<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://teaching.healthtech.dtu.dk:443/22112/index.php?action=history&amp;feed=atom&amp;title=Hash_usage</id>
	<title>Hash usage - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://teaching.healthtech.dtu.dk:443/22112/index.php?action=history&amp;feed=atom&amp;title=Hash_usage"/>
	<link rel="alternate" type="text/html" href="https://teaching.healthtech.dtu.dk:443/22112/index.php?title=Hash_usage&amp;action=history"/>
	<updated>2026-04-29T03:51:54Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.41.0</generator>
	<entry>
		<id>https://teaching.healthtech.dtu.dk:443/22112/index.php?title=Hash_usage&amp;diff=55&amp;oldid=prev</id>
		<title>WikiSysop at 09:33, 17 June 2024</title>
		<link rel="alternate" type="text/html" href="https://teaching.healthtech.dtu.dk:443/22112/index.php?title=Hash_usage&amp;diff=55&amp;oldid=prev"/>
		<updated>2024-06-17T09:33:15Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table style=&quot;background-color: #fff; color: #202122;&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;Revision as of 11:33, 17 June 2024&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l1&quot;&gt;Line 1:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 1:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;{| width=500  style=&amp;quot;float:right; margin-left: 10px; margin-top: -56px;&amp;quot;&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;{| width=500  style=&amp;quot;float:right; margin-left: 10px; margin-top: -56px;&amp;quot;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|Previous: [[&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;MapReduce and &lt;/del&gt;Binary representation]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|Previous: [[Binary representation]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|Next: [[Exercises and Organization]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|Next: [[Exercises and Organization]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|}&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|}&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l8&quot;&gt;Line 8:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 8:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=df2d4e4c-5e16-4671-bba0-af170070db98 Count-min Sketch, a probabilistic counter]&amp;lt;br&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=df2d4e4c-5e16-4671-bba0-af170070db98 Count-min Sketch, a probabilistic counter]&amp;lt;br&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=dac4011c-dec5-4c3f-8b8c-af170070b46d MinHash, a probabilistic set similarity score]&amp;lt;br&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=dac4011c-dec5-4c3f-8b8c-af170070b46d MinHash, a probabilistic set similarity score]&amp;lt;br&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Powerpoint: [https://teaching.healthtech.dtu.dk/material/22112/&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;HPCLife09&lt;/del&gt;-Hashes.ppt The use of hash]&amp;lt;br&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Powerpoint: [https://teaching.healthtech.dtu.dk/material/22112/&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;22112_12&lt;/ins&gt;-Hashes.ppt The use of hash]&amp;lt;br&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=e6c44ada-9074-4821-bb91-af1700708f13 Exercises]&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=e6c44ada-9074-4821-bb91-af1700708f13 Exercises]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>WikiSysop</name></author>
	</entry>
	<entry>
		<id>https://teaching.healthtech.dtu.dk:443/22112/index.php?title=Hash_usage&amp;diff=26&amp;oldid=prev</id>
		<title>WikiSysop: Created page with &quot;{| width=500  style=&quot;float:right; margin-left: 10px; margin-top: -56px;&quot; |Previous: MapReduce and Binary representation |Next: Exercises and Organization |} == Material for the lesson == Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=f253830c-9a70-4aaa-b61e-af170071451a Hashes, properties and use]&lt;br&gt; Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=52b59412-1df9-4e2b-b8de-af1700710cab Bloom filter, a probabilistic set]&lt;br&gt; Video: [https:/...&quot;</title>
		<link rel="alternate" type="text/html" href="https://teaching.healthtech.dtu.dk:443/22112/index.php?title=Hash_usage&amp;diff=26&amp;oldid=prev"/>
		<updated>2024-03-06T10:57:50Z</updated>

		<summary type="html">&lt;p&gt;Created page with &amp;quot;{| width=500  style=&amp;quot;float:right; margin-left: 10px; margin-top: -56px;&amp;quot; |Previous: &lt;a href=&quot;/22112/index.php/MapReduce_and_Binary_representation&quot; title=&quot;MapReduce and Binary representation&quot;&gt;MapReduce and Binary representation&lt;/a&gt; |Next: &lt;a href=&quot;/22112/index.php/Exercises_and_Organization&quot; title=&quot;Exercises and Organization&quot;&gt;Exercises and Organization&lt;/a&gt; |} == Material for the lesson == Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=f253830c-9a70-4aaa-b61e-af170071451a Hashes, properties and use]&amp;lt;br&amp;gt; Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=52b59412-1df9-4e2b-b8de-af1700710cab Bloom filter, a probabilistic set]&amp;lt;br&amp;gt; Video: [https:/...&amp;quot;&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;{| width=500  style=&amp;quot;float:right; margin-left: 10px; margin-top: -56px;&amp;quot;&lt;br /&gt;
|Previous: [[MapReduce and Binary representation]]&lt;br /&gt;
|Next: [[Exercises and Organization]]&lt;br /&gt;
|}&lt;br /&gt;
== Material for the lesson ==&lt;br /&gt;
Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=f253830c-9a70-4aaa-b61e-af170071451a Hashes, properties and use]&amp;lt;br&amp;gt;&lt;br /&gt;
Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=52b59412-1df9-4e2b-b8de-af1700710cab Bloom filter, a probabilistic set]&amp;lt;br&amp;gt;&lt;br /&gt;
Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=df2d4e4c-5e16-4671-bba0-af170070db98 Count-min Sketch, a probabilistic counter]&amp;lt;br&amp;gt;&lt;br /&gt;
Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=dac4011c-dec5-4c3f-8b8c-af170070b46d MinHash, a probabilistic set similarity score]&amp;lt;br&amp;gt;&lt;br /&gt;
Powerpoint: [https://teaching.healthtech.dtu.dk/material/22112/HPCLife09-Hashes.ppt The use of hash]&amp;lt;br&amp;gt;&lt;br /&gt;
Video: [https://panopto.dtu.dk/Panopto/Pages/Viewer.aspx?id=e6c44ada-9074-4821-bb91-af1700708f13 Exercises]&lt;br /&gt;
&lt;br /&gt;
== Exercises ==&lt;br /&gt;
&amp;#039;&amp;#039;&amp;#039;Determine if unknown DNA is human or not&amp;#039;&amp;#039;&amp;#039;&amp;lt;br&amp;gt;&lt;br /&gt;
The way to go about this is to create a Bloom filter containing all 30-mers from the human.fsa file and then query that with the unknown DNA in the mixeddna.fsa file. A 30-mer is chosen, as it is very likely unique.&lt;br /&gt;
Make two programs; one that creates the Bloom filter from the human DNA and saves it to a file, and one program that reads in the filter from file, and queries it based on an input fasta file.&lt;br /&gt;
&lt;br /&gt;
&amp;#039;&amp;#039;&amp;#039;1)&amp;#039;&amp;#039;&amp;#039; Computing the Bloom filter parameters&amp;lt;br&amp;gt;&lt;br /&gt;
The false positive rate, &amp;#039;&amp;#039;&amp;#039;p&amp;#039;&amp;#039;&amp;#039; must be 1% or lower. Figure out the size of the input, i.e. approximately how many elements will be put in the filter, &amp;#039;&amp;#039;&amp;#039;n&amp;#039;&amp;#039;&amp;#039;. Compute the size of the filter, &amp;#039;&amp;#039;&amp;#039;m&amp;#039;&amp;#039;&amp;#039;, and the number of hash functions, &amp;#039;&amp;#039;&amp;#039;k&amp;#039;&amp;#039;&amp;#039; necessary. Adjust values so they fit your purpose, and make a final calculation of &amp;#039;&amp;#039;&amp;#039;p&amp;#039;&amp;#039;&amp;#039;. Pick hash function(s) and explain your choice. &lt;br /&gt;
&lt;br /&gt;
&amp;#039;&amp;#039;&amp;#039;2)&amp;#039;&amp;#039;&amp;#039; Making the Bloom filter&amp;lt;br&amp;gt;&lt;br /&gt;
You can use previous functions and ideas. The Bloom filter must be a bytearray and it can be written directly to a binary file.&lt;br /&gt;
Since Bloom filters can be added together then you can parallelize the filter construction per chromosome. This is not required. Realize that using joblib creates several filters and they are sizable, i.e. running out of memory is possible.&lt;br /&gt;
&lt;br /&gt;
To help out a bit I have created 3 functions: one that takes a k-mer (as a bytes object) and gives back an integer representing a 32 byte (256 bit) integer number -&lt;br /&gt;
the hash-number. One that finds the next position of the bit to set, given a hash-number, and one that sets a bit in a bytearray given positions. These functions are here to increase your understanding, not the programs performance. Using them as they are will not give you the performance boost you need.&lt;br /&gt;
I used 7 hours mainly in calculating the Bloom filter in a not-parallel program.&lt;br /&gt;
&amp;lt;pre&amp;gt;&lt;br /&gt;
import sys, hashlib&lt;br /&gt;
&lt;br /&gt;
# Computes a hash based on a kmer&lt;br /&gt;
def hashit(kmer):&lt;br /&gt;
    myhashobj = hashlib.sha256()&lt;br /&gt;
    myhashobj.update(kmer)&lt;br /&gt;
    return int.from_bytes(myhashobj.digest(), byteorder=sys.byteorder)&lt;br /&gt;
&lt;br /&gt;
# The size of the bit field (the slice of bits taken from the hash, how many bits you need)&lt;br /&gt;
bitfieldsize = xxxx # your number that correlates with m.&lt;br /&gt;
&lt;br /&gt;
# &amp;quot;eats&amp;quot; a slice from the hash and returns the position in the bloom filter that slice corresponds to.&lt;br /&gt;
# Also returns the &amp;quot;leftover&amp;quot; hash, so it can be used again for a new position.&lt;br /&gt;
def nextposition(hashnumber):&lt;br /&gt;
    position = hashnumber &amp;amp; (2**bitfieldsize - 1)&lt;br /&gt;
    byteposition = position &amp;gt;&amp;gt; 3&lt;br /&gt;
    bitposition = position &amp;amp; 7&lt;br /&gt;
    hashnumber &amp;gt;&amp;gt;= bitfieldsize # discarding the used hash slice&lt;br /&gt;
    return(hashnumber, byteposition, bitposition)&lt;br /&gt;
&lt;br /&gt;
# Sets the bit in the bloom filter&lt;br /&gt;
def setbit(bloomfilter, byteposition, bitposition):&lt;br /&gt;
    bloomfilter[byteposition] |= 1 &amp;lt;&amp;lt; bitposition&lt;br /&gt;
&amp;lt;/pre&amp;gt;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&amp;#039;&amp;#039;&amp;#039;3)&amp;#039;&amp;#039;&amp;#039; Querying the Bloom filter.&lt;br /&gt;
The Bloom filter can/should be loaded with one read. When querying for sequence, also try the reverse complement. Query all 30-mers in the input fasta file, mixeddna.fsa,&lt;br /&gt;
and output the no_of_hits/no_of_kmers ratio in percent per sequence. If near 100%, it is human DNA, else not.&lt;br /&gt;
&lt;br /&gt;
&amp;lt;pre&amp;gt;&lt;br /&gt;
# This function return True if a bit is set on the position given.&lt;br /&gt;
def isbitset(bloomfilter, byteposition, bitposition):&lt;br /&gt;
    return (bloomfilter[byteposition] &amp;amp; (1 &amp;lt;&amp;lt; bitposition)) != 0&lt;br /&gt;
&amp;lt;/pre&amp;gt;&lt;/div&gt;</summary>
		<author><name>WikiSysop</name></author>
	</entry>
</feed>