Example code - Functions
File used in example
Finding consensus sequence from protein alignment
This simple consensus sequence finder simply looks at each position on all the sequences in an alignment and picks the most frequent occurring amino acid as the consensus for that position. In case of tie, the first one according to the sorting algorithm is chosen. Stupid and random, but a good practical example of functions.
#!/usr/bin/python3 import sys, re # A general error function that is called whenever a problem occurs. # Shows a message if one exist, and prints how to use the program. def usage(message=''): '''A usage message to be displayed when the user gives faulty input''' if message != '': print(message) print("Usage: consensusaa.py <filename>") print("Calculates the consensus sequence of an alignment i fasta format.") print("Sounds cool, but it is really just displaying the most frequent") print("amino acid in any position of the aligned sequences.") sys.exit(1) # A function to read a fasta file, if given the filename # Returns the headers and sequences as two separate lists def readFasta(filename): '''Reads a fasta file and returns 2 lists, one with headers and one with sequences.''' headers = list() sequences = list() try: fastafile = open(filename, 'r') seqlines = list() for line in fastafile: if line.startswith('>'): if len(headers) != 0: sequences.append(''.join(seqlines)) seqlines.clear() headers.append(line.rstrip()) else: # efficient one-liner to get rid of all whitespace line = ''.join(line.split()) if re.search(r'[^A-Z-]', line): usage("Illegal char in protein sequence: " + line) seqlines.append(line) sequences.append(''.join(seqlines)) fastafile.close() except IOError as err: usage("Can't open file (" + filename + "), reason: " + str(err)) return(headers, sequences) # An alignment has by nature the same sequence length in all sequences. # This is verified here, when given a list of the aligned sequences. def verifyAlignment(seqlist): '''Test if all sequences have the same length.''' if len(seqlist) == 0: usage("Empty fasta file") length = len(seqlist[0]) for seq in seqlist: if length != len(seq): usage("Aligned sequences do not have the same length") return None # Finding the most frequent occurring char in a list of sequences # and returns the consensus string of chars. def findConsensus(seqlist): '''Finding consensus of an alignment, winner takes all on any position''' consensus = '' for position in range(len(seqlist[0])): chardict = dict() for seqno in range(len(seqlist)): char = seqlist[seqno][position] if char in chardict: chardict[char] += 1 else: chardict[char] = 1 # Top pick consensus += sorted(chardict.keys(), reverse=True, key=chardict.get)[0] return consensus # Main program if len(sys.argv) == 1: # no commandline arguments usage('Missing filename') (headers, sequences) = readFasta(sys.argv[1]) verifyAlignment(sequences) sequence = findConsensus(sequences) print("The consensus sequence is:") print(sequence)