Example code - Functions

From 22113
Jump to navigation Jump to search

File used in example

aln7.fsa

Finding consensus sequence from protein alignment

This simple consensus sequence finder simply looks at each position on all the sequences in an alignment and picks the most frequent occurring amino acid as the consensus for that position. In case of tie, the first one according to the sorting algorithm is chosen. Stupid and random, but a good practical example of functions.

#!/usr/bin/python3
import sys, re

# A general error function that is called whenever a problem occurs.
# Shows a message if one exist, and prints how to use the program.
def usage(message=''):
    '''A usage message to be displayed when the user gives faulty input'''
    if message != '':
        print(message)
    print("Usage: consensusaa.py <filename>")
    print("Calculates the consensus sequence of an alignment i fasta format.")
    print("Sounds cool, but it is really just displaying the most frequent")
    print("amino acid in any position of the aligned sequences.")
    sys.exit(1)

# A function to read a fasta file, if given the filename
# Returns the headers and sequences as two separate lists
def readFasta(filename):
    '''Reads a fasta file and returns 2 lists, one with headers and one with sequences.'''
    headers = list()
    sequences = list()
    try:
        fastafile = open(filename, 'r')
        seqlines = list()
        for line in fastafile:
            if line.startswith('>'):
                if len(headers) != 0:
                    sequences.append(''.join(seqlines))
                seqlines.clear()
                headers.append(line.rstrip())
            else:
                # efficient one-liner to get rid of all whitespace
                line = ''.join(line.split())
                if re.search(r'[^A-Z-]', line):
                    usage("Illegal char in protein sequence: " + line)
                seqlines.append(line)
        sequences.append(''.join(seqlines))
        fastafile.close()
    except IOError as err:
        usage("Can't open file (" + filename + "), reason: " + str(err))
    return(headers, sequences)

# An alignment has by nature the same sequence length in all sequences.
# This is verified here, when given a list of the aligned sequences.
def verifyAlignment(seqlist):
    '''Test if all sequences have the same length.'''
    if len(seqlist) == 0:
        usage("Empty fasta file")
    length = len(seqlist[0])
    for seq in seqlist:
        if length != len(seq):
            usage("Aligned sequences do not have the same length")
    return None

# Finding the most frequent occurring char in a list of sequences
# and returns the consensus string of chars.
def findConsensus(seqlist):
    '''Finding consensus of an alignment, winner takes all on any position'''
    consensus = ''
    for position in range(len(seqlist[0])):
        chardict = dict()
        for seqno in range(len(seqlist)):
            char = seqlist[seqno][position]
            if char in chardict:
                chardict[char] += 1
            else:
                chardict[char] = 1
        # Top pick
        consensus += sorted(chardict.keys(), reverse=True, key=chardict.get)[0]
    return consensus


# Main program
if len(sys.argv) == 1:          # no commandline arguments
    usage('Missing filename')

(headers, sequences) = readFasta(sys.argv[1])
verifyAlignment(sequences)
sequence = findConsensus(sequences)

print("The consensus sequence is:")
print(sequence)