import string import sys import random import tools #import sw ## removed by Reina def test_gibbs(): #random.seed(0) ##Reina is fixing the seed for the pset seqs = ['ACGGAGATTATATATATATATATATAGCGGTAATTAGTAC', 'ACCGGGATTAGCGGTAATTATATATATATATATAGTATGAC', 'ACGCGAATTAGCGGTGACGTACTGGTATATATATATATTTAG', 'GATAAGATTAGCGGTAATTAGTATATATATATATATACGGA', 'ACGATTATTAGCGGTAGCCTGTATATATATATATATAGGAC', 'ACGAGTATTATACTTAATTATATATATATATATATGGCCAT'] return iterate_gibbs(seqs, 6) def test2_gibbs(file): random.seed(0) mti = open(file,'r') # Need to specify file c = mti.read() seqs = c.split('#') for i in range(seqs.count('')): seqs.remove('') # need to remove empty strings # this is a printing subroutine #for i in range(len(seqs)): # print(seqs[i]) #return simple_align(seqs, 6) return iterate_gibbs(seqs, 6) def iterate_gibbs(seqs, k): origseqs = seqs[:] all_motifs = [] # for long sequences while min(map(len, seqs)) >= k: test=min(map(len,seqs)) print test # Find the best motif motif, indices = simple_align(seqs, k) all_motifs.append((motif,indices)) # remove that motif from the sequences for i in range(0,len(seqs)): seqs[i] = seqs[i][:indices[i]]+seqs[i][indices[i]+k:] print_motif(motif) print "Indices are %s"%indices return all_motifs def simple_align(seqs, k): # the basic algorithm simply tries to find an optimal # alignment of the sequences so that k consecutive columns # form an optimal motif # initialize alignment indices to random values indices = map(lambda len,k=k: random.randint(0,len-k), map(len, seqs)) num_iterations = 10 last_indices = None for iter in range(0,num_iterations): # # Part I - re-compute motif for current positions # motif = gather_motif(seqs, indices, k) #print "Current motif is: " #print_motif(motif) # # Part II - re-align each of the sequences # #print "Current indices are: [%s]"%tools.display_list(indices) for i in range(0,len(seqs)):\ # first subtract local instance of the motif before realigning # otherwise, you're biased to align to the same spot #print "here motif" #print_motif(motif) submotif = map(tools.copy_dic,motif) submotif = subtract_seq(submotif, seqs[i][indices[i]:][:len(submotif)]) #print "here submotif" #print_motif(submotif) indices[i],score = best_fit(seqs[i], submotif) # no index changed, break if last_indices == indices and last_motif == motif: print "Found local maximum, breaking" break last_indices = indices last_motif = motif return motif, indices def gather_motif(seqs, indices, k): # makes a motif from a set of aligned sequences # the alignment indices are indices, # the sequences are seqs # the length of the motif to build is k # simply adds the different character instances # for each position. motif = create_empty_motif(k) for seq, index in map(None, seqs, indices): add_seq(motif, seq[index:][:k]) return motif def create_empty_motif(k): return map(lambda i: {'A': 0, 'C': 0, 'G': 0, 'T': 0}, range(0,k)) def add_seq(motif, seq): # add a sequence to a motif assert(len(motif) == len(seq)) for char,c in map(None, motif, seq): char[c] = char[c] + 1 return motif def subtract_seq(motif, seq): # subtract a sequence from a motif # assumes the sequence is already part of the motif # otherwise we'll get negative numbers assert(len(motif) == len(seq)) for char,c in map(None, motif, seq): char[c] = char[c] - 1 assert(char[c] >= 0) return motif def print_motif(motif, f=sys.stdout): # a motif is a list of chardictionaries f.write(' %2s '%'') for c in 'ACGT': f.write('| %2s '%c) f.write('\n----') for c in 'ACGT': f.write('+----') for i in range(0,len(motif)): char = motif[i] f.write('\n %2s '%i) for c in 'ACGT': f.write('| %2s '%char[c]) f.write('| %2s'%tools.IUPAC_from_chardic(char)) f.write('\n') def best_fit(seq, motif): # find the best fit of a motif in a sequence. # returns the index at which the best score was found, # and the corresponding score assert(len(seq) >= len(motif)) k = len(motif) max_score, max_i = score_motif(seq[:k],motif), 0 for i in range(1, len(seq) - k): score = score_motif(seq[i:][:k], motif) if score > max_score: max_score = score max_i = i # print "Seq: %s, Motif: %s -> Score: %s for i=%s, %s"%( # seq, string.join(map(tools.IUPAC_from_chardic,motif),''), # max_score,max_i,seq[max_i:][:len(motif)]) return max_i, max_score def score_motif(seq, motif): # motif and seq are the same length # simply multiply out the motif numbers assert(len(seq) == len(motif)) score = 0 for i in range(0,len(motif)): score = score + motif[i][seq[i]] return score