1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 from __future__ import generators
20
21
22 import sys, logging, weakref, bisect, time, os, os.path
23 import numpy.oldnumeric as Numeric
24 import psyco
25
26 import recouvrement
27 import _suffix_tree
28
29
31 """Fonctions de parcours de l'arbre et de recherche des chaines répétées"""
33 'Iterator through only MAximal exact matches'
34 for n in self.post_order_nodes_2:
35 if not n.is_leaf:
36 yield n
38 'Iterator through only Maximal exact matches'
39 def dfs(n):
40 nb_children_leaf = 0
41 for c in n.children:
42 if c.is_leaf: nb_children_leaf += 1
43 for m in dfs(c):
44 yield m
45 if nb_children_leaf>=1: yield n
46 for n in dfs(self.root):
47 yield n
48
50 'Iterator through all nodes in the tree in post-order.'
51 def dfs(n):
52 for c in n.children:
53 for m in dfs(c):
54 yield m
55 yield n
56 for n in dfs(self.root):
57 yield n
58
60 'Iterator through all nodes in the tree in post-order.'
61 def dfs(n):
62 for c in n.children:
63 for m in dfs(c):
64 if m.nb_occ>1: yield m
65 if n.nb_occ>1: yield n
66 for n in dfs(self.root):
67 if n.nb_occ>1: yield n
68
70 'Iterator through all nodes in the tree in post-order.'
71 def dfs(n):
72 yield n
73 for c in n.children:
74 for m in dfs(c):
75 yield m
76 for n in dfs(self.root):
77 yield n
78
84
90
91
92 MEM = property(generate_MEM, None, None,
93 "post_order_nodes2")
94 post_order_nodes = property(generate_post_order_nodes, None, None,
95 "post_order_nodes")
96 post_order_nodes_max = property(generate_post_order_nodes_max, None, None,
97 "post_order_nodes_max")
98 pre_order_nodes = property(generate_pre_order_nodes, None, None,
99 "pre_order_nodes")
100 leaves = property(generate_leaves, None, None, "leaves")
101 inner_nodes = property(generate_inner_nodes, None, None, "inner_nodes")
102 inner_nodes_2 = property(generate_inner_nodes_2, None, None, "inner_nodes_2")
103
105 """Ajoute à chaque noeud interne(répété) ses positions dans la séquence. """
106 for n in self.post_order_nodes:
107 if n.is_leaf:
108 n.path_indices = n.path_position
109 else:
110 n.path_indices = []
111 for c in n.children:
112 try:
113 n.path_indices.extend(c.path_indices)
114 except TypeError:
115 n.path_indices.append(c.path_indices)
116
117 n.nb_occ = len(n.path_indices)
118
120 """Renvoie un dico de tous les Maximal Exact Matches de taille min min_size
121
122 Le dico est indexé par la taille de MEM qui renvoie une liste de toutes
123 les positions de cette taille.
124 Linéaire(nb de MEM) < linéaire(taille de la séquence)"""
125
126
127
128 seq_repeat = {}
129 for n in self.generate_inner_nodes():
130 longueur_MEM = n.edge_label_end - n.path_position
131 if longueur_MEM >= min_size:
132 n.path_indices = []
133 for c in n.children:
134 if c.is_leaf: n.path_indices.append(c.path_position)
135 elif len(c.path_indices) == 1: n.path_indices.extend(c.path_indices)
136
137 for pi in n.path_indices:
138 seq_repeat[pi] = pi+longueur_MEM
139
140 e="""for i in xrange(pi, pi+longueur_chaine):
141 if not seq_repeat.has_key(i):
142 seq_repeat[i] = pi+longueur_chaine
143 elif pi+longueur_chaine < seq_repeat[i]:
144 break
145 else:
146 seq_repeat[i] = pi+longueur_chaine"""
147
148 dic_MEM = {}
149 for debut in seq_repeat:
150 fin = seq_repeat[debut]
151 if (fin > debut):
152 try:
153 dic_MEM[fin-debut].append(debut)
154 except KeyError:
155 dic_MEM[fin-debut] = [debut]
156
157 return dic_MEM
158
159 -class SuffixTree(SuffixTreeInterface,_suffix_tree.SuffixTree):
160 """A higher-level wrapper around the C suffix tree type,
161 _suffix_tree.SuffixTree. This class adds a few methods to the suffix
162 tree, methods that are more easily expressed in Python than in C, and
163 that can be written using the primitives exported from C. """
164
166 """Build a suffix tree from the input string s. The string
167 must not contain the special symbol $."""
168 if '$' in s: raise "The suffix tree string must not contain $!"
169 self.sequence = s
170
171
172
173
174
175
176 _suffix_tree.SuffixTree.__init__(self,s)
177
179 - def __init__(self, debut, fin, slink=None, depth=0):
180 self.suffixLink = slink
181 self.depth = depth
182
183
184
185 self.edge_label_begin = debut
186 self.edge_label_end = fin
187
188
189
191 self.edge_label_begin = debut
192 self.edge_label_end = fin
194 '''Iterator through all the node's children'''
195 for k,p,node in self.itervalues():
196 yield node
197 children = property(getChildren, None, None, "children of the node")
199 if len(self) == 0: return True
200 else: return False
201 is_leaf = property(isLeaf, None, None, "leaf or not leaf ?")
202 - def __str__(self, seq=None, space='', short=False, charOnly=None):
203 res = "deb="+str(self.edge_label_begin)+" / fin="+str(self.edge_label_end)
204 try:
205 res += " / path_position="+str(self.path_position)
206 except AttributeError: pass
207 if seq is not None:
208 res += " / " +seq[self.edge_label_begin:self.edge_label_end]
209 res += "\n"
210 if short and seq is not None:
211 res = seq[self.edge_label_begin:self.edge_label_end]+'\n'
212 space += '|---'
213 if charOnly is not None and self.has_key(charOnly):
214 k,p,node = self[charOnly]
215 res += space + node.__str__(seq, space, short)
216 else:
217 for char,(k,p,node) in self.iteritems():
218 res += space + node.__str__(seq, space, short)
219 return res
220
222 """Construit un suffix tree avec l'algorithme d'Ukkonen"""
224 return self.root.__str__(seq=self.sequence)
226 assert isinstance(sequence,basestring)
227 if not generalizedST:
228 if '¤' in sequence: raise "The suffix tree string must not contain ¤!"
229 self.sequence = sequence+'¤'
230 self.inf = len(sequence)
231 self.bottom = SuffixNodeU(-2,-2)
232 self.root = SuffixNodeU(0,0,self.bottom)
233 s = self.root
234 k = 0 ; i = -1
235 for t in self.sequence:
236
237 i += 1
238
239 self.bottom[t] = -2,-2,self.root
240
241 s,k = self.update(s,k,i)
242
243 s,k = self.canonize(s,k,i)
244
245
246 self.annotate_path_position()
247 self.sequence = sequence
248
250 """Ajout du caractère en i """
251
252 oldroot = self.root
253 t = self.sequence[i]
254
255 end_point,r = self.testAndSplit(s,k,i-1,t)
256 while not end_point:
257 r2 = SuffixNodeU(i,self.inf)
258 r[t] = i,self.inf,r2
259
260 if not (oldroot is self.root): oldroot.suffixLink = r
261 oldroot = r
262 s,k = self.canonize(s.suffixLink,k,i-1)
263 end_point,r = self.testAndSplit(s,k,i-1,t)
264 if not (oldroot is self.root): oldroot.suffixLink = r
265 return s,k
266
268 """Test si s,k,p est le end point et sinon explicite et renvoie s,k,p"""
269 if k <= p:
270 tk = self.sequence[k]
271 k2,p2,s2 = s[tk]
272
273 if t == self.sequence[k2+p-k+1]: return True,s
274 else:
275 tk2 = self.sequence[k2]
276 end = k2+p-k
277 r = SuffixNodeU(k2,end+1)
278 s[tk2] = k2, end, r
279 s2.changeValues(end+1, p2+1)
280 tk3 = self.sequence[end+1]
281 r[tk3] = end+1, p2, s2
282 return False,r
283
284 elif not s.has_key(t): return False,s
285 else: return True,s
286
288 """Si s,k,p implicite, renvoie son plus proche ancêtre explicite"""
289 if p < k: return s,k
290 else:
291 tk = self.sequence[k]
292 k2,p2,s2 = s[tk]
293
294 while p2-k2 <= p-k:
295 k = k+p2-k2+1
296 s = s2
297 if k <= p:
298 tk = self.sequence[k]
299 k2,p2,s2 = s[tk]
300 return s,k
301
303 """ajoute l'attribut path_position qui correspond à la position du 1er caractère de la branche"""
304 node = self.root
305 self.root.path_position = 0
306 self.__annotate_path_position(node, 0)
308 for s,k,fils in node.itervalues():
309 if len(fils) == 0:
310
311 fils.path_position = fils.edge_label_begin - longueur
312 else:
313 fils.path_position = fils.edge_label_begin - longueur
314 self.__annotate_path_position(fils,
315 longueur+fils.edge_label_end-fils.edge_label_begin)
316
317
318
320
321 """A suffix tree for a set of strings."""
322
324 '''Build a generalised suffix tree. The strings must not
325 contain the special symbols $ or ascii numbers from 1 to the number of
326 sequences.'''
327
328 self.sequences = sequences
329 self.start_positions = [0]
330 self.concat_string = ''
331 for i in xrange(len(sequences)):
332 if chr(i+1) in sequences[i]:
333 raise "The suffix tree string must not contain chr(%d)!"%(i+1)
334 self.concat_string += sequences[i]+chr(i+1)
335 self.start_positions += [len(self.concat_string)]
336
337 self.start_positions += [self.start_positions[-1]+1]
338 self.sequences += ['']
339
340
341 SuffixTree.__init__(self,self.concat_string)
342 self._annotate_nodes()
343
344
346 'Translate a concat-string index into a (string_no,idx) pair.'
347 for i in xrange(len(self.start_positions)-1):
348 if self.start_positions[i] <= idx < self.start_positions[i+1]:
349 return (i,idx-self.start_positions[i])
350 raise IndexError, "Index out of range: "+str(idx)
351
353 for n in self.post_order_nodes:
354 if n.is_leaf:
355 seq,idx = self._translate_index(n.path_position)
356 n.path_indices = [(seq, idx)]
357 n.sequences = [seq]
358 n.nb_occ = len(n.path_indices)
359 else:
360 path_indices = [] ; sequences = []
361 for c in n.children:
362 path_indices += c.path_indices
363 sequences += c.sequences
364
365 seqs_in_subtree = {}
366 for s in sequences:
367 seqs_in_subtree[s] = 1
368
369 n.path_indices = path_indices
370 n.sequences = [s for s in seqs_in_subtree]
371 n.nb_occ = len(n.path_indices)
372
374 '''Iterator through shared sub-strings. Returned as a list of triples
375 (sequence,from,to).'''
376 for n in self.inner_nodes:
377 if len(n.sequences) == len(self.sequences) - 1:
378 l = n.edge_label_end - n.path_position
379 if l >= minimum_length:
380 yield [(seq, idx, idx+l) for (seq,idx) in n.path_indices]
381
382
383
384
385
387 if (i[0]<=d[0]<=i[1] and i[0]<=d[1]<=i[1]): return True
388 else: return False
389
391 '''Iterator through shared sub-strings. Returned as a list of triples
392 (sequence,from,to).'''
393 DRes={}
394 maxIdxDres=0
395 longTexte1=len(self.sequences[0])+1
396 for n in self.inner_nodes:
397
398
399
400 long_path=n.edge_label_end-n.path_position
401 if long_path<long_min_pivots: continue
402 ladd=[]
403 for pi in n.path_indices:
404
405
406 if pi[0]==1: deb=pi[1]+longTexte1-1
407 else: deb=pi[1]
408 found=False
409 for i in xrange(maxIdxDres,0,-1):
410 if DRes.has_key(i):
411 for x in DRes[i]:
412 if self.inter_inclus(x,(deb,deb+long_path)):
413 found=True
414 break
415 if (found or i-1<long_path):break
416 if not found:
417 ladd.append((deb,deb+long_path))
418
419 if len(ladd)>1:
420 if DRes.has_key(long_path): DRes[long_path].extend(ladd)
421 else:
422 maxIdxDres=max(maxIdxDres,long_path)
423 DRes[long_path]=ladd
424
425 LRes=[]
426 for i in xrange(maxIdxDres,0,-1):
427 if DRes.has_key(i):
428 print str(i)+" "+str(DRes[i])
429 for x in DRes[i]:
430 for y in LRes:
431 if self.inter_inclus(y,x): break
432 else:
433 LRes.append(x)
434 print "text n="+str(len(self.sequences[0])+len(self.sequences[1]))+" / leaves="+str(nleaf)+" / internal nodes="+str(nnode)
435 return LRes
436
438 '''Iterator through shared sub-strings. Returned as a list of triples
439 (sequence,from,to).'''
440 lInter=[]
441 longTexte1=len(self.sequences[0])+1
442 nleaf=nnode=0
443
444 for n in self.post_order_nodes:
445 if n.is_leaf:
446 nleaf+=1
447 else:
448 nnode+=1
449
450
451 long_path=n.edge_label_end-n.path_position
452 if long_path<long_min_pivots: continue
453 ladd=[]
454 for pi in n.path_indices:
455
456
457 if pi[0]==1: deb=pi[1]+longTexte1-1
458 else: deb=pi[1]
459 if not MediteAppli.inclus((deb,deb+long_path),lInter):
460 ladd.append((deb,deb+long_path))
461
462 if len(ladd)>1:
463 lInter=MediteAppli.union2(ladd,lInter)
464
465
466 sys.stderr.write("text n="+str(len(self.sequences[0])+len(self.sequences[1]))+" / leaves="+str(nleaf)+" / internal nodes="+str(nnode)+"\n")
467 return lInter
468
470 '''Iterator through shared sub-strings. Returned as a list of triples
471 (sequence,from,to).'''
472 longTexte1=len(self.sequences[0])
473 t=self.sequences[0] + self.sequences[1]
474 longTexte=len(t)
475 lMax=[]
476 for i in xrange(longTexte):
477 lMax.append(0)
478 nleaf=nnode=0
479 for n in self.post_order_nodes:
480 if n.is_leaf:
481 nleaf+=1
482 else:
483 nnode+=1
484 long_path=n.edge_label_end-n.path_position
485 if long_path<long_min_pivots: continue
486 ladd=[]
487 for pi in n.path_indices:
488 if pi[0]==1: deb=pi[1]+longTexte1
489 else: deb=pi[1]
490 fin=deb+long_path
491 if (lMax[deb]>=fin):continue
492 else: ladd.append((deb,fin))
493
494 if (len(ladd)>1 and ladd[0][0]<longTexte1 and ladd[-1][0]>=longTexte1):
495
496
497 for x in ladd:
498 for i in xrange(x[0],x[1]):
499 if x[1]>lMax[i]: lMax[i]=x[1]
500
501
502 dic={}
503 prevInserted=lastmax=0
504 nbPos=0
505 for i in xrange(longTexte):
506 nbPos+=1
507 pos=lMax[i]
508 long=pos-i
509 if pos!=prevInserted and pos>0 and pos>lastmax:
510 texte=t[i:pos]
511 if not dic.has_key(texte):
512 dic[texte]=[]
513 dic[texte].append(i)
514 prevInserted=pos
515 if pos>lastmax: lastmax=pos
516
517
518 dic2={}
519 for x in dic:
520 a=dic[x]
521 if (len(x)<long_min_pivots or len(a)<2 or
522 not(a[0]<longTexte1 and a[-1]>=longTexte1)):
523
524 continue
525 else:dic2[x]=a
526
527
528
529
530
531 return dic2,lMax
532
534 """Renvoie un dictionnaire de toutes les chaines répétées.
535
536 Indexé par la taille des chaines. En valeur une liste de positions.
537 min_size: taille mini des chaines répétées
538 Linéaire(nb de noeuds de l'arbre)=linéaire(taille de la séquence)"""
539 longueur_seq1 = len(self.sequences[0])
540 dic = {}
541 for n in self.generate_inner_nodes():
542 longueur_chaine = n.edge_label_end - n.path_position
543 if longueur_chaine >= min_size:
544 seq1 = seq2 = False
545 lPos = []
546 for (seq,pos) in n.path_indices:
547 if seq == 0:
548 seq1 = True
549 lPos.append(pos)
550 else:
551 seq2 = True
552 lPos.append(pos + longueur_seq1)
553 if seq1 and seq2:
554 try:
555 dic[longueur_chaine].extend(lPos)
556
557 except KeyError:
558 dic[longueur_chaine] = lPos
559 return dic
560
562
563
564
565 longueur_seq1 = len(self.sequences[0])
566 seq_repeat = []
567 for i in xrange(len(self.concat_string)):
568 seq_repeat.append((i,i))
569 i=nbpitot=0
570 for n in self.generate_inner_nodes():
571 i+=1
572
573
574
575
576
577 longueur_chaine = n.edge_label_end - n.path_position
578 n.path_indices = []
579 for c in n.children:
580 if c.is_leaf: n.path_indices.append(c.path_position)
581
582 else: n.path_indices.extend(c.path_indices)
583 if longueur_chaine >= min_size and len(n.path_indices) > 1:
584
585 seq1 = seq2 = False
586 for pos in n.path_indices:
587
588 if pos < longueur_seq1: seq1 = True
589 else: seq2 = True
590 if seq1 and seq2:
591 for pi in n.path_indices:
592
593
594
595
596
597 for pos in xrange(pi,pi+longueur_chaine):
598
599
600
601
602
603 if (pi <= seq_repeat[pos][0] <= seq_repeat[pos][1] <= pi+longueur_chaine or
604 seq_repeat[pos][0] <= pi <= seq_repeat[pos][1] <= pi+longueur_chaine):
605 seq_repeat[pos] = (pi,pi+longueur_chaine)
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620 return seq_repeat
621
623
624
625
626 longueur_seq1 = len(self.sequences[0])
627 seq_repeat_deb = Numeric.zeros(len(self.concat_string),Numeric.Int)
628 seq_repeat_fin = Numeric.zeros(len(self.concat_string),Numeric.Int)
629 for i in xrange(len(self.concat_string)):
630 seq_repeat_deb[i] = i
631 seq_repeat_fin[i] = i
632 i=nbpitot=0
633 for n in self.generate_inner_nodes():
634 i+=1
635
636
637
638
639
640 longueur_chaine = n.edge_label_end - n.path_position
641 n.path_indices = []
642 all_children_leaf = True
643 for c in n.children:
644 if c.is_leaf: n.path_indices.append(c.path_position)
645
646 else:
647 n.path_indices.extend(c.path_indices)
648 all_children_leaf = False
649 listePos = [i-1 for i in n.path_indices]
650 listeCarac = []
651 for i in listePos:
652 assert -1 <= i < len(self.concat_string),(i,len(self.concat_string))
653 if i == -1: x = chr(4)
654 else: x = self.concat_string[i]
655 listeCarac.append(x)
656
657
658
659 setCarac = set(listeCarac)
660 if (longueur_chaine >= min_size and len(n.path_indices) > 1 and
661 all_children_leaf and len(setCarac)==len(listeCarac)):
662
663 seq1 = seq2 = False
664 for pos in n.path_indices:
665
666 if pos < longueur_seq1: seq1 = True
667 else: seq2 = True
668 if seq1 and seq2:
669 for pi in n.path_indices:
670
671
672
673
674
675 for pos in xrange(pi,pi+longueur_chaine):
676
677
678
679
680
681 if (pi <= seq_repeat_deb[pos] <= seq_repeat_fin[pos] <= pi+longueur_chaine or
682 seq_repeat_deb[pos] <= pi <= seq_repeat_fin[pos] <= pi+longueur_chaine):
683 seq_repeat_deb[pos] = pi
684 seq_repeat_fin[pos] = pi+longueur_chaine
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699 return seq_repeat_deb,seq_repeat_fin
700
702
703
704
705 texte = self.sequences[0] + '$' + self.sequences[1]
706 longueur_seq1 = len(self.sequences[0])
707 seq_repeat = []
708 for i in xrange(len(self.concat_string)):
709 seq_repeat.append((i,i))
710 i=nbpitot=0
711 for n in self.generate_inner_nodes():
712 i+=1
713 longueur_chaine = n.edge_label_end - n.path_position
714 n.path_indices = []
715 all_children_leaf = True
716 seq1 = seq2 = False
717 car_seq1 = [] ; car_seq2 = []
718 for c in n.children:
719 if c.is_leaf:
720 n.path_indices.append(c.path_position)
721 dic_car_seq1 = {} ; dic_car_seq2 = {}
722 if c.path_position <= longueur_seq1:
723 seq1 = True
724 try: dic_car_seq1[texte[c.path_position]].append(c.path_position)
725 except KeyError: dic_car_seq1[texte[c.path_position]] = [c.path_position]
726 else:
727
728 assert c.path_position < len(self.concat_string),(c.path_position, len(self.concat_string))
729 seq2 = True ; car = self.concat_string[c.path_position]
730 try: dic_car_seq2[car].append(c.path_position)
731 except KeyError: dic_car_seq2[car] = [c.path_position]
732 else: all_children_leaf = False
733 if not all_children_leaf: continue
734 if (not seq1) and (not seq2): continue
735 if longueur_chaine >= min_size and len(n.path_indices) > 1:
736 seq1 = seq2 = False ; lOccMEM = []
737 for car,lOcc in dic_car_seq1.iteritems():
738 if len(lOcc) == 1:
739 lOccMEM.append(lOcc[0])
740 seq1 = True
741 for car,lOcc in dic_car_seq2.iteritems():
742 if len(lOcc) == 1:
743 lOccMEM.append(lOcc[0])
744 seq2 = True
745 logging.debug(lOccMEM)
746 if seq1 and seq2:
747 for pi in lOccMEM:
748 for pos in xrange(pi,pi+longueur_chaine):
749 if (pi <= seq_repeat[pos][0] <= seq_repeat[pos][1] <= pi+longueur_chaine or
750 seq_repeat[pos][0] <= pi <= seq_repeat[pos][1] <= pi+longueur_chaine):
751 seq_repeat[pos] = (pi,pi+longueur_chaine)
752 return seq_repeat
753
755
756 """Classe proxy qui construit un GeneralisedSuffixTree
757 Celui-ci est construit et on extrait les infos nécessaires pour trouver les MEM
758 Ensuite on supprime l'objet GeneralisedSuffixTree, ce qui permet d'économiser beaucoup de mémoire"""
759
761
762 self.sequences = sequences
763 self.start_positions = [0]
764 self.concat_string = ''
765 for i in xrange(len(sequences)):
766 if chr(i+1) in sequences[i]:
767 raise "The suffix tree string must not contain chr(%d)!"%(i+1)
768 self.concat_string += sequences[i]+chr(i+1)
769 self.start_positions += [len(self.concat_string)]
770
771 self.start_positions += [self.start_positions[-1]+1]
772 self.sequences += ['']
773
774 self.st = TrueGeneralisedSuffixTree(sequences)
775
777 """Renvoie un dico de tous les Maximal Exact Matches de taille min min_size
778
779 Le dico est indexé par la taille de MEM qui renvoie une liste de toutes
780 les positions de cette taille.
781 Linéaire(nb de MEM) < linéaire(taille de la séquence)"""
782 dic_repeat = self.st.get_repeat(min_size)
783 del self.st
784
785 seq_repeat = {}
786 seq_repeat[-1] = -1
787 for i in xrange(len(self.sequences[0])+len(self.sequences[1])):
788 seq_repeat[i] = i
789 for length in dic_repeat:
790 for pos in dic_repeat[length]:
791 for pos_i in xrange(pos,pos+length):
792 seq_repeat[pos_i] = max(seq_repeat[pos_i], pos+length)
793
794
795
796 assert len(seq_repeat) == len(self.sequences[0])+len(self.sequences[1])+1
797 dic_MEM = {}
798 for debut in seq_repeat:
799 fin = seq_repeat[debut]
800
801 if (fin > debut and seq_repeat[debut-1] != fin):
802 try:
803 dic_MEM[fin-debut].append(debut)
804 except KeyError:
805 dic_MEM[fin-debut] = [debut]
806
807 return dic_MEM
808
809
811 """Renvoie un dico de tous les Maximal Exact Matches de taille min min_size
812
813 Le dico est indexé par la taille de MEM qui renvoie une liste de toutes
814 les positions de cette taille.
815 Linéaire(nb de MEM) < linéaire(taille de la séquence)"""
816 seq_repeat = self.get_seq_repeat(min_size)
817 if sr==3: return seq_repeat
818 elif sr == 1:
819 dic_MEM = {}
820
821 pos = len(seq_repeat)-1
822 while pos >= 0:
823 (debut,fin) = seq_repeat[pos]
824 if fin > debut:
825 try:
826 dic_MEM[fin-debut].append(debut)
827 except KeyError:
828 dic_MEM[fin-debut] = [debut]
829 pos = debut-1
830
831
832 return dic_MEM
833 elif sr==4:
834 dic_MEM = {}
835 longueur_s1 = len(self.sequences[0])
836 texte = self.sequences[0] + self.sequences[1]
837 pos = len(seq_repeat)-1
838 while pos >= 0:
839
840 (debut,fin) = seq_repeat[pos]
841 if fin > debut:
842 longueur = fin - debut
843 if debut > longueur_s1: pos_debut = debut - 1
844 else: pos_debut = debut
845 t = hash(texte[pos_debut:pos_debut+longueur])
846 if not dic_MEM.has_key(longueur): dic_MEM[longueur] = {}
847 try:
848 dic_MEM[longueur][t].append(pos_debut)
849 except KeyError:
850 dic_MEM[longueur][t] = [pos_debut]
851 pos = debut-1
852
853
854
855
856 return dic_MEM
857
858 - def get_MEM(self, min_size=1, sr=1):
859 """Renvoie un dico de tous les Maximal Exact Matches de taille min min_size
860
861 Le dico est indexé par la taille de MEM qui renvoie une liste de toutes
862 les positions de cette taille.
863 Linéaire(nb de MEM) < linéaire(taille de la séquence)"""
864 seq_repeat_deb,seq_repeat_fin = self.st.get_seq_repeat(min_size)
865 del self.st
866 logging.debug("suffixTree deleted")
867
868 if sr==3: return seq_repeat
869 elif sr == 1:
870 dic_MEM = {}
871
872 pos = len(seq_repeat_deb)-1
873 while pos >= 0:
874 debut = seq_repeat_deb[pos]
875 fin = seq_repeat_fin[pos]
876 if fin > debut:
877 try:
878 dic_MEM[fin-debut].append(debut)
879 except KeyError:
880 dic_MEM[fin-debut] = [debut]
881 pos = debut-1
882
883
884 return dic_MEM
885 elif sr==4:
886 dic_MEM = {}
887 longueur_s1 = len(self.sequences[0])
888 texte = self.sequences[0] + self.sequences[1]
889 pos = len(seq_repeat_deb)-1
890 while pos >= 0:
891
892 debut = seq_repeat_deb[pos]
893 fin = seq_repeat_fin[pos]
894 if fin > debut:
895 longueur = fin - debut
896 if debut > longueur_s1: pos_debut = debut - 1
897 else: pos_debut = debut
898 t = hash(texte[pos_debut:pos_debut+longueur])
899 if not dic_MEM.has_key(longueur): dic_MEM[longueur] = {}
900 try:
901 dic_MEM[longueur][t].append(pos_debut)
902 except KeyError:
903 dic_MEM[longueur][t] = [pos_debut]
904 pos = debut-1
905
906
907
908
909 return dic_MEM
910
912 """ Modifie les coordonnées de la 2e chaine en retirant 1
913 à cause du séparateur pour le suffix tree """
914 if sr == 4:
915 seq_repeat = self.get_MEM(min_size, sr=4)
916 logging.log(5,'get_MEM done')
917 return seq_repeat
918 elif sr == 3:
919 seq_repeat = self.get_MEM(min_size, sr=3)
920 logging.log(5,'get_MEM done')
921
922 seq2 = []
923 longueur_s1 = len(self.sequences[0])
924 pos = len(seq_repeat)-1
925 for pos in xrange(len(seq_repeat)-1):
926 deb,fin = seq_repeat[pos]
927 if pos > longueur_s1:
928 seq2.append((deb-1,fin-1))
929 elif pos == longueur_s1:
930 continue
931 else:
932 seq2.append((deb,fin))
933
934 assert len(seq2) == len(self.sequences[0]) + len(self.sequences[1])
935
936
937 return seq2
938 else:
939 dic_repeat = self.get_MEM(min_size, sr=1)
940 logging.log(5,'get_MEM done')
941
942 dic_repeat2 = {}
943 longueur_s1 = len(self.sequences[0])
944 texte = self.sequences[0] + self.sequences[1]
945 for longueur,liste in dic_repeat.iteritems():
946 if longueur >= min_size:
947 liste2 = []
948 for deb in liste:
949 if deb > longueur_s1:
950 liste2.append(deb-1)
951 else:
952 liste2.append(deb)
953 for deb in liste2:
954 t = texte[deb:deb+longueur]
955 try:
956
957 bisect.insort_right(dic_repeat2[t],deb)
958 except KeyError:
959 dic_repeat2[t] = [deb]
960 del dic_repeat
961 for t,l in dic_repeat2.iteritems():
962
963 assert len(t) >= min_size
964 return dic_repeat2
965
967 """ just_keep_words: si Vrai, on rogne les homologies de façon à n'avoir que des mots
968 (ou suites de mots) entiers
969 renvoie un dico indexé par (cle,longueur) ou cle représente hash(chaine) la chaine
970 dont on fait référence et la longueur de la chaine, la valeur et la liste d'occurences
971 de la chaine"""
972
973 seq = self.get_MEM_index_chaine2(min_size, sr)
974
975 logging.log(5,'get_MEM_index_chaine2 done')
976 if sr == 4:
977
978 texte = self.sequences[0]+self.sequences[1]
979 for longueur, dicoOcc in seq.iteritems():
980 for cle_hash, lOcc in dicoOcc.iteritems():
981
982
983 logging.debug(texte[lOcc[0]:lOcc[0]+longueur])
984
985 if eliminRecouv:
986 a = time.time()
987 recouv = recouvrement.Recouvrement4(self.sequences[0]+self.sequences[1],seq,len(self.sequences[0]),min_size)
988
989
990 blocs = recouv.eliminer_recouvrements()
991 b = time.time()
992 else:
993 blocs = {}
994
995 texte = self.sequences[0]+self.sequences[1]
996 for longueur, dicoOcc in seq.iteritems():
997 for cle_hash, lOcc in dicoOcc.iteritems():
998 for occ in lOcc:
999 cle = hash(texte[occ:occ+longueur])
1000 try:
1001 blocs[(cle,longueur)].append(occ)
1002 except KeyError:
1003 blocs[(cle,longueur)] = [occ]
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013 if eliminRecouv and recouv.SMEM_nb_bloc >= 100:
1014 logFile = os.path.join(os.getcwd(), 'resultat_complex_recouv.csv')
1015 if not os.path.exists(logFile):
1016
1017 csv = open(logFile, 'w')
1018 csv.write("SMEM_nb_bloc;SMEM_nb_occ;SMEM_tot_size;NOSMEM_nb_bloc;NOSMEM_nb_occ;NOSMEM_tot_size;temps;A;B\n")
1019 else:
1020 csv = open(logFile, 'a')
1021 csv.write(str(recouv.SMEM_nb_bloc)+';'+str(recouv.SMEM_nb_occ)+';'+str(recouv.SMEM_tot_size)+';'+
1022 str(recouv.NOSMEM_nb_bloc)+';'+str(recouv.NOSMEM_nb_occ)+';'+str(recouv.NOSMEM_tot_size)+';'+
1023 str(b-a)+';'+str(len(self.sequences[0]))+';'+str(len(self.sequences[1]))+'\n')
1024 csv.close()
1025
1026 elif sr == 3:
1027 recouv = recouvrement.Recouvrement3(self.sequences[0]+self.sequences[1],seq,len(self.sequences[0]))
1028 blocs = recouv.eliminer_recouvrements()
1029 else:
1030
1031 recouv = recouvrement.Recouvrement(self.sequences[0]+self.sequences[1],seq,len(self.sequences[0]),min_size)
1032 blocs1 = recouv.eliminer_recouvrements()
1033
1034
1035
1036 blocs = {}
1037 for texte,liste in blocs1.iteritems():
1038 cle = hash(texte)
1039 longueur = len(texte)
1040
1041
1042 liste.reverse()
1043 blocs[(cle,longueur)] = liste
1044 del blocs1
1045
1046 logging.log(5,'eliminer_recouvrements done')
1047 dic_chaine2 = {}
1048 longueur_s1 = len(self.sequences[0])
1049
1050
1051
1052
1053 sep = """. !\r,\n:\t;-?"'`() """
1054
1055 for (cle,longueur),liste_pos in blocs.iteritems():
1056 if (longueur >= min_size and len(liste_pos) >= 2):
1057
1058 liste_pos.reverse()
1059 if (liste_pos[0] < longueur_s1 <= liste_pos[-1]):
1060 if just_keep_words:
1061
1062
1063
1064
1065
1066 assert self.sequences[0][liste_pos[0]] == self.sequences[1][liste_pos[-1]-longueur_s1]
1067 assert self.sequences[0][liste_pos[0]+longueur-1] == self.sequences[1][liste_pos[-1]+longueur-1-longueur_s1], chaine+' / '+self.sequences[0][liste_pos[0]:liste_pos[0]+longueur]+' / '+self.sequences[1][liste_pos[-1]-longueur_s1:liste_pos[-1]+longueur-longueur_s1]+' / *'+self.sequences[0][liste_pos[0]+longueur]+'* / *'+self.sequences[1][liste_pos[-1]+longueur-longueur_s1]+'**'
1068
1069 i = liste_pos[0] ; i2 = liste_pos[-1]
1070 if (i == 0 or self.sequences[0][i-1] in sep) and (i2-longueur_s1 == 0 or self.sequences[1][i2-1-longueur_s1] in sep):
1071
1072
1073 decalage_avant = 0
1074 else:
1075 while (i < liste_pos[0]+longueur and self.sequences[0][i] not in sep and
1076 self.sequences[1][i2-longueur_s1] not in sep): i += 1 ; i2 += 1
1077 decalage_avant = i - liste_pos[0]
1078
1079 assert i >= liste_pos[0]
1080
1081 j = liste_pos[0]+longueur-1 ; j2 = liste_pos[-1]+longueur-1
1082 if ((j == len(self.sequences[0])-1 or self.sequences[0][j+1] in sep) and
1083 (j2-longueur_s1 == len(self.sequences[1])-1 or self.sequences[1][j2+1-longueur_s1] in sep)):
1084 pass
1085 else:
1086 while (j >= liste_pos[0] and self.sequences[0][j] not in sep and
1087 j2 >= liste_pos[-1] and self.sequences[1][j2-longueur_s1] not in sep): j -= 1 ; j2 -= 1
1088 assert j <= liste_pos[0]+longueur-1
1089
1090 if i < j:
1091 longueur2 = j+1 -i ; assert longueur2 <= longueur
1092 if (longueur2 >= min_size):
1093 cle2 = hash(self.sequences[0][i:j+1])
1094 tt = [x+decalage_avant for x in liste_pos]
1095 dic_chaine2[(cle2,longueur2)] = tt
1096
1097 else:
1098 dic_chaine2[(cle,longueur)] = liste_pos
1099
1100
1101 logging.log(10,'len(blocs)='+str(len(blocs))+' / len(dic_chaine2)='+str(len(dic_chaine2))
1102 +' / taille chaines dic_chaine2='+str(sum([longueur*len(li) for (c,longueur),li in dic_chaine2.iteritems()])))
1103 texte = self.sequences[0] + self.sequences[1]
1104 for (cle,longueur),lOcc in dic_chaine2.iteritems():
1105 logging.debug(texte[lOcc[0]:lOcc[0]+longueur])
1106 return dic_chaine2
1107
1109 """ Converti le dic de sortie pour le module MediteAppli
1110
1111 index par la chaine directement qui donne les positions de cette chaine"""
1112 dic_MEM = self.get_MEM(min_size)
1113 texte = self.sequences[0] + '$'+ self.sequences[1]
1114 longueur_s1 = len(self.sequences[0])
1115 dic_chaine = {}
1116 for longueur,liste_pos in dic_MEM.iteritems():
1117 if len(liste_pos) < 2: continue
1118 for pos in liste_pos:
1119 chaine = texte[pos:pos+longueur]
1120
1121 if pos > longueur_s1: pos -= 1
1122 try:
1123 dic_chaine[chaine].append(pos)
1124 except KeyError:
1125 dic_chaine[chaine] = [pos]
1126
1127 dic_chaine2 = {}
1128 for chaine,liste_pos in dic_chaine.iteritems():
1129 if (len(chaine) >= min_size and len(liste_pos) >= 2):
1130
1131 liste_pos.reverse()
1132 if (liste_pos[0] < longueur_s1 <= liste_pos[-1]):
1133 dic_chaine2[chaine] = liste_pos
1134 return dic_chaine2
1135
1136 -def trace(commande,dic_locals):
1137
1138 import profile,pstats,sys
1139 profile.runctx(commande,globals(), dic_locals,'c:\workspace\medite\statFile')
1140 s = pstats.Stats('c:\workspace\medite\statFile')
1141 s.sort_stats('time')
1142 s.print_stats()
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1154 print 'SIMPLE TEST'
1155 st = SuffixTree('mississippi')
1156 for l in st.leaves:
1157 print 'leaf:', '"'+l.path_label+'"', ':', '"'+l.edge_label+'" path_position '+ \
1158 str(l.path_position)+' / edge_label_start '+str(l.edge_label_start)+ \
1159 ' / edge_label_end '+str(l.edge_label_end)
1160
1161 for n in st.inner_nodes:
1162 print 'inner:', '"'+n.edge_label+'" path_position '+ \
1163 str(n.path_position)+' / edge_label_start '+str(n.edge_label_start)+ \
1164 ' / edge_label_end '+str(n.edge_label_end)
1165 print 'done.\n\n'
1166
1168 print 'GENERALISED TEST'
1169 sequences = ['xabxa','babxba']
1170 st = GeneralisedSuffixTree(sequences)
1171
1172 for shared in st.shared_substrings():
1173 print '-'*70
1174 for seq,start,stop in shared:
1175 print seq, '['+str(start)+':'+str(stop)+']',
1176 print sequences[seq][start:stop],
1177 print sequences[seq][start:]
1178 print '='*70
1179
1180 for shared in st.shared_substrings(2):
1181 print '-'*70
1182 for seq,start,stop in shared:
1183 print seq, '['+str(start)+':'+str(stop)+']',
1184 print sequences[seq][start:stop],
1185 print sequences[seq][start:]
1186 print '='*70
1187
1188 print 'done.\n\n'
1189
1191 print 'TEST3'
1192
1193
1194 sequences = ["Le 7 décembre, à onze heures du matin, et à deux heures et demie du soir, en pinçant la patte dont les vaisseaux sont liés, on y observe des mouvements réflexes. On laisse la grenouille dans les mêmes conditions que la veille.",
1195 "Le 7 décembre, à 11 h. du matin et à 2 h. 1/2, en pinçant la patte antérieure et la patte liée, on a des mouvements réflexes dans cette dernière."]
1196 st = GeneralisedSuffixTree(sequences)
1197
1198 texte=sequences[0]+sequences[1]
1199 for (start,stop) in st.shared_substrings2():
1200 print '['+str(start)+':'+str(stop)+']',
1201 print "*"+texte[start:stop]+ "* \n ",
1202 print '='*70
1203
1206
1207
1208
1209
1210 if __name__ == '__main__':
1211 test()
1212