medite.MediteAppli.suffix

1 # -*- coding: iso-8859-1 -*- 2 # Copyright 20003 - 2008: Julien Bourdaillet (julien.bourdaillet@lip6.fr), Jean-Gabriel Ganascia (jean-gabriel.ganascia@lip6.fr) 3 # This file is part of MEDITE. 4 # 5 # MEDITE is free software; you can redistribute it and/or modify 6 # it under the terms of the GNU General Public License as published by 7 # the Free Software Foundation; either version 2 of the License, or 8 # (at your option) any later version. 9 # 10 # MEDITE is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with Foobar; if not, write to the Free Software 17 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 19 from __future__ import generators 20 #from __future__ import absolute_import 21 22 import sys, logging, weakref, bisect, time, os, os.path 23 import numpy.oldnumeric as Numeric 24 import psyco 25 26 import recouvrement 27 import _suffix_tree 28 29

30 -class SuffixTreeInterface(object):

31 """Fonctions de parcours de l'arbre et de recherche des chaines répétées"""

32 - def generate_inner_nodes_2(self):

33 'Iterator through only MAximal exact matches' 34 for n in self.post_order_nodes_2: 35 if not n.is_leaf: 36 yield n

37 - def generate_MEM(self):

38 'Iterator through only Maximal exact matches' 39 def dfs(n): 40 nb_children_leaf = 0 41 for c in n.children: 42 if c.is_leaf: nb_children_leaf += 1 43 for m in dfs(c): 44 yield m 45 if nb_children_leaf>=1: yield n

46 for n in dfs(self.root): 47 yield n 48

49 - def generate_post_order_nodes(self):

50 'Iterator through all nodes in the tree in post-order.' 51 def dfs(n): 52 for c in n.children: 53 for m in dfs(c): 54 yield m 55 yield n

56 for n in dfs(self.root): 57 yield n 58

59 - def generate_post_order_nodes_max(self):

60 'Iterator through all nodes in the tree in post-order.' 61 def dfs(n): 62 for c in n.children: 63 for m in dfs(c): 64 if m.nb_occ>1: yield m 65 if n.nb_occ>1: yield n

66 for n in dfs(self.root): 67 if n.nb_occ>1: yield n 68

69 - def generate_pre_order_nodes(self):

70 'Iterator through all nodes in the tree in post-order.' 71 def dfs(n): 72 yield n 73 for c in n.children: 74 for m in dfs(c): 75 yield m

76 for n in dfs(self.root): 77 yield n 78

79 - def generate_leaves(self):

80 'Iterator through all leaves in the tree.' 81 for n in self.post_order_nodes: 82 if n.is_leaf: 83 yield n

84

85 - def generate_inner_nodes(self):

86 'Iterator through all leaves in the tree.' 87 for n in self.post_order_nodes: 88 if not n.is_leaf: 89 yield n

90 91 # set class properties 92 MEM = property(generate_MEM, None, None, 93 "post_order_nodes2") 94 post_order_nodes = property(generate_post_order_nodes, None, None, 95 "post_order_nodes") 96 post_order_nodes_max = property(generate_post_order_nodes_max, None, None, 97 "post_order_nodes_max") 98 pre_order_nodes = property(generate_pre_order_nodes, None, None, 99 "pre_order_nodes") 100 leaves = property(generate_leaves, None, None, "leaves") 101 inner_nodes = property(generate_inner_nodes, None, None, "inner_nodes") 102 inner_nodes_2 = property(generate_inner_nodes_2, None, None, "inner_nodes_2") 103

104 - def _annotate_nodes(self):

105 """Ajoute à chaque noeud interne(répété) ses positions dans la séquence. """ 106 for n in self.post_order_nodes: 107 if n.is_leaf: 108 n.path_indices = n.path_position 109 else: 110 n.path_indices = [] 111 for c in n.children: 112 try: 113 n.path_indices.extend(c.path_indices) 114 except TypeError: 115 n.path_indices.append(c.path_indices) 116 #print path_indices 117 n.nb_occ = len(n.path_indices)

118

119 - def get_MEM(self, min_size=1):

120 """Renvoie un dico de tous les Maximal Exact Matches de taille min min_size 121 122 Le dico est indexé par la taille de MEM qui renvoie une liste de toutes 123 les positions de cette taille. 124 Linéaire(nb de MEM) < linéaire(taille de la séquence)""" 125 # à chaque position de seq_repeat doit correspondre la position 126 # de la fin de la répétition la plus longue commençant à cette position 127 #self._annotate_nodes() 128 seq_repeat = {} 129 for n in self.generate_inner_nodes():#generate_MEM(): 130 longueur_MEM = n.edge_label_end - n.path_position 131 if longueur_MEM >= min_size: 132 n.path_indices = [] 133 for c in n.children: 134 if c.is_leaf: n.path_indices.append(c.path_position) 135 elif len(c.path_indices) == 1: n.path_indices.extend(c.path_indices) 136 137 for pi in n.path_indices: 138 seq_repeat[pi] = pi+longueur_MEM 139 140 e="""for i in xrange(pi, pi+longueur_chaine): 141 if not seq_repeat.has_key(i): 142 seq_repeat[i] = pi+longueur_chaine 143 elif pi+longueur_chaine < seq_repeat[i]: 144 break 145 else: 146 seq_repeat[i] = pi+longueur_chaine""" 147 #print seq_repeat 148 dic_MEM = {} 149 for debut in seq_repeat: 150 fin = seq_repeat[debut] 151 if (fin > debut): 152 try: 153 dic_MEM[fin-debut].append(debut) 154 except KeyError: 155 dic_MEM[fin-debut] = [debut] 156 #print dic_MEM 157 return dic_MEM

158

159 -class SuffixTree(SuffixTreeInterface,_suffix_tree.SuffixTree):

160 """A higher-level wrapper around the C suffix tree type, 161 _suffix_tree.SuffixTree. This class adds a few methods to the suffix 162 tree, methods that are more easily expressed in Python than in C, and 163 that can be written using the primitives exported from C. """ 164

165 - def __init__(self,s):

166 """Build a suffix tree from the input string s. The string 167 must not contain the special symbol $.""" 168 if '$' in s: raise "The suffix tree string must not contain $!" 169 self.sequence = s 170 #print len(s) 171 #s = unicode(s,'utf-8') #raw_unicode_escape') 172 #print len(s) 173 #if isinstance(s, unicode): 174 # s = s.encode('utf-8') 175 #print "SuffixTree.__init__ type(s) ",type(s) ," len ",len(s) 176 _suffix_tree.SuffixTree.__init__(self,s)

177

178 -class SuffixNodeU(dict):

179 - def __init__(self, debut, fin, slink=None, depth=0):

180 self.suffixLink = slink 181 self.depth = depth 182 # représente le début et la fin de l'arc qui pointe sur ce noeud 183 # ces 2 valeurs servent pour le parcours de l'arbre ultérieur à 184 # sa construction permetant la recherche des MEMs 185 self.edge_label_begin = debut # debut de l'arc 186 self.edge_label_end = fin # fin de l'arc

187 # path_position correspond au 1er caractère de la branche, 188 # celui du 1er caractère du noeud fils de root 189 # ainsi texte[path_position:edge_label_end] décrit la chaine depuis root

190 - def changeValues(self, debut, fin):

191 self.edge_label_begin = debut 192 self.edge_label_end = fin

193 - def getChildren(self):

194 '''Iterator through all the node's children''' 195 for k,p,node in self.itervalues(): 196 yield node

197 children = property(getChildren, None, None, "children of the node")

198 - def isLeaf(self):

199 if len(self) == 0: return True 200 else: return False

201 is_leaf = property(isLeaf, None, None, "leaf or not leaf ?")

202 - def __str__(self, seq=None, space='', short=False, charOnly=None):

203 res = "deb="+str(self.edge_label_begin)+" / fin="+str(self.edge_label_end) 204 try: 205 res += " / path_position="+str(self.path_position) 206 except AttributeError: pass 207 if seq is not None: 208 res += " / " +seq[self.edge_label_begin:self.edge_label_end] 209 res += "\n" 210 if short and seq is not None: 211 res = seq[self.edge_label_begin:self.edge_label_end]+'\n' 212 space += '|---' 213 if charOnly is not None and self.has_key(charOnly): 214 k,p,node = self[charOnly] 215 res += space + node.__str__(seq, space, short) 216 else: 217 for char,(k,p,node) in self.iteritems(): 218 res += space + node.__str__(seq, space, short) 219 return res

220

221 -class SuffixTree_(SuffixTreeInterface):

222 """Construit un suffix tree avec l'algorithme d'Ukkonen"""

223 - def __str__(self):

224 return self.root.__str__(seq=self.sequence)

225 - def __init__(self, sequence, generalizedST=False):

226 assert isinstance(sequence,basestring) 227 if not generalizedST: 228 if '¤' in sequence: raise "The suffix tree string must not contain ¤!" 229 self.sequence = sequence+'¤' 230 self.inf = len(sequence) # infini 231 self.bottom = SuffixNodeU(-2,-2) 232 self.root = SuffixNodeU(0,0,self.bottom) 233 s = self.root 234 k = 0 ; i = -1 235 for t in self.sequence: 236 # pour chaque char t en position i, consturction de ST(sequence[:i]) 237 i += 1 238 # transition t entre bottom en root 239 self.bottom[t] = -2,-2,self.root 240 # ajout de t en renvoie du end point de ST(sequence[:i]) 241 s,k = self.update(s,k,i) 242 # le end point n'est pas forcément explicite, on le canonize 243 s,k = self.canonize(s,k,i) 244 # end point + 1 char = active point de ST(sequence[:i+1]) 245 #print self 246 self.annotate_path_position() 247 self.sequence = sequence

248

249 - def update(self, s,k,i):

250 """Ajout du caractère en i """ 251 # active point = s,k,i-1 252 oldroot = self.root 253 t = self.sequence[i] 254 # active point = end point ? renvoie l'active point explicité 255 end_point,r = self.testAndSplit(s,k,i-1,t) 256 while not end_point: 257 r2 = SuffixNodeU(i,self.inf) 258 r[t] = i,self.inf,r2 # crée nouvelle transition 259 # crée lien suffixe du noeud de l'étape précédente 260 if not (oldroot is self.root): oldroot.suffixLink = r 261 oldroot = r 262 s,k = self.canonize(s.suffixLink,k,i-1) # remonte lien suffixe 263 end_point,r = self.testAndSplit(s,k,i-1,t) # end point ? 264 if not (oldroot is self.root): oldroot.suffixLink = r 265 return s,k # renvoie le end point

266

267 - def testAndSplit(self, s,k,p,t):

268 """Test si s,k,p est le end point et sinon explicite et renvoie s,k,p""" 269 if k <= p: # s,k,p implicite 270 tk = self.sequence[k] 271 k2,p2,s2 = s[tk] 272 # le char suivant l'arc s[tk] = t ? => end point 273 if t == self.sequence[k2+p-k+1]: return True,s 274 else:# sinon on split l'arc 275 tk2 = self.sequence[k2] 276 end = k2+p-k 277 r = SuffixNodeU(k2,end+1)#+1 pour avoir un slicing correct en python 278 s[tk2] = k2, end, r 279 s2.changeValues(end+1, p2+1)#MAJ valeur pour le slicing python 280 tk3 = self.sequence[end+1] 281 r[tk3] = end+1, p2, s2 282 return False,r 283 # s,k,p explicite 284 elif not s.has_key(t): return False,s # pas de tk-transition 285 else: return True,s # tk-transition => end point

286

287 - def canonize(self, s,k,p):

288 """Si s,k,p implicite, renvoie son plus proche ancêtre explicite""" 289 if p < k: return s,k # s,k,p explicite 290 else: 291 tk = self.sequence[k] 292 k2,p2,s2 = s[tk] 293 # si l'arc qui commence en s[tk] est inférieur à w, on avance dessus 294 while p2-k2 <= p-k: 295 k = k+p2-k2+1 296 s = s2 297 if k <= p: # essaye un nouvel arc 298 tk = self.sequence[k] 299 k2,p2,s2 = s[tk] 300 return s,k

301

302 - def annotate_path_position(self):

303 """ajoute l'attribut path_position qui correspond à la position du 1er caractère de la branche""" 304 node = self.root 305 self.root.path_position = 0 306 self.__annotate_path_position(node, 0)

307 - def __annotate_path_position(self, node, longueur):

308 for s,k,fils in node.itervalues(): 309 if len(fils) == 0: # feuille 310 # début de l'arc - longueur des arcs traversés depuis root 311 fils.path_position = fils.edge_label_begin - longueur 312 else: 313 fils.path_position = fils.edge_label_begin - longueur 314 self.__annotate_path_position(fils, 315 longueur+fils.edge_label_end-fils.edge_label_begin)

316 317 318

319 -class TrueGeneralisedSuffixTree(SuffixTree):

320 321 """A suffix tree for a set of strings.""" 322

323 - def __init__(self,sequences):

324 '''Build a generalised suffix tree. The strings must not 325 contain the special symbols $ or ascii numbers from 1 to the number of 326 sequences.''' 327 328 self.sequences = sequences 329 self.start_positions = [0] 330 self.concat_string = '' 331 for i in xrange(len(sequences)): 332 if chr(i+1) in sequences[i]: 333 raise "The suffix tree string must not contain chr(%d)!"%(i+1) 334 self.concat_string += sequences[i]+chr(i+1) 335 self.start_positions += [len(self.concat_string)] 336 337 self.start_positions += [self.start_positions[-1]+1] # empty string 338 self.sequences += [''] 339 #print self.start_positions 340 #print "GeneralisedSuffixTree.__init__ type(self.concat_string) ",type(self.concat_string) ," len ",len(self.concat_string) 341 SuffixTree.__init__(self,self.concat_string) 342 self._annotate_nodes()

343 344

345 - def _translate_index(self,idx):

346 'Translate a concat-string index into a (string_no,idx) pair.' 347 for i in xrange(len(self.start_positions)-1): 348 if self.start_positions[i] <= idx < self.start_positions[i+1]: 349 return (i,idx-self.start_positions[i]) 350 raise IndexError, "Index out of range: "+str(idx)

351

352 - def _annotate_nodes(self):

353 for n in self.post_order_nodes: 354 if n.is_leaf: 355 seq,idx = self._translate_index(n.path_position) 356 n.path_indices = [(seq, idx)] 357 n.sequences = [seq] 358 n.nb_occ = len(n.path_indices) 359 else: 360 path_indices = [] ; sequences = [] 361 for c in n.children: 362 path_indices += c.path_indices 363 sequences += c.sequences 364 365 seqs_in_subtree = {} 366 for s in sequences: 367 seqs_in_subtree[s] = 1 368 369 n.path_indices = path_indices 370 n.sequences = [s for s in seqs_in_subtree] 371 n.nb_occ = len(n.path_indices)

372

373 - def shared_substrings(self,minimum_length=0):

374 '''Iterator through shared sub-strings. Returned as a list of triples 375 (sequence,from,to).''' 376 for n in self.inner_nodes: 377 if len(n.sequences) == len(self.sequences) - 1: 378 l = n.edge_label_end - n.path_position 379 if l >= minimum_length: 380 yield [(seq, idx, idx+l) for (seq,idx) in n.path_indices]

381 # for (seq,idx) in n.path_indices: 382 # print "idx="+str(idx)+" / idx+1="+str(idx+1) 383 # yield (seq, idx, idx+l) 384 385

386 - def inter_inclus(self,i,d):

387 if (i[0]<=d[0]<=i[1] and i[0]<=d[1]<=i[1]): return True 388 else: return False

389

390 - def shared_substrings2naif(self,long_min_pivots):

391 '''Iterator through shared sub-strings. Returned as a list of triples 392 (sequence,from,to).''' 393 DRes={} 394 maxIdxDres=0 395 longTexte1=len(self.sequences[0])+1 396 for n in self.inner_nodes: 397 #for n in self.post_order_nodes: 398 #print "n.pathlabel= *"+n.path_label+"* edge_label= *"+n.edge_label+"*" 399 # print "edge_label= *"+n.edge_label+"*" 400 long_path=n.edge_label_end-n.path_position 401 if long_path<long_min_pivots: continue 402 ladd=[] 403 for pi in n.path_indices: 404 # print "pi= "+str(pi)+" / n.path_position= "+str(n.path_position)+" / n.edge_label_start= "+str(n.edge_label_start)+" / n.edge_label_end= "+str(n.edge_label_end) 405 # print "edge= *"+self.sequences[pi[0]][pi[1]:pi[1]+long_path] +"*" 406 if pi[0]==1: deb=pi[1]+longTexte1-1 407 else: deb=pi[1] 408 found=False 409 for i in xrange(maxIdxDres,0,-1): 410 if DRes.has_key(i): 411 for x in DRes[i]: 412 if self.inter_inclus(x,(deb,deb+long_path)): 413 found=True 414 break 415 if (found or i-1<long_path):break 416 if not found: 417 ladd.append((deb,deb+long_path)) 418 419 if len(ladd)>1: 420 if DRes.has_key(long_path): DRes[long_path].extend(ladd) 421 else: 422 maxIdxDres=max(maxIdxDres,long_path) 423 DRes[long_path]=ladd 424 425 LRes=[] 426 for i in xrange(maxIdxDres,0,-1): 427 if DRes.has_key(i): 428 print str(i)+" "+str(DRes[i]) 429 for x in DRes[i]: 430 for y in LRes: 431 if self.inter_inclus(y,x): break 432 else: 433 LRes.append(x) 434 print "text n="+str(len(self.sequences[0])+len(self.sequences[1]))+" / leaves="+str(nleaf)+" / internal nodes="+str(nnode) 435 return LRes

436

437 - def shared_substrings2(self,long_min_pivots):

438 '''Iterator through shared sub-strings. Returned as a list of triples 439 (sequence,from,to).''' 440 lInter=[] 441 longTexte1=len(self.sequences[0])+1 442 nleaf=nnode=0 443 #for n in self.inner_nodes: 444 for n in self.post_order_nodes: 445 if n.is_leaf: 446 nleaf+=1 447 else: 448 nnode+=1 449 #print "n.pathlabel= *"+n.path_label+"* edge_label= *"+n.edge_label+"*" 450 # print "edge_label= *"+n.edge_label+"*" 451 long_path=n.edge_label_end-n.path_position 452 if long_path<long_min_pivots: continue 453 ladd=[] 454 for pi in n.path_indices: 455 # print "pi= "+str(pi)+" / n.path_position= "+str(n.path_position)+" / n.edge_label_start= "+str(n.edge_label_start)+" / n.edge_label_end= "+str(n.edge_label_end) 456 # print "edge= *"+self.sequences[pi[0]][pi[1]:pi[1]+long_path] +"*" 457 if pi[0]==1: deb=pi[1]+longTexte1-1 458 else: deb=pi[1] 459 if not MediteAppli.inclus((deb,deb+long_path),lInter): 460 ladd.append((deb,deb+long_path)) 461 462 if len(ladd)>1: 463 lInter=MediteAppli.union2(ladd,lInter) 464 # for x in ladd: 465 # lInter = MediteAppli.ajout_intervalle(lInter,x) 466 sys.stderr.write("text n="+str(len(self.sequences[0])+len(self.sequences[1]))+" / leaves="+str(nleaf)+" / internal nodes="+str(nnode)+"\n") 467 return lInter

468

469 - def shared_substrings3(self,long_min_pivots):

470 '''Iterator through shared sub-strings. Returned as a list of triples 471 (sequence,from,to).''' 472 longTexte1=len(self.sequences[0]) # longueur du 1er texte 473 t=self.sequences[0] + self.sequences[1] # texte total 474 longTexte=len(t) # longueur du texte total 475 lMax=[] 476 for i in xrange(longTexte): 477 lMax.append(0) 478 nleaf=nnode=0 479 for n in self.post_order_nodes: 480 if n.is_leaf: 481 nleaf+=1 482 else: 483 nnode+=1 484 long_path=n.edge_label_end-n.path_position 485 if long_path<long_min_pivots: continue 486 ladd=[] 487 for pi in n.path_indices: 488 if pi[0]==1: deb=pi[1]+longTexte1 489 else: deb=pi[1] 490 fin=deb+long_path 491 if (lMax[deb]>=fin):continue 492 else: ladd.append((deb,fin)) 493 494 if (len(ladd)>1 and ladd[0][0]<longTexte1 and ladd[-1][0]>=longTexte1): 495 # if (ladd[0][0]<longTexte1 and not ladd[0][1]<longTexte1): 496 # sys.stderr.write("ladd[0]="+str(ladd[0])+" / "+self.sequences[0][ladd[0][0]:ladd[0][1]]+" / longTexte1 ="+str(longTexte1)+"\n") 497 for x in ladd: 498 for i in xrange(x[0],x[1]): 499 if x[1]>lMax[i]: lMax[i]=x[1] 500 #else: break 501 502 dic={} 503 prevInserted=lastmax=0 504 nbPos=0 505 for i in xrange(longTexte): 506 nbPos+=1 507 pos=lMax[i] 508 long=pos-i 509 if pos!=prevInserted and pos>0 and pos>lastmax: 510 texte=t[i:pos] 511 if not dic.has_key(texte): 512 dic[texte]=[] 513 dic[texte].append(i) 514 prevInserted=pos 515 if pos>lastmax: lastmax=pos 516 #sys.stderr.write("len dic = "+str(len(dic))+"\n") 517 #print dic 518 dic2={} 519 for x in dic: 520 a=dic[x] 521 if (len(x)<long_min_pivots or len(a)<2 or 522 not(a[0]<longTexte1 and a[-1]>=longTexte1)): 523 # sys.stderr.write("SUPP "+x+" "+str(a)+"\n") 524 continue 525 else:dic2[x]=a 526 # sys.stderr.write("len dic2 = "+str(len(dic2))+"\n") 527 # sys.stderr.write("text n="+str(len(self.sequences[0])+len(self.sequences[1]))+ 528 # " / leaves="+str(nleaf)+" / internal nodes="+str(nnode)+ 529 # " / nbPos="+str(nbPos)+"\n") 530 #print dic2 531 return dic2,lMax

532

533 - def get_repeat(self, min_size=1):

534 """Renvoie un dictionnaire de toutes les chaines répétées. 535 536 Indexé par la taille des chaines. En valeur une liste de positions. 537 min_size: taille mini des chaines répétées 538 Linéaire(nb de noeuds de l'arbre)=linéaire(taille de la séquence)""" 539 longueur_seq1 = len(self.sequences[0]) 540 dic = {} 541 for n in self.generate_inner_nodes():# chaines répétées 542 longueur_chaine = n.edge_label_end - n.path_position 543 if longueur_chaine >= min_size: 544 seq1 = seq2 = False 545 lPos = [] 546 for (seq,pos) in n.path_indices: 547 if seq == 0: 548 seq1 = True 549 lPos.append(pos) 550 else: 551 seq2 = True 552 lPos.append(pos + longueur_seq1) 553 if seq1 and seq2: 554 try: 555 dic[longueur_chaine].extend(lPos) 556 #print n.path_position 557 except KeyError: 558 dic[longueur_chaine] = lPos 559 return dic

560

561 - def get_seq_repeat_list(self,min_size=1):

562 # à chaque position de seq_repeat doit correspondre la position 563 # de la fin de la répétition la plus longue commençant à cette position 564 #print self.root.__str__(self.concat_string, short=True) 565 longueur_seq1 = len(self.sequences[0]) 566 seq_repeat = [] 567 for i in xrange(len(self.concat_string)): 568 seq_repeat.append((i,i)) 569 i=nbpitot=0 570 for n in self.generate_inner_nodes():#generate_MEM(): 571 i+=1 572 #print i,n.__str__(self.concat_string) 573 #if n.path_position<longueur_seq1 and n.edge_label_end>longueur_seq1: 574 # n.edge_label_end= longueur_seq1 575 #assert (0<=n.path_position<=n.edge_label_begin<=n.edge_label_end<=longueur_seq1 or 576 # longueur_seq1<=n.path_position<=n.edge_label_begin<=n.edge_label_end<=len(self.sequences[0])+len(self.sequences[1])) 577 longueur_chaine = n.edge_label_end - n.path_position 578 n.path_indices = [] 579 for c in n.children: 580 if c.is_leaf: n.path_indices.append(c.path_position) 581 #elif len(c.path_indices) == 1: n.path_indices.extend(c.path_indices) 582 else: n.path_indices.extend(c.path_indices) 583 if longueur_chaine >= min_size and len(n.path_indices) > 1: 584 #print n.path_indices 585 seq1 = seq2 = False 586 for pos in n.path_indices: 587 #print self.concat_string[pos:pos+longueur_chaine] 588 if pos < longueur_seq1: seq1 = True 589 else: seq2 = True 590 if seq1 and seq2: 591 for pi in n.path_indices: 592 #try: 593 # if seq_repeat[pi][1] < pi+longueur_chaine: 594 # seq_repeat[pi] = (pi,pi+longueur_chaine) 595 #except KeyError: 596 # seq_repeat[pi] = (pi,pi+longueur_chaine) 597 for pos in xrange(pi,pi+longueur_chaine): 598 #for pos in xrange(pi+longueur_chaine,pi,-1): 599 #if (seq_repeat[pos][1] < pi+longueur_chaine or 600 # pi < seq_repeat[pos][0] and seq_repeat[pos][1] == pi+longueur_chaine): 601 # seq_repeat[pos] = (pi,pi+longueur_chaine) 602 # si bloc décrit à la pos sourante est inclus dans le bloc qu'on ajoute 603 if (pi <= seq_repeat[pos][0] <= seq_repeat[pos][1] <= pi+longueur_chaine or 604 seq_repeat[pos][0] <= pi <= seq_repeat[pos][1] <= pi+longueur_chaine): 605 seq_repeat[pos] = (pi,pi+longueur_chaine) 606 #for pos in xrange(pi+longueur_chaine-1,pi-1,-1): 607 # if (pi <= seq_repeat[pos][0] <= seq_repeat[pos][1] <= pi+longueur_chaine or 608 #pi <= seq_repeat[pos][0] <= pi+longueur_chaine <= seq_repeat[pos][1]): 609 # seq_repeat[pos][0] <= pi <= seq_repeat[pos][1] < pi+longueur_chaine): 610 # seq_repeat[pos] = (pi,pi+longueur_chaine) 611 612 613 #print seq_repeat 614 #if seq_repeat[pi][1] < pi+longueur_chaine: 615 # seq_repeat[pi] = (pi,pi+longueur_chaine) 616 #nbpitot+=1 617 #seq_repeat[pi] = pi+longueur_chaine 618 #else: print n.path_indices 619 #print seq_repeat 620 return seq_repeat

621

622 - def get_seq_repeat(self,min_size=1):

623 # à chaque position de seq_repeat doit correspondre la position 624 # de la fin de la répétition la plus longue commençant à cette position 625 #print self.root.__str__(self.concat_string, short=True) 626 longueur_seq1 = len(self.sequences[0]) 627 seq_repeat_deb = Numeric.zeros(len(self.concat_string),Numeric.Int) 628 seq_repeat_fin = Numeric.zeros(len(self.concat_string),Numeric.Int) 629 for i in xrange(len(self.concat_string)): 630 seq_repeat_deb[i] = i 631 seq_repeat_fin[i] = i 632 i=nbpitot=0 633 for n in self.generate_inner_nodes():#generate_MEM(): 634 i+=1 635 #print i,n.__str__(self.concat_string) 636 #if n.path_position<longueur_seq1 and n.edge_label_end>longueur_seq1: 637 # n.edge_label_end= longueur_seq1 638 #assert (0<=n.path_position<=n.edge_label_begin<=n.edge_label_end<=longueur_seq1 or 639 # longueur_seq1<=n.path_position<=n.edge_label_begin<=n.edge_label_end<=len(self.sequences[0])+len(self.sequences[1])) 640 longueur_chaine = n.edge_label_end - n.path_position 641 n.path_indices = [] 642 all_children_leaf = True 643 for c in n.children: 644 if c.is_leaf: n.path_indices.append(c.path_position) 645 #elif len(c.path_indices) == 1: n.path_indices.extend(c.path_indices) 646 else: 647 n.path_indices.extend(c.path_indices) 648 all_children_leaf = False 649 listePos = [i-1 for i in n.path_indices] 650 listeCarac = [] 651 for i in listePos: 652 assert -1 <= i < len(self.concat_string),(i,len(self.concat_string)) 653 if i == -1: x = chr(4) 654 else: x = self.concat_string[i] 655 listeCarac.append(x) 656 #for idx,pos in listePos: 657 # if idx == 0: listeCarac.append(self.sequences[0][pos]) 658 # else: listeCarac.append(self.sequences[1][pos]) 659 setCarac = set(listeCarac) 660 if (longueur_chaine >= min_size and len(n.path_indices) > 1 and 661 all_children_leaf and len(setCarac)==len(listeCarac)): 662 #print n.path_indices 663 seq1 = seq2 = False 664 for pos in n.path_indices: 665 #print self.concat_string[pos:pos+longueur_chaine] 666 if pos < longueur_seq1: seq1 = True 667 else: seq2 = True 668 if seq1 and seq2: 669 for pi in n.path_indices: 670 #try: 671 # if seq_repeat[pi][1] < pi+longueur_chaine: 672 # seq_repeat[pi] = (pi,pi+longueur_chaine) 673 #except KeyError: 674 # seq_repeat[pi] = (pi,pi+longueur_chaine) 675 for pos in xrange(pi,pi+longueur_chaine): 676 #for pos in xrange(pi+longueur_chaine,pi,-1): 677 #if (seq_repeat[pos][1] < pi+longueur_chaine or 678 # pi < seq_repeat[pos][0] and seq_repeat[pos][1] == pi+longueur_chaine): 679 # seq_repeat[pos] = (pi,pi+longueur_chaine) 680 # si bloc décrit à la pos sourante est inclus dans le bloc qu'on ajoute 681 if (pi <= seq_repeat_deb[pos] <= seq_repeat_fin[pos] <= pi+longueur_chaine or 682 seq_repeat_deb[pos] <= pi <= seq_repeat_fin[pos] <= pi+longueur_chaine): 683 seq_repeat_deb[pos] = pi 684 seq_repeat_fin[pos] = pi+longueur_chaine 685 #for pos in xrange(pi+longueur_chaine-1,pi-1,-1): 686 # if (pi <= seq_repeat[pos][0] <= seq_repeat[pos][1] <= pi+longueur_chaine or 687 #pi <= seq_repeat[pos][0] <= pi+longueur_chaine <= seq_repeat[pos][1]): 688 # seq_repeat[pos][0] <= pi <= seq_repeat[pos][1] < pi+longueur_chaine): 689 # seq_repeat[pos] = (pi,pi+longueur_chaine) 690 691 692 #print seq_repeat 693 #if seq_repeat[pi][1] < pi+longueur_chaine: 694 # seq_repeat[pi] = (pi,pi+longueur_chaine) 695 #nbpitot+=1 696 #seq_repeat[pi] = pi+longueur_chaine 697 #else: print n.path_indices 698 #print seq_repeat 699 return seq_repeat_deb,seq_repeat_fin

700

701 - def get_seq_repeat2(self,min_size=1):

702 # à chaque position de seq_repeat doit correspondre la position 703 # de la fin de la répétition la plus longue commençant à cette position 704 #print self.root.__str__(self.concat_string, short=True) 705 texte = self.sequences[0] + '$' + self.sequences[1] 706 longueur_seq1 = len(self.sequences[0]) 707 seq_repeat = [] 708 for i in xrange(len(self.concat_string)): 709 seq_repeat.append((i,i)) 710 i=nbpitot=0 711 for n in self.generate_inner_nodes():#generate_MEM(): 712 i+=1 713 longueur_chaine = n.edge_label_end - n.path_position 714 n.path_indices = [] 715 all_children_leaf = True 716 seq1 = seq2 = False 717 car_seq1 = [] ; car_seq2 = [] 718 for c in n.children: 719 if c.is_leaf: 720 n.path_indices.append(c.path_position) 721 dic_car_seq1 = {} ; dic_car_seq2 = {} 722 if c.path_position <= longueur_seq1: 723 seq1 = True 724 try: dic_car_seq1[texte[c.path_position]].append(c.path_position) 725 except KeyError: dic_car_seq1[texte[c.path_position]] = [c.path_position] 726 else: 727 #if c.path_position < len(self.concat_string): 728 assert c.path_position < len(self.concat_string),(c.path_position, len(self.concat_string)) 729 seq2 = True ; car = self.concat_string[c.path_position] 730 try: dic_car_seq2[car].append(c.path_position) 731 except KeyError: dic_car_seq2[car] = [c.path_position] 732 else: all_children_leaf = False 733 if not all_children_leaf: continue 734 if (not seq1) and (not seq2): continue 735 if longueur_chaine >= min_size and len(n.path_indices) > 1: 736 seq1 = seq2 = False ; lOccMEM = [] 737 for car,lOcc in dic_car_seq1.iteritems(): 738 if len(lOcc) == 1: 739 lOccMEM.append(lOcc[0]) 740 seq1 = True 741 for car,lOcc in dic_car_seq2.iteritems(): 742 if len(lOcc) == 1: 743 lOccMEM.append(lOcc[0]) 744 seq2 = True 745 logging.debug(lOccMEM) 746 if seq1 and seq2: 747 for pi in lOccMEM: 748 for pos in xrange(pi,pi+longueur_chaine): 749 if (pi <= seq_repeat[pos][0] <= seq_repeat[pos][1] <= pi+longueur_chaine or 750 seq_repeat[pos][0] <= pi <= seq_repeat[pos][1] <= pi+longueur_chaine): 751 seq_repeat[pos] = (pi,pi+longueur_chaine) 752 return seq_repeat

753

754 -class GeneralisedSuffixTree(object):

755 756 """Classe proxy qui construit un GeneralisedSuffixTree 757 Celui-ci est construit et on extrait les infos nécessaires pour trouver les MEM 758 Ensuite on supprime l'objet GeneralisedSuffixTree, ce qui permet d'économiser beaucoup de mémoire""" 759

760 - def __init__(self,sequences):

761 762 self.sequences = sequences 763 self.start_positions = [0] 764 self.concat_string = '' 765 for i in xrange(len(sequences)): 766 if chr(i+1) in sequences[i]: 767 raise "The suffix tree string must not contain chr(%d)!"%(i+1) 768 self.concat_string += sequences[i]+chr(i+1) 769 self.start_positions += [len(self.concat_string)] 770 771 self.start_positions += [self.start_positions[-1]+1] # empty string 772 self.sequences += [''] 773 774 self.st = TrueGeneralisedSuffixTree(sequences)

775

776 - def get_MEM_(self, min_size=1):

777 """Renvoie un dico de tous les Maximal Exact Matches de taille min min_size 778 779 Le dico est indexé par la taille de MEM qui renvoie une liste de toutes 780 les positions de cette taille. 781 Linéaire(nb de MEM) < linéaire(taille de la séquence)""" 782 dic_repeat = self.st.get_repeat(min_size) 783 del self.st 784 #print dic_repeat 785 seq_repeat = {} 786 seq_repeat[-1] = -1 787 for i in xrange(len(self.sequences[0])+len(self.sequences[1])): 788 seq_repeat[i] = i 789 for length in dic_repeat: 790 for pos in dic_repeat[length]: 791 for pos_i in xrange(pos,pos+length): 792 seq_repeat[pos_i] = max(seq_repeat[pos_i], pos+length) 793 # à chaque position de seq_repeat doit correspondre la position 794 # de la fin de la répétition la plus longue commençant à cette posistion 795 #print seq_repeat 796 assert len(seq_repeat) == len(self.sequences[0])+len(self.sequences[1])+1 797 dic_MEM = {} 798 for debut in seq_repeat: 799 fin = seq_repeat[debut] 800 # si ça fait plus de 1 carac et si on ne vient pas d'ajouter la répet 801 if (fin > debut and seq_repeat[debut-1] != fin): 802 try: 803 dic_MEM[fin-debut].append(debut) 804 except KeyError: 805 dic_MEM[fin-debut] = [debut] 806 #print dic_MEM 807 return dic_MEM

808 809

810 - def get_MEM_list(self, min_size=1, sr=1):

811 """Renvoie un dico de tous les Maximal Exact Matches de taille min min_size 812 813 Le dico est indexé par la taille de MEM qui renvoie une liste de toutes 814 les positions de cette taille. 815 Linéaire(nb de MEM) < linéaire(taille de la séquence)""" 816 seq_repeat = self.get_seq_repeat(min_size) 817 if sr==3: return seq_repeat 818 elif sr == 1: 819 dic_MEM = {} 820 #for (debut,fin) 821 pos = len(seq_repeat)-1 822 while pos >= 0: 823 (debut,fin) = seq_repeat[pos] 824 if fin > debut: 825 try: 826 dic_MEM[fin-debut].append(debut) 827 except KeyError: 828 dic_MEM[fin-debut] = [debut] 829 pos = debut-1 830 831 #print dic_MEM 832 return dic_MEM 833 elif sr==4: 834 dic_MEM = {} 835 longueur_s1 = len(self.sequences[0]) 836 texte = self.sequences[0] + self.sequences[1] 837 pos = len(seq_repeat)-1 838 while pos >= 0: 839 # attention, ne pas mélanger debut et pos_debut, 2 compteurs différents 840 (debut,fin) = seq_repeat[pos] 841 if fin > debut: 842 longueur = fin - debut 843 if debut > longueur_s1: pos_debut = debut - 1 844 else: pos_debut = debut 845 t = hash(texte[pos_debut:pos_debut+longueur]) 846 if not dic_MEM.has_key(longueur): dic_MEM[longueur] = {} 847 try: 848 dic_MEM[longueur][t].append(pos_debut) 849 except KeyError: 850 dic_MEM[longueur][t] = [pos_debut] 851 pos = debut-1 852 #logging.debug( dic_MEM) 853 #for longueur,dic_cle in dic_MEM.iteritems(): 854 # for cle,lOcc in dic_cle.iteritems(): 855 # logging.debug(str(longueur)+' / '+texte[lOcc[0]:lOcc[0]+longueur]+'/ '+str(lOcc)) 856 return dic_MEM

857

858 - def get_MEM(self, min_size=1, sr=1):

859 """Renvoie un dico de tous les Maximal Exact Matches de taille min min_size 860 861 Le dico est indexé par la taille de MEM qui renvoie une liste de toutes 862 les positions de cette taille. 863 Linéaire(nb de MEM) < linéaire(taille de la séquence)""" 864 seq_repeat_deb,seq_repeat_fin = self.st.get_seq_repeat(min_size) 865 del self.st # permet d'économiser beaucoup de mémoire 866 logging.debug("suffixTree deleted") 867 868 if sr==3: return seq_repeat 869 elif sr == 1: 870 dic_MEM = {} 871 #for (debut,fin) 872 pos = len(seq_repeat_deb)-1 873 while pos >= 0: 874 debut = seq_repeat_deb[pos] 875 fin = seq_repeat_fin[pos] 876 if fin > debut: 877 try: 878 dic_MEM[fin-debut].append(debut) 879 except KeyError: 880 dic_MEM[fin-debut] = [debut] 881 pos = debut-1 882 883 #print dic_MEM 884 return dic_MEM 885 elif sr==4: 886 dic_MEM = {} 887 longueur_s1 = len(self.sequences[0]) 888 texte = self.sequences[0] + self.sequences[1] 889 pos = len(seq_repeat_deb)-1 890 while pos >= 0: 891 # attention, ne pas mélanger debut et pos_debut, 2 compteurs différents 892 debut = seq_repeat_deb[pos] 893 fin = seq_repeat_fin[pos] 894 if fin > debut: 895 longueur = fin - debut 896 if debut > longueur_s1: pos_debut = debut - 1 897 else: pos_debut = debut 898 t = hash(texte[pos_debut:pos_debut+longueur]) 899 if not dic_MEM.has_key(longueur): dic_MEM[longueur] = {} 900 try: 901 dic_MEM[longueur][t].append(pos_debut) 902 except KeyError: 903 dic_MEM[longueur][t] = [pos_debut] 904 pos = debut-1 905 #logging.debug( dic_MEM) 906 #for longueur,dic_cle in dic_MEM.iteritems(): 907 # for cle,lOcc in dic_cle.iteritems(): 908 # logging.debug(str(longueur)+' / '+texte[lOcc[0]:lOcc[0]+longueur]+'/ '+str(lOcc)) 909 return dic_MEM

910

911 - def get_MEM_index_chaine2(self, min_size=1, sr = 1):

912 """ Modifie les coordonnées de la 2e chaine en retirant 1 913 à cause du séparateur pour le suffix tree """ 914 if sr == 4: 915 seq_repeat = self.get_MEM(min_size, sr=4) 916 logging.log(5,'get_MEM done') 917 return seq_repeat 918 elif sr == 3: 919 seq_repeat = self.get_MEM(min_size, sr=3) 920 logging.log(5,'get_MEM done') 921 #print seq_repeat[:100] 922 seq2 = [] 923 longueur_s1 = len(self.sequences[0]) 924 pos = len(seq_repeat)-1 925 for pos in xrange(len(seq_repeat)-1): 926 deb,fin = seq_repeat[pos] 927 if pos > longueur_s1: # 2e chaine 928 seq2.append((deb-1,fin-1)) 929 elif pos == longueur_s1: # position du séparateur des 2 séquences 930 continue 931 else: # première chaine 932 seq2.append((deb,fin)) 933 #print seq2 934 assert len(seq2) == len(self.sequences[0]) + len(self.sequences[1]) 935 # renvoie une séquence de la longueur des 2 chaines concaténées 936 # dont chaque item est une paire(debut du bloc, fon du bloc) 937 return seq2 938 else: # sr 1 939 dic_repeat = self.get_MEM(min_size, sr=1) 940 logging.log(5,'get_MEM done') 941 #print seq_repeat[:100] 942 dic_repeat2 = {} 943 longueur_s1 = len(self.sequences[0]) 944 texte = self.sequences[0] + self.sequences[1] 945 for longueur,liste in dic_repeat.iteritems(): 946 if longueur >= min_size: 947 liste2 = [] 948 for deb in liste: 949 if deb > longueur_s1: 950 liste2.append(deb-1) 951 else: 952 liste2.append(deb) 953 for deb in liste2: 954 t = texte[deb:deb+longueur] 955 try: 956 #dic_repeat2[t].append(deb) 957 bisect.insort_right(dic_repeat2[t],deb) 958 except KeyError: 959 dic_repeat2[t] = [deb] 960 del dic_repeat 961 for t,l in dic_repeat2.iteritems(): 962 #logging.debug((t,l)) 963 assert len(t) >= min_size 964 return dic_repeat2

965

966 - def get_MEM_index_chaine3(self, min_size=1, just_keep_words=True, sr = 4, eliminRecouv=True):

967 """ just_keep_words: si Vrai, on rogne les homologies de façon à n'avoir que des mots 968 (ou suites de mots) entiers 969 renvoie un dico indexé par (cle,longueur) ou cle représente hash(chaine) la chaine 970 dont on fait référence et la longueur de la chaine, la valeur et la liste d'occurences 971 de la chaine""" 972 973 seq = self.get_MEM_index_chaine2(min_size, sr) 974 #print seq 975 logging.log(5,'get_MEM_index_chaine2 done') 976 if sr == 4: 977 #logging.debug('Recouv 4 eliminrecouv') 978 texte = self.sequences[0]+self.sequences[1] 979 for longueur, dicoOcc in seq.iteritems(): 980 for cle_hash, lOcc in dicoOcc.iteritems(): 981 #for occ in lOcc: 982 # logging.debug(texte[occ:occ+longueur]) 983 logging.debug(texte[lOcc[0]:lOcc[0]+longueur]) 984 #print '====' 985 if eliminRecouv: 986 a = time.time() 987 recouv = recouvrement.Recouvrement4(self.sequences[0]+self.sequences[1],seq,len(self.sequences[0]),min_size) 988 #recouv = recouvrement.RecouvrementNaif(self.sequences[0]+self.sequences[1],seq,len(self.sequences[0]),min_size) 989 #trace('blocs = recouv.eliminer_recouvrements()',locals()) 990 blocs = recouv.eliminer_recouvrements() 991 b = time.time() 992 else: 993 blocs = {} 994 #blocs.NOSMEM_nb_bloc = 0 995 texte = self.sequences[0]+self.sequences[1] 996 for longueur, dicoOcc in seq.iteritems(): 997 for cle_hash, lOcc in dicoOcc.iteritems(): 998 for occ in lOcc: 999 cle = hash(texte[occ:occ+longueur]) 1000 try: 1001 blocs[(cle,longueur)].append(occ) 1002 except KeyError: 1003 blocs[(cle,longueur)] = [occ] 1004 #blocs.NOSMEM_nb_bloc += 1 1005 #for cle,lOcc in blocs.iteritems(): 1006 # lOcc.reverse() # on reverse car dans la suite dans l'algo c'est nécessaire 1007 1008 #for (cle,longueur),liste_pos in blocs.iteritems(): 1009 # for occ in liste_pos: 1010 # print texte[occ:occ+longueur] 1011 #print '=========================' 1012 #logging.debug('Recouv 4 FIN eliminrecouv') 1013 if eliminRecouv and recouv.SMEM_nb_bloc >= 100: 1014 logFile = os.path.join(os.getcwd(), 'resultat_complex_recouv.csv') 1015 if not os.path.exists(logFile): 1016 #le fichier csv n'existe pas, on écrit sur la premiere ligne le nom des params 1017 csv = open(logFile, 'w') #mode creation 1018 csv.write("SMEM_nb_bloc;SMEM_nb_occ;SMEM_tot_size;NOSMEM_nb_bloc;NOSMEM_nb_occ;NOSMEM_tot_size;temps;A;B\n") 1019 else: 1020 csv = open(logFile, 'a') #mode append 1021 csv.write(str(recouv.SMEM_nb_bloc)+';'+str(recouv.SMEM_nb_occ)+';'+str(recouv.SMEM_tot_size)+';'+ 1022 str(recouv.NOSMEM_nb_bloc)+';'+str(recouv.NOSMEM_nb_occ)+';'+str(recouv.NOSMEM_tot_size)+';'+ 1023 str(b-a)+';'+str(len(self.sequences[0]))+';'+str(len(self.sequences[1]))+'\n') 1024 csv.close() 1025 1026 elif sr == 3: 1027 recouv = recouvrement.Recouvrement3(self.sequences[0]+self.sequences[1],seq,len(self.sequences[0])) 1028 blocs = recouv.eliminer_recouvrements() 1029 else: # 1 1030 #logging.debug('Recouv 1 eliminrecouv') 1031 recouv = recouvrement.Recouvrement(self.sequences[0]+self.sequences[1],seq,len(self.sequences[0]),min_size) 1032 blocs1 = recouv.eliminer_recouvrements() 1033 #blocs1 = {} 1034 #trace('blocs1 = recouv.eliminer_recouvrements()',locals()) 1035 #logging.info('Recouv 1 FIN eliminrecouv') 1036 blocs = {} 1037 for texte,liste in blocs1.iteritems(): 1038 cle = hash(texte) 1039 longueur = len(texte) 1040 # ATTENTION, on doit inverser ici SSI on inverse présédemment 1041 # dans get_MEM_index_chaine2(sr=1) lors de l'ajout dans dic_repeat2 1042 liste.reverse() 1043 blocs[(cle,longueur)] = liste 1044 del blocs1 1045 #print 'blocs:'+str(blocs) 1046 logging.log(5,'eliminer_recouvrements done') 1047 dic_chaine2 = {} 1048 longueur_s1 = len(self.sequences[0]) 1049 #print len(blocs),longueur_s1,len(self.sequences[0])+len(self.sequences[1]),min_size 1050 # ATTENTION le 1er espace (début de liste) est l'espace habituel 1051 # le second (fin de liste) est ALT+0160 qui est un espace insécable inséré, en particulier, 1052 # par Word avant les ponctuations faibles : ; ! ? et guillemets 1053 sep = """. !\r,\n:\t;-?"'`’() """ #liste des séparateurs 1054 # on ne garde que les chaine de taille > min et répétées 1055 for (cle,longueur),liste_pos in blocs.iteritems(): 1056 if (longueur >= min_size and len(liste_pos) >= 2): 1057 # on inverse car les pos ont été ajoutées dans l'ordre décroissant dans get_MEM 1058 liste_pos.reverse() 1059 if (liste_pos[0] < longueur_s1 <= liste_pos[-1]): 1060 if just_keep_words: 1061 # basé sur seulement 2 homologies dans la liste, quid du comportement avec + d'homologies ? 1062 # car pour couper une homologies, on regarde les caractères précédents et suivants 1063 # dans les 2 chaines, la premiere de la liste (texte 1) et la dernière (texte 2) 1064 #logging.debug( '1:' +self.sequences[0][liste_pos[0]:liste_pos[0]+longueur]+'/'+str(liste_pos)) 1065 # les car de début et fin des chaines doivent êrre égaux 1066 assert self.sequences[0][liste_pos[0]] == self.sequences[1][liste_pos[-1]-longueur_s1] 1067 assert self.sequences[0][liste_pos[0]+longueur-1] == self.sequences[1][liste_pos[-1]+longueur-1-longueur_s1], chaine+' / '+self.sequences[0][liste_pos[0]:liste_pos[0]+longueur]+' / '+self.sequences[1][liste_pos[-1]-longueur_s1:liste_pos[-1]+longueur-longueur_s1]+' / *'+self.sequences[0][liste_pos[0]+longueur]+'* / *'+self.sequences[1][liste_pos[-1]+longueur-longueur_s1]+'**' 1068 # recherche du premier séparateur dans la chaine 1069 i = liste_pos[0] ; i2 = liste_pos[-1] 1070 if (i == 0 or self.sequences[0][i-1] in sep) and (i2-longueur_s1 == 0 or self.sequences[1][i2-1-longueur_s1] in sep): 1071 # soit début de séquence et dans ce cas le car précédent est forcément un sep car on est en mode mot 1072 # soit caractère précédent est un séparateur 1073 decalage_avant = 0 1074 else: # sinon on cherche le 1er sep dans la chaine courante 1075 while (i < liste_pos[0]+longueur and self.sequences[0][i] not in sep and 1076 self.sequences[1][i2-longueur_s1] not in sep): i += 1 ; i2 += 1 1077 decalage_avant = i - liste_pos[0] 1078 1079 assert i >= liste_pos[0] 1080 # recherche du séparateur le + à droite dans la chaine 1081 j = liste_pos[0]+longueur-1 ; j2 = liste_pos[-1]+longueur-1 1082 if ((j == len(self.sequences[0])-1 or self.sequences[0][j+1] in sep) and 1083 (j2-longueur_s1 == len(self.sequences[1])-1 or self.sequences[1][j2+1-longueur_s1] in sep)): 1084 pass # idem que i: car de fin de séquence ou car precedent sep 1085 else: 1086 while (j >= liste_pos[0] and self.sequences[0][j] not in sep and 1087 j2 >= liste_pos[-1] and self.sequences[1][j2-longueur_s1] not in sep): j -= 1 ; j2 -= 1 1088 assert j <= liste_pos[0]+longueur-1 1089 #logging.debug('decalage_avanti='+str(decalage_avant)+', longueur2='+str(j+1-i)) 1090 if i < j: # si la nouvelle chaine n'est pas vide 1091 longueur2 = j+1 -i ; assert longueur2 <= longueur 1092 if (longueur2 >= min_size): 1093 cle2 = hash(self.sequences[0][i:j+1]) 1094 tt = [x+decalage_avant for x in liste_pos] 1095 dic_chaine2[(cle2,longueur2)] = tt 1096 #logging.debug('2:'+self.sequences[0][tt[0]:tt[0]+longueur2]+'/'+str(tt)) 1097 else: # cas standard 1098 dic_chaine2[(cle,longueur)] = liste_pos 1099 #print 'dic_chaine2:'+str(dic_chaine2) 1100 #print len(dic_chaine2)#,dic_chaine2 1101 logging.log(10,'len(blocs)='+str(len(blocs))+' / len(dic_chaine2)='+str(len(dic_chaine2)) 1102 +' / taille chaines dic_chaine2='+str(sum([longueur*len(li) for (c,longueur),li in dic_chaine2.iteritems()]))) 1103 texte = self.sequences[0] + self.sequences[1] 1104 for (cle,longueur),lOcc in dic_chaine2.iteritems(): 1105 logging.debug(texte[lOcc[0]:lOcc[0]+longueur]) 1106 return dic_chaine2

1107

1108 - def get_MEM_index_chaine(self, min_size=1):

1109 """ Converti le dic de sortie pour le module MediteAppli 1110 1111 index par la chaine directement qui donne les positions de cette chaine""" 1112 dic_MEM = self.get_MEM(min_size) 1113 texte = self.sequences[0] + '$'+ self.sequences[1] 1114 longueur_s1 = len(self.sequences[0]) 1115 dic_chaine = {} 1116 for longueur,liste_pos in dic_MEM.iteritems(): 1117 if len(liste_pos) < 2: continue 1118 for pos in liste_pos: 1119 chaine = texte[pos:pos+longueur] 1120 # à cause du $ dans la 2e chaine, tous les indices sont décalés de 1, donc on les MAJ correctement 1121 if pos > longueur_s1: pos -= 1 1122 try: 1123 dic_chaine[chaine].append(pos) 1124 except KeyError: 1125 dic_chaine[chaine] = [pos] 1126 #print dic_chaine 1127 dic_chaine2 = {} 1128 for chaine,liste_pos in dic_chaine.iteritems(): 1129 if (len(chaine) >= min_size and len(liste_pos) >= 2): 1130 # on inverse car les pos ont été ajoutées dans l'ordre décroissant dans get_MEM 1131 liste_pos.reverse() 1132 if (liste_pos[0] < longueur_s1 <= liste_pos[-1]): 1133 dic_chaine2[chaine] = liste_pos 1134 return dic_chaine2

1135

1136 -def trace(commande,dic_locals):

1137 #import hotshot,hotshot.stats,sys 1138 import profile,pstats,sys 1139 profile.runctx(commande,globals(), dic_locals,'c:\workspace\medite\statFile') 1140 s = pstats.Stats('c:\workspace\medite\statFile') 1141 s.sort_stats('time') 1142 s.print_stats()

1143 #sys.stderr.flush() 1144 #sys.stdout.flush() 1145 #prof = hotshot.Profile("c:\workspace\medite\statFile") 1146 #benchtime = prof.runctx(commande,globals(), dic_locals) 1147 #prof.close() 1148 #stats = hotshot.stats.load("c:\workspace\medite\statFile") 1149 #stats.strip_dirs() 1150 #stats.sort_stats('time', 'calls') 1151 #stats.print_stats(50) 1152

1153 -def simple_test():

1154 print 'SIMPLE TEST' 1155 st = SuffixTree('mississippi') 1156 for l in st.leaves: 1157 print 'leaf:', '"'+l.path_label+'"', ':', '"'+l.edge_label+'" path_position '+ \ 1158 str(l.path_position)+' / edge_label_start '+str(l.edge_label_start)+ \ 1159 ' / edge_label_end '+str(l.edge_label_end) 1160 1161 for n in st.inner_nodes: 1162 print 'inner:', '"'+n.edge_label+'" path_position '+ \ 1163 str(n.path_position)+' / edge_label_start '+str(n.edge_label_start)+ \ 1164 ' / edge_label_end '+str(n.edge_label_end) 1165 print 'done.\n\n'

1166

1167 -def generalised_test():

1168 print 'GENERALISED TEST' 1169 sequences = ['xabxa','babxba'] 1170 st = GeneralisedSuffixTree(sequences) 1171 1172 for shared in st.shared_substrings(): 1173 print '-'*70 1174 for seq,start,stop in shared: 1175 print seq, '['+str(start)+':'+str(stop)+']', 1176 print sequences[seq][start:stop], 1177 print sequences[seq][start:] 1178 print '='*70 1179 1180 for shared in st.shared_substrings(2): 1181 print '-'*70 1182 for seq,start,stop in shared: 1183 print seq, '['+str(start)+':'+str(stop)+']', 1184 print sequences[seq][start:stop], 1185 print sequences[seq][start:] 1186 print '='*70 1187 1188 print 'done.\n\n'

1189

1190 -def test3():

1191 print 'TEST3' 1192 #sequences = ['le chat est grand','1 chat est vert'] 1193 #['le chat est grand','un grand chat est vert'] 1194 sequences = ["Le 7 décembre, à onze heures du matin, et à deux heures et demie du soir, en pinçant la patte dont les vaisseaux sont liés, on y observe des mouvements réflexes. On laisse la grenouille dans les mêmes conditions que la veille.", 1195 "Le 7 décembre, à 11 h. du matin et à 2 h. 1/2, en pinçant la patte antérieure et la patte liée, on a des mouvements réflexes dans cette dernière."] 1196 st = GeneralisedSuffixTree(sequences) 1197 # st.shared_substrings2() 1198 texte=sequences[0]+sequences[1] 1199 for (start,stop) in st.shared_substrings2(): 1200 print '['+str(start)+':'+str(stop)+']', 1201 print "*"+texte[start:stop]+ "* \n ", 1202 print '='*70

1203

1204 -def test():

1205 simple_test()

1206 #generalised_test() 1207 #test3() 1208 1209 1210 if __name__ == '__main__': 1211 test() 1212

Source Code for Module medite.MediteAppli.suffix_tree