1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 import sys,string, logging, bisect, gc
20 from math import *
21 import psyco
22 import suffix_tree, recouvrement, utile
23 import numpy.oldnumeric as Numeric
24 import numpy.numarray as numarray
25 import numpy
26 import aligne
27
28
29
30
32 """ Interface, regroupe les fonctions communes """
34 """ Constructeur
35
36 #pre: isinstance(texte,str)
37 # len(texte)>0
38 """
39
40 self.tool = utile.Utile()
41
43 """ Fonction qui aligne 2 liste d'intervalles du texte
44
45 pre: isinstance(L1,list)
46 isinstance(L2,list)
47 len(L1)>0
48 len(L2)>0
49 forall([L1[i][0] >= L1[i-1][1] for i in range(1, len(L1))])
50 forall([L2[i][0] >= L2[i-1][1] for i in range(1, len(L2))])
51 """
52 pass
53
55 """ Détermine s'il y a une occurrence inférieure et une occurrence supérieure éà la frontière
56
57 pre: 0<=l_texte1<=self.l_texte1
58 """
59 return (not liste_occ == [] and liste_occ[0] < l_texte1 and liste_occ[-1] >= l_texte1)
60
62 sys.stderr.write(str+"\n")
63
65 if __debug__:
66 for i in xrange(1,len(L)):
67 assert L[i-1][1]<=L[i][0]
69 if __debug__:
70 for i in xrange(1,len(L)):
71 assert d<=L[i-1][1]<=L[i][0]<=f,"d "+str(d)+" / L["+str(i-1)+"][1] "+str(L[i-1][1])+" / L["+str(i)+"][0] "+str(L[i][0])+" / f "+str(f)+ \
72 "\nL "+str(L)+"\n"+self.lint2str(L,texte)
74 """ Transforme une liste d'intervalle sur self.texte en leur transcription textuelle
75
76 pre: isinstance(L,list)
77 post: isinstance(__return__,str) """
78 res = ""
79 if len(L) == 0: return ''
80 for x in L[:-1]:
81 res += texte[x[0]:x[1]] + " / "
82 res += texte[L[-1][0]:L[-1][1]]
83 return res
84
86 """ Aligneur naif, 1ere version utilisée dans MEDITE """
88
89 L = []
90 while S1 <> [] or S2 <> []:
91 if S1 == []:
92 L = self.tool.addition_intervalle(L, S2[0])
93 S2 = S2[1:]
94 elif S2 == []:
95 L = self.tool.addition_intervalle(L, S1[0])
96 S1 = S1[1:]
97 elif self.texte[S1[0][0]:S1[0][1]] == self.texte[S2[0][0]:S2[0][1]]:
98 S1 = S1[1:]
99 S2 = S2[1:]
100 elif self.absent(S1[0], S2):
101 L = self.tool.addition_intervalle(L, S1[0])
102 S1 = S1[1:]
103 elif self.absent(S2[0], S1):
104 L = self.tool.addition_intervalle(L, S2[0])
105 S2 = S2[1:]
106 else:
107 ecart1 = self.rang(S1[0], S2)
108 ecart2 = self.rang(S2[0], S1)
109 if (ecart1 > ecart2 or
110 (ecart1 == ecart2 and
111 (S1[0][1] - S1[0][0]) < (S2[0][1] - S2[0][0]))):
112 L = self.tool.addition_intervalle(L, S1[0])
113 S1 = S1[1:]
114 else:
115 L = self.tool.addition_intervalle(L, S2[0])
116 S2 = S2[1:]
117 return L
118
120
121
122 while S <> []:
123 if self.texte[I[0]:I[1]] == self.texte[S[0][0]:S[0][1]]:
124 return 0
125 else:
126 S = S[1:]
127 return 1
128
129 - def rang(self, I, S):
132
134 ncar = 0
135 while S <> [] and self.texte[I[0]:I[1]] <> self.texte[S[0][0]:S[0][1]]:
136 ncar = ncar + S[0][1] - S[0][0]
137 S = S[1:]
138 return ncar
139
140
141
143 """ Alignement dichotomique """
145 L=L2[:]
146 i=0
147
148 self.dicoPos = {}
149 while L <> []:
150 B2=L[0]
151 chaine = self.texte[B2[0]:B2[1]]
152 l=B2[1]-B2[0]
153 if not self.dicoPos.has_key(chaine):
154 self.dicoPos[chaine] = [(i,l)]
155 else:self.dicoPos[chaine].append((i,l))
156 i+=1
157 L=L[1:]
158 self.L1=L1
159 self.L2=L2
160 LRes1,LRes2 = self.trouver_deplacement(0,0,len(L1),0,len(L2))
161 i=c=0.0
162 for b in self.dicoPos:
163 i+=1
164 for p,l in self.dicoPos[b]:
165 c+=1
166
167 del self.L1, self.L2, self.dicoPos
168
169 dicoBest1={}
170 dicoBest2={}
171 LRes=[]
172 LResBC=[]
173 for x in LRes1: dicoBest1[x[0]]=1
174 for x in LRes2: dicoBest2[x[0]]=1
175 for i in L1:
176 if dicoBest1.has_key(i[0]):LRes.append(i)
177 else:LResBC.append(i)
178 for i in L2:
179 if dicoBest2.has_key(i[0]):LRes.append(i)
180 else:LResBC.append(i)
181 return LRes,LResBC
182
184
185
186
187 compt+=1
188 ecartMaxInit = max(fin1-deb2,fin2-deb1)
189
190
191 scoreMax = 0
192 LTemp1 = self.L1[deb1:fin1]
193 PosB1 = deb1
194 PosCandidatB1 = PosCandidatB2 = 0
195 candidatB = None
196 for Bi in LTemp1:
197 for (PosB2,longB2) in self.dicoPos[self.texte[Bi[0]:Bi[1]]]:
198 ecart = abs(PosB2 - PosB1)
199
200
201
202
203 score = 0*(ecartMaxInit-ecart) + 1*longB2
204
205
206 if (deb2<=PosB2<=fin2 and score>scoreMax):
207
208 candidatB=Bi
209
210
211 scoreMax=score
212 PosCandidatB1=PosB1
213 PosCandidatB2=PosB2
214 PosB1+=1
215 if candidatB != None:
216
217 La1,La2 = self.trouver_deplacement(compt,deb1,PosCandidatB1,deb2,PosCandidatB2)
218 Lb1,Lb2 = self.trouver_deplacement(compt,PosCandidatB1+1,fin1,PosCandidatB2+1,fin2)
219
220
221
222
223
224
225
226
227 return La1+Lb1,La2+Lb2
228 else:
229 return self.L1[deb1:fin1],self.L2[deb2:fin2]
230
231
233 """ Alignement avec ordre partiel et A* """
235 """ Calcule des dictionnaires des poistions et des couts fixes
236 pre: isinstance(L,list)
237 len(L)>0
238 post: forall([self.texte[x[0]:x[1]] in __return__[0].keys() for x in L])
239 """
240 dicoPos = {}
241 coutFixe = {}
242 cumul=0
243 for i in xrange(len(L)):
244 chaine = self.texte[L[i][0]:L[i][1]]
245 if not dicoPos.has_key(chaine): dicoPos[chaine] = []
246 dicoPos[chaine].append(i)
247 coutFixe[i]=cumul
248 cumul+=L[i][1]-L[i][0]
249 return dicoPos,coutFixe
250
252
253 dicoPos = {}
254 coutFixe = {}
255 dicoPos[1], coutFixe[1] = self.calcPosCoutFixe(L1)
256 dicoPos[2], coutFixe[2] = self.calcPosCoutFixe(L2)
257
258 diffSym = self.preTraitDiffSym(L1,L2,dicoPos)
259 best, closedSet = self.deplacements_pond_Astar(L1,L2,diffSym,dicoPos,coutFixe)
260 dicoBest1={}
261 dicoBest2={}
262 while best != (-1,-1):
263 dicoBest1[best[0]]=1
264 dicoBest2[best[1]]=1
265 best=closedSet[best][1]
266
267 LResDep=[]
268 LResBC=[]
269 for i in xrange(len(L1)):
270 if not dicoBest1.has_key(i):LResDep.append(L1[i])
271 else:LResBC.append(L1[i])
272 for i in xrange(len(L2)):
273 if not dicoBest2.has_key(i):LResDep.append(L2[i])
274 else:LResBC.append(L2[i])
275
276 del dicoPos, coutFixe, dicoBest1, dicoBest2
277
278 return LResDep,LResBC
279
280
282 """ implémentation de A* pour rechercher le meilleur chemin dans l'arbre des appariement entre blocs
283 Renvoie le noeud du dernier appariement """
284 openSet={}
285 closedSet={}
286 L1=L1Static
287 L2=L2Static
288 best=(-1,-1)
289 while True:
290 for posB1 in xrange(len(L1)):
291 for posB2 in dicoPos[2][self.texte[L1[posB1][0]:L1[posB1][1]]]:
292 if posB2<best[1]+1: continue
293 node=(posB1+best[0]+1,posB2)
294 cout=self.cout(best[0]+1,best[1]+1,posB1+best[0]+1,posB2,diffSym,coutFixe)
295 if closedSet.has_key(node):
296 if closedSet[node][0]>cout:
297 openSet[node]=(cout,best)
298 del closedSet[node]
299 elif openSet.has_key(node):
300 if openSet[node][0]>cout:
301 openSet[node]=(cout,best)
302 else: openSet[node]=(cout,best)
303 best=None
304 bestValue=None
305 for node in openSet:
306 if (best==None or openSet[node][0]<bestValue[0]):
307 best=node
308 bestValue=openSet[node]
309 L1=L1Static[best[0]+1:]
310 L2=L2Static[best[1]+1:]
311 del openSet[best]
312 closedSet[best]=bestValue
313 if (L1==[] or L2==[] or len(L1)+len(L2)==diffSym[(best[0],best[1])][1]):
314 return best,closedSet
315
316 - def cout(self,d1,d2,f1,f2,diffSym,coutFixe):
317 """ calcul du cout d'un noeud """
318 cF1=coutFixe[1][f1]-coutFixe[1][d1]
319 cF2=coutFixe[2][f2]-coutFixe[2][d2]
320 coutHeuristique=diffSym[(f1,f2)][0]
321 return cF1+cF2+coutHeuristique
322
324 """ Précalcul de toutes les différences symétriques possibles.
325 Ainsi chacune est calculée une seule fois et non un nombre combinatoire de fois si on faisait le calcul à la demande
326 pre: forall([self.texte[x[0]:x[1]] in dicoPos[1].keys() for x in L1])
327 forall([self.texte[x[0]:x[1]] in dicoPos[2].keys() for x in L1])
328 forall([self.texte[x[0]:x[1]] in dicoPos[1].keys() for x in L2])
329 forall([self.texte[x[0]:x[1]] in dicoPos[2].keys() for x in L2])
330 len(self.tool.difference(L1,L2))>0
331 """
332 diffSym={}
333
334 if niveau>0:
335
336
337
338 pass
339 posB1=0
340 for B1 in L1:
341 for posB2 in dicoPos[2][self.texte[B1[0]:B1[1]]]:
342 L=self.difference_symetrique(L1[posB1+1:],L2[posB2+1:],posB1,posB2,dicoPos)
343 cout=0
344 for B in L:
345 cout += B[1]-B[0]
346 diffSym[(posB1,posB2)]=(cout,len(L))
347 posB1+=1
348 return diffSym
349
351 """ différence symétrique entre 2 listes de blocs
352 pre: forall([self.texte[x[0]:x[1]] in dicoPos[1].iterkeys() for x in L1])
353 forall([self.texte[x[0]:x[1]] in dicoPos[2].iterkeys() for x in L1])
354 forall([self.texte[x[0]:x[1]] in dicoPos[1].iterkeys() for x in L2])
355 forall([self.texte[x[0]:x[1]] in dicoPos[2].iterkeys() for x in L2])
356 len(self.tool.difference(L1,L2))>0
357 forall(L1, lambda x:self.texte[x[0]:x[1]] in dicoPos[1].iterkeys())
358 forall(L1, lambda x:self.texte[x[0]:x[1]] in dicoPos[2].iterkeys())
359 forall(L2, lambda x:self.texte[x[0]:x[1]] in dicoPos[1].iterkeys())
360 forall(L2, lambda x:self.texte[x[0]:x[1]] in dicoPos[2].iterkeys())
361 """
362 if (len(L1)==0 and len(L2)==0): return []
363 elif (len(L1)==0): return L2
364 elif (len(L2)==0): return L1
365 LRes=[]
366 for B1 in L1:
367 found=False
368 for posB2 in dicoPos[2][self.texte[B1[0]:B1[1]]]:
369 if posB2>=deb2+1: found=True
370 if not found: LRes.append(B1)
371 for B2 in L2:
372 found=False
373 for posB1 in dicoPos[1][self.texte[B2[0]:B2[1]]]:
374 if posB1>=deb1+1: found=True
375 if not found: LRes.append(B2)
376
377 return LRes
378
381 res = 0
382 for cle,val in self:
383 res += val
384 return res
385
387 """Multiset implémenté par un dictionnaire"""
389 assert isinstance(liste, list)
390 self.valeur = 0
391 self.length = 0
392 for (val,deb,fin) in liste:
393 try:
394 nb,longueur = self[val]
395 self[val] = nb+1,longueur
396 except KeyError:
397 self[val] = (1, fin-deb)
398 self.valeur += fin - deb
399 self.length += 1
401 """Différence entre l'objet courant et mset2, renvoie un nouveau mset"""
402 assert isinstance(mset2, Mset)
403 res = Mset()
404 for cle,(nb, longueur) in self.iteritems():
405 if cle not in mset2:
406 res[cle] = (nb, longueur)
407 res.valeur += nb * longueur
408 res.length += nb
409 else:
410 (nb2, longueur2) = mset2[cle]
411 if nb > nb2:
412 res[cle] = (nb-nb2, longueur)
413 res.valeur += (nb-nb2) * longueur
414 res.length += nb-nb2
415 return res
417 assert isinstance(mset2, list)
418 res = Mset()
419 for cle,(nb, longueur) in self.iteritems():
420 if cle not in mset2:
421 res[cle] = (nb, longueur)
422 res.valeur += nb * longueur
423 res.length += nb
424 else:
425 (nb2, longueur2) = mset2[cle]
426 if nb > nb2:
427 res[cle] = (nb-nb2, longueur)
428 res.valeur += (nb-nb2) * longueur
429 res.length += nb-nb2
430 return res
432 """Intersection entre l'objet courant et mset2, renvoie un nouveau mset"""
433 assert isinstance(mset2, Mset)
434 res = Mset()
435 if self.length <= mset2.length: grand = mset2 ; petit = self
436 else: grand = self ; petit = mset2
437 for cle,(nb, longueur) in petit.iteritems():
438 try:
439 (nb2, longueur2) = grand[cle]
440 if nb <= nb2: nbToAdd = nb
441 else: nbToAdd = nb2
442 res[cle] = (nbToAdd, longueur)
443 res.valeur += nbToAdd * longueur
444 res.length += nbToAdd
445 except KeyError:
446 pass
447 return res
449 """Union entre l'objet courant et mset2, ATTENTION modifie l'objet courant"""
450 assert isinstance(mset2, Mset)
451 for cle,(nb, longueur) in mset2.iteritems():
452 try:
453 nb2, longueur2 = self[cle]
454 self[cle] = nb+nb2, longueur
455 except KeyError:
456 self[cle] = nb, longueur
457 self.valeur += nb * longueur
458 self.length += nb
459
461 """ texte passé en paramétre des fonctions """
463 """ Calcule des dictionnaires des positions et des couts fixes
464
465 pre: isinstance(L,list)
466 len(L)>0
467 #post: forall([texte[x[0]:x[1]] in __return__[0].keys() for x in L])
468 """
469 dicoPos = {}
470 coutFixe = {}
471 cumul=0
472 for i in xrange(len(L)):
473
474
475 chaine, longueur = L[i]
476 try:
477 dicoPos[chaine].append(i)
478 except KeyError:
479 dicoPos[chaine] = [i]
480 coutFixe[i]=cumul
481 cumul+=longueur
482 return dicoPos,coutFixe
483
485 L1 = [] ; L2 = [] ; Lcf1 = [] ; Lcf2 = [] ; LH1 = [] ; LH2 = []
486 for bloc in Lv1:
487 cle = hash(self.texte[bloc[0]:bloc[1]])
488 L1.append(cle)
489 Lcf1.append((cle,bloc[1]-bloc[0]))
490 LH1.append((cle, bloc[0], bloc[1]))
491 print LH1
492 logging.debug('init L1 done')
493 for bloc in Lv2:
494 cle = hash(self.texte[bloc[0]:bloc[1]])
495 L2.append(cle)
496 Lcf2.append((cle,bloc[1]-bloc[0]))
497 LH2.append((cle, bloc[0], bloc[1]))
498 logging.debug('init L2 done')
499
500 dicoPos = {}
501 coutFixe = {}
502 dicoPos[1], coutFixe[1] = self.calcPosCoutFixe(Lcf1,self.texte)
503 dicoPos[2], coutFixe[2] = self.calcPosCoutFixe(Lcf2,self.texte)
504 for cle in dicoPos[1]: assert cle in dicoPos[2]
505 for cle in dicoPos[2]: assert cle in dicoPos[1]
506
507 diffSym = self.preTraitDiffSym(LH1,LH2,self.texte,dicoPos)
508 best, closedSet = self.deplacements_pond_Astar(L1,L2,self.texte,diffSym,dicoPos,coutFixe)
509 dicoBest1={}
510 dicoBest2={}
511 while best != (-1,-1):
512 dicoBest1[best[0]]=1
513 dicoBest2[best[1]]=1
514 best=closedSet[best][1]
515
516 LResDep=[]
517 LResBC=[]
518 for i in xrange(len(L1)):
519 if not dicoBest1.has_key(i):LResDep.append(L1[i])
520 else:LResBC.append(L1[i])
521 for i in xrange(len(L2)):
522 if not dicoBest2.has_key(i):LResDep.append(L2[i])
523 else:LResBC.append(L2[i])
524
525 del dicoPos, coutFixe, dicoBest1, dicoBest2
526
527 return LResDep,LResBC
528
529
531 """ implémentation de A* pour rechercher le meilleur chemin dans l'arbre des appariement entre blocs
532 Renvoie le noeud du dernier appariement """
533 openSet={}
534 closedSet={}
535 L1=L1Static
536 L2=L2Static
537 logging.debug('len(L1)='+str(len(L1))+' / len(L2)='+str(len(L2)))
538
539
540 best=(-1,-1)
541 while True:
542 cptEval = 0
543 for posB1 in xrange(len(L1)):
544
545 for posB2 in dicoPos[2][L1[posB1]]:
546 if posB2<best[1]+1: continue
547 cptEval += 1
548 node=(posB1+best[0]+1,posB2)
549 cout=self.cout(best[0]+1,best[1]+1,posB1+best[0]+1,posB2,diffSym,coutFixe)
550 if closedSet.has_key(node):
551 if closedSet[node][0]>cout:
552 openSet[node]=(cout,best)
553 del closedSet[node]
554 elif openSet.has_key(node):
555 if openSet[node][0]>cout:
556 openSet[node]=(cout,best)
557 else: openSet[node]=(cout,best)
558 best=None
559 bestValue=None
560 for node in openSet:
561 if (best==None or openSet[node][0]<bestValue[0]):
562 best=node
563 bestValue=openSet[node]
564 L1=L1Static[best[0]+1:]
565 L2=L2Static[best[1]+1:]
566 del openSet[best]
567 closedSet[best]=bestValue
568 if (L1==[] or L2==[] or len(L1)+len(L2)==diffSym[(best[0],best[1])][1]):
569 return best,closedSet
570
571
572
573
574
575
576
577
578 - def deplacements_pond_Astar(self, L1Static, L2Static, diffSym, dicoPos,
579 coutFixe, dicoMinMax1=None, dicoMinMax2=None,lSuppression=None, lInsertion=None,
580 arrayRepetT1=None, arrayRepetT2=None, MO=False):
581 """ implémentation de A* pour rechercher le meilleur chemin dans l'arbre des appariement entre blocs
582 Renvoie le noeud du dernier appariement """
583 openSet = {}
584 closedSet = {}
585 goalNodes = {}
586 L1 = L1Static
587 L2 = L2Static
588 logging.debug('len(L1)='+str(len(L1))+' / len(L2)='+str(len(L2)))
589 logging.debug('openSet = '+str(openSet))
590 logging.debug('closedSet = '+str(closedSet))
591 best = (-1,-1)
592 if MO: bestValue = (0.0+sys.maxint, best, (0,0,0,0,0,0,0,0,0,0,0,0))
593 else: bestValue = (0.0+sys.maxint, best, 0)
594
595 while True:
596 cptEval = 0
597 for posB1 in xrange(len(L1)):
598
599 for posB2 in dicoPos[2][L1[posB1]]:
600 if posB2 < best[1]+1: continue
601 cptEval += 1
602 node = (posB1+best[0]+1,posB2)
603
604 if MO:
605 nodeValue = self.cout(best, node, diffSym, coutFixe,
606 bestValue, dicoMinMax1, dicoMinMax2, lSuppression, lInsertion,
607 arrayRepetT1, arrayRepetT2)
608
609 cout = nodeValue[0]
610 else:
611 cout, cF = self.cout(best, node, diffSym, coutFixe, bestValue)
612 nodeValue = (cout,best,cF)
613 if closedSet.has_key(node):
614 if closedSet[node][0] > cout:
615 openSet[node] = nodeValue
616 del closedSet[node]
617 elif openSet.has_key(node):
618 if openSet[node][0] > cout:
619 openSet[node] = nodeValue
620 else: openSet[node] = nodeValue
621
622
623
624
625
626
627
628
629
630
631
632 best = None
633 bestValue = None
634 if len(openSet) > 0:
635 for node in openSet:
636 if (best == None or openSet[node][0] < bestValue[0]):
637 best = node
638 bestValue = openSet[node]
639 logging.debug( best)
640 L1 = L1Static[best[0]+1:]
641 L2 = L2Static[best[1]+1:]
642 del openSet[best]
643 closedSet[best] = bestValue
644
645
646
647 if (L1==[] or L2==[] or len(L1)+len(L2)==diffSym[(best[0],best[1])][1] or
648 len(openSet) == 0):
649 goalNodes[best] = bestValue
650 if len(openSet) == 0 or len(goalNodes) == 5:
651 best = None ; bestValue = None
652 for node in goalNodes:
653 if (best == None or goalNodes[node][0] < bestValue[0]):
654 best = node
655 bestValue = goalNodes[node]
656 return best,closedSet
657
658
659
660
661
662
663
664
665
666
667 - def cout(self,(d1,d2),(f1,f2),diffSym,coutFixe, bestValue):
668
669 """ calcul du cout d'un noeud """
670 d1 += 1 ; d2 += 1
671 cF1=coutFixe[1][f1]-coutFixe[1][d1]
672 cF2=coutFixe[2][f2]-coutFixe[2][d2]
673 coutHeuristique=diffSym[(f1,f2)][0]
674
675 return cF1+cF2+coutHeuristique + bestValue[2], cF1+cF2
676
677
679 try:
680 return diffSym[(f1,f2)]
681 except KeyError:
682 pass
683
685 diffSym={}
686 LH1=L1 ; LH2=L2
687 a='''LH1 = [] ; LH2 = []
688 for bloc in L1:
689 LH1.append((bloc[0], bloc[1]-bloc[1]))
690 logging.debug('init LH1 done')
691 for bloc in L2:
692 LH2.append((bloc[0], bloc[1]-bloc[1]))
693 logging.debug('init LH2 done')'''
694 logging.log(5,'len(LH2)= '+str(len(LH2))+' / len(LH1)= '+str(len(LH1)))
695
696 len_L1 = len(LH1) ; len_L2 = len(LH2)
697
698
699
700
701
702 i=0
703 for posB1 in xrange(len_L1-1,-1,-1):
704 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,'itération LH1: '+str(i))
705 LL1 = Mset(LH1[posB1+1:])
706 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,' done MSet1')
707 liste_pos2 = dicoPos[2][LH1[posB1][0]]
708 j = len(liste_pos2)-1
709 while j >= 0:
710 if j == len(liste_pos2)-1:
711 LL2 = Mset(LH2[liste_pos2[j]+1:]) ; ds = nb = 0
712 LL1res = LL1.difference(LL2)
713 LL2res = LL2.difference(LL1)
714 else:
715 posB2 = liste_pos2[j]
716 next_posB2 = liste_pos2[j+1]
717
718
719
720
721
722
723
724
725 new_LL2 = Mset(LH2[posB2+1:next_posB2])
726 t2 = new_LL2.difference(LL1res)
727 LL2res.union(t2)
728 LL1res = LL1res.difference(new_LL2)
729
730
731 ds2 = LL1res.valeur + LL2res.valeur
732 nb2 = LL1res.length + LL2.length
733
734
735 diffSym[(posB1,liste_pos2[j])] = (ds2,nb2)
736 j -= 1
737 diffSym[(posB1,liste_pos2[0])] = (ds2,nb2)
738 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,' done itération interne')
739 i+=1
740 return diffSym
741
743 logging.log(5,'begin preTraitDiffSymVect')
744 diffSym={}
745 LH1=L1 ; LH2=L2
746 logging.log(5,'len(LH2)= '+str(len(LH2))+' / len(LH1)= '+str(len(LH1)))
747 dic_alphabet = {}
748 for cle,deb,fin in L1:
749 dic_alphabet[cle] = fin-deb
750 for cle,deb,fin in L2:
751 dic_alphabet[cle] = fin-deb
752 taille_alphabet = len(dic_alphabet)
753 temp = []
754 for cle in dic_alphabet:
755 bisect.insort_right(temp, cle)
756 array_pos_alphabet = numarray.array(temp)
757 assert len(array_pos_alphabet) == taille_alphabet
758 array_valeur_alphabet = numarray.zeros(taille_alphabet)
759 for pos in xrange(taille_alphabet):
760 cle = array_pos_alphabet[pos]
761 val = dic_alphabet[cle]
762
763 dic_alphabet[cle] = pos
764 array_valeur_alphabet[pos] = val
765
766 LH1 = [cle for cle,deb,fin in LH1]
767 LHH1 = [dic_alphabet[cle] for cle in LH1]
768
769
770
771
772
773
774
775
776
777
778
779
780
781 LH2 = [cle for cle,deb,fin in LH2]
782 LHH2 = [dic_alphabet[cle] for cle in LH2]
783
784
785
786
787
788
789
790
791
792 logging.log(4,'LH1 = '+str(LH1))
793 logging.log(4,'LH2 = '+str(LH2))
794 logging.log(4,'array_pos_alphabet = '+str(array_pos_alphabet))
795 logging.log(4,'array_valeur_alphabet = '+str(array_valeur_alphabet))
796 logging.log(4,'dic_alphabet = '+str(dic_alphabet))
797
798
799
800
801 logging.log(5,'creation alphabet done')
802
803 current = numarray.zeros(taille_alphabet)
804 accu = numarray.zeros(taille_alphabet)
805 def liste_to_alphab_array(liste,array):
806
807
808
809
810
811
812
813
814
815 numarray.logical_and(current,array_zero, current)
816 numarray.logical_and(accu,array_zero, accu)
817 for i in xrange(len(liste)-1,-1,-1):
818 pos = LHH2[i]
819 numarray.where(numarray.arange(taille_alphabet)==pos,1,0,out=current)
820 numarray.add(accu,current,accu)
821 array = accu
822
823
824
825
826
827
828 assert len(liste) == numarray.sum(array),str(len(liste))+' / '+str(numarray.sum(array))
829
830 def alphab_array_val_length(array):
831 val = length = 0
832 for pos in xrange(taille_alphabet):
833 nb = max(0,array[pos])
834 length += nb
835 val += nb * array_valeur_alphabet[pos]
836
837
838 assert 0 <= length <= val
839
840
841
842 return val, length
843
844 len_L1 = len(LH1) ; len_L2 = len(LH2)
845 i=0
846 array_zero = numarray.zeros(taille_alphabet)
847 array_inter = numarray.zeros(taille_alphabet)
848 LL1 = numarray.zeros(taille_alphabet)
849 LL2 = numarray.zeros(taille_alphabet)
850 LL1res = numarray.zeros(taille_alphabet)
851 LL2res = numarray.zeros(taille_alphabet)
852 new_LL1 = numarray.zeros(taille_alphabet)
853 for posB1 in xrange(len_L1-1,-1,-1):
854 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,'itération LH1: '+str(i))
855 if i == 300: break
856
857 numarray.logical_and(LL1,array_zero, LL1)
858 assert numarray.sum(LL1) == 0
859 liste_to_alphab_array(LHH1[posB1+1:], LL1)
860
861
862
863
864 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,' done MSet1')
865
866 liste_pos2 = dicoPos[2][LH1[posB1]]
867 j = len(liste_pos2)-1
868 while j >= 0:
869
870
871
872
873
874 if j == len(liste_pos2)-1:
875 numarray.logical_and(LL2,array_zero, LL2)
876 numarray.logical_and(LL1res,array_zero, LL1res)
877 numarray.logical_and(LL2res,array_zero, LL2res)
878 assert numarray.sum(LL2) == numarray.sum(LL1res) == numarray.sum(LL2res) == 0
879
880 liste_to_alphab_array(LHH2[liste_pos2[j]+1:], LL2) ; ds = nb = 0
881
882
883
884
885
886 numarray.subtract(LL1,LL2, LL1res)
887
888
889 numarray.subtract(LL2,LL1, LL2res)
890
891
892
893 else:
894 posB2 = liste_pos2[j]
895 next_posB2 = liste_pos2[j+1]
896 numarray.logical_and(new_LL1,array_zero, new_LL1)
897 numarray.logical_and(array_inter,array_zero, array_inter)
898 assert numarray.sum(new_LL1) == numarray.sum(array_inter) == 0
899
900 liste_to_alphab_array(LHH1[posB2+1:next_posB2+1], new_LL1)
901
902
903
904
905
906
907 numarray.minimum(LL2res,new_LL1,array_inter)
908
909
910 numarray.subtract(LL1res,array_inter, LL1res)
911
912
913 numarray.subtract(LL2res,new_LL1, LL2res)
914
915
916
917
918
919
920 d1,n1 = alphab_array_val_length(LL1res)
921 d2,n2 = alphab_array_val_length(LL2res)
922 ds2 = d1 + d2 ; nb2 = n1 + n2
923 diffSym[(posB1,liste_pos2[j])] = (ds2,nb2)
924 j -= 1
925 diffSym[(posB1,liste_pos2[0])] = (ds2,nb2)
926 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,' done itération interne')
927 i+=1
928
929 return diffSym
930
932 r='''class dicH(weakref.WeakKeyDictionary):
933 def __init__(self, dico=None):
934 if dico is None:
935 self.totalItem = 0
936 self.valeur = 0
937 else:
938 for cle,nbItem in dico.iteritems():
939 self[cle] = nbItem #[pos for pos in liste_pos]
940 self.totalItem = dico.totalItem
941 self.valeur = dico.valeur'''
942 def _addBloc(bloc):
943 (cle, debut, fin) = bloc
944 try:
945
946 dico[cle] += 1
947 except KeyError:
948 dico[cle] = 1
949 dico['valeur'] += fin - debut
950 dico['totalItem'] += 1
951 def _testBloc(bloc):
952 (cle, debut, fin) = bloc
953 try:
954
955
956
957
958
959 if diffSymCourante[cle] == 1:
960 del diffSymCourante[cle]
961 else:
962 diffSymCourante[cle] -= 1
963 diffSymCourante['valeur'] -= fin - debut
964 diffSymCourante['totalItem'] -= 1
965 except KeyError:
966 diffSymCourante[cle] = 1
967 diffSymCourante['valeur'] += fin - debut
968 diffSymCourante['totalItem'] += 1
969
970 diffSym={}
971
972 LH1 = [] ; LH2 = []
973 for bloc in L1:
974 LH1.append((hash(texte[bloc[0]:bloc[1]]), bloc[0], bloc[1]))
975 logging.debug('init LH1 done')
976 for bloc in L2:
977 LH2.append((hash(texte[bloc[0]:bloc[1]]), bloc[0], bloc[1]))
978 logging.debug('init LH2 done')
979
980 dico = {'valeur':0, 'totalItem':0}
981
982
983
984
985 ld=i=0
986 logging.debug('len(LH2)= '+str(len(LH2))+' / len(LH1)= '+str(len(LH1)))
987 for pos2 in xrange(len(LH2)-1,-1,-1):
988 if (i<10 or i>len(LH2)-10 or i%10==0): logging.debug('itération LH2: '+str(i));k=True
989 else:k=False
990 if pos2 < (len(LH2)-1):
991 _addBloc(LH2[pos2+1])
992 assert dico['totalItem'] == ld + 1 ; ld+=1
993
994 if pos2 > 0: diffSymCourante = weakref.WeakKeyDictionary(dico.copy())
995 else: diffSymCourante = dico
996 logging.debug(' copie dico done')
997 j=0
998 for pos1 in xrange(len(LH1)-1,-1,-1):
999
1000 if pos1 < (len(LH1)-1): _testBloc(LH1[pos1+1])
1001 diffSym[(pos1,pos2)] = (diffSymCourante['valeur'],diffSymCourante['totalItem'])
1002 j+=1
1003 logging.debug(' itération interne done')
1004 i+=1
1005
1006 a='''for pos2 in xrange(len(LH2)-1,-1,-1):
1007 if pos2 < len(LH2)-1: _testBloc(LH2[pos2+1], True)
1008 diffSymCourante = dico.copy()
1009 prev_taille = diffSymCourante['totalItem']
1010 for pos1 in xrange(len(LH1)):
1011 if pos1 < len(LH1)-1: _testBloc(LH1[pos1+1])
1012 diffSym[(pos1,pos2)] = (diffSymCourante['valeur'],diffSymCourante['totalItem'])
1013 #assert diffSymCourante['totalItem'] < prev_taille, str(diffSymCourante['totalItem'])+' '+str(prev_taille)'''
1014 assert len(diffSym) == len(L1) * len(L2), str(len(diffSym))+' '+str(len(L1))+' '+str(len(L2))
1015
1016
1017
1018
1019
1020
1021 return diffSym
1022
1023
1025 def _addBloc(bloc):
1026 cle = hash(texte[bloc[0]:bloc[1]])
1027 try:
1028 dico[cle].append(bloc[0])
1029 except KeyError:
1030 dico[cle] = [bloc[0]]
1031 dico['valeur'] += bloc[1] - bloc[0]
1032 dico['nbItem'] += 1
1033 diffSym={}
1034
1035 L = {1:L1, 2:L2}
1036 tabDiffSym = {}
1037 dico = {'valeur':0, 'nbItem':0}
1038 tabDiffSym[(len(L1),-1)] = {}
1039 for i in xrange(len(L1)-1,-1,-1):
1040 _addBloc(L1[i])
1041 dico_courant = dico.copy()
1042 tabDiffSym[(i,-1)] = dico_courant
1043 assert dico['nbItem'] == len(L1)
1044
1045 tabDiffSym[(len(L1),-1)] = {'valeur':0, 'nbItem':0} ; bloc = {}
1046 for j in xrange(len(L2)-1,-1,-1):
1047 bloc[2] = L2[j]
1048 for i in xrange(len(L1)-1,-1,-1):
1049 bloc[1] = L1[i]
1050 diffSymCourante = tabDiffSym[(i+1,-1)]
1051
1052 for k in [1,2]:
1053 cle = hash(texte[bloc[k][0]:bloc[k][1]])
1054 try:
1055 liste_pos_bloc = diffSymCourante[cle]
1056 try:
1057 liste_pos_bloc.pop()
1058 except IndexError:
1059 del diffSymCourante[cle]
1060 diffSymCourante['valeur'] -= bloc[k][1] - bloc[k][0]
1061 diffSymCourante['nbItem'] -= 1
1062 except KeyError:
1063 diffSymCourante[cle] = [bloc[k][0]]
1064 diffSymCourante['valeur'] += bloc[k][1] - bloc[k][0]
1065 diffSymCourante['nbItem'] += 1
1066 tabDiffSym[(i,0)] = diffSymCourante
1067 diffSym[(i,j)] = (diffSymCourante['valeur'],diffSymCourante['nbItem'])
1068 for i in xrange(len(L1)):
1069 tabDiffSym[(i,-1)] = tabDiffSym[(i,0)]
1070 del tabDiffSym[(i,0)]
1071 dico = tabDiffSym[(len(L1),-1)]
1072 _addBloc(L2[j])
1073 tabDiffSym[(len(L1),-1)] = dico
1074 assert len(tabDiffSym) == len(L1)+1
1075 assert len(diffSym) == len(L1) * len(L2),str(len(diffSym))+str(len(L1))+str(len(L2))
1076 return diffSym
1077
1079 """ Précalcul de toutes les différences symétriques possibles.
1080 Ainsi chacune est calculée une seule fois et non un nombre combinatoire de fois si on faisait le calcul à la demande
1081 pre: forall([texte[x[0]:x[1]] in dicoPos[1].keys() for x in L1])
1082 forall([texte[x[0]:x[1]] in dicoPos[2].keys() for x in L1])
1083 forall([texte[x[0]:x[1]] in dicoPos[1].keys() for x in L2])
1084 forall([texte[x[0]:x[1]] in dicoPos[2].keys() for x in L2])
1085 len(self.tool.difference(L1,L2))>0
1086 """
1087 diffSym={}
1088
1089 posB1=0
1090 for B1 in L1:
1091 for posB2 in dicoPos[2][texte[B1[0]:B1[1]]]:
1092 L=self.difference_symetrique(L1[posB1+1:],L2[posB2+1:],texte,posB1,posB2,dicoPos)
1093 cout=0
1094 for B in L:
1095 cout += B[1]-B[0]
1096 diffSym[(posB1,posB2)]=(cout,len(L))
1097 posB1+=1
1098 assert len(diffSym) <= len(L1) * len(L2),str(len(diffSym))+' '+str(len(L1))+' '+str(len(L2))
1099 return diffSym
1100
1102 """ différence symétrique entre 2 listes de blocs
1103 pre: forall([texte[x[0]:x[1]] in dicoPos[1].iterkeys() for x in L1])
1104 forall([texte[x[0]:x[1]] in dicoPos[2].iterkeys() for x in L1])
1105 forall([texte[x[0]:x[1]] in dicoPos[1].iterkeys() for x in L2])
1106 forall([texte[x[0]:x[1]] in dicoPos[2].iterkeys() for x in L2])
1107 len(self.tool.difference(L1,L2))>0
1108 forall(L1, lambda x:texte[x[0]:x[1]] in dicoPos[1].iterkeys())
1109 forall(L1, lambda x:texte[x[0]:x[1]] in dicoPos[2].iterkeys())
1110 forall(L2, lambda x:texte[x[0]:x[1]] in dicoPos[1].iterkeys())
1111 forall(L2, lambda x:texte[x[0]:x[1]] in dicoPos[2].iterkeys())
1112 """
1113 if (len(L1)==0 and len(L2)==0): return []
1114 elif (len(L1)==0): return L2
1115 elif (len(L2)==0): return L1
1116 LRes=[]
1117 for B1 in L1:
1118 found=False
1119 for posB2 in dicoPos[2][texte[B1[0]:B1[1]]]:
1120 if posB2>=deb2+1: found=True
1121 if not found: LRes.append(B1)
1122 for B2 in L2:
1123 found=False
1124 for posB1 in dicoPos[1][texte[B2[0]:B2[1]]]:
1125 if posB1>=deb1+1: found=True
1126 if not found: LRes.append(B2)
1127
1128 return LRes
1129
1130
1131
1132 a='''
1133 class AlignAstarRecur3(AlignAstar):
1134 def __init__(self,texte,l_texte1,long_min_pivots):
1135 AlignAstar.__init__(self,texte)
1136 self.long_min_pivots=long_min_pivots
1137 self.l_texte1 = l_texte1
1138 def deplacements_pond(self, L1, L2):
1139 """ Fonction qui aligne 2 listes d'intervalles du texte
1140 pre: isinstance(L1,list)
1141 isinstance(L2,list)
1142 len(L1)>0
1143 len(L2)>0
1144 forall([self.l_texte1 >= L1[i][0] >= L1[i-1][1] >= 0 for i in range(1, len(L1))])
1145 forall([len(self.texte) >= L2[i][0] >= L2[i-1][1] >= self.l_texte1 for i in range(1, len(L2))])
1146 post:forall([len(self.texte) >=__return__[1][i][0] >= __return__[1][i-1][1] >= 0 for i in range(1, len(__return__[1]))])
1147 """
1148 #forall([len(self.texte) >=__return__[0][i][0] >= __return__[0][i-1][1] >= 0 for i in range(1, len(__return__[0]))])
1149 #forall([len(self.texte) >=__return__[2][i][0] >= __return__[2][i-1][1] >= 0 for i in range(1, len(__return__[2]))])
1150 LResDep1,LResDep2,LResBC1,LResBC2 = self.deplacements_pond__(L1, L2, 0,self.l_texte1,self.l_texte1,len(self.texte), 0)
1151 LDep,LUnique = self.removeUnique(LResDep1+LResDep2)
1152 return LDep,LResBC1+LResBC2,LUnique
1153
1154 def removeUnique(self,L):
1155 """ Scinde L en 2 listes, une pour les chaines ayant plusieurs occurences
1156 et l'autre pour celles en ayant une seule
1157 #pre: forall([len(self.texte) >= L[i][0] >= L[i-1][1] >= 0 for i in range(1, len(L))])
1158 #post: forall([len(self.texte) >=__return__[0][i][0] >= __return__[0][i-1][1] >= 0 for i in range(1, len(__return__[0]))])
1159 # forall([len(self.texte) >=__return__[1][i][0] >= __return__[1][i-1][1] >= 0 for i in range(1, len(__return__[1]))])
1160 """
1161 dicDep = {}
1162 for x in L:
1163 t=self.texte[x[0]:x[1]]
1164 if not dicDep.has_key(t): dicDep[t]=[]
1165 dicDep[t].append(x[0])
1166 LDep=[]
1167 LUnique=[]
1168 for clef in dicDep:
1169 locc=dicDep[clef]
1170 if self.repetition(locc,self.l_texte1):
1171 for occ in locc: LDep = self.tool.addition_intervalle(LDep,[occ, occ+len(clef)])
1172 else:
1173 for occ in locc: LUnique = self.tool.addition_intervalle(LUnique,[occ, occ+len(clef)])#sans fusion
1174 return LDep,LUnique
1175
1176 def deplacements_pond__(self, L1, L2, debT1, debT2, finT1, finT2, niveau):
1177 """ Fonction qui aligne 2 listes d'intervalles du texte
1178 pre: isinstance(L1,list)
1179 isinstance(L2,list)
1180 len(L1)>0
1181 len(L2)>0
1182 forall([debT1 >= L1[i][0] >= L1[i-1][1] >= finT1 for i in range(1, len(L1))])
1183 forall([debT2 >= L2[i][0] >= L2[i-1][1] >= finT2 for i in range(1, len(L2))])
1184 #post[debT1, debT2]:
1185 # forall([finT1>=__return__[2][i][0] >= __return__[2][i-1][1] >= __old__.debT1 for i in range(1, len(__return__[2]))])
1186 # forall([finT2>=__return__[3][i][0] >= __return__[3][i-1][1] >= __old__.debT2 for i in range(1, len(__return__[3]))])
1187 #forall([finT1>=__return__[0][i][0] >= __return__[0][i-1][1] >= __old__.debT1 for i in range(1, len(__return__[0]))])
1188 #forall([finT2>=__return__[1][i][0] >= __return__[1][i-1][1] >= __old__.debT2 for i in range(1, len(__return__[1]))])
1189 """
1190 # dicoPos[1] stocke pour chaque bloc de L1 ses positions dans cette liste
1191 dicoPos = {}
1192 coutFixe = {}
1193 dicoPos[1], coutFixe[1] = self.calcPosCoutFixe(L1)
1194 dicoPos[2], coutFixe[2] = self.calcPosCoutFixe(L2)
1195 if niveau>0:
1196 #sys.stderr.write("t1="+self.texte[debT1:finT1]+"\n")
1197 #sys.stderr.write("t2="+self.texte[debT2:finT2]+"\n")
1198 #assert(dicoPos[1].has_key('.on.'))
1199 #sys.stderr.flush()
1200 pass
1201 for x in dicoPos[1]:
1202 assert(x in self.texte[debT1:finT1])
1203 assert(x in self.texte[debT2:finT2])
1204 for x in dicoPos[2]:
1205 assert(x in self.texte[debT1:finT1])
1206 assert(x in self.texte[debT2:finT2])
1207 diffSym = self.preTraitDiffSym(L1,L2,dicoPos,niveau) # pré-calcul des différences symétriques
1208 best, closedSet = self.deplacements_pond_Astar(L1,L2,diffSym,dicoPos,coutFixe) # A*
1209 LBest = {1:[], 2:[]}
1210 while best != (-1,-1): # on remonte le meilleur parcours trouvé
1211 LBest[1].append(best[0])
1212 LBest[2].append(best[1])
1213 best=closedSet[best][1] # noeud pére
1214 LBest[1].reverse()
1215 LBest[2].reverse()
1216 for i in xrange(1,len(LBest[1])):
1217 assert(0<=LBest[1][i-1]<=LBest[1][i]<=len(L1))
1218 assert(0<=LBest[2][i-1]<=LBest[2][i]<=len(L2))
1219 LResDep={1:[],2:[]}
1220 LResBC={1:[],2:[]}
1221 debL1=0
1222 debL2=0
1223 debdebT1=debT1
1224 debdebT2=debT2
1225 for i in xrange(len(LBest[1])):
1226 #print niveau,i
1227 posL1 = LBest[1][i]
1228 posL2 = LBest[2][i]
1229 B1 = L1[posL1]
1230 B2 = L2[posL2]
1231 t1 = self.texte[debT1:B1[0]]
1232 t2 = self.texte[debT2:B2[0]]
1233 assert(debT1<=B1[0])
1234 assert(debT2<=B2[0])
1235 LDepTemp = {1:L1[debL1:posL1], 2:L2[debL2:posL2]}
1236 assert(debL1<=posL1)
1237 assert(debL2<=posL2)
1238 self.ass2__(LDepTemp[1],debT1,B1[0])
1239 self.ass2__(LDepTemp[2],debT2,B2[0])
1240 LResBC,LDepTemp=self.fff(t1,t2,debT1,debT2,LResBC,LDepTemp,B1[0],B2[0],niveau)
1241 #for x in LResBC1:
1242 # print str(i)+self.texte[x[0]:x[1]]
1243 #print str(i)+"-----------------------------------------"
1244 LResBC[1].append(B1)
1245 LResBC[2].append(B2)
1246 self.ass2__(LResBC[1],debdebT1,B1[1])
1247 self.ass2__(LResBC[2],debdebT2,B2[1])
1248 LResDep[1].extend(LDepTemp[1])
1249 LResDep[2].extend(LDepTemp[2])
1250 #self.ass__(LResDep)
1251 #self.ass2__(LResDep[1],debdebT1,B1[0])
1252 #self.ass2__(LResDep[2],debdebT2,B2[0])
1253 debT1=B1[1]
1254 debT2=B2[1]
1255 debL1=posL1+1
1256 debL2=posL2+1
1257 # fin
1258 t1=self.texte[debT1:finT1]
1259 t2=self.texte[debT2:finT2]
1260 assert(debT1<=finT1)
1261 assert(debT2<=finT2)
1262 if len(t1)>0 and len(t2)>0:
1263 LDepTemp = {1:L1[debL1:], 2:L2[debL2:]}
1264 if len(LDepTemp[1])>0: self.ass2__(LDepTemp[1],L1[debL1][0],finT1)
1265 if len(LDepTemp[2])>0: self.ass2__(LDepTemp[2],L2[debL2][0],finT2)
1266 LResBC,LDepTemp=self.fff(t1,t2,debT1,debT2,LResBC,LDepTemp,finT1,finT2,niveau)
1267 self.ass__(LResBC)
1268 self.ass2__(LResBC[1],debdebT1,finT1)
1269 self.ass2__(LResBC[2],debdebT2,finT2)
1270 LResDep[1].extend(LDepTemp[1])
1271 LResDep[2].extend(LDepTemp[2])
1272 #self.ass2__(LResDep[1],debdebT1,finT1)
1273 #self.ass2__(LResDep[2],debdebT2,finT2)
1274 #self.ass__(LResDep)
1275 return LResDep[1],LResDep[2],LResBC[1],LResBC[2]
1276
1277 def fff(self,t1,t2,debT1,debT2,LResBC,LDepTemp,fT1,fT2,niveau):
1278 """
1279 pre: debT1>=0
1280 #forall([fT1>=__return__[0][1][i][0] >= __return__[0][1][i-1][1] >= __old__.debT1 for i in range(1, len(__return__[0][1]))])
1281 #forall([fT2>=__return__[0][2][i][0] >= __return__[0][2][i-1][1] >= __old__.debT2 for i in range(1, len(__return__[0][2]))])
1282 #forall([fT1>=__return__[1][1][i][0] >= __return__[1][1][i-1][1] >= __old__.debT1 for i in range(1, len(__return__[1][1]))])
1283 #forall([fT2>=__return__[1][2][i][0] >= __return__[1][2][i-1][1] >= __old__.debT2 for i in range(1, len(__return__[1][2]))])
1284 """
1285 #return LResBC,LDepTemp
1286 if len(t1)==0 or len(t2)==0: return LResBC,LDepTemp
1287 st = suffix_tree.GeneralisedSuffixTree([t1,t2])
1288 blocs_texte = st.shared_substrings3(self.long_min_pivots)
1289 #self.syserr("BT1 "+str(blocs_texte))
1290 recouv = MediteAppli.Recouvrement(t1+t2,blocs_texte,len(t1))
1291 blocs_texte = recouv.eliminer_recouvrements()
1292 #self.syserr("BT2 "+str(blocs_texte))
1293 blocs_texte,r2 = self.remUnique(blocs_texte,len(t1))
1294 #self.syserr("BT3 "+str(blocs_texte))
1295 #self.syserr("r2 "+str(r2))
1296 NL1=[]
1297 NL2=[]
1298 for clef in blocs_texte:
1299 for occ in blocs_texte[clef]:
1300 if occ<len(t1):
1301 NL1 = self.tool.addition_intervalle(NL1,[occ+debT1, occ+debT1+len(clef)])
1302 else: NL2 = self.tool.addition_intervalle(NL2,[occ+debT2-len(t1), occ+debT2+len(clef)-len(t1)])
1303
1304 if len(NL1)>0 and len(NL2)>0:
1305 self.ass2__(NL1,debT1,fT1)
1306 self.ass2__(NL2,debT2,fT2)
1307 self.ass2__(NL1+NL2,debT1,fT2)
1308 #ds=self.tool.difference_sym(self.toLTexte(NL1),self.toLTexte(NL2))
1309 #if len(ds)>0: self.syserr("dSym "+str(ds))
1310 #if len(r2)>0: self.syserr("r2 "+str(r2))
1311 NewLResDep1,NewLResDep2,NewLResBC1,NewLResBC2 = self.deplacements_pond__(NL1, NL2, debT1,debT2,fT1,fT2,niveau+1)
1312 self.ass2__(NewLResDep1,debT1,fT1)
1313 self.ass2__(NewLResDep2,debT2,fT2)
1314 self.ass2__(NewLResBC1,debT1,fT1)
1315 self.ass2__(NewLResBC2,debT2,fT2)
1316 LResBC[1].extend(NewLResBC1)
1317 LResBC[2].extend(NewLResBC2)
1318 self.ass__(LResBC)
1319 LDepTemp[1] = self.tool.soustr_l_intervalles(LDepTemp[1],NewLResBC1)
1320 LDepTemp[2] = self.tool.soustr_l_intervalles(LDepTemp[2],NewLResBC2)
1321 LDepTemp[1] = self.tool.addition_l_intervalle(LDepTemp[1],NewLResDep1)
1322 LDepTemp[2] = self.tool.addition_l_intervalle(LDepTemp[2],NewLResDep2)
1323 #self.ass2__(LDepTemp[1],debT1,fT1)
1324 #self.ass2__(LDepTemp[2],debT2,fT2)
1325 return LResBC,LDepTemp
1326 def remUnique(self,dic,l_texte1):
1327 """post: len(__return__[0])<=len(dic)
1328 forall([x in dic.keys()],lambda x:x in __return__[0].keys() and self.repetition(__return__[0][x]))
1329 """
1330 notUnique={}
1331 unique={}
1332 for x in dic:
1333 y=dic[x]
1334 if self.repetition(dic[x],l_texte1): notUnique[x]=y
1335 else: unique[x]=y
1336 return notUnique,unique
1337
1338 def toLTexte(self,L):
1339 """
1340 #pre:forall(L,lambda x:isinstance(x,(int,int)))
1341 """
1342 #print L
1343 res=[]
1344 for x in L:
1345 res.append(self.texte[x[0]:x[1]])
1346 return res
1347
1348 def pr(self,L):
1349 y=""
1350 for x in L:
1351 y += self.texte[x[0]:x[1]] +"/"
1352 return y
1353
1354 def ass__(self,L):
1355 if __debug__:
1356 for x in L:
1357 for i in xrange(1,len(L[x])):
1358 assert(L[x][i-1][1]<=L[x][i][0])
1359
1360
1361 class AlignAstarRecur4(AlignAstar2,AlignAstarRecur3):
1362 """ texte passé en paramétre des récursions """
1363 def deplacements_pond(self, L1, L2):
1364 """ Fonction qui aligne 2 listes d'intervalles du texte
1365 pre: isinstance(L1,list)
1366 isinstance(L2,list)
1367 len(L1)>0
1368 len(L2)>0
1369 forall([self.l_texte1 >= L1[i][0] >= L1[i-1][1] >= 0 for i in range(1, len(L1))])
1370 forall([len(self.texte) >= L2[i][0] >= L2[i-1][1] >= self.l_texte1 for i in range(1, len(L2))])
1371 post:forall([len(self.texte) >=__return__[1][i][0] >= __return__[1][i-1][1] >= 0 for i in range(1, len(__return__[1]))])
1372 #forall([len(self.texte) >=__return__[0][i][0] >= __return__[0][i-1][1] >= 0 for i in range(1, len(__return__[0]))])
1373 #forall([len(self.texte) >=__return__[2][i][0] >= __return__[2][i-1][1] >= 0 for i in range(1, len(__return__[2]))])
1374 """
1375 LResDep1,LResDep2,LResBC1,LResBC2 = self.deplacements_pond__(L1, L2, self.texte, self.l_texte1, 0)
1376 LDep,LUnique = self.removeUnique(LResDep1+LResDep2)
1377 return LDep,LResBC1+LResBC2,LUnique
1378
1379 def deplacements_pond__(self, L1, L2, texte, l_texte1, niveau):
1380 """ Fonction qui aligne 2 listes d'intervalles du texte
1381 pre: isinstance(L1,list)
1382 isinstance(L2,list)
1383 len(L1)>0
1384 len(L2)>0
1385 forall([0 >= L1[i][0] >= L1[i-1][1] >= l_texte1 for i in range(1, len(L1))])
1386 forall([l_texte1 >= L2[i][0] >= L2[i-1][1] >= len(texte) for i in range(1, len(L2))])
1387 #post[debT1, debT2]:
1388 # forall([finT1>=__return__[2][i][0] >= __return__[2][i-1][1] >= __old__.debT1 for i in range(1, len(__return__[2]))])
1389 # forall([finT2>=__return__[3][i][0] >= __return__[3][i-1][1] >= __old__.debT2 for i in range(1, len(__return__[3]))])
1390 # forall([finT1>=__return__[0][i][0] >= __return__[0][i-1][1] >= __old__.debT1 for i in range(1, len(__return__[0]))])
1391 # forall([finT2>=__return__[1][i][0] >= __return__[1][i-1][1] >= __old__.debT2 for i in range(1, len(__return__[1]))])
1392 """
1393 debT1=0
1394 finT1=debT2=l_texte1
1395 finT2=len(texte)
1396 texte1=texte[:l_texte1]
1397 texte2=texte[l_texte1:]
1398 # dicoPos[1] stocke pour chaque bloc de L1 ses positions dans cette liste
1399 dicoPos = {}
1400 coutFixe = {}
1401 dicoPos[1], coutFixe[1] = self.calcPosCoutFixe(L1,texte)
1402 dicoPos[2], coutFixe[2] = self.calcPosCoutFixe(L2,texte)
1403 if niveau>0:
1404 #sys.stderr.write("t1="+self.texte[debT1:finT1]+"\n")
1405 #sys.stderr.write("t2="+self.texte[debT2:finT2]+"\n")
1406 #assert(dicoPos[1].has_key('.on.'))
1407 #sys.stderr.flush()
1408 pass
1409 for x in dicoPos[1]:
1410 assert(x in texte1)
1411 assert(x in texte2)
1412 for x in dicoPos[2]:
1413 assert(x in texte1)
1414 assert(x in texte2)
1415 diffSym = self.preTraitDiffSym(L1,L2,texte,dicoPos,niveau) # pré-calcul des différences symétriques
1416 best, closedSet = self.deplacements_pond_Astar(L1,L2,texte,diffSym,dicoPos,coutFixe) # A*
1417 LBest = {1:[], 2:[]}
1418 while best != (-1,-1): # on remonte le meilleur parcours trouvé
1419 LBest[1].append(best[0])
1420 LBest[2].append(best[1])
1421 best=closedSet[best][1] # noeud pére
1422 LBest[1].reverse()
1423 LBest[2].reverse()
1424 for i in xrange(1,len(LBest[1])):
1425 assert(0<=LBest[1][i-1]<=LBest[1][i]<=len(L1))
1426 assert(0<=LBest[2][i-1]<=LBest[2][i]<=len(L2))
1427 LResDep={1:[],2:[]}
1428 LResBC={1:[],2:[]}
1429 debL1=0
1430 debL2=0
1431 debdebT1=debT1
1432 debdebT2=debT2
1433 for i in xrange(len(LBest[1])):
1434 #print niveau,i
1435 posL1 = LBest[1][i]
1436 posL2 = LBest[2][i]
1437 B1 = L1[posL1]
1438 B2 = L2[posL2]
1439 t1 = texte[debT1:B1[0]]
1440 t2 = texte[debT2:B2[0]]
1441 assert(debT1<=B1[0])
1442 assert(debT2<=B2[0])
1443 LDepTemp = {1:L1[debL1:posL1], 2:L2[debL2:posL2]}
1444 assert(debL1<=posL1)
1445 assert(debL2<=posL2)
1446 self.ass2__(LDepTemp[1],debT1,B1[0])
1447 self.ass2__(LDepTemp[2],debT2,B2[0])
1448 LResBC,LDepTemp=self.fff(t1,t2,debT1,debT2,LResBC,LDepTemp,B1[0],B2[0],niveau)
1449 #for x in LResBC1:
1450 # print str(i)+self.texte[x[0]:x[1]]
1451 #print str(i)+"-----------------------------------------"
1452 LResBC[1].append(B1)
1453 LResBC[2].append(B2)
1454 self.ass2__(LResBC[1],debdebT1,B1[1])
1455 self.ass2__(LResBC[2],debdebT2,B2[1])
1456 LResDep[1].extend(LDepTemp[1])
1457 LResDep[2].extend(LDepTemp[2])
1458 #self.ass__(LResDep)
1459 #self.ass2__(LResDep[1],debdebT1,B1[0])
1460 #self.ass2__(LResDep[2],debdebT2,B2[0])
1461 debT1=B1[1]
1462 debT2=B2[1]
1463 debL1=posL1+1
1464 debL2=posL2+1
1465 # fin
1466 t1=texte[debT1:finT1]
1467 t2=texte[debT2:finT2]
1468 assert(debT1<=finT1)
1469 assert(debT2<=finT2)
1470 if len(t1)>0 and len(t2)>0:
1471 LDepTemp = {1:L1[debL1:], 2:L2[debL2:]}
1472 if len(LDepTemp[1])>0: self.ass2__(LDepTemp[1],L1[debL1][0],finT1)
1473 if len(LDepTemp[2])>0: self.ass2__(LDepTemp[2],L2[debL2][0],finT2)
1474 LResBC,LDepTemp=self.fff(t1,t2,debT1,debT2,LResBC,LDepTemp,finT1,finT2,niveau)
1475 self.ass__(LResBC)
1476 self.ass2__(LResBC[1],debdebT1,finT1)
1477 self.ass2__(LResBC[2],debdebT2,finT2)
1478 LResDep[1].extend(LDepTemp[1])
1479 LResDep[2].extend(LDepTemp[2])
1480 #self.ass2__(LResDep[1],debdebT1,finT1)
1481 #self.ass2__(LResDep[2],debdebT2,finT2)
1482 #self.ass__(LResDep)
1483 return LResDep[1],LResDep[2],LResBC[1],LResBC[2]
1484
1485 def fff(self,t1,t2,debT1,debT2,LResBC,LDepTemp,fT1,fT2,niveau):
1486 """
1487 pre: debT1>=0
1488 #forall([fT1>=__return__[0][1][i][0] >= __return__[0][1][i-1][1] >= __old__.debT1 for i in range(1, len(__return__[0][1]))])
1489 #forall([fT2>=__return__[0][2][i][0] >= __return__[0][2][i-1][1] >= __old__.debT2 for i in range(1, len(__return__[0][2]))])
1490 #forall([fT1>=__return__[1][1][i][0] >= __return__[1][1][i-1][1] >= __old__.debT1 for i in range(1, len(__return__[1][1]))])
1491 #forall([fT2>=__return__[1][2][i][0] >= __return__[1][2][i-1][1] >= __old__.debT2 for i in range(1, len(__return__[1][2]))])
1492 """
1493 #return LResBC,LDepTemp
1494 if len(t1)==0 or len(t2)==0: return LResBC,LDepTemp
1495 st = suffix_tree.GeneralisedSuffixTree([t1,t2])
1496 blocs_texte = st.shared_substrings3(self.long_min_pivots)
1497 recouv = MediteAppli.Recouvrement(t1+t2,blocs_texte,len(t1))
1498 blocs_texte = recouv.eliminer_recouvrements()
1499 blocs_texte,r2 = self.remUnique(blocs_texte,len(t1))
1500 NL1=[]
1501 NL2=[]
1502 for clef in blocs_texte:
1503 for occ in blocs_texte[clef]:
1504 if occ<len(t1):
1505 NL1 = self.tool.addition_intervalle(NL1,[occ, occ+len(clef)])
1506 else: NL2 = self.tool.addition_intervalle(NL2,[occ, occ+len(clef)])
1507
1508 if len(NL1)>0 and len(NL2)>0:
1509 self.ass2__(NL1,0,len(t1))
1510 self.ass2__(NL2,len(t1),len(t1+t2))
1511 self.ass2__(NL1+NL2,0,len(t1+t2))
1512 NewLResDep1,NewLResDep2,NewLResBC1,NewLResBC2 = self.deplacements_pond__(NL1, NL2, t1+t2, len(t1),niveau+1)
1513 NewLResDep1=self.addNumLInter(NewLResDep1,debT1)
1514 NewLResDep2=self.addNumLInter(NewLResDep2,debT2-len(t1))
1515 NewLResBC1=self.addNumLInter(NewLResBC1,debT1)
1516 NewLResBC2=self.addNumLInter(NewLResBC2,debT2-len(t1))
1517 self.ass2__(NewLResDep1,debT1,fT1)
1518 self.ass2__(NewLResDep2,debT2,fT2)
1519 self.ass2__(NewLResBC1,debT1,fT1)
1520 self.ass2__(NewLResBC2,debT2,fT2)
1521 LResBC[1].extend(NewLResBC1)
1522 LResBC[2].extend(NewLResBC2)
1523 self.ass__(LResBC)
1524 LDepTemp[1] = self.tool.soustr_l_intervalles(LDepTemp[1],NewLResBC1)
1525 LDepTemp[2] = self.tool.soustr_l_intervalles(LDepTemp[2],NewLResBC2)
1526 LDepTemp[1] = self.tool.addition_l_intervalle(LDepTemp[1],NewLResDep1)
1527 LDepTemp[2] = self.tool.addition_l_intervalle(LDepTemp[2],NewLResDep2)
1528 #self.ass2__(LDepTemp[1],debT1,fT1)
1529 #self.ass2__(LDepTemp[2],debT2,fT2)
1530 return LResBC,LDepTemp
1531
1532 def addNumLInter(self,L,num):
1533 res=[]
1534 for x in L:
1535 res.append([x[0]+num,x[1]+num])
1536 return res
1537 '''
1539
1540 - def __init__(self,l_texte1,long_min_pivots=1, algoAlign="", coeff=None, sep=True):
1541 """ Constructeur
1542
1543 #pre: isinstance(l_texte1, int) and isinstance(long_min_pivots, int)
1544
1545 @param texte1: la premiere version du texte a comparer
1546 @type texte1: String
1547 @param long_min_pivots: longueur minimale des chaines répétées
1548 @param long_min_pivots: integer
1549 @param algoAlign: algo d'alignement, par défaut A*
1550 @type algoAlign: string
1551 @param coeff: poids des params pour MOA*
1552 @type coeff: triplet
1553 @param sep: sensible aux séparateurs si Vrai
1554 @type sep: boolean
1555 """
1556 AlignAstar2.__init__(self)
1557 self.long_min_pivots=long_min_pivots
1558 self.l_texte1 = l_texte1
1559 self.algoAligneur = algoAlign
1560 self.separatorSensivitive = sep
1561
1562 - def run(self, t1, t2):
1563 """ pre: isinstance(t1,str) and isinstance(t2,str)
1564 """
1565
1566 self.MAXRECURSION = 1000
1567 self.l_texte2 = len(t2)
1568
1569 self.preTraitDiffSym = psyco.proxy(self.preTraitDiffSym)
1570
1571
1572 self.addSubDep = True
1573
1574 sepTable = string.maketrans("$",".")
1575 t1 = t1.translate(sepTable)
1576 t2 = t2.translate(sepTable)
1577 lDEP1,lDEP2,lBC1,lBC2=self.deplacements_pond2(t1,t2)
1578
1579 lDEP1.extend(lDEP2)
1580
1581 LDEP = self.cleanDep(lDEP1,t1,t2)
1582 lBC1.extend(lBC2)
1583 return LDEP,lBC1
1584
1585 - def cleanDep(self,LDEP,texte1,texte2):
1586 """Enleve les deplacements inclus dans un autre deplacement et ceux qui ne sont plus répétés
1587
1588 #pre: forall([len(texte) >= LDEP[i][0] >= LDEP[i-1][1] >= 0 for i in range(1, len(LDEP))])
1589 #post: forall([len(texte1)+len(texte2) >=__return__[i][0] >= __return__[i-1][1] >= 0 for i in range(1, len(__return__))])
1590 """
1591 size_LDEP = len(LDEP)+1
1592 while len(LDEP) < size_LDEP:
1593 size_LDEP = len(LDEP)
1594
1595 LDEP = self.removeInclude(LDEP)
1596
1597
1598 LDEP = self.removeUnique(LDEP,texte1,texte2)
1599
1600
1601
1602
1603 return LDEP
1605 """ Enleve les deplacements inclus dans un autre déplacement
1606
1607 #pre: forall([len(texte) >= L[i][0] >= L[i-1][1] >= 0 for i in range(1, len(L))])
1608 #post: forall([len(texte) >=__return__[i][0] >= __return__[i-1][1] >= 0 for i in range(1, len(__return__))])
1609 # forall([len(texte) >=__return__[1][i][0] >= __return__[1][i-1][1] >= 0 for i in range(1, len(__return__[1]))])
1610 """
1611 LDep = []
1612 prevInter = (0,0)
1613 for (deb,fin) in L:
1614 if prevInter[0] <= deb <= fin <= prevInter[1]:
1615 continue
1616 else:
1617 LDep.append([deb,fin])
1618 prevInter = (deb,fin)
1619 return LDep
1620
1622 """ Scinde L en 2 listes, une pour les chaines ayant plusieurs occurences
1623 et l'autre pour celles en ayant une seule
1624
1625 #pre: forall([len(texte) >= L[i][0] >= L[i-1][1] >= 0 for i in range(1, len(L))])
1626 #post: forall([len(texte) >=__return__[0][i][0] >= __return__[0][i-1][1] >= 0 for i in range(1, len(__return__[0]))])
1627 # forall([len(texte) >=__return__[1][i][0] >= __return__[1][i-1][1] >= 0 for i in range(1, len(__return__[1]))])
1628 """
1629 texte = texte1+texte2
1630 dicDep = {}
1631 for x in L:
1632 t=texte[x[0]:x[1]]
1633 try:
1634 dicDep[t].append(x[0])
1635 except KeyError:
1636 dicDep[t]=[x[0]]
1637
1638
1639
1640 LDep=[]
1641 LUnique=[]
1642 for clef,locc in dicDep.iteritems():
1643 len_clef = len(clef)
1644 if self.repetition(locc,self.l_texte1):
1645 assert texte[locc[0]:locc[0]+len_clef] == texte[locc[-1]:locc[-1]+len_clef]
1646 assert texte1[locc[0]:locc[0]+len_clef] == texte2[locc[-1]-self.l_texte1:locc[-1]-self.l_texte1+len_clef] , texte1[locc[0]:locc[0]+len_clef] + texte2[locc[-1]-self.l_texte1:locc[-1]-self.l_texte1+len_clef]
1647 for occ in locc:
1648
1649 bisect.insort_right(LDep,[occ, occ+len_clef])
1650 else:
1651 for occ in locc:
1652
1653 bisect.insort_right(LUnique,[occ, occ+len_clef])
1654
1655 return LDep
1657 """ Scinde L en 2 listes, une pour les chaines ayant plusieurs occurences
1658 et l'autre pour celles en ayant une seule
1659
1660 #pre: forall([len(texte) >= L[i][0] >= L[i-1][1] >= 0 for i in range(1, len(L))])
1661 #post: forall([len(texte) >=__return__[0][i][0] >= __return__[0][i-1][1] >= 0 for i in range(1, len(__return__[0]))])
1662 # forall([len(texte) >=__return__[1][i][0] >= __return__[1][i-1][1] >= 0 for i in range(1, len(__return__[1]))])
1663 """
1664 dicDep = {}
1665 for (deb,fin) in L:
1666 longueur = fin - deb
1667 if deb < self.l_texte1:
1668 cle = hash(texte1[deb:fin])
1669 else:
1670 cle = hash(texte2[deb-self.l_texte1:fin-self.l_texte1])
1671 try:
1672 dicDep[(cle,longueur)].append(deb)
1673 except KeyError:
1674 dicDep[(cle,longueur)]=[deb]
1675
1676 LDep=[]
1677
1678 for (cle,longueur),locc in dicDep.iteritems():
1679
1680 if self.repetition(locc,self.l_texte1):
1681 assert texte1[locc[0]:locc[0]+longueur] == texte2[locc[-1]-self.l_texte1:locc[-1]-self.l_texte1+longueur] , texte1[locc[0]:locc[0]+longueur] + texte2[locc[-1]-self.l_texte1:locc[-1]-self.l_texte1+longueur]
1682 for occ in locc:
1683
1684 bisect.insort_right(LDep,[occ, occ+longueur])
1685
1686
1687
1688
1689
1690 return LDep
1691
1693 """ prends les 2 textes en entrée et renvoie 2 listes au format
1694 [(BC,[BDeps le précédant])] """
1695 aligneSMEMS = True
1696 if aligneSMEMS:
1697 s1,s2 = self._texteToSeqHomo(t1,t2)
1698 else:
1699
1700 s1,s2 = self._texteToSeqHomoNGrammes(t1,t2,1,self.long_min_pivots)
1701
1702
1703 if __debug__:
1704 texte = t1+t2
1705 self.ass2__(s1,0,len(t1),texte)
1706 self.ass2__(s2,len(t1),len(t1)+len(t2),texte)
1707 if len(s1)==0 or len(s2)==0: return [],[]
1708
1709
1710
1711 LResT1,LResT2 = self._appelAlgo(s1,s2,t1,t2,len(t1))
1712 return LResT1,LResT2
1713
1715 if self.algoAligneur.lower() == 'LIS'.lower():
1716 a = aligne.AlignLIS()
1717 return a.alignement(s1, s2, t1, t2, t)
1718 elif self.algoAligneur.lower() == 'HIS'.lower():
1719 a = aligne.AlignHIS()
1720 return a.alignement(s1, s2, t1, t2, t)
1721 elif self.algoAligneur.lower() == 'HISCont'.lower():
1722 a = aligne.AlignHISCont()
1723 return a.alignement(s1, s2, t1, t2, t)
1724 elif self.algoAligneur.lower() == 'HISVO'.lower():
1725 a = aligne.AlignHISVo()
1726 return a.alignement(s1, s2, t1, t2, t)
1727 elif self.algoAligneur.lower() == 'glouton'.lower():
1728 a = aligne.AlignGlouton()
1729 return a.alignement(s1, s2, t1, t2, t)
1730 elif self.algoAligneur.lower() == 'dyn'.lower():
1731 a = aligne.AlignProgDyn()
1732 return a.alignement(s1, s2, t1, t2, t)
1733 elif self.algoAligneur.lower() == 'dyn2'.lower():
1734 a = aligne.AlignProgDyn2()
1735 return a.alignement(s1, s2, t1, t2, t)
1736 else:
1737 return self._appelAstar(s1,s2,t1,t2,t)
1738
1740 """ pre: isinstance(t1,str) and isinstance(t2,str)
1741 """
1742
1743 if len(t1)==0 or len(t2)==0: return [],[],[],[]
1744 if niveau > self.MAXRECURSION: return [],[],[],[]
1745
1746 logging.log(5,"debut dep_pond niveau "+str(niveau))
1747 LResT1,LResT2 = self.compute_alignement(t1,t2)
1748 if len(LResT1)==0 or len(LResT2)==0: return [],[],[],[]
1749 texte=t1+t2
1750 debutT1=i=0 ; debutT2=len(t1)
1751 lResBC1=[] ; lResBC2=[] ; lResDEP1=[] ; lResDEP2=[]
1752
1753 while (i<min(len(LResT1),len(LResT2))):
1754 BC1,lDep1=LResT1[i]
1755 BC2,lDep2=LResT2[i]
1756
1757 if BC1 is not None: finT1=BC1[0]
1758 else: finT1=len(t1)
1759 if BC2 is not None: finT2=BC2[0]
1760 else: finT2=len(t2)
1761
1762
1763 for x in lDep1: bisect.insort_right(lResDEP1, x)
1764 for x in lDep2: bisect.insort_right(lResDEP2, x)
1765
1766
1767
1768 nt1=texte[debutT1:finT1]
1769 nt2=texte[debutT2:finT2]
1770 NewLResDep1,NewLResDep2,NewLResBC1,NewLResBC2 = self.deplacements_pond2(nt1,nt2,niveau+1)
1771
1772 if self.addSubDep:
1773 NewLResDep1 = self._filtreDepRec(NewLResDep1)
1774 self.ass2__(NewLResDep1,0,len(nt1),nt1)
1775 if self.addSubDep:
1776 NewLResDep2 = self._filtreDepRec(NewLResDep2)
1777 self.ass2__(NewLResDep2,len(nt1),len(nt1)+len(nt2),nt1+nt2)
1778 self.ass2__(NewLResBC1,0,len(nt1),nt1)
1779 self.ass2__(NewLResBC2,len(nt1),len(nt1)+len(nt2),nt1+nt2)
1780 if self.addSubDep: NewLResDep1 = [[x[0]+debutT1,x[1]+debutT1] for x in NewLResDep1]
1781 if self.addSubDep: NewLResDep2 = [[x[0]+debutT2-len(nt1),x[1]+debutT2-len(nt1)] for x in NewLResDep2]
1782 NewLResBC1 = [[x[0]+debutT1,x[1]+debutT1] for x in NewLResBC1]
1783 NewLResBC2 = [[x[0]+debutT2-len(nt1),x[1]+debutT2-len(nt1)] for x in NewLResBC2]
1784 if self.addSubDep: self.ass2__(NewLResDep1,debutT1,finT1,t1)
1785 if self.addSubDep: self.ass2__(NewLResDep2,debutT2,finT2,texte)
1786 self.ass2__(NewLResBC1,debutT1,finT1,t1)
1787 self.ass2__(NewLResBC2,debutT2,finT2,texte)
1788 self.ass2__(lResBC1,0,debutT1,t1)
1789 self.ass2__(lResBC2,len(t1),debutT2,texte)
1790 lResBC1.extend(NewLResBC1)
1791 lResBC2.extend(NewLResBC2)
1792 if BC1 is not None: self.ass2__(lResBC1,0,finT1,t1)
1793 if BC2 is not None: self.ass2__(lResBC2,len(t1),finT2,texte)
1794 lResDEP1 = self.tool.soustr_l_intervalles(lResDEP1,NewLResBC1)
1795 lResDEP2 = self.tool.soustr_l_intervalles(lResDEP2,NewLResBC2)
1796 if self.addSubDep:
1797 for x in NewLResDep1: bisect.insort_right(lResDEP1, x)
1798 if self.addSubDep:
1799 for x in NewLResDep2: bisect.insort_right(lResDEP2, x)
1800 if BC1 is not None: lResBC1.append(BC1)
1801 if BC2 is not None: lResBC2.append(BC2)
1802 i+=1
1803 if BC1 is not None: debutT1 = BC1[1]
1804 if BC2 is not None: debutT2 = BC2[1]
1805
1806 if len(LResT1)>len(LResT2):
1807 assert(len(LResT1)==len(LResT2)+1 and LResT1[-1][0] is None)
1808 lResDEP1.extend(LResT1[-1][1])
1809 elif len(LResT2)>len(LResT1):
1810 assert(len(LResT2)==len(LResT1)+1 and LResT2[-1][0] is None)
1811 lResDEP2.extend(LResT2[-1][1])
1812 else: assert(len(LResT1)==len(LResT2))
1813 logging.log(5,"fin dep_pond niveau "+str(niveau))
1814 return lResDEP1,lResDEP2,lResBC1,lResBC2
1815
1817 """recherche d'un point fixe"""
1818 taille_originale = len(liste)
1819 taille_prev = 0
1820 liste2 = self.__filtreDepRec(liste)
1821 taille_courante = len(liste2)
1822 while taille_prev != taille_courante:
1823 taille_prev = taille_courante
1824 liste2 = self.__filtreDepRec(liste2)
1825 taille_courante = len(liste2)
1826 return liste2
1827
1829 """filtrage des déplacements se chevauchant"""
1830 liste2 = []
1831 if len(liste) < 2:
1832 liste2.extend(liste)
1833 else:
1834 i = 0
1835 while i < len(liste)-1:
1836 segment1 = liste[i] ; long1 = segment1[1]-segment1[0]
1837 segment2 = liste[i+1] ; long2 = segment2[1]-segment2[0]
1838 if segment1[1] > segment2[0]:
1839 if long1 >= long2:
1840 liste2.append(segment1)
1841 i += 2
1842 else:
1843 liste2.append(segment2)
1844 i += 1
1845 else:
1846 liste2.append(segment1)
1847 i += 1
1848 if i == len(liste)-1:
1849 liste2.append(liste[-1])
1850 return liste2
1851
1852 - def _texteToSeqHomo(self,t1,t2):
1853 """ Extrait des 2 textes, les 2 séquences de blocs répétés """
1854 logging.log(5,"debut _texteToSeqHomo")
1855 st = suffix_tree.GeneralisedSuffixTree([t1,t2])
1856 logging.log(5,"fin construction ST")
1857
1858
1859 eliminationRecouv = True
1860 if self.algoAligneur.lower() == 'HISCont'.lower():
1861 eliminationRecouv = False
1862 blocs_texte = st.get_MEM_index_chaine3(self.long_min_pivots, eliminRecouv=eliminationRecouv)
1863
1864 logging.log(5,"fin extraction MEM")
1865 del st
1866
1867
1868
1869
1870
1871 blocs_texte = self.remUnique(blocs_texte,len(t1), t1, t2)
1872 NL1=[] ; NL2=[]
1873 len_t1 = len(t1)
1874 for (cle,longueur),liste_occ in blocs_texte.iteritems():
1875
1876 for occ in iter(liste_occ):
1877 if occ < len_t1:
1878
1879 bisect.insort_right(NL1,[occ, occ+longueur])
1880 else:
1881 bisect.insort_right(NL2,[occ, occ+longueur])
1882 logging.log(5,"fin _texteToSeqHomo")
1883
1884 return NL1,NL2
1885
1886
1887 - def _texteToSeqHomoNGrammes(self,t1,t2,n,min_size):
1888 """Recherche naive d'homologies
1889
1890 @param t1: le texte 1
1891 @param t2: le texte 2
1892 @param n: le nombre de mots que l'on desire dans un bloc (1=> monogramme, 2=> bigramme etc...)
1893 @param min_size: taille minimale des blocs
1894 """
1895 t12=t1+t2 ; lenT1 = len(t1) ; lenT12 = len(t12)
1896
1897 t1Split = self.__splitNGrammes(t1,n,0)
1898 t2Split = self.__splitNGrammes(t2, n,lenT1)
1899
1900 dicoT1 = {}
1901 for key,(debut,fin) in t1Split:
1902 assert 0 <= debut < fin <= lenT1
1903 longueur = fin - debut
1904 assert longueur > 0
1905 if not dicoT1.has_key(longueur): dicoT1[longueur] = {}
1906 try:
1907 dicoT1[longueur][key].append(debut)
1908 except KeyError :
1909 dicoT1[longueur][key] = [debut]
1910
1911 dicoT2 = {}
1912 for key,(debut,fin) in t2Split:
1913 assert lenT1 <= debut < fin <= lenT12
1914 longueur = fin - debut
1915 assert longueur > 0
1916 if not dicoT2.has_key(longueur): dicoT2[longueur] = {}
1917 try:
1918 dicoT2[longueur][key].append(debut)
1919 except KeyError :
1920 dicoT2[longueur][key] = [debut]
1921
1922
1923 dico = {}
1924 for longueur,dicoLongueurT1 in dicoT1.iteritems():
1925 if dicoT2.has_key(longueur):
1926
1927 for key,listePosT1 in dicoLongueurT1.iteritems():
1928 if dicoT2[longueur].has_key(key):
1929 listePosT2 = dicoT2[longueur][key]
1930 if not dico.has_key(longueur): dico[longueur] = {}
1931 try:
1932 dico[longueur][key].extend(listePosT1)
1933 dico[longueur][key].extend(listePosT2)
1934 except KeyError:
1935 dico[longueur][key] = listePosT1
1936 dico[longueur][key].extend(listePosT2)
1937 dico[longueur][key].sort()
1938
1939 if __debug__:
1940
1941 for longueur, dicoLongueur in dico.iteritems():
1942 for key, listePos in dicoLongueur.iteritems():
1943 texte = t12[listePos[0]:listePos[0]+longueur]
1944 listeTextes = [texte]
1945 i = 1 ; OK = True
1946 while i < len(listePos):
1947
1948 texte2 = t12[listePos[i]:listePos[i]+longueur]
1949 listeTextes.append(texte2)
1950 if texte != texte2: OK = False
1951 i += 1
1952 if not OK: print listeTextes
1953
1954 if len(dico) > 1:
1955
1956 pass
1957 recouv = recouvrement.Recouvrement4(t12,dico,lenT1,min_size)
1958 blocs = recouv.eliminer_recouvrements()
1959
1960
1961
1962 for cle, listePos in blocs.iteritems():
1963 listePos.reverse()
1964 if len(blocs) > 1:
1965
1966 pass
1967 blocs = self.remUnique(blocs,lenT1,t1,t2)
1968 if len(blocs) > 1:
1969
1970 pass
1971
1972 NL1=[] ; NL2=[]
1973
1974 for key,pos in blocs.iteritems():
1975 for posD in pos :
1976 if posD < len(t1) : NL1.append([posD,posD+key[1]])
1977 else : NL2.append([posD,posD+key[1]])
1978 NL1.sort()
1979 NL2.sort()
1980 return NL1,NL2
1981
1983 """ Découpage en Ngrammes
1984
1985 Appel plusieurs fois __splitNGrammes2 pour avoir tous les ngrammes
1986 car celle-ci ne renvoie que des ngrammes disjoints 2 à 2
1987
1988 @param texte: le texte
1989 @param tailleNgram: de 1 à 3, le nombre de mots que l'on desire dans un bloc (1=> monogramme, 2=> bigramme etc...)
1990 @param lenT1: lmongueur du texte1
1991 """
1992 assert 1 <= tailleNgram <= 3
1993 if self.separatorSensivitive:
1994 sep=" "
1995 else: sep="."
1996
1997 if tailleNgram == 1:
1998 return self.__splitNGrammes2( texte, tailleNgram, lenT1)
1999 else:
2000
2001 r1 = self.__splitNGrammes2( texte, tailleNgram, lenT1)
2002 t = texte.split(sep) ; i = 0
2003 if len(t) > 0:
2004
2005 while len(t[0]) == 0:
2006 t = t[1:] ; i += 1
2007
2008 assert len(t[0]) > 0
2009 i += len(t[0]) ; t = t[1:]
2010
2011 while len(t) > 0 and len(t[0]) == 0:
2012 t = t[1:] ; i += 1
2013 else: i += 1
2014
2015 r2 = self.__splitNGrammes2( texte[i:], tailleNgram, lenT1+i)
2016 r1.extend(r2)
2017 if tailleNgram == 2:
2018 return r1
2019 else:
2020 if len(t) > 0:
2021
2022 assert len(t[0]) > 0
2023 i += len(t[0]) ; t = t[1:]
2024
2025 while len(t) > 0 and len(t[0]) == 0:
2026 t = t[1:] ; i += 1
2027 else: i += 1
2028
2029 r3 = self.__splitNGrammes2( texte[i:], tailleNgram, lenT1+i)
2030 r1.extend(r3)
2031 return r1
2032
2034 """Découpage effectif en Ngrammes
2035
2036 !! Attention, la fonction ne retourne que les Ngrammes disjoints 2à2
2037 Il faut l'appellet plusieurs fois en décalant le début pour avoir les autres ngrammes
2038
2039 @param texte: le texte
2040 @param tailleNgram: le nombre de mots que l'on desire dans un bloc (1=> monogramme, 2=> bigramme etc...)
2041 @param lenT1: l'offset pour la position (0 pour le texte1, len(texte1) pour le texte 2
2042 """
2043 if self.separatorSensivitive:
2044 sep=" "
2045 else: sep="."
2046 res = []
2047 i = 0
2048 prev = 0
2049 curNgram = 0
2050 accu = []
2051 while i < len(texte):
2052 car = texte[i]
2053 accu.append(car)
2054 if car == sep:
2055 curNgram += 1
2056
2057 if curNgram == tailleNgram:
2058 ngram = ''.join(accu)
2059 debut = prev +lenT1
2060 fin = debut + len(ngram)
2061
2062 assert ngram == texte[debut-lenT1:fin-lenT1], (ngram, texte[debut-lenT1:fin-lenT1])
2063 res.append((hash(ngram),[debut,fin]))
2064 accu = [] ; curNgram = 0 ; prev = i + 1
2065 i += 1
2066 return res
2067
2069 """Sort les blocs d'un texte en nGrammes
2070 @param texte: le texte
2071 @param tailleNgram: le nombre de mots que l'on desire dans un bloc (1=> monogramme, 2=> bigramme etc...)
2072 @param lenT1: l'offset pour la position (0 pour le texte1, len(texte1) pour le texte 2
2073 """
2074 if self.separatorSensivitive:
2075 sep=" "
2076 else: sep="."
2077 tSplit = texte.split(sep)
2078 assert len(texte) == sum([len(x) for x in tSplit]) + len(tSplit)-1
2079 print tSplit[:5]
2080
2081 res = [] ; posTexte = 0 ; pos = 0
2082 while pos < len(tSplit)-tailleNgram+1:
2083 mot = tSplit[pos]
2084 if len(mot) == 0:
2085 pos += 1 ; posTexte += 1
2086 else:
2087 valNgramme = mot
2088 tailleCurrentNgramme = 1 ; i = 1 ; nbSep = 0
2089 while tailleCurrentNgramme < tailleNgram:
2090 next = tSplit[pos+i]
2091 if len(next) == 0:
2092 nbSep += 1
2093 else:
2094 valNgramme += sep + nbSep * sep + next
2095 tailleCurrentNgramme += 1
2096 i += 1
2097 if tailleCurrentNgramme == tailleNgram:
2098 debut = posTexte + lenT1
2099 fin = posTexte + lenT1 + len(valNgramme) + nbSep
2100 print (valNgramme, texte[debut:fin])
2101 res.append((hash(valNgramme),[debut,fin]))
2102 assert valNgramme == texte[debut:fin], (valNgramme, texte[debut:fin])
2103 pos += 1 ; posTexte += len(mot)+1
2104 if __debug__:
2105 print pos, posTexte
2106 som = 0
2107 for x in tSplit[:pos]:
2108 if len(x) == 0: som += 1
2109 else: som += len(x)+1
2110 assert posTexte-1 == som, (posTexte-1,som)
2111
2113 """Sort les blocs d'un texte en nGrammes
2114 @param texte: le texte
2115 @param tailleNgram: le nombre de mots que l'on desire dans un bloc (1=> monogramme, 2=> bigramme etc...)
2116 @param lenT1: l'offset pour la position (0 pour le texte1, len(texte1) pour le texte 2
2117 """
2118 if self.separatorSensivitive:
2119 sep=" "
2120 else: sep="."
2121
2122 tSplit = texte.split(sep)
2123 assert len(texte) == sum([len(x) for x in tSplit]) + len(tSplit)-1
2124 posT = -1
2125 res = []
2126 for pos in xrange(len(tSplit)):
2127 mot = tSplit[pos]
2128 posT += 1
2129 if len(mot) != 0:
2130
2131 valNgramme = mot
2132 lenNgramme = len(mot)
2133
2134 tailleCurrentNgramme = 1
2135 nbEspace = 0
2136 while (tailleCurrentNgramme < tailleNgram and
2137 pos+tailleCurrentNgramme+nbEspace < len(tSplit)):
2138 if len(tSplit[pos+tailleCurrentNgramme+nbEspace]) == 0 :
2139 nbEspace += 1
2140 valNgramme += sep
2141 else:
2142 valNgramme += sep+tSplit[pos+tailleCurrentNgramme+nbEspace]
2143 tailleCurrentNgramme += 1
2144 if tailleCurrentNgramme == tailleNgram:
2145
2146 debut = posT + lenT1
2147
2148 fin = posT+lenT1 + len(valNgramme) + nbEspace+tailleNgram-1
2149 res.append((hash(valNgramme),[debut,fin]))
2150 assert valNgramme == texte[debut:fin], (valNgramme, texte[debut:fin])
2151 posT += len(mot)
2152 else:
2153
2154 posT += 1
2155 return res
2156
2158 """ Alignement A* entre les 2 séquences
2159 pre: isinstance(L1,list) and isinstance(L2,list) and isinstance(texte1,str) and isinstance(texte2,str)
2160 post: (len(__return__[0])==len(__return__[1])) or (len(__return__[0])==len(__return__[1])+1) or \
2161 (len(__return__[0])+1==len(__return__[1]))"""
2162
2163 logging.log(5,"debut _appelAstar")
2164 LC1 = [] ; LC2 = []
2165 Lcf1 = [] ; Lcf2 = []
2166 LH1 = [] ; LH2 = []
2167 for bloc in L1:
2168 cle = hash(texte1[bloc[0]:bloc[1]])
2169 LC1.append(cle)
2170 Lcf1.append((cle,bloc[1]-bloc[0]))
2171 LH1.append((cle, bloc[0], bloc[1]))
2172
2173 logging.log(5,'init L1 done')
2174 for bloc in L2:
2175 cle = hash(texte2[bloc[0]-lt1:bloc[1]-lt1])
2176 LC2.append(cle)
2177 Lcf2.append((cle,bloc[1]-bloc[0]))
2178 LH2.append((cle, bloc[0], bloc[1]))
2179 logging.log(5,'init L2 done')
2180 dicoPos = {}
2181 coutFixe = {}
2182 dicoPos[1], coutFixe[1] = self.calcPosCoutFixe(Lcf1)
2183 dicoPos[2], coutFixe[2] = self.calcPosCoutFixe(Lcf2)
2184 if __debug__:
2185 for cle in dicoPos[1]:
2186 assert cle in dicoPos[2], str(cle)+' / '+str([texte1[d:f] for d,f in L1])+' '+str([texte2[d-lt1:f-lt1] for d,f in L2])
2187 for cle in dicoPos[2]: assert cle in dicoPos[1]
2188 dic = {} ; tri = []
2189 for cle,deb,fin in LH1:
2190 try:
2191 val,lpos = dic[cle]
2192 lpos.append((deb,fin))
2193 dic[cle] = val+1, lpos
2194 except KeyError: dic[cle] = 1 , [(deb,fin)]
2195 for cle,(nb,lpos) in dic.iteritems():
2196 bisect.insort_right(tri,(nb,lpos))
2197 for nb,lpos in tri[-20:]:
2198 deb = lpos[0][0] ; fin =lpos[0][1]
2199
2200 dic = {} ; tri = []
2201 for cle,deb,fin in LH2:
2202 try:
2203 val,lpos = dic[cle]
2204 lpos.append((deb,fin))
2205 dic[cle] = val+1, lpos
2206 except KeyError: dic[cle] = 1 , [(deb,fin)]
2207 for cle,(nb,lpos) in dic.iteritems():
2208 bisect.insort_right(tri,(nb,lpos))
2209 for nb,lpos in tri[-20:]:
2210 deb = lpos[0][0] ; fin =lpos[0][1]
2211
2212
2213 tri = []
2214 for cle,liste in dicoPos[1].iteritems():
2215 bisect.insort_right(tri,len(liste))
2216
2217 logging.debug("longueur liste1 = %d, moyenne = %.2f, mediane = %d",len(tri),+(0.0+sum(tri))/len(tri),tri[len(tri)/2])
2218 tri = []
2219 for cle,liste in dicoPos[2].iteritems():
2220 bisect.insort_right(tri,len(liste))
2221
2222 logging.debug('longueur liste2 = %d, moyenne = %.2f, mediane = %d',len(tri),(0.0+sum(tri))/len(tri),tri[len(tri)/2])
2223 logging.log(5,"fin calcPosCoutFixe")
2224
2225 diffSym = self.preTraitDiffSym(LH1,LH2,dicoPos)
2226
2227
2228
2229
2230
2231 logging.log(5,"fin preTraitDiffSym")
2232
2233
2234
2235
2236
2237
2238
2239 best, closedSet = self.deplacements_pond_Astar(LC1,LC2,diffSym,dicoPos,coutFixe)
2240 logging.log(5,"fin deplacements_pond_Astar")
2241 dicoBest={1:{}, 2:{}}
2242 while best != (-1,-1):
2243 dicoBest[1][best[0]]=1
2244 dicoBest[2][best[1]]=1
2245 best=closedSet[best][1]
2246
2247 s1=self._makeLRes(dicoBest[1],L1)
2248 s2=self._makeLRes(dicoBest[2],L2)
2249 logging.log(5,'s1 = '+str(s1))
2250 logging.log(5,'s2 = '+str(s2))
2251 if __debug__:
2252 s11=[]
2253 s12=[]
2254 for i in xrange(len(s1)):
2255 s11.append(s1[i][0])
2256 s12.extend(s1[i][1])
2257 if s11[-1] is None: s11=s11[:-1]
2258 s21=[]
2259 s22=[]
2260 for i in xrange(len(s2)):
2261 s21.append(s2[i][0])
2262 s22.extend(s2[i][1])
2263 if s21[-1] is None: s21=s21[:-1]
2264 self.ass2__(s11,0,lt1,texte1)
2265 self.ass2__(s12,0,lt1,texte1)
2266 texte=texte1+texte2
2267 self.ass2__(s21,lt1,len(texte1)+len(texte2),texte)
2268 self.ass2__(s22,lt1,len(texte1)+len(texte2),texte)
2269 self.ass2__(s11+s21,0,len(texte1)+len(texte2),texte)
2270 self.ass2__(s12+s22,0,len(texte1)+len(texte2),texte)
2271 del dicoPos, coutFixe, dicoBest
2272 logging.log(5,"fin _appelAstar")
2273 return s1,s2
2274
2276 """ Donne une liste [(BC,[BDeps le précédant])]
2277 pre: isinstance(dicoBest,dict) and isinstance(L,list)
2278 #post: forall([ __return__[i-1][0][1] <= __return__[i][0][0] <= __return__[0][1][0][0] for i in range(1, len(__return__))]) """
2279 LRes = []
2280 accuDep = []
2281 for i in xrange(len(L)):
2282 if not dicoBest.has_key(i): accuDep.append(L[i])
2283 else:
2284 LRes.append((L[i],accuDep))
2285 accuDep = []
2286 if len(accuDep)>0: LRes.append((None,accuDep))
2287 return LRes
2288
2289 - def remUnique(self,dic,l_texte1, texte1 ,texte2):
2290 """#post: len(__return__[0])<=len(dic)
2291 # forall([x in dic.keys()],lambda x:x in __return__[0].keys() and self.repetition(__return__[0][x]))
2292 """
2293 notUnique={}
2294 unique={}
2295 for cle, listePos in dic.iteritems():
2296 if self.repetition(listePos, l_texte1):
2297 notUnique[cle] = listePos
2298 else: unique[cle] = listePos
2299 a="""logging.debug('len(unique)='+str(len(unique)))
2300 unique1 = {} ; unique2 = {}
2301 for (cle,longueur),listePos in unique:
2302 if listePos[0] < l_texte1:
2303 unique1[(cle,longueur)] = listePos
2304 else:
2305 unique2[(cle,longueur)] = listePos
2306 assert len(unique) == len(unique1)+len(unique2)
2307 sep = " " # espace + ALT+0160
2308 i = 0
2309 while i < len(unique1):
2310 (cle,longueur),listePos = unique1.popitem()
2311 pos = listePos[0]
2312 assert pos < l_texte1
2313 segment1 = texte1[pos:pos+longueur]
2314 segment1_ = segment1.strip(sep)
2315 stop = False
2316 for (cle2,longueur2),listePos2 in unique2.iteritems():
2317 assert listePos2[0] >= l_texte1
2318 pos2 = listePos2[0] - l_texte1
2319 segment2 = texte2[pos2:pos2+longueur2]
2320 segment2_ = segment2.strip(sep)
2321 if segment1_ == segment2_ and longueur == longueur2:
2322 stop = True
2323 if stop: break
2324 if stop:
2325 notUnique[(cle,longueur)] = listePos.extend(listePos2)
2326 unique2.pop((cle2,longueur2))
2327 i += 1
2328 """
2329
2330 return notUnique
2331
2332
2333
2334
2335
2336
2337
2338
2340 LC1 = [] ; LC2 = [] ; Lcf1 = [] ; Lcf2 = [] ; LH1 = [] ; LH2 = []
2341 for bloc in L1:
2342 cle = hash(texte1[bloc[0]:bloc[1]])
2343 LC1.append((cle,bloc[1]-bloc[0]))
2344 for bloc in L2:
2345 cle = hash(texte2[bloc[0]-lt1:bloc[1]-lt1])
2346 LC2.append((cle,bloc[1]-bloc[0]))
2347
2348 dom_matches = {}
2349 for i in xrange(len(L1)):
2350 x,lenght_x = LC1[i]
2351 for j in xrange(len(L2)):
2352 pass
2353
2355 """Multiset gérant le + petit invariant et le + petit move trouvé"""
2357 Mset.__init__(self, liste)
2358 self.inv_min = sys.maxint ; self.mov_min = sys.maxint
2359 self.inv_max = 0 ; self.mov_max = 0
2360
2362 """Différence entre l'objet courant et mset2, renvoie un nouveau mset2"""
2363 assert isinstance(mset2, Mset2)
2364 res = Mset2()
2365 for cle,(nb, longueur) in self.iteritems():
2366 if cle not in mset2:
2367 res[cle] = (nb, longueur)
2368 res.valeur += nb * longueur
2369 res.length += nb
2370 self.mov_min = min(longueur, self.mov_min)
2371 self.mov_max = max(longueur, self.mov_max)
2372 else:
2373 (nb2, longueur2) = mset2[cle]
2374 self.inv_min = min(longueur, self.inv_min)
2375 self.inv_max = max(longueur, self.inv_max)
2376 if nb > nb2:
2377 res[cle] = (nb-nb2, longueur)
2378 res.valeur += (nb-nb2) * longueur
2379 res.length += nb-nb2
2380 self.mov_min = min(longueur, self.mov_min)
2381 self.mov_max = max(longueur, self.mov_max)
2382 return res, self.inv_min, self.inv_max, self.mov_min, self.mov_max
2383
2384 - def union(self, mset2):
2385 """Union entre l'objet courant et mset2, ATTENTION modifie l'objet courant"""
2386 assert isinstance(mset2, Mset2)
2387 for cle,(nb, longueur) in mset2.iteritems():
2388 try:
2389 nb2, longueur2 = self[cle]
2390 self[cle] = nb+nb2, longueur
2391 except KeyError:
2392 self[cle] = nb, longueur
2393 self.valeur += nb * longueur
2394 self.length += nb
2395
2397 """ Recursive A* avec fonction d'évaluation des noeuds multi-critére """
2398
2399 - def __init__(self, l_texte1,algoAlign="",long_min_pivots=1, seuil_remplacement=0.5,
2400 coeff = None, sep=True):
2401
2402 AlignAstarRecur.__init__(self, l_texte1, long_min_pivots, sep=sep)
2403 self.seuil_remplacements = seuil_remplacement
2404 global algo
2405 algo=algoAlign
2406 if coeff == None: coeff = (0.34, 0.33, 0.33)
2407
2408 self.coeffX = coeff[0]
2409 self.coeffY = coeff[1]
2410 self.coeffZ = coeff[2]
2411 assert self.coeffX + self.coeffY + self.coeffZ == 1
2412
2413 self.cout = psyco.proxy(self.cout)
2414
2415
2417 """ Calcule des dictionnaires des positions et des couts fixes
2418
2419 pre: isinstance(L,list) and len(L)>0
2420 isinstance(longueur_texte,int)
2421 #post: forall([texte[x[0]:x[1]] in __return__[0].keys() for x in L])
2422 """
2423 assert isinstance(longueur_texte,int) and longueur_texte > 0
2424 dicoPos = {}
2425 coutFixeRepete = {-1 : 0}
2426 coutFixeNonRepete = {-1 : 0}
2427 cumulRepete = 0
2428 cumulNonRepete = 0
2429 nbRepete = 0
2430 nbNonRepete = 0
2431 pos_texte = 0
2432 lNonRepete = []
2433
2434 for i in xrange(len(L)):
2435 chaine, debut, fin = L[i]
2436 debut -= debut_texte
2437 fin -= debut_texte
2438 assert 0 <= debut < fin <= longueur_texte, str(debut) +'/'+ str(fin) + '/'+ str(longueur_texte)+'/'+str(L)
2439 longueur = fin - debut
2440 try:
2441 dicoPos[chaine].append(i)
2442 except KeyError:
2443 dicoPos[chaine] = [i]
2444
2445 cumulRepete += longueur
2446 coutFixeRepete[i] = cumulRepete
2447
2448 cumulNonRepete += debut - pos_texte
2449 coutFixeNonRepete[i] = cumulNonRepete
2450 if debut - pos_texte > 0:
2451
2452
2453 lNonRepete.append((pos_texte,debut))
2454 else: lNonRepete.append((-1,-1))
2455 pos_texte = fin
2456
2457 assert fin == cumulRepete + cumulNonRepete
2458
2459 if fin != longueur_texte:
2460 cumulNonRepete += longueur_texte - fin
2461 coutFixeNonRepete[i+1] = cumulNonRepete
2462 pos_texte = longueur_texte
2463 lNonRepete.append((fin,longueur_texte))
2464
2465 assert pos_texte == longueur_texte
2466
2467
2468 assert (0 == lNonRepete[0][0] < L[0][1]-debut_texte or 0 == L[0][1]-debut_texte < lNonRepete[0][0] or
2469 -1 == lNonRepete[0][0] and L[0][1]-debut_texte == 0) , str(lNonRepete[0][0]) + '/' +str( L[0][1]-debut_texte)
2470
2471 assert longueur_texte == lNonRepete[-1][1] > L[-1][2]-debut_texte or longueur_texte == L[-1][2]-debut_texte > lNonRepete[-1][1], str(lNonRepete[-1][1]) + '/' +str( L[-1][2]-debut_texte) + '/' +str(longueur_texte)
2472 assert abs(len(lNonRepete)-len(L)) <= 1, abs(len(lNonRepete)-len(L))
2473
2474
2475
2476 return dicoPos, (coutFixeRepete, coutFixeNonRepete), lNonRepete
2477
2479 """ Calcule des dictionnaires des positions et des couts fixes
2480
2481 pre: isinstance(L,list) and len(L)>0
2482 isinstance(longueur_texte,int)
2483 #post: forall([texte[x[0]:x[1]] in __return__[0].keys() for x in L])
2484 """
2485 assert isinstance(longueur_texte,int) and longueur_texte > 0
2486 dicoPos = {}
2487
2488
2489
2490
2491 coutFixe = numpy.zeros((2,len(L)+1), numpy.int_)
2492
2493 cumulRepete = 0
2494 cumulNonRepete = 0
2495 nbRepete = 0
2496 nbNonRepete = 0
2497 pos_texte = 0
2498 lNonRepete = []
2499
2500 for i in xrange(len(L)):
2501 chaine, debut, fin = L[i]
2502 debut -= debut_texte
2503 fin -= debut_texte
2504 assert 0 <= debut < fin <= longueur_texte, str(debut) +'/'+ str(fin) + '/'+ str(longueur_texte)+'/'+str(L)
2505 longueur = fin - debut
2506 try:
2507 dicoPos[chaine].append(i)
2508 except KeyError:
2509 dicoPos[chaine] = [i]
2510
2511 cumulRepete += longueur
2512
2513 coutFixe[0][i] = cumulRepete
2514
2515 cumulNonRepete += debut - pos_texte
2516
2517 coutFixe[1][i] = cumulNonRepete
2518 if debut - pos_texte > 0:
2519
2520
2521 lNonRepete.append((pos_texte,debut))
2522 else: lNonRepete.append((-1,-1))
2523 pos_texte = fin
2524
2525 assert fin == cumulRepete + cumulNonRepete
2526
2527 if fin != longueur_texte:
2528 cumulNonRepete += longueur_texte - fin
2529 coutFixeNonRepete[i+1] = cumulNonRepete
2530 pos_texte = longueur_texte
2531 lNonRepete.append((fin,longueur_texte))
2532
2533 assert pos_texte == longueur_texte
2534
2535
2536 assert (0 == lNonRepete[0][0] < L[0][1]-debut_texte or 0 == L[0][1]-debut_texte < lNonRepete[0][0] or
2537 -1 == lNonRepete[0][0] and L[0][1]-debut_texte == 0) , str(lNonRepete[0][0]) + '/' +str( L[0][1]-debut_texte)
2538
2539 assert longueur_texte == lNonRepete[-1][1] > L[-1][2]-debut_texte or longueur_texte == L[-1][2]-debut_texte > lNonRepete[-1][1], str(lNonRepete[-1][1]) + '/' +str( L[-1][2]-debut_texte) + '/' +str(longueur_texte)
2540 assert abs(len(lNonRepete)-len(L)) <= 1, abs(len(lNonRepete)-len(L))
2541
2542
2543
2544 return dicoPos, (coutFixeRepete, coutFixeNonRepete), lNonRepete
2545
2547 """Extrait d'une liste le + petit et le + grand bloc situé avant et
2548 après chaque position i de cette liste
2549 dicoMinMax[0][i] = bloc le + petit avant la bloc i
2550 dicoMinMax[1][i] = bloc le + grand avant la bloc i
2551 dicoMinMax[2][i] = bloc le + petit après la bloc i
2552 dicoMinMax[3][i] = bloc le + grand après la bloc i
2553 """
2554 dicoMinMax = {0 : {}, 1 : {}, 2 : {}, 3 : {}}
2555
2556
2557 min_before = min_after = int(sys.maxint/10)
2558 max_before = max_after = 0
2559 for i in xrange(len(liste)):
2560 debut,fin = liste[i]
2561 if fin - debut > 0:
2562 min_before = min(min_before, fin-debut)
2563 max_before = max(max_before, fin-debut)
2564 dicoMinMax[0][i] = min_before
2565 dicoMinMax[1][i] = max_before
2566
2567
2568 for i in xrange(len(liste)-1, -1, -1):
2569 debut,fin = liste[i]
2570 if fin - debut > 0:
2571 min_after = min(min_after, fin-debut)
2572 max_after = max(max_after, fin-debut)
2573 dicoMinMax[2][i] = min_after
2574 dicoMinMax[3][i] = max_after
2575
2576 if __debug__:
2577 i = 0
2578 while i < len(dicoMinMax[0])-2:
2579 assert dicoMinMax[0][i] >= dicoMinMax[0][i+1], dicoMinMax[0]
2580 i += 1
2581 i = 0
2582 while i < len(dicoMinMax[1])-2:
2583 assert dicoMinMax[1][i] <= dicoMinMax[1][i+1]
2584 i += 1
2585 i = 0
2586 while i < len(dicoMinMax[2])-2:
2587 assert dicoMinMax[2][i] <= dicoMinMax[2][i+1]
2588 i += 1
2589 i = 0
2590 while i < len(dicoMinMax[3])-2:
2591 assert dicoMinMax[3][i] >= dicoMinMax[3][i+1]
2592 i += 1
2593 return dicoMinMax
2594
2596
2597 diff_taille = len(L1) - len(L2)
2598 if diff_taille < 0:
2599
2600 L2 = L2[:len(L1)]
2601 elif diff_taille > 0:
2602
2603 L1 = L1[:len(L2)]
2604
2605 assert len(L1) == len(L2), str(len(L1))+'/'+str(len(L2))
2606
2607 seuil_remp = self.seuil_remplacements
2608 inverse_seuil_remp = 1.0 / seuil_remp
2609
2610
2611
2612 tab1 = numpy.array(L1, numpy.float_)
2613 tab2 = numpy.array(L2)
2614
2615
2616 tab1 = numpy.where((tab1 / tab2) > seuil_remp , tab1, 0)
2617 tab2 = numpy.where((tab1 / tab2) < inverse_seuil_remp , tab2, 0)
2618
2619
2620 masque_remp = numpy.logical_and(tab1, tab2)
2621
2622 tab1 = numpy.where(masque_remp, tab1, 0)
2623 tab2 = numpy.where(masque_remp, tab2, 0)
2624
2625 tab1 = numpy.where(tab1 >= 0, tab1, 0)
2626 tab2 = numpy.where(tab2 >= 0, tab2, 0)
2627
2628 return tab1, tab2
2629
2631
2632 diff_taille = len(L1) - len(L2)
2633 if diff_taille < 0:
2634 L1.extend([-1] * abs(diff_taille))
2635 elif diff_taille > 0:
2636 L2.extend([-1] * diff_taille)
2637
2638 assert len(L1) == len(L2), str(len(L1))+'/'+str(len(L2))
2639
2640 seuil_remp = self.seuil_remplacements
2641 inverse_seuil_remp = 1.0 / seuil_remp
2642
2643
2644 tab1 = numarray.array(L1, type = 'Float32')
2645 tab2 = numarray.array(L2)
2646
2647 numarray.where((tab1 / tab2) > seuil_remp , tab1, 0, tab1)
2648 numarray.where((tab1 / tab2) < inverse_seuil_remp , tab2, 0, tab2)
2649
2650 masque_remp = numarray.logical_and(tab1, tab2)
2651
2652 numarray.where(masque_remp, tab1, 0, tab1)
2653 numarray.where(masque_remp, tab2, 0, tab2)
2654 return tab1, tab2
2655
2657
2658 diff_taille = len(L1) - len(L2)
2659 if diff_taille < 0:
2660 L1.extend([-1] * abs(diff_taille))
2661 elif diff_taille > 0:
2662 L2.extend([-1] * diff_taille)
2663
2664 assert len(L1) == len(L2), str(len(L1))+'/'+str(len(L2))
2665
2666 seuil_remp = self.seuil_remplacements
2667 inverse_seuil_remp = 1.0 / seuil_remp
2668
2669
2670 tab1 = Numeric.array(L1, Numeric.Float)
2671 tab2 = Numeric.array(L2)
2672
2673 tab1 = Numeric.where((tab1 / tab2) > seuil_remp , tab1, 0)
2674 tab2 = Numeric.where((tab1 / tab2) < inverse_seuil_remp , tab2, 0)
2675
2676 masque_remp = Numeric.logical_and(tab1, tab2)
2677
2678 tab1 = Numeric.where(masque_remp, tab1, 0)
2679 tab2 = Numeric.where(masque_remp, tab2, 0)
2680
2681 return tab1, tab2
2682
2684 """ Alignement A* entre les 2 séquences
2685 pre: isinstance(L1,list) and isinstance(L2,list) and isinstance(texte1,str) and isinstance(texte2,str)
2686 post: (len(__return__[0])==len(__return__[1])) or (len(__return__[0])==len(__return__[1])+1) or \
2687 (len(__return__[0])+1==len(__return__[1]))"""
2688
2689 logging.log(5,"debut _appelAstar")
2690 bu_l_texte1 = self.l_texte1 ; bu_l_texte2 = self.l_texte2
2691 self.l_texte1 = lt1 ; self.l_texte2 = len(texte2)
2692
2693 LC1 = [] ; LC2 = [] ; LH1 = [] ; LH2 = [] ; lRepetT1 = [] ; lRepetT2 = []
2694 for bloc in L1:
2695 cle = hash(texte1[bloc[0]:bloc[1]])
2696 LC1.append(cle)
2697 LH1.append((cle, bloc[0], bloc[1]))
2698 lRepetT1.append(bloc[1]-bloc[0])
2699 logging.log(5,'init L1 done')
2700 for bloc in L2:
2701 cle = hash(texte2[bloc[0]-lt1:bloc[1]-lt1])
2702 LC2.append(cle)
2703 LH2.append((cle, bloc[0], bloc[1]))
2704 lRepetT2.append(bloc[1]-bloc[0])
2705 logging.log(5,'init L2 done')
2706
2707 arrayRepetT1 = numpy.array(lRepetT1)
2708 arrayRepetT2 = numpy.array(lRepetT2)
2709 dicoPos = {} ; coutFixe = {} ; dicoMinMax = {}
2710
2711 dicoPos[1], (coutFixeRepeteT1, coutFixeNonRepeteT1), lSup = self.calcPosCoutFixe(LH1, self.l_texte1)
2712 dicoPos[2], (coutFixeRepeteT2, coutFixeNonRepeteT2), lIns = self.calcPosCoutFixe(LH2, self.l_texte2, debut_texte=self.l_texte1)
2713 coutFixe = ((coutFixeRepeteT1, coutFixeNonRepeteT1), (coutFixeRepeteT2, coutFixeNonRepeteT2))
2714 logging.log(5,"fin calcPosCoutFixe")
2715
2716 dicoMinMax1 = self.extractMinMax(lSup)
2717 dicoMinMax2 = self.extractMinMax(lIns)
2718
2719
2720 diffSym = self.preTraitDiffSym(LH1,LH2,dicoPos, lRepetT1, lRepetT2)
2721 logging.log(5,"fin preTraitDiffSym")
2722
2723 lSuppression = [] ; lInsertion = []
2724 for deb,fin in lSup:
2725 if fin-deb == 0: lSuppression.append(-1)
2726 else: lSuppression.append(fin-deb)
2727 for deb,fin in lIns:
2728 if fin-deb == 0: lInsertion.append(-1)
2729 else: lInsertion.append(fin-deb)
2730
2731
2732
2733 best, closedSet = self.deplacements_pond_Astar(LC1,LC2,diffSym,dicoPos,coutFixe, dicoMinMax1,dicoMinMax2, lSuppression, lInsertion, arrayRepetT1, arrayRepetT2, MO=True)
2734
2735 logging.log(5,"fin deplacements_pond_Astar")
2736 dicoBest={1:{}, 2:{}}
2737 while best != (-1,-1):
2738 dicoBest[1][best[0]]=1
2739 dicoBest[2][best[1]]=1
2740 best=closedSet[best][1]
2741
2742 s1=self._makeLRes(dicoBest[1],L1)
2743 s2=self._makeLRes(dicoBest[2],L2)
2744 logging.log(5,'s1 = '+str(s1))
2745 logging.log(5,'s2 = '+str(s2))
2746
2747 del dicoPos, coutFixe, dicoBest
2748 self.l_texte1 = bu_l_texte1 ; self.l_texte2 = bu_l_texte2
2749 logging.log(5,"fin _appelAstar")
2750 return s1,s2
2751
2752
2753 - def cout__(self,(k,l),(i,j),diffSym,((coutFixeRepeteT1, coutFixeNonRepeteT1),
2754 (coutFixeRepeteT2, coutFixeNonRepeteT2)),
2755 bestValue, dicoMinMax1, dicoMinMax2, lSuppression, lInsertion,
2756 arrayRepetT1, arrayRepetT2):
2757 return cost.fct_cout(k,l,i,j,diffSym,coutFixeRepeteT1, coutFixeNonRepeteT1,
2758 coutFixeRepeteT2, coutFixeNonRepeteT2,
2759 bestValue, dicoMinMax1, dicoMinMax2, lSuppression, lInsertion,
2760 arrayRepetT1, arrayRepetT2, self.coeffX, self.coeffY, self.coeffZ,
2761 self.l_texte1, self.l_texte2)
2762
2763
2764 - def cout(self,(k,l),(i,j),diffSym,((coutFixeRepeteT1, coutFixeNonRepeteT1),
2765 (coutFixeRepeteT2, coutFixeNonRepeteT2)),
2766 bestValue, dicoMinMax1, dicoMinMax2, lSuppression, lInsertion,
2767 arrayRepetT1, arrayRepetT2):
2768 """ calcul du cout d'un noeud """
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793 tot_remp1 = tot_remp2 = nb_remp1 = nb_remp2 = 0
2794 assert nb_remp1 == nb_remp2
2795
2796 (best_cumulINV, best_cumulMOV, best_cumulINS, best_cumulSUP,
2797 best_cumulREP, best_nbINV, best_nbMOV, best_nbINS, best_nbSUP,
2798 best_nbREP, best_inv_max, best_mov_max) = bestValue[2]
2799
2800 assert arrayRepetT1[i] == coutFixeRepeteT1[i] - coutFixeRepeteT1[i-1]
2801 assert arrayRepetT2[j] == coutFixeRepeteT2[j] - coutFixeRepeteT2[j-1]
2802 cumulINV = best_cumulINV + ( coutFixeRepeteT1[i] - coutFixeRepeteT1[i-1] +
2803 coutFixeRepeteT2[j] - coutFixeRepeteT2[j-1] )
2804
2805
2806 assert sum(arrayRepetT1[k+1:i]) == coutFixeRepeteT1[i-1] - coutFixeRepeteT1[k]
2807 assert sum(arrayRepetT2[l+1:j]) == coutFixeRepeteT2[j-1] - coutFixeRepeteT2[l]
2808 cumulMOV = best_cumulMOV + ( coutFixeRepeteT1[i-1] - coutFixeRepeteT1[k] +
2809 coutFixeRepeteT2[j-1] - coutFixeRepeteT2[l] )
2810
2811
2812 assert coutFixeNonRepeteT2[j] - coutFixeNonRepeteT2[l] - tot_remp2 >= 0,(coutFixeNonRepeteT2[j],coutFixeNonRepeteT2[l],tot_remp2)
2813 cumulINS = best_cumulINS + ( coutFixeNonRepeteT2[j] -
2814 coutFixeNonRepeteT2[l] )
2815
2816
2817 assert coutFixeNonRepeteT1[i] - coutFixeNonRepeteT1[k] - tot_remp1 >= 0, (coutFixeNonRepeteT1[i], coutFixeNonRepeteT1[k],tot_remp1)
2818 cumulSUP = best_cumulSUP + ( coutFixeNonRepeteT1[i] -
2819 coutFixeNonRepeteT1[k])
2820
2821
2822
2823 assert tot_remp1 + tot_remp2 >= 0, (tot_remp1, tot_remp2)
2824 cumulREP = best_cumulREP + ( tot_remp1 + tot_remp2 )
2825
2826 assert best_cumulINV < cumulINV <= self.l_texte1 + self.l_texte2 - cumulMOV, (best_cumulINV, cumulINV, self.l_texte1, self.l_texte2, cumulMOV)
2827 assert best_cumulMOV <= cumulMOV <= self.l_texte1 + self.l_texte2 - cumulINV
2828 assert best_cumulINS <= cumulINS <= self.l_texte2, (best_cumulINS, cumulINS,coutFixeNonRepeteT2[j], coutFixeNonRepeteT2[l],tot_remp2, self.l_texte2)
2829 assert best_cumulSUP <= cumulSUP <= self.l_texte1
2830 a="""assert best_cumulREP <= cumulREP <= self.l_texte1 + self.l_texte2, (best_cumulREP, cumulREP)
2831 """
2832
2833
2834 (difference_sym, nb_blocs_diff_sym, inv_min_after, inv_max_after,
2835 mov_min_after, mov_max_after) = diffSym[(i,j)]
2836
2837
2838
2839 g_x = 0.0 - (cumulMOV+cumulINV)
2840
2841 h_x = + difference_sym
2842
2843 f_x = (1 + (g_x - h_x) / (self.l_texte1 + self.l_texte2)) / 2
2844
2845 assert 0 <= f_x <= 1, f_x
2846
2847
2848
2849 current_inv_max = best_inv_max
2850 if current_inv_max < arrayRepetT1[i]: current_inv_max = arrayRepetT1[i]
2851
2852 nb_mov_added1 = nb_mov_added2 = max_mov_added1 = max_mov_added2 = 0
2853 if i - (k+1) > 0:
2854
2855
2856 max_mov_added1 = numpy.max(arrayRepetT1[k+1:i])
2857
2858
2859
2860 nb_mov_added1 = i - (k + 1)
2861
2862 if j - (l+1) > 0:
2863
2864
2865 max_mov_added2 = numpy.max(arrayRepetT2[l+1:j])
2866
2867
2868
2869 nb_mov_added2 = j - (l + 1)
2870
2871
2872 current_mov_max = best_mov_max
2873 if current_mov_max < max_mov_added1: current_mov_max = max_mov_added1
2874 if current_mov_max < max_mov_added2: current_mov_max = max_mov_added2
2875
2876
2877
2878 nbINV = best_nbINV + 2
2879
2880 nbMOV = best_nbMOV + nb_mov_added1 + nb_mov_added2
2881 nbINS = best_nbINS + (j - l) - nb_remp2
2882 nbSUP = best_nbSUP + (i - k) - nb_remp1
2883 nbREP = best_nbREP + nb_remp1 + nb_remp2
2884
2885
2886
2887 deno = current_inv_max
2888 if deno < inv_min_after: deno = inv_min_after
2889 f_yINV = ((0.0 + cumulINV + inv_min_after) / (nbINV +2)) / deno
2890 assert 0 < f_yINV <= 1, ((k,l),(i,j),f_yINV, cumulINV, inv_min_after, nbINV,current_inv_max,arrayRepetT1[:i+1],arrayRepetT2[:j+1])
2891
2892
2893 if current_mov_max == 0: f_yMOV = 0
2894 else: f_yMOV = ((cumulMOV + mov_min_after) / (nbMOV + 2)) / (current_mov_max)
2895
2896
2897
2898 deno2 = dicoMinMax2[1][j]
2899 if deno2 < dicoMinMax2[2][j]: deno2 = dicoMinMax2[2][j]
2900 if deno2 == 0: f_yINS = 0
2901 else: f_yINS = ((cumulINS + dicoMinMax2[2][j]) / (nbINS + 1)) / (deno2)
2902
2903
2904
2905 deno1 = dicoMinMax1[1][i]
2906 if deno1 < dicoMinMax1[2][i]: deno1 = dicoMinMax1[2][i]
2907 if deno1 == 0: f_ySUP = 0
2908 else: f_ySUP = ((cumulSUP + dicoMinMax1[2][i]) / (nbSUP + 1)) / (deno1)
2909
2910
2911 if deno1+deno2 == 0: f_yREP = 0
2912 else: f_yREP = (((cumulREP + dicoMinMax2[2][j] + dicoMinMax1[2][i]) / (nbREP + 2)) /
2913 (deno1+deno2))
2914
2915 f_y = (f_yINV + f_yMOV + f_yINS + f_ySUP + f_yREP) / 5
2916
2917 assert 0 <= f_y <= 1, (f_y, (f_yINV, f_yMOV, f_yINS, f_ySUP, f_yREP))
2918
2919
2920 denominateur_z1 = ((cumulINS + dicoMinMax2[2][j]) +
2921 (cumulSUP + dicoMinMax1[2][i]) + (cumulREP + dicoMinMax2[2][j] + dicoMinMax1[2][i]) +
2922 (cumulMOV + mov_min_after))
2923 if denominateur_z1 == 0: f_z1 = 0
2924 else: f_z1 = (1.0 + cumulMOV + mov_min_after) / denominateur_z1
2925
2926 denominateur_z2 = ((cumulINS + dicoMinMax2[2][j]) +
2927 (cumulSUP + dicoMinMax1[2][i]) + (cumulREP + dicoMinMax2[2][j] + dicoMinMax1[2][i]))
2928 if denominateur_z2 == 0: f_z2 = 0
2929 else: f_z2 = (1.0 + cumulREP + dicoMinMax2[2][j] + dicoMinMax1[2][i]) / (denominateur_z2)
2930
2931 f_z0 = (0.0 + cumulINV+ inv_min_after)/(cumulINV+ inv_min_after+cumulMOV+ mov_min_after)
2932
2933
2934 f_z = (f_z0 + f_z2) / 2
2935
2936 f_temp = (1.0/3 * f_x + 1.0/3 * f_y + 1.0/3 * f_z)
2937 assert 0 <= f_temp <= 1
2938 if f_temp == 0: f_temp = 1.0 / sys.maxint
2939 f = 1.0 / f_temp
2940
2941 return (f, (k,l), (cumulINV, cumulMOV, cumulINS, cumulSUP, cumulREP,
2942 nbINV, nbMOV, nbINS, nbSUP, nbREP, current_inv_max, current_mov_max))
2943
2945 diffSym={}
2946 LH1=L1 ; LH2=L2
2947 logging.log(5,'len(LH2)= '+str(len(LH2))+' / len(LH1)= '+str(len(LH1)))
2948 len_L1 = len(LH1) ; len_L2 = len(LH2)
2949
2950 i=0
2951 for posB1 in xrange(len_L1-1,-1,-1):
2952 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,'itération LH1: '+str(i))
2953 LL1 = Mset2(LH1[posB1+1:])
2954 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,' done MSet1')
2955 liste_pos2 = dicoPos[2][LH1[posB1][0]]
2956 j = len(liste_pos2)-1
2957 while j >= 0:
2958 posB2 = liste_pos2[j]
2959 if j == len(liste_pos2)-1:
2960 LL2 = Mset2(LH2[posB2+1:])
2961 LL1res, inv_min1, inv_max1, mov_min1, mov_max1 = LL1.difference(LL2)
2962 LL2res, inv_min2, inv_max2, mov_min2, mov_max2 = LL2.difference(LL1)
2963 assert inv_min1 == inv_min2
2964 assert inv_max1 == inv_max2
2965
2966
2967 else:
2968 next_posB2 = liste_pos2[j+1]
2969
2970
2971
2972
2973
2974 new_LL2 = Mset2(LH2[posB2+1:next_posB2])
2975
2976 t2, inv_min_t2, inv_max_t2, mov_min_t2, mov_max_t2 = new_LL2.difference(LL1res)
2977 LL2res.union(t2)
2978 inv_min2 = min(inv_min2, inv_min_t2)
2979 inv_max2 = max(inv_max2, inv_max_t2)
2980 mov_min2 = min(mov_min2, mov_min_t2)
2981 mov_max2 = max(mov_max2, mov_max_t2)
2982
2983 LL1res, inv_min_t1, inv_max_t1, mov_min_t1, mov_max_t1 = LL1res.difference(new_LL2)
2984 inv_min1 = min(inv_min1, inv_min_t1)
2985 inv_max1 = max(inv_max1, inv_max_t1)
2986 mov_min1 = min(mov_min1, mov_min_t1)
2987 mov_max1 = max(mov_max1, mov_max_t1)
2988
2989 assert inv_min1 == inv_min2, str(inv_min1) +'/'+ str(inv_min2)
2990 assert inv_max1 == inv_max2
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009 ds2 = LL1res.valeur + LL2res.valeur
3010 nb2 = LL1res.length + LL2.length
3011 mov_min = min(mov_min1, mov_min2)
3012 mov_max = max(mov_max1, mov_max2)
3013 assert inv_min1 == inv_min2
3014 assert inv_max1 == inv_max2
3015 inv_min = inv_min1
3016 if inv_min == sys.maxint: inv_min = 0
3017
3018
3019 if mov_min == sys.maxint: mov_min = 0
3020 diffSym[(posB1,posB2)] = (ds2, nb2, inv_min, inv_max1, mov_min, mov_max)
3021 j -= 1
3022 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,' done itération interne')
3023 i+=1
3024
3025 return diffSym
3026
3028 """ algo GREEDY de Sahpira et Storer 2002"""
3030 """ pre: isinstance(t1,str) and isinstance(t2,str)
3031 """
3032
3033 if len(t1)==0 or len(t2)==0: return [],[],[],[]
3034 logging.log(5,"debut dep_pond niveau "+str(niveau))
3035 LResT1,LResT2 = self.compute_alignement(t1,t2)
3036 if len(LResT1)==0 or len(LResT2)==0: return [],[],[],[]
3037 texte=t1+t2
3038 debutT1=i=0 ; debutT2=len(t1)
3039 lResBC1=[] ; lResBC2=[] ; lResDEP1=[] ; lResDEP2=[]
3040
3041 while (i<min(len(LResT1),len(LResT2))):
3042 BC1,lDep1=LResT1[i]
3043 BC2,lDep2=LResT2[i]
3044
3045 if BC1 is not None: finT1=BC1[0]
3046 else: finT1=len(t1)
3047 if BC2 is not None: finT2=BC2[0]
3048 else: finT2=len(t2)
3049
3050
3051 for x in lDep1: bisect.insort_right(lResDEP1, x)
3052 for x in lDep2: bisect.insort_right(lResDEP2, x)
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084 if BC1 is not None: lResBC1.append(BC1)
3085 if BC2 is not None: lResBC2.append(BC2)
3086 i+=1
3087 if BC1 is not None: debutT1 = BC1[1]
3088 if BC2 is not None: debutT2 = BC2[1]
3089
3090 if len(LResT1)>len(LResT2):
3091 assert(len(LResT1)==len(LResT2)+1 and LResT1[-1][0] is None)
3092 lResDEP1.extend(LResT1[-1][1])
3093 elif len(LResT2)>len(LResT1):
3094 assert(len(LResT2)==len(LResT1)+1 and LResT2[-1][0] is None)
3095 lResDEP2.extend(LResT2[-1][1])
3096 else: assert(len(LResT1)==len(LResT2))
3097 logging.log(5,"fin dep_pond niveau "+str(niveau))
3098 return lResDEP1,lResDEP2,lResBC1,lResBC2
3099
3101 """ prends les 2 textes en entrée et renvoie 2 listes au format
3102 [(BC,[BDeps le précédant])] """
3103 lLCS = self._compute_lLCS(t1,t2)
3104 if len(lLCS) == 0: return [],[]
3105 lTexte = list(t1) ; lTexte.extend(list(t2))
3106 len_t1 = len(t1)
3107 while len(lLCS) > 1:
3108 (longueur,inv_len_lOcc,cle,lOcc) = lLCS.pop(0)
3109
3110 lOcc_t1 = [] ; lOcc_t2 = []
3111 for occ in lOcc:
3112 if occ >= len_t1: lOcc_t1.append(occ)
3113 else: lOcc_t2.append(occ)
3114 nb_occ_replace = min(len(lOcc_t1), len(lOcc_t2))
3115
3116 for i in xrange(nb_occ_replace):
3117 occ1 = lOcc_t1[i] ; occ2 = lOcc_t2[i]
3118 for j in xrange(occ1,occ1+longueur):
3119 lTexte[j] = (longueur,inv_len_lOcc,cle,occ1)
3120 for j in xrange(occ2,occ2+longueur):
3121 lTexte[j] = (longueur,inv_len_lOcc,cle,occ2)
3122
3123 logging.debug('transformation en une liste avec les blocs répétéss')
3124 lTexte2 = [] ; i = 0 ; new_len_t1 = 0
3125 while i < len(lTexte):
3126 item = lTexte[i]
3127 if isinstance(item, tuple):
3128 item2 = item ; i += item[0]
3129 else:
3130 item2 = (1,1,item,i)
3131 i += 1
3132 if item2[3] < len_t1: new_len_t1 += 1
3133 lTexte2.append(item2)
3134 if __debug__:
3135 longueur = 0
3136 for car in lTexte2:
3137 if isinstance(car, tuple): longueur += car[0]
3138 else: longueur += 1
3139 assert longueur == len(lTexte) == len(t1) + len(t2)
3140
3141 LResT1,LResT2 = self.compute_ed_array(lTexte2, new_len_t1)
3142
3143 return LResT1,LResT2
3144
3146 logging.debug('compute_ed')
3147
3148 t1 = texte[:len_t1] ; t2 = texte[len_t1:] ; len_t2 = len(t2)
3149 logging.debug('nb ligne = %d, nb_col = %d',len_t1,len_t2)
3150
3151 table = {(-1,-1) : (0,None)}
3152 for i in xrange(len_t1): table[(i,-1)] = i,(i-1,-1),'D'
3153 for j in xrange(len_t2): table[(-1,j)] = j,(-1,j-1),'I'
3154 for i in xrange(len_t1):
3155
3156 for j in xrange(len_t2):
3157 deletion = table[(i-1,j)]
3158 insertion = table[(i,j-1)]
3159 diagonal = table[(i-1,j-1)]
3160 if (t1[i][2] == t2[j][2]):
3161 table[(i,j)] = min((diagonal[0],(i-1,j-1),'BC'), (deletion[0]+1,(i-1,j),'D'), (insertion[0]+1,(i,j-1),'I'))
3162 else:
3163 table[(i,j)] = min((deletion[0]+1,(i-1,j),'D'), (insertion[0]+1,(i,j-1),'I'))
3164 logging.debug("ed = "+str(table[i,j]))
3165
3166
3167
3168 i = len_t1-1 ; j = len_t2-1 ; path = []
3169 while i != -1 and j != -1:
3170 cell = table[i,j]
3171 path.append(((i,j),cell[0],cell[1],cell[2]))
3172 i = cell[1][0] ; j = cell[1][1]
3173 path.reverse()
3174
3175
3176 dico_pos_t1 = {}; dico_pos_t2 = {}
3177 for (i,j),ed,pere,typ in path:
3178 if typ == 'D':
3179 item = t1[i] ; cle = item[2]
3180 try: dico_pos_t1[cle].append((i,j))
3181 except KeyError: dico_pos_t1[cle] = [(i,j)]
3182 elif typ == 'I':
3183 item = t2[i] ; cle = item[2]
3184 try: dico_pos_t2[cle].append((i,j))
3185 except KeyError: dico_pos_t2[cle] = [(i,j)]
3186
3187
3188 i = len_t1-1 ; j = len_t2-1 ; path = []
3189 while i != -1 or j != -1:
3190 cell = table[i,j] ; typ = cell[2]
3191
3192 if typ == 'D':
3193 item = t1[i] ; cle = item[2]
3194 if dico_pos_t2.has_key(cle):
3195 pos_t2 = dico_pos_t2[cle].pop()
3196 if len(dico_pos_t2[cle]) == 0: del dico_pos_t2[cle]
3197 cell_t2 = table[pos_t2]
3198 table[pos_t2] = cell_t2[0],cell_t2[1],'M'
3199 table[i,j] = cell[0],cell[1],'M'
3200 elif typ == 'I':
3201 item = t2[j] ; cle = item[2]
3202 if dico_pos_t1.has_key(cle):
3203 pos_t1 = dico_pos_t1[cle].pop()
3204 if len(dico_pos_t1[cle]) == 0: del dico_pos_t1[cle]
3205 cell_t1 = table[pos_t1]
3206 table[pos_t1] = cell_t1[0],cell_t1[1],'M'
3207 table[i,j] = cell[0],cell[1],'M'
3208 cell = table[i,j]
3209 path.append(((i,j),cell[0],cell[1],cell[2]))
3210 i = cell[1][0] ; j = cell[1][1]
3211 path.reverse()
3212
3213
3214 LRes1 = [] ; LRes2 = [] ; accuDep1 = [] ; accuDep2 = []
3215 for (i,j),ed,(i_pere,j_pere),typ in path:
3216 item1 = t1[i] ; item2 = t2[j]
3217
3218 if typ == 'BC':
3219 item1 = t1[i]
3220 LRes1.append(([item1[3],item1[3]+item1[0]],accuDep1))
3221 item2 = t2[j]
3222 LRes2.append(([item2[3],item2[3]+item2[0]],accuDep2))
3223 accuDep1 = [] ; accuDep2 = []
3224 elif typ == 'M':
3225 if i > i_pere:
3226 item1 = t1[i]
3227 accuDep1.append([item1[3],item1[3]+item1[0]])
3228 elif j > j_pere:
3229 item2 = t2[j]
3230 accuDep2.append([item2[3],item2[3]+item2[0]])
3231
3232
3233 if len(accuDep1)>0: LRes1.append((None,accuDep1))
3234 if len(accuDep2)>0: LRes2.append((None,accuDep2))
3235
3236 return LRes1, LRes2
3237
3239 logging.debug('compute_ed_array')
3240
3241 t1 = texte[:len_t1] ; t2 = texte[len_t1:] ; len_t2 = len(t2)
3242 logging.debug('nb ligne = %d, nb_col = %d',len_t1,len_t2)
3243
3244 scoreTable = Numeric.zeros((len_t1+1,len_t2+1),Numeric.Int16)
3245 pathTable = Numeric.zeros((len_t1+1,len_t2+1),Numeric.Int8)
3246
3247
3248
3249
3250
3251 for i in xrange(len_t1+1):
3252 scoreTable[i,0] = i
3253 pathTable[i,0] = 1
3254 for j in xrange(len_t2+1):
3255 scoreTable[0,j] = j ; pathTable[0,j] = 2
3256 scoreTable[0,0] = 0 ; pathTable[0,0] = 0
3257
3258 for i in xrange(1,len_t1+1):
3259 if i%100==0: logging.debug('debut ligne %d',i)
3260 for j in xrange(1,len_t2+1):
3261 deletion = scoreTable[i-1,j]
3262 insertion = scoreTable[i,j-1]
3263 diagonal = scoreTable[i-1,j-1]
3264 if (t1[i-1][2] == t2[j-1][2]):
3265 scoreTable[i,j], pathTable[i,j] = min((diagonal,0),(deletion+1,1),(insertion+1,2))
3266 else:
3267 scoreTable[i,j], pathTable[i,j] = min((deletion+1,1),(insertion+1,2))
3268 logging.debug("ed = "+str(scoreTable[i,j]))
3269
3270
3271
3272 i = len_t1 ; j = len_t2 ; path = []
3273 while i != 0 and j != 0:
3274 score = scoreTable[i,j] ; direction = pathTable[i,j]
3275 if direction == 0: prev_i = i-1 ; prev_j = j-1 ; typeDir = 'BC'
3276 elif direction == 1: prev_i = i-1 ; prev_j = j ; typeDir = 'D'
3277 else: prev_i = i ; prev_j = j-1 ; typeDir = 'I'
3278 path.append(((i,j), score, (prev_i,prev_j), typeDir))
3279 i = prev_i ; j = prev_j
3280 path.reverse()
3281
3282
3283 dico_pos_t1 = {}; dico_pos_t2 = {}
3284 for (i,j),ed,pere,typ in path:
3285 if typ == 'D':
3286 if i == 0: continue
3287 assert 1 <= i <= len(t1), str(i)+'/'+str(len(t1))
3288 item = t1[i-1] ; cle = item[2]
3289 try: dico_pos_t1[cle].append((i,j))
3290 except KeyError: dico_pos_t1[cle] = [(i,j)]
3291 elif typ == 'I':
3292 if j == 0: continue
3293 assert 1 <= j <= len(t2), str(j)+'/'+str(len(t2))
3294 item = t2[j-1] ; cle = item[2]
3295 try: dico_pos_t2[cle].append((i,j))
3296 except KeyError: dico_pos_t2[cle] = [(i,j)]
3297
3298
3299 i = len_t1 ; j = len_t2 ; path = []
3300 while i != 0 or j != 0:
3301
3302 score = scoreTable[i,j]
3303 direction = pathTable[i,j]
3304
3305 if direction == 1:
3306 item = t1[i-1] ; cle = item[2]
3307 if dico_pos_t2.has_key(cle):
3308 pos_t2 = dico_pos_t2[cle].pop()
3309 if len(dico_pos_t2[cle]) == 0: del dico_pos_t2[cle]
3310
3311
3312 pathTable[pos_t2] = 4
3313
3314 pathTable[i,j] = 3
3315 elif direction == 2:
3316 item = t2[j-1] ; cle = item[2]
3317 if dico_pos_t1.has_key(cle):
3318 pos_t1 = dico_pos_t1[cle].pop()
3319 if len(dico_pos_t1[cle]) == 0: del dico_pos_t1[cle]
3320
3321
3322 pathTable[pos_t1] = 3
3323
3324 pathTable[i,j] = 4
3325
3326
3327
3328 direction = pathTable[i,j]
3329 if direction == 0: pere = (i-1,j-1); typ='BC'
3330 elif direction == 1: pere = (i-1,j);typ='D'
3331 elif direction == 3: pere = (i-1,j);typ='M'
3332 elif direction == 2: pere = (i,j-1); typ='I'
3333 elif direction == 4: pere = (i,j-1); typ='M'
3334 path.append(((i,j),score,pere,typ))
3335 i = pere[0] ; j = pere[1]
3336 path.reverse()
3337
3338
3339 LRes1 = [] ; LRes2 = [] ; accuDep1 = [] ; accuDep2 = []
3340 for (i,j),ed,(i_pere,j_pere),typ in path:
3341 item1 = t1[i-1] ; item2 = t2[j-1]
3342
3343 if typ == 'BC':
3344 item1 = t1[i-1]
3345 LRes1.append(([item1[3],item1[3]+item1[0]],accuDep1))
3346 item2 = t2[j-1]
3347 LRes2.append(([item2[3],item2[3]+item2[0]],accuDep2))
3348 accuDep1 = [] ; accuDep2 = []
3349 elif typ == 'M':
3350 if i > i_pere:
3351 item1 = t1[i-1]
3352 accuDep1.append([item1[3],item1[3]+item1[0]])
3353 elif j > j_pere:
3354 item2 = t2[j-1]
3355 accuDep2.append([item2[3],item2[3]+item2[0]])
3356
3357
3358 if len(accuDep1)>0: LRes1.append((None,accuDep1))
3359 if len(accuDep2)>0: LRes2.append((None,accuDep2))
3360
3361 return LRes1, LRes2
3362
3364 """ Extrait des 2 textes, les 2 séquences de blocs répétés """
3365 st = suffix_tree.GeneralisedSuffixTree([t1,t2])
3366 blocs_texte = st.get_MEM_index_chaine3(self.long_min_pivots)
3367 del st
3368 blocs_texte = self.remUnique(blocs_texte,len(t1))
3369 lLCS = []
3370 for (cle,longueur),lOcc in blocs_texte.iteritems():
3371 bisect.insort_right(lLCS, (longueur,1.0/len(lOcc),cle,lOcc))
3372 return lLCS
3373
3374
3375
3376
3378
3379
3380
3382 """recherche le point précédent maximum et sa position et le plus proche du point"""
3383 pos = 0
3384 maximum = -1
3385 posMax = -1
3386 if len(arrayItem)<=0 : return -1,-1
3387 for elem in arrayItem:
3388 if elem >= maximum :
3389
3390 posMax = pos
3391 maximum = elem
3392 pos+=1
3393
3394 return self.wOuverture+maximum*self.wExtension, len(arrayItem)-posMax
3395
3396
3398
3399 self.wOuverture = 5
3400 self.wExtension = 2
3401 logging.debug('compute_ed_array')
3402 t1 = texte[:len_t1] ; t2 = texte[len_t1:] ; len_t2 = len(t2)
3403
3404 logging.debug('nb ligne = %d, nb_col = %d',len_t1,len_t2)
3405
3406 scoreTable = Numeric.zeros((len_t1+1,len_t2+1))
3407 pathTable = Numeric.zeros((len_t1+1,len_t2+1))
3408 pathTableVal = Numeric.zeros((len_t1+1,len_t2+1))
3409
3410 for i in xrange(len_t1+1):
3411 scoreTable[i,0] = i
3412 pathTable[i,0] = 1
3413 pathTableVal[i,0] = i
3414 for j in xrange(len_t2+1):
3415 scoreTable[0,j] = j ; pathTable[0,j] = 2; pathTableVal[0,j] = j
3416 scoreTable[0,0] = 0 ; pathTable[0,0] = 0 ; pathTableVal[0,0] = 0
3417
3418 for i in xrange(1,len_t1+1):
3419 logging.debug('ligne %d',i)
3420 for j in xrange(1,len_t2+1):
3421 (deletion,posDeletion) = self.__getGapMaxEtPos(scoreTable[i-1,:j])
3422 (insertion,posInsertion) = self.__getGapMaxEtPos(scoreTable[:i,j-1])
3423 diagonal = scoreTable[i-1,j-1]
3424 scoreTable[i,j],pathTable[i,j],pathTableVal[i,j] = max((deletion,1,posDeletion),(insertion,2,posInsertion),(diagonal,0,0))
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439 logging.debug("ed = "+str(scoreTable[i,j]))
3440
3441
3442
3443 i = len_t1 ; j = len_t2 ; path = []
3444 while i != 0 and j != 0:
3445
3446 score = scoreTable[i,j] ; direction = pathTable[i,j]
3447
3448 if direction == 0: prev_i = i-1 ; prev_j = j-1 ; typeDir = 'BC'
3449 elif direction == 1: prev_i = i-pathTableVal[i,j] ; prev_j = j ; typeDir = 'D'
3450 else: prev_i = i ; prev_j = j-pathTableVal[i,j] ; typeDir = 'I'
3451 path.append(((i,j), score, (prev_i,prev_j), typeDir))
3452 i = prev_i ; j = prev_j
3453 path.reverse()
3454
3455 dico_pos_t1 = {}; dico_pos_t2 = {}
3456 for (i,j),ed,pere,typ in path:
3457 if typ == 'D':
3458 item = t1[i-1] ; cle = item[2]
3459 try: dico_pos_t1[cle].append((i,j))
3460 except KeyError: dico_pos_t1[cle] = [(i,j)]
3461 elif typ == 'I':
3462 assert 0 <=i <= len(t2), str(i)+'/'+str(len(t2))
3463 item = t2[i-1] ; cle = item[2]
3464 try: dico_pos_t2[cle].append((i,j))
3465 except KeyError: dico_pos_t2[cle] = [(i,j)]
3466
3467
3468 i = len_t1 ; j = len_t2 ; path = []
3469
3470 while i != 0 or j != 0:
3471
3472 score = scoreTable[i,j]
3473 direction = pathTable[i,j]
3474
3475 if direction == 1:
3476 item = t1[i-1] ; cle = item[2]
3477 if dico_pos_t2.has_key(cle):
3478 pos_t2 = dico_pos_t2[cle].pop()
3479 if len(dico_pos_t2[cle]) == 0: del dico_pos_t2[cle]
3480
3481
3482 pathTable[pos_t2] = [4,pathTableVal[pos_t2]]
3483
3484 pathTable[i,j] = [3,pathTableVal[i,j]]
3485 elif direction == 2:
3486 item = t2[j-1] ; cle = item[2]
3487 if dico_pos_t1.has_key(cle):
3488 pos_t1 = dico_pos_t1[cle].pop()
3489 if len(dico_pos_t1[cle]) == 0: del dico_pos_t1[cle]
3490
3491
3492 pathTable[pos_t1] = [3,abs(pathTableVal[pos_t1])]
3493
3494 pathTable[i,j] = [4,abs(pathTableVal[i,j])]
3495
3496
3497
3498 direction = pathTable[i,j]
3499 if direction == 0: pere = (i-1,j-1); typ='BC'
3500 elif direction == 1: pere = (i-pathTableVal[i,j],j);typ='D'
3501 elif direction == 3: pere = (i-pathTableVal[i,j],j);typ='M'
3502 elif direction == 2: pere = (i,j-pathTableVal[i,j]); typ='I'
3503 elif direction == 4: pere = (i,j-pathTableVal[i,j]); typ='M'
3504 path.append(((i,j),score,pere,typ))
3505 i = pere[0] ; j = pere[1]
3506
3507 path.reverse()
3508
3509
3510 LRes1 = [] ; LRes2 = [] ; accuDep1 = [] ; accuDep2 = []
3511 for (i,j),ed,(i_pere,j_pere),typ in path:
3512 item1 = t1[i-1] ; item2 = t2[j-1]
3513
3514 if typ == 'BC':
3515 item1 = t1[i-1]
3516 LRes1.append(([item1[3],item1[3]+item1[0]],accuDep1))
3517 item2 = t2[j-1]
3518 LRes2.append(([item2[3],item2[3]+item2[0]],accuDep2))
3519 accuDep1 = [] ; accuDep2 = []
3520 elif typ == 'M':
3521 if i > i_pere:
3522 item1 = t1[i-1]
3523 accuDep1.append([item1[3],item1[3]+item1[0]])
3524 elif j > j_pere:
3525 item2 = t2[j-1]
3526 accuDep2.append([item2[3],item2[3]+item2[0]])
3527
3528
3529 if len(accuDep1)>0: LRes1.append((None,accuDep1))
3530 if len(accuDep2)>0: LRes2.append((None,accuDep2))
3531
3532 return LRes1, LRes2
3533
3534
3536 - def run(self, t1, t2):
3537 """ pre: isinstance(t1,str) and isinstance(t2,str)
3538 """
3539
3540 self.preTraitDiffSym = psyco.proxy(self.preTraitDiffSym)
3541
3542
3543 self.addSubDep = False
3544
3545 sepTable = string.maketrans("$",".")
3546 if '$' in t1: t1 = t1.translate(sepTable)
3547 if '$' in t2: t2 = t2.translate(sepTable)
3548 bbl = self.deplacements_pond2(t1,t2)
3549 return bbl
3550
3552 """ pre: isinstance(t1,str) and isinstance(t2,str)
3553 """
3554
3555 if len(t1)==0 or len(t2)==0: return []
3556
3557
3558
3559 logging.log(5,"debut dep_pond")
3560
3561
3562 s1,s2 = self._texteToSeqHomo(t1,t2)
3563 if __debug__:
3564 self.ass2__(s1,0,len(t1),t1)
3565 self.ass2__(s2,len(t1),len(t1)+len(t2),t1+t2)
3566
3567 if len(s1)==0 or len(s2)==0: return []
3568
3569
3570
3571
3572 LResT1,LResT2 = self._appelAstar(s1,s2,t1,t2,len(t1))
3573 assert -1 <= len(LResT1) - len(LResT2) <= 1
3574 debutT1 = posT1 = 0 ; debutT2 = posT2 = len(t1) ; bbl = []
3575
3576 for (BC1,lDep1),(BC2,lDep2) in zip(LResT1,LResT2):
3577
3578 if BC1 is not None: finT1 = BC1[0]
3579 else: finT1 = len(t1)
3580 if BC2 is not None: finT2 = BC2[0]
3581 else: finT2 = len(t1)+len(t2)
3582
3583
3584
3585 texteRec1 = t1[debutT1:finT1] ; texteRec2 = t2[debutT2-len(t1):finT2-len(t1)]
3586 bblRec = self.deplacements_pond2(texteRec1,texteRec2)
3587
3588 if __debug__:
3589 for B1Rec,B2Rec in bblRec:
3590 assert B1Rec is None or 0 <= B1Rec[1] < B1Rec[2] <= len(texteRec1), B1Rec
3591 assert B2Rec is None or len(texteRec1) <= B2Rec[1] < B2Rec[2] <= len(texteRec1)+len(texteRec2)
3592 assert posT1 == debutT1 and posT2 == debutT2
3593 for i in xrange(len(bblRec)-1,-1,-1):
3594
3595 B1,B2 = bblRec[i]
3596 if B1 is None: NB1 = None
3597 else: NB1 = (B1[0], B1[1] + posT1, B1[2] + posT1,
3598 [(d1+posT1,d2+posT1) for (d1,d2) in B1[3]])
3599 if B2 is None: NB2 = None
3600 else: NB2 = (B2[0], B2[1] + posT2-len(texteRec1), B2[2] + posT2-len(texteRec1),
3601 [(d1+posT2-len(texteRec1),d2+posT2-len(texteRec1)) for (d1,d2) in B2[3]])
3602 if __debug__:
3603 if NB1 is not None:
3604 assert posT1 <= NB1[1] < NB1[2] <= finT1
3605 for d,f in NB1[3]: assert NB1[1] <= d < f <= NB1[2] , NB1
3606 if NB2 is not None:
3607 assert posT2 <= NB2[1] < NB2[2] <= finT2 , str(posT2)+' <= '+str(NB2)+' <= '+str(finT2)
3608 for d,f in NB2[3]: NB2[1] <= d < f <= NB2[2]
3609 bblRec[i] = (NB1,NB2)
3610
3611 if __debug__:
3612 assert posT1 == debutT1 and posT2 == debutT2
3613 for B1,B2 in bblRec:
3614 assert B1 is None or posT1 <= B1[1] < B1[2] <= finT1
3615 assert B2 is None or posT2 <= B2[1] < B2[2] <= finT2, str(posT2)+' <= '+str(B2)+' <= '+str(finT2)
3616
3617 nbbl = []
3618 for B1,B2 in bblRec:
3619 if B1 is not None and (B1[0] == 'S'):
3620 assert posT1 == B1[1] and B1[1] < B1[2]
3621 nbbl.append((B1,B2))
3622 posT1 = B1[2]
3623 elif B2 is not None and (B2[0] == 'I'):
3624 assert posT2 == B2[1] and B2[1] < B2[2]
3625 nbbl.append((B1,B2))
3626 posT2 = B2[2]
3627 else:
3628 assert B1 is not None and B1[0] == 'BC' and B2 is not None and B2[0] == 'BC'
3629 assert B1[1] < B1[2] and B2[1] < B2[2]
3630 nbbl.append((B1,B2))
3631 posT1 = B1[2] ; posT2 = B2[2]
3632 else:
3633 if posT1 < finT1:
3634 nbbl.append((('S',posT1,finT1,[]),None))
3635 posT1 = finT1
3636 if posT2 < finT2:
3637 nbbl.append((None,('I',posT2,finT2,[])))
3638 posT2 = finT2
3639
3640 assert nbbl > 0
3641 if __debug__ and len(nbbl) > 1:
3642
3643 lastBB1,lastBB2 = nbbl[-1] ; lastLastBB1,lastLastBB2 = nbbl[-2]
3644 if lastBB1 is not None and lastLastBB1 is not None:
3645 assert debutT1 <= lastLastBB1[2] <= lastBB1[1] <= finT1 , lastLastBB1 + lastBB1
3646 if lastBB2 is not None and lastLastBB2 is not None:
3647 assert debutT2 <= lastLastBB2[2] <= lastBB2[1] <= finT2
3648 test1 = test2 = True
3649 for B1,B2 in nbbl:
3650 if test1 and B1 is not None: assert B1[1] == debutT1 ; test1 = False
3651 if test2 and B2 is not None: assert B2[1] == debutT2 ; test2 = False
3652 if not test1 and not test2: break
3653 test1 = test2 = True
3654 nbbl2 = nbbl[:] ; nbbl2.reverse()
3655 for B1,B2 in nbbl2:
3656 if test1 and B1 is not None: assert B1[2] == finT1 ; test1 = False
3657 if test2 and B2 is not None: assert B2[2] == finT2 ; test2 = False
3658 if not test1 and not test2: break
3659
3660 i = j = k = 0
3661 while (k < len(nbbl)):
3662 B1,B2 = nbbl[k]
3663 if B1 is not None:
3664 while (i < len(lDep1)):
3665 D1 = lDep1[i]
3666 if B1[1] <= D1[0] < D1[1] <= B1[2]:
3667
3668 bisect.insort_right(B1[3], (D1[0],D1[1]))
3669 i += 1
3670 else:
3671 assert B1[2] < D1[1]
3672 i += 1
3673 break
3674 if B2 is not None:
3675 while (j < len(lDep2)):
3676 D2 = lDep2[j]
3677 if B2[1] <= D2[0] < D2[1] <= B2[2]:
3678
3679 bisect.insort_right(B2[3], (D2[0],D2[1]))
3680 j += 1
3681 else:
3682 assert B2[2] < D2[1]
3683 j += 1
3684 break
3685 k += 1
3686 if __debug__:
3687 for B1,B2 in nbbl:
3688 if B1 is not None:
3689 assert debutT1 <= B1[1] < B1[2] <= finT1 , str(debutT1)+' <= '+str(B1)+' <= '+str(finT1)
3690 for d,f in B1[3]: assert B1[1] <= d < f <= B1[2] , B1
3691 if B2 is not None:
3692 assert debutT2 <= B2[1] < B2[2] <= finT2 , str(debutT2)+' <= '+str(B2)+' <= '+str(finT2)
3693 for d,f in B2[3]: B2[1] <= d < f <= B2[2]
3694 bbl.extend(nbbl)
3695
3696 if BC1 is not None:
3697 assert BC2 is not None
3698 bbl.append((('BC',BC1[0],BC1[1],[]),('BC',BC2[0],BC2[1],[])))
3699 posT1 = BC1[1] ; posT2 = BC2[1]
3700 debutT1 = posT1 ; debutT2 = posT2
3701
3702 assert posT1 == debutT1 and posT2 == debutT2
3703 if len(LResT1) > len(LResT2):
3704 assert(len(LResT1)==len(LResT2)+1 and LResT1[-1][0] is None)
3705 if posT1 < len(t1): bbl.append((('S',posT1,len(t1),[(d,f) for d,f in LResT1[-1][1]]),None))
3706 if posT2 < len(t1)+len(t2): bbl.append((None,('I',posT2,len(t1)+len(t2),[])))
3707 elif len(LResT2) > len(LResT1):
3708 assert(len(LResT2)==len(LResT1)+1 and LResT2[-1][0] is None)
3709 if posT1 < len(t1): bbl.append((('S',posT1,len(t1),[]),None))
3710 if posT2 < len(t1)+len(t2): bbl.append((None,('I',posT2,len(t1)+len(t2),[(d,f) for d,f in LResT2[-1][1]])))
3711 else:
3712 assert len(LResT1) == len(LResT2)
3713 if posT1 < len(t1): bbl.append((('S',posT1,len(t1),[]),None))
3714 if posT2 < len(t1)+len(t2): bbl.append((None,('I',posT2,len(t1)+len(t2),[])))
3715
3716 if __debug__:
3717
3718 self._testOrdre(bbl,len(t1))
3719 test1 = test2 = True
3720 for B1,B2 in bbl:
3721 if test1 and B1 is not None: assert B1[1] == 0 ; test1 = False
3722 if test2 and B2 is not None: assert B2[1] == len(t1) ; test2 = False
3723 if not test1 and not test2: break
3724 test1 = test2 = True
3725 bbl2 = bbl[:] ; bbl2.reverse()
3726 for B1,B2 in bbl2:
3727 if test1 and B1 is not None: assert B1[2] == len(t1),str(B1)+'/'+str(len(t1)) ; test1 = False
3728 if test2 and B2 is not None: assert B2[2] == len(t1)+len(t2) ; test2 = False
3729 if not test1 and not test2: break
3730 logging.log(5,"fin dep_pond")
3731 return bbl
3732
3734 """ Teste le bon ordre des blocs dans la liste """
3735 posT1 = 0
3736 posT2 = len_t1
3737 prevB1 = ('',0,0,[]) ; prevB2 = ('',posT2,posT2,[])
3738 for (B1,B2) in liste:
3739 if B1 is not None:
3740 assert B1[0] == 'BC' or B1[0] == 'D' or B1[0] == 'S'
3741
3742 assert posT1 == B1[1],str(posT1)+' '+str(prevB1)+' '+str(B1)
3743 prevDep = posT1
3744 for dep in B1[3]:
3745 assert B1[1] <= dep[0] <= dep[1] <= B1[2]
3746 assert prevDep <= dep[0], B1[3]
3747 prevDep = dep[0]
3748 posT1 = B1[2]
3749 prevB1 = B1
3750 if B2 is not None:
3751 assert B2[0] == 'BC' or B2[0] == 'D' or B2[0] == 'I'
3752 assert posT2 == B2[1], str(posT2)+' '+str(prevB2)+' '+str(B2)
3753 prevDep = posT2
3754 for dep in B2[3]:
3755 assert B2[1] <= dep[0] <= dep[1] <= B2[2]
3756 assert prevDep <= dep[0]
3757 prevDep = dep[0]
3758 posT2 = B2[2]
3759 prevB2 = B2
3760
3761 import contract
3762 contract.checkmod(__name__)
3763
3764 -def trace1(commande,dic_locals):
3765 import profile,pstats
3766 profile.runctx(commande,globals(), dic_locals,'c:\workspace\medite\statFile')
3767 s = pstats.Stats('c:\workspace\medite\statFile')
3768 s.sort_stats('time')
3769 s.print_stats()
3770 sys.stderr.flush()
3771 sys.stdout.flush()
3772
3773 -def trace2(commande,dic_locals):
3774 import hotshot,hotshot.stats
3775 prof = hotshot.Profile("c:\workspace\medite\statFile")
3776 benchtime = prof.runctx(commande,globals(), dic_locals)
3777 prof.close()
3778 stats = hotshot.stats.load("c:\workspace\medite\statFile")
3779 stats.strip_dirs()
3780 stats.sort_stats('cumulative', 'time', 'calls')
3781 stats.print_stats()
3782
3786
3787 if __name__ == '__main__':
3788 _test()
3789