Package medite :: Package MediteAppli :: Module temp2
[hide private]
[frames] | no frames]

Source Code for Module medite.MediteAppli.temp2

  1  # -*- coding: iso-8859-1 -*- 
  2  # Copyright 20003 - 2008: Julien Bourdaillet (julien.bourdaillet@lip6.fr), Jean-Gabriel Ganascia (jean-gabriel.ganascia@lip6.fr) 
  3  # This file is part of MEDITE. 
  4  # 
  5  #    MEDITE is free software; you can redistribute it and/or modify 
  6  #    it under the terms of the GNU General Public License as published by 
  7  #    the Free Software Foundation; either version 2 of the License, or 
  8  #    (at your option) any later version. 
  9  # 
 10  #    MEDITE is distributed in the hope that it will be useful, 
 11  #    but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13  #    GNU General Public License for more details. 
 14  # 
 15  #    You should have received a copy of the GNU General Public License 
 16  #    along with Foobar; if not, write to the Free Software 
 17  #    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA 
 18   
19 -class AlignAstar2(Align):
20 """ texte passé en paramètre des fonctions """
21 - def calcPosCoutFixe(self, L): #, texte):
22 """ Calcule des dictionnaires des positions et des couts fixes 23 24 pre: isinstance(L,list) 25 len(L)>0 26 #post: forall([texte[x[0]:x[1]] in __return__[0].keys() for x in L]) 27 """ 28 dicoPos = {} 29 coutFixe = {} 30 cumul=0 31 for i in xrange(len(L)): 32 #chaine = texte[L[i][0]:L[i][1]] 33 #longueur = L[i][1]-L[i][0] 34 chaine, longueur = L[i] 35 try: 36 dicoPos[chaine].append(i) # ajout de sa position 37 except KeyError: 38 dicoPos[chaine] = [i] 39 coutFixe[i]=cumul #cumul des couts (leur longueur) des blocs précédents 40 cumul+=longueur #L[i][1]-L[i][0] 41 return dicoPos,coutFixe
42
43 - def deplacements_pond(self, Lv1, Lv2):
44 L1 = [] ; L2 = [] ; Lcf1 = [] ; Lcf2 = [] ; LH1 = [] ; LH2 = [] 45 for bloc in Lv1: 46 cle = hash(self.texte[bloc[0]:bloc[1]]) 47 L1.append(cle) 48 Lcf1.append((cle,bloc[1]-bloc[0])) 49 LH1.append((cle, bloc[0], bloc[1])) 50 print LH1 51 logging.debug('init L1 done') 52 for bloc in Lv2: 53 cle = hash(self.texte[bloc[0]:bloc[1]]) 54 L2.append(cle) 55 Lcf2.append((cle,bloc[1]-bloc[0])) 56 LH2.append((cle, bloc[0], bloc[1])) 57 logging.debug('init L2 done') 58 # dicoPos1 stocke pour chaque bloc de L1 ses positions dans cette liste 59 dicoPos = {} 60 coutFixe = {} 61 dicoPos[1], coutFixe[1] = self.calcPosCoutFixe(Lcf1,self.texte) 62 dicoPos[2], coutFixe[2] = self.calcPosCoutFixe(Lcf2,self.texte) 63 for cle in dicoPos[1]: assert cle in dicoPos[2] 64 for cle in dicoPos[2]: assert cle in dicoPos[1] 65 66 diffSym = self.preTraitDiffSym(LH1,LH2,self.texte,dicoPos) # pré-calcul des différences symétriques 67 best, closedSet = self.deplacements_pond_Astar(L1,L2,self.texte,diffSym,dicoPos,coutFixe) # A* 68 dicoBest1={} # dico des positions dans L1 du meilleur parcours 69 dicoBest2={} # dico des positions dans L2 du meilleur parcours 70 while best != (-1,-1): # on remonte le meilleur parcours trouvé 71 dicoBest1[best[0]]=1 72 dicoBest2[best[1]]=1 73 best=closedSet[best][1] # noeud père 74 75 LResDep=[] 76 LResBC=[] 77 for i in xrange(len(L1)): 78 if not dicoBest1.has_key(i):LResDep.append(L1[i]) 79 else:LResBC.append(L1[i]) 80 for i in xrange(len(L2)): 81 if not dicoBest2.has_key(i):LResDep.append(L2[i]) 82 else:LResBC.append(L2[i]) 83 84 del dicoPos, coutFixe, dicoBest1, dicoBest2 85 86 return LResDep,LResBC
87 88
89 - def deplacements_pond_Astar(self, L1Static, L2Static, diffSym, dicoPos, coutFixe):
90 """ implémentation de A* pour rechercher le meilleur chemin dans l'arbre des appariement entre blocs 91 Renvoie le noeud du dernier appariement """ 92 openSet={} # ensemble des sommets ouverts pour A* 93 closedSet={} # ensemble des sommets fermés pour A* 94 L1=L1Static # liste L1 courante 95 L2=L2Static 96 best=(-1,-1) 97 while True: 98 for posB1 in xrange(len(L1)): 99 #for posB2 in dicoPos[2][texte[L1[posB1][0]:L1[posB1][1]]]: 100 for posB2 in dicoPos[2][L1[posB1]]: 101 if posB2<best[1]+1: continue # cette position est hors de la liste courante 102 node=(posB1+best[0]+1,posB2) # position absolue par rapport à self.L1 et self.L2 103 cout=self.cout(best[0]+1,best[1]+1,posB1+best[0]+1,posB2,diffSym,coutFixe) 104 if closedSet.has_key(node): 105 if closedSet[node][0]>cout: 106 openSet[node]=(cout,best) 107 del closedSet[node] 108 elif openSet.has_key(node): 109 if openSet[node][0]>cout: 110 openSet[node]=(cout,best) 111 else: openSet[node]=(cout,best) 112 best=None # noeud (posL1, posL2) 113 bestValue=None # valeur (cout,pere) 114 for node in openSet: # recherche du noeud de moindre cout 115 if (best==None or openSet[node][0]<bestValue[0]): 116 best=node 117 bestValue=openSet[node] 118 L1=L1Static[best[0]+1:] # listes correspondant à ce noeud 119 L2=L2Static[best[1]+1:] 120 del openSet[best] 121 closedSet[best]=bestValue 122 if (L1==[] or L2==[] or len(L1)+len(L2)==diffSym[(best[0],best[1])][1]): # plus d'appariement possible, sortie 123 return best,closedSet
124
125 - def cout(self,d1,d2,f1,f2,diffSym,coutFixe):
126 """ calcul du cout d'un noeud """ 127 cF1=coutFixe[1][f1]-coutFixe[1][d1] 128 cF2=coutFixe[2][f2]-coutFixe[2][d2] 129 coutHeuristique=diffSym[(f1,f2)][0] 130 #coutHeuristique=getDiffSym(f1,f2,diffSym)[0] 131 return cF1+cF2+coutHeuristique # cout de l'appariement
132
133 - def getDiffSym(self,f1,f2,diffSym):
134 try: 135 return diffSym[(f1,f2)] 136 except KeyError: 137 pass
138
139 - def preTraitDiffSym(self,L1,L2,dicoPos,niveau=0):
140 diffSym={} # dico[(i,j)] stocke le cout heuristique et la longueur de la liste résultant de la différence symétrique entre L1[i+1:] et L2[j+1:] 141 LH1=L1 ; LH2=L2 142 a='''LH1 = [] ; LH2 = [] 143 for bloc in L1: 144 LH1.append((bloc[0], bloc[1]-bloc[1])) 145 logging.debug('init LH1 done') 146 for bloc in L2: 147 LH2.append((bloc[0], bloc[1]-bloc[1])) 148 logging.debug('init LH2 done')''' 149 logging.log(5,'len(LH2)= '+str(len(LH2))+' / len(LH1)= '+str(len(LH1))) 150 151 len_L1 = len(LH1) ; len_L2 = len(LH2) 152 #for posB2 in xrange(len_L2): 153 # diffSym[(len_L1,posB2)] = (0,0,[],[]) 154 # maxDiffSymL1[posB2] = len_L1 155 class Mset_(set): 156 def evaluate(self): 157 res = 0 158 for cle,val in self: 159 res += val 160 return res
161 class Mset(dict): 162 def __init__(self, liste=[]): 163 assert isinstance(liste, list) 164 self.valeur = 0 165 self.length = 0 166 for (val,deb,fin) in liste: 167 try: 168 nb,longueur = self[val] 169 self[val] = nb+1,longueur 170 except KeyError: 171 self[val] = (1, fin-deb) 172 self.valeur += fin - deb 173 self.length += 1 174 def difference(self, mset2, renvoieReste=False): 175 assert isinstance(mset2, Mset) 176 res = Mset() #; reste = MSet() 177 for cle,(nb, longueur) in self.iteritems(): 178 if cle not in mset2: 179 res[cle] = (nb, longueur) 180 res.valeur += nb * longueur 181 res.length += nb 182 else: 183 (nb2, longueur2) = mset2[cle] 184 if nb > nb2: 185 res[cle] = (nb-nb2, longueur) 186 res.valeur += (nb-nb2) * longueur 187 res.length += nb-nb2 188 return res 189 def difference_liste(self, mset2, renvoieReste=False): 190 assert isinstance(mset2, list) 191 res = Mset() #; reste = MSet() 192 for cle,(nb, longueur) in self.iteritems(): 193 if cle not in mset2: 194 res[cle] = (nb, longueur) 195 res.valeur += nb * longueur 196 res.length += nb 197 else: 198 (nb2, longueur2) = mset2[cle] 199 if nb > nb2: 200 res[cle] = (nb-nb2, longueur) 201 res.valeur += (nb-nb2) * longueur 202 res.length += nb-nb2 203 return res 204 def intersection(self, mset2): 205 assert isinstance(mset2, Mset) 206 res = Mset() 207 if self.length <= mset2.length: grand = mset2 ; petit = self 208 else: grand = self ; petit = mset2 209 for cle,(nb, longueur) in petit.iteritems(): 210 try: 211 (nb2, longueur2) = grand[cle] 212 if nb <= nb2: nbToAdd = nb 213 else: nbToAdd = nb2 214 res[cle] = (nbToAdd, longueur) 215 res.valeur += nbToAdd * longueur 216 res.length += nbToAdd 217 except KeyError: 218 pass 219 return res 220 i=0 221 for posB1 in xrange(len_L1-1,-1,-1): 222 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,'itération LH1: '+str(i)) 223 LL1 = Mset(LH1[posB1+1:]) 224 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,' done MSet1') 225 liste_pos2 = dicoPos[2][LH1[posB1][0]]#texte[L1[posB1][0]:L1[posB1][1]]] 226 j = len(liste_pos2)-1 227 while j >= 0: 228 if j == len(liste_pos2)-1: 229 LL2 = Mset(LH2[liste_pos2[j]+1:]) ; ds = nb = 0 230 LL1res = LL1.difference(LL2) 231 LL2res = LL2.difference(LL1) 232 else: 233 posB2 = liste_pos2[j] 234 next_posB2 = liste_pos2[j+1] 235 #(ds,nb,LL1,LL2) = diffSym[(posB1,next_posB2)] 236 #diffSym[(posB1,next_posB2)] = (ds,nb) 237 new_LL1 = Mset(LH1[posB2+1:next_posB2+1]) 238 LL1res = LL1res.difference(LL2res.intersection(new_LL1)) 239 LL2res = LL2res.difference(new_LL1) 240 241 ds2 = LL1res.valeur + LL2res.valeur 242 nb2 = LL1res.length + LL2.length 243 #ds2 = LL1res.evaluate() + LL2res.evaluate() 244 #nb2 = len(LL1res) + len(LL2res) 245 diffSym[(posB1,liste_pos2[j])] = (ds2,nb2)#,LL1res, LL2res) 246 j -= 1 247 diffSym[(posB1,liste_pos2[0])] = (ds2,nb2) 248 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,' done itération interne') 249 i+=1 250 return diffSym 251
252 - def preTraitDiffSymVect(self,L1,L2,dicoPos,niveau=0):
253 logging.log(5,'begin preTraitDiffSymVect') 254 diffSym={} # dico[(i,j)] stocke le cout heuristique et la longueur de la liste résultant de la différence symétrique entre L1[i+1:] et L2[j+1:] 255 LH1=L1 ; LH2=L2 256 logging.log(5,'len(LH2)= '+str(len(LH2))+' / len(LH1)= '+str(len(LH1))) 257 dic_alphabet = {} 258 for cle,deb,fin in L1: 259 dic_alphabet[cle] = fin-deb 260 for cle,deb,fin in L2: 261 dic_alphabet[cle] = fin-deb 262 taille_alphabet = len(dic_alphabet) 263 temp = [] 264 for cle in dic_alphabet: 265 bisect.insort_right(temp, cle) 266 array_pos_alphabet = numarray.array(temp) # ordre des lettres de l'alphabet 267 assert len(array_pos_alphabet) == taille_alphabet 268 array_valeur_alphabet = numarray.zeros(taille_alphabet) # poids de chaque lettre 269 for pos in xrange(taille_alphabet):# MAJ dico avec position 270 cle = array_pos_alphabet[pos] 271 val = dic_alphabet[cle] 272 #dic_alphabet[cle] = val,pos 273 dic_alphabet[cle] = pos 274 array_valeur_alphabet[pos] = val 275 276 LH1 = [cle for cle,deb,fin in LH1] 277 LHH1 = [dic_alphabet[cle] for cle in LH1] 278 #logging.debug('len(LHH1)=%d, taille_alphabet=%d,len(LHH1)*taille_alphabet=%d',len(LHH1), taille_alphabet,len(LHH1)*taille_alphabet) 279 #array_LH1 = numarray.zeros((len(LHH1),taille_alphabet),numarray.Int8) 280 #current = numarray.zeros(taille_alphabet) 281 #accu = numarray.zeros(taille_alphabet) 282 #logging.debug('i=%d',len(LHH1)) 283 #for i in xrange(len(LHH1)-1,-1,-1): 284 # if i%1000 ==0: logging.debug('i=%d',i) 285 # pos = LHH1[i] 286 # numarray.where(numarray.arange(taille_alphabet)==pos,1,0,out=current) 287 # numarray.add(accu,current,accu) 288 # array_LH1[i] = accu 289 #logging.debug('array_LH1 done') 290 291 LH2 = [cle for cle,deb,fin in LH2] 292 LHH2 = [dic_alphabet[cle] for cle in LH2] 293 #array_LH2 = numarray.zeros((len(LHH2),taille_alphabet),numarray.Int8) 294 #current = numarray.zeros(taille_alphabet) 295 #accu = numarray.zeros(taille_alphabet) 296 #for i in xrange(len(LHH2)-1,-1,-1): 297 # pos = LHH2[i] 298 # numarray.where(numarray.arange(taille_alphabet)==pos,1,0,out=current) 299 # numarray.add(accu,current,accu) 300 # array_LH2[i] = accu 301 #logging.debug('array_LH2 done') 302 logging.log(4,'LH1 = '+str(LH1)) 303 logging.log(4,'LH2 = '+str(LH2)) 304 logging.log(4,'array_pos_alphabet = '+str(array_pos_alphabet)) 305 logging.log(4,'array_valeur_alphabet = '+str(array_valeur_alphabet)) 306 logging.log(4,'dic_alphabet = '+str(dic_alphabet)) 307 #for pos in xrange(taille_alphabet): 308 # cle = array_pos_alphabet[pos] 309 # val = dic_alphabet[cle] 310 # array_valeur_alphabet[pos] = val 311 logging.log(5,'creation alphabet done') 312 313 current = numarray.zeros(taille_alphabet) 314 accu = numarray.zeros(taille_alphabet) 315 def liste_to_alphab_array(liste,array): 316 #c=0 317 #for cle,deb,fin in liste: 318 #for cle in liste: 319 #for pos in liste: 320 #val,pos = dic_alphabet[cle] 321 #array[pos] += 1 322 #array[dic_alphabet[cle]] += 1 323 # array[pos] += 1 324 #c +=1 325 numarray.logical_and(current,array_zero, current) 326 numarray.logical_and(accu,array_zero, accu) 327 for i in xrange(len(liste)-1,-1,-1): 328 pos = LHH2[i] 329 numarray.where(numarray.arange(taille_alphabet)==pos,1,0,out=current) 330 numarray.add(accu,current,accu) 331 array = accu 332 #map(lambda pos:array[pos]=array[pos]+1,liste) 333 #for p in [dic_alphabet[cle] for cle in liste]: array[p] +=1 334 #j=0 335 #for i in array: assert i>=0,i ; j+=i 336 #print Numeric.sum(array), numarray.add.reduce(array),j 337 #assert len(liste) == c 338 assert len(liste) == numarray.sum(array),str(len(liste))+' / '+str(numarray.sum(array))
339 340 def alphab_array_val_length(array): 341 val = length = 0 342 for pos in xrange(taille_alphabet): 343 nb = max(0,array[pos]) 344 length += nb 345 val += nb * array_valeur_alphabet[pos] 346 #if nb>0: assert val>0 347 #assert length == numarray.sum(array) 348 assert 0 <= length <= val 349 #if val == 1769: 350 # print liste 351 # print array 352 return val, length 353 354 len_L1 = len(LH1) ; len_L2 = len(LH2) 355 i=0 356 array_zero = numarray.zeros(taille_alphabet)#,numarray.UInt8) 357 array_inter = numarray.zeros(taille_alphabet)#,numarray.UInt8) 358 LL1 = numarray.zeros(taille_alphabet)#,numarray.UInt8) 359 LL2 = numarray.zeros(taille_alphabet)#,numarray.UInt8) 360 LL1res = numarray.zeros(taille_alphabet)#,numarray.UInt8) 361 LL2res = numarray.zeros(taille_alphabet)#,numarray.UInt8) 362 new_LL1 = numarray.zeros(taille_alphabet)#,numarray.UInt8) 363 for posB1 in xrange(len_L1-1,-1,-1): 364 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,'itération LH1: '+str(i)) 365 if i == 300: break 366 #LL1 = numarray.zeros(taille_alphabet,numarray.Int8) 367 numarray.logical_and(LL1,array_zero, LL1) 368 assert numarray.sum(LL1) == 0 369 liste_to_alphab_array(LHH1[posB1+1:], LL1) 370 #LL1 = array_LH1[posB1+1] 371 #for pos in LHH1[posB1+1:]: 372 # LL1[pos] += 1 373 #logging.log(4,'LL1 = '+str(LL1)) 374 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,' done MSet1') 375 #liste_pos2 = dicoPos[2][LH1[posB1][0]]#texte[L1[posB1][0]:L1[posB1][1]]] 376 liste_pos2 = dicoPos[2][LH1[posB1]] 377 j = len(liste_pos2)-1 378 while j >= 0: 379 #posB2 = liste_pos2[j] 380 #LL2 = array_LH2[posB2] 381 #liste_to_alphab_array(LHH2[posB2+1:], LL2) 382 #numarray.subtract(LL1,LL2, LL1res) 383 #numarray.subtract(LL2,LL1, LL2res) 384 if j == len(liste_pos2)-1: 385 numarray.logical_and(LL2,array_zero, LL2) 386 numarray.logical_and(LL1res,array_zero, LL1res) 387 numarray.logical_and(LL2res,array_zero, LL2res) 388 assert numarray.sum(LL2) == numarray.sum(LL1res) == numarray.sum(LL2res) == 0 389 #LL2 = 390 liste_to_alphab_array(LHH2[liste_pos2[j]+1:], LL2) ; ds = nb = 0 391 #LL2 = array_LH2[liste_pos2[j]+1] 392 #for pos in LHH2[liste_pos2[j]+1:]: 393 # LL2[pos] += 1 394 #logging.log(4,'LL2 = '+str(LL2)) 395 #LL1res = 396 numarray.subtract(LL1,LL2, LL1res) 397 #numarray.maximum(LL1res,array_zero,LL1res) 398 #LL2res = 399 numarray.subtract(LL2,LL1, LL2res) 400 #numarray.maximum(LL2res,array_zero,LL2res) 401 #logging.log(4,'LL1res = '+str(LL1res)) 402 #logging.log(4,'LL2res = '+str(LL2res)) 403 else: 404 posB2 = liste_pos2[j] 405 next_posB2 = liste_pos2[j+1] 406 numarray.logical_and(new_LL1,array_zero, new_LL1) 407 numarray.logical_and(array_inter,array_zero, array_inter) 408 assert numarray.sum(new_LL1) == numarray.sum(array_inter) == 0 409 #new_LL1 = 410 liste_to_alphab_array(LHH1[posB2+1:next_posB2+1], new_LL1) 411 #new_LL1 = array_LH1[liste_pos2[j]+1] 412 #for pos in LHH1[posB2+1:next_posB2+1]: 413 # new_LL1[pos] += 1 414 # new_LL1 = Mset(LH1[posB2+1:next_posB2+1]) 415 # LL1res = LL1res.difference(LL2res.intersection(new_LL1)) 416 # LL2res = LL2res.difference(new_LL1) 417 numarray.minimum(LL2res,new_LL1,array_inter) 418 #numarray.maximum(array_inter,array_zero,array_inter) 419 420 numarray.subtract(LL1res,array_inter, LL1res) 421 #numarray.maximum(LL1res,array_zero,LL1res) 422 423 numarray.subtract(LL2res,new_LL1, LL2res) 424 #numarray.maximum(LL2res,array_zero,LL2res) 425 #logging.log(4,'array_inter = '+str(array_inter)) 426 #logging.log(4,'LL1res = '+str(LL1res)) 427 #logging.log(4,'LL2res = '+str(LL2res)) 428 #ds2 = LL1res.valeur + LL2res.valeur 429 #nb2 = LL1res.length + LL2.length 430 d1,n1 = alphab_array_val_length(LL1res) 431 d2,n2 = alphab_array_val_length(LL2res) 432 ds2 = d1 + d2 ; nb2 = n1 + n2 433 diffSym[(posB1,liste_pos2[j])] = (ds2,nb2)#,LL1res, LL2res) 434 j -= 1 435 diffSym[(posB1,liste_pos2[0])] = (ds2,nb2) 436 if (i<10 or i>len_L1-10 or i%100==0): logging.log(5,' done itération interne') 437 i+=1 438 #logging.debug('diffSym = '+str(diffSym)) 439 return diffSym 440
441 - def preTraitDiffSym2(self,L1,L2,texte,dicoPos,niveau=0):
442 r='''class dicH(weakref.WeakKeyDictionary): 443 def __init__(self, dico=None): 444 if dico is None: 445 self.totalItem = 0 446 self.valeur = 0 447 else: 448 for cle,nbItem in dico.iteritems(): 449 self[cle] = nbItem #[pos for pos in liste_pos] 450 self.totalItem = dico.totalItem 451 self.valeur = dico.valeur''' 452 def _addBloc(bloc): 453 (cle, debut, fin) = bloc 454 try: 455 #(nbItem, valeur) = self[cle] 456 dico[cle] += 1 #(nbItem+1,valeur) 457 except KeyError: 458 dico[cle] = 1 #(1, fin-debut) #[debut] 459 dico['valeur'] += fin - debut 460 dico['totalItem'] += 1
461 def _testBloc(bloc): 462 (cle, debut, fin) = bloc 463 try: 464 #(nbItem, valeur) = self[cle] 465 # liste_pos_bloc = self[cle] 466 #liste_pos_bloc.pop() 467 #if len(liste_pos_bloc) == 0: 468 # del self[cle] 469 if diffSymCourante[cle] == 1: 470 del diffSymCourante[cle] 471 else: 472 diffSymCourante[cle] -= 1 #(nbItem-1, valeur) 473 diffSymCourante['valeur'] -= fin - debut 474 diffSymCourante['totalItem'] -= 1 475 except KeyError: 476 diffSymCourante[cle] = 1 #[debut] 477 diffSymCourante['valeur'] += fin - debut 478 diffSymCourante['totalItem'] += 1 479 480 diffSym={} # dico[(i,j)] stocke le cout heuristique et la longueur de la liste résultant 481 # de la différence symétrique entre L1[i+1:] et L2[j+1:] 482 LH1 = [] ; LH2 = [] 483 for bloc in L1: 484 LH1.append((hash(texte[bloc[0]:bloc[1]]), bloc[0], bloc[1])) 485 logging.debug('init LH1 done') 486 for bloc in L2: 487 LH2.append((hash(texte[bloc[0]:bloc[1]]), bloc[0], bloc[1])) 488 logging.debug('init LH2 done') 489 490 dico = {'valeur':0, 'totalItem':0} #{} #dicH() # 491 #dico.valeur = 0 ; dico.totalItem = 0 492 #for i in xrange(2,len(LH1)-1): 493 # _addBloc(LH1[i+1]) 494 #assert dico['totalItem'] == len(L1)-3 495 ld=i=0 496 logging.debug('len(LH2)= '+str(len(LH2))+' / len(LH1)= '+str(len(LH1))) 497 for pos2 in xrange(len(LH2)-1,-1,-1): 498 if (i<10 or i>len(LH2)-10 or i%10==0): logging.debug('itération LH2: '+str(i));k=True 499 else:k=False 500 if pos2 < (len(LH2)-1): 501 _addBloc(LH2[pos2+1]) 502 assert dico['totalItem'] == ld + 1 ; ld+=1 503 #print 'totalItem=',dico#['totalItem'] 504 if pos2 > 0: diffSymCourante = weakref.WeakKeyDictionary(dico.copy()) 505 else: diffSymCourante = dico 506 logging.debug(' copie dico done') 507 j=0 508 for pos1 in xrange(len(LH1)-1,-1,-1): 509 #if k and (j<10 or j>len(LH1)-10 or j%10==0): logging.debug(' itération LH1: '+str(j)) 510 if pos1 < (len(LH1)-1): _testBloc(LH1[pos1+1]) 511 diffSym[(pos1,pos2)] = (diffSymCourante['valeur'],diffSymCourante['totalItem']) 512 j+=1 513 logging.debug(' itération interne done') 514 i+=1 515 516 a='''for pos2 in xrange(len(LH2)-1,-1,-1): 517 if pos2 < len(LH2)-1: _testBloc(LH2[pos2+1], True) 518 diffSymCourante = dico.copy() 519 prev_taille = diffSymCourante['totalItem'] 520 for pos1 in xrange(len(LH1)): 521 if pos1 < len(LH1)-1: _testBloc(LH1[pos1+1]) 522 diffSym[(pos1,pos2)] = (diffSymCourante['valeur'],diffSymCourante['totalItem']) 523 #assert diffSymCourante['totalItem'] < prev_taille, str(diffSymCourante['totalItem'])+' '+str(prev_taille)''' 524 assert len(diffSym) == len(L1) * len(L2), str(len(diffSym))+' '+str(len(L1))+' '+str(len(L2)) 525 #str1 = '' ; str2 = '' 526 #for i in xrange(len(L1)): 527 # str1 += str(diffSym[(i,0)]) +' ' 528 # str2 += str(diffSym[(i,len(LH2)-1)]) +' ' 529 #print str1 530 #print str2 531 return diffSym 532 533
534 - def preTraitDiffSym__(self,L1,L2,texte,dicoPos,niveau=0):
535 def _addBloc(bloc): 536 cle = hash(texte[bloc[0]:bloc[1]]) 537 try: 538 dico[cle].append(bloc[0]) 539 except KeyError: 540 dico[cle] = [bloc[0]] 541 dico['valeur'] += bloc[1] - bloc[0] 542 dico['nbItem'] += 1
543 diffSym={} # dico[(i,j)] stocke le cout heuristique et la longueur de la liste résultant 544 # de la différence symétrique entre L1[i+1:] et L2[j+1:] 545 L = {1:L1, 2:L2} 546 tabDiffSym = {} 547 dico = {'valeur':0, 'nbItem':0} 548 tabDiffSym[(len(L1),-1)] = {} 549 for i in xrange(len(L1)-1,-1,-1): 550 _addBloc(L1[i]) 551 dico_courant = dico.copy() 552 tabDiffSym[(i,-1)] = dico_courant 553 assert dico['nbItem'] == len(L1) 554 555 tabDiffSym[(len(L1),-1)] = {'valeur':0, 'nbItem':0} ; bloc = {} 556 for j in xrange(len(L2)-1,-1,-1): 557 bloc[2] = L2[j] 558 for i in xrange(len(L1)-1,-1,-1): 559 bloc[1] = L1[i] 560 diffSymCourante = tabDiffSym[(i+1,-1)] 561 #print diffSymCourante 562 for k in [1,2]: 563 cle = hash(texte[bloc[k][0]:bloc[k][1]]) 564 try: 565 liste_pos_bloc = diffSymCourante[cle] 566 try: 567 liste_pos_bloc.pop() 568 except IndexError: 569 del diffSymCourante[cle] 570 diffSymCourante['valeur'] -= bloc[k][1] - bloc[k][0] 571 diffSymCourante['nbItem'] -= 1 572 except KeyError: 573 diffSymCourante[cle] = [bloc[k][0]] 574 diffSymCourante['valeur'] += bloc[k][1] - bloc[k][0] 575 diffSymCourante['nbItem'] += 1 576 tabDiffSym[(i,0)] = diffSymCourante 577 diffSym[(i,j)] = (diffSymCourante['valeur'],diffSymCourante['nbItem']) 578 for i in xrange(len(L1)): 579 tabDiffSym[(i,-1)] = tabDiffSym[(i,0)] 580 del tabDiffSym[(i,0)] 581 dico = tabDiffSym[(len(L1),-1)] 582 _addBloc(L2[j]) 583 tabDiffSym[(len(L1),-1)] = dico 584 assert len(tabDiffSym) == len(L1)+1 585 assert len(diffSym) == len(L1) * len(L2),str(len(diffSym))+str(len(L1))+str(len(L2)) 586 return diffSym 587
588 - def preTraitDiffSym_(self,L1,L2,texte,dicoPos,niveau=0):
589 """ Précalcul de toutes les différences symétriques possibles. 590 Ainsi chacune est calculée une seule fois et non un nombre combinatoire de fois si on faisait le calcul à la demande 591 pre: forall([texte[x[0]:x[1]] in dicoPos[1].keys() for x in L1]) 592 forall([texte[x[0]:x[1]] in dicoPos[2].keys() for x in L1]) 593 forall([texte[x[0]:x[1]] in dicoPos[1].keys() for x in L2]) 594 forall([texte[x[0]:x[1]] in dicoPos[2].keys() for x in L2]) 595 len(self.tool.difference(L1,L2))>0 596 """ 597 diffSym={} # dico[(i,j)] stocke le cout heuristique et la longueur de la liste résultant 598 # de la différence symétrique entre L1[i+1:] et L2[j+1:] 599 posB1=0 600 for B1 in L1: 601 for posB2 in dicoPos[2][texte[B1[0]:B1[1]]]: 602 L=self.difference_symetrique(L1[posB1+1:],L2[posB2+1:],texte,posB1,posB2,dicoPos) 603 cout=0 604 for B in L: 605 cout += B[1]-B[0] 606 diffSym[(posB1,posB2)]=(cout,len(L)) 607 posB1+=1 608 assert len(diffSym) <= len(L1) * len(L2),str(len(diffSym))+' '+str(len(L1))+' '+str(len(L2)) 609 return diffSym
610
611 - def difference_symetrique(self,L1,L2,texte,deb1,deb2,dicoPos):
612 """ différence symétrique entre 2 listes de blocs 613 pre: forall([texte[x[0]:x[1]] in dicoPos[1].iterkeys() for x in L1]) 614 forall([texte[x[0]:x[1]] in dicoPos[2].iterkeys() for x in L1]) 615 forall([texte[x[0]:x[1]] in dicoPos[1].iterkeys() for x in L2]) 616 forall([texte[x[0]:x[1]] in dicoPos[2].iterkeys() for x in L2]) 617 len(self.tool.difference(L1,L2))>0 618 forall(L1, lambda x:texte[x[0]:x[1]] in dicoPos[1].iterkeys()) 619 forall(L1, lambda x:texte[x[0]:x[1]] in dicoPos[2].iterkeys()) 620 forall(L2, lambda x:texte[x[0]:x[1]] in dicoPos[1].iterkeys()) 621 forall(L2, lambda x:texte[x[0]:x[1]] in dicoPos[2].iterkeys()) 622 """ 623 if (len(L1)==0 and len(L2)==0): return [] 624 elif (len(L1)==0): return L2 625 elif (len(L2)==0): return L1 626 LRes=[] 627 for B1 in L1: 628 found=False 629 for posB2 in dicoPos[2][texte[B1[0]:B1[1]]]: 630 if posB2>=deb2+1: found=True 631 if not found: LRes.append(B1) 632 for B2 in L2: 633 found=False 634 for posB1 in dicoPos[1][texte[B2[0]:B2[1]]]: 635 if posB1>=deb1+1: found=True 636 if not found: LRes.append(B2) 637 #sys.stderr.write("\ndiff_sym L1="+str(L1)+"\nL2="+str(L2)+"\n="+str(LRes)+"\n") 638 return LRes
639
640 -def _appelAstar(self,L1,L2,texte1,texte2,lt1):
641 """ Alignement A* entre les 2 séquences 642 pre: isinstance(L1,list) and isinstance(L2,list) and isinstance(texte1,str) and isinstance(texte2,str) 643 post: (len(__return__[0])==len(__return__[1])) or (len(__return__[0])==len(__return__[1])+1) or \ 644 (len(__return__[0])+1==len(__return__[1]))""" 645 # dicoPos1 stocke pour chaque bloc de L1 ses positions dans cette liste 646 logging.log(5,"debut _appelAstar") 647 LC1 = [] ; LC2 = [] ; Lcf1 = [] ; Lcf2 = [] ; LH1 = [] ; LH2 = [] 648 for bloc in L1: 649 cle = hash(texte1[bloc[0]:bloc[1]]) 650 LC1.append(cle) 651 Lcf1.append((cle,bloc[1]-bloc[0])) 652 LH1.append((cle, bloc[0], bloc[1])) 653 #print LH1 654 logging.log(5,'init L1 done') 655 for bloc in L2: 656 cle = hash(texte2[bloc[0]-lt1:bloc[1]-lt1]) 657 LC2.append(cle) 658 Lcf2.append((cle,bloc[1]-bloc[0])) 659 LH2.append((cle, bloc[0], bloc[1])) 660 logging.log(5,'init L2 done') 661 dicoPos = {} 662 coutFixe = {} 663 dicoPos[1], coutFixe[1] = self.calcPosCoutFixe(Lcf1) 664 dicoPos[2], coutFixe[2] = self.calcPosCoutFixe(Lcf2) 665 if __debug__: 666 for cle in dicoPos[1]: assert cle in dicoPos[2], str([texte1[d:f] for d,f in L1])+' '+str([texte2[d-lt1:f-lt1] for d,f in L2]) 667 for cle in dicoPos[2]: assert cle in dicoPos[1] 668 dic = {} ; tri = [] 669 for cle,deb,fin in LH1: 670 try: 671 val,lpos = dic[cle] 672 lpos.append((deb,fin)) 673 dic[cle] = val+1, lpos 674 except KeyError: dic[cle] = 1 , [(deb,fin)] 675 for cle,(nb,lpos) in dic.iteritems(): 676 bisect.insort_right(tri,(nb,lpos)) 677 for nb,lpos in tri[-20:]: 678 deb = lpos[0][0] ; fin =lpos[0][1] 679 logging.debug('LH1: nb = '+str(nb)+' / '+texte1[deb:fin]) 680 dic = {} ; tri = [] 681 for cle,deb,fin in LH2: 682 try: 683 val,lpos = dic[cle] 684 lpos.append((deb,fin)) 685 dic[cle] = val+1, lpos 686 except KeyError: dic[cle] = 1 , [(deb,fin)] 687 for cle,(nb,lpos) in dic.iteritems(): 688 bisect.insort_right(tri,(nb,lpos)) 689 for nb,lpos in tri[-20:]: 690 deb = lpos[0][0] ; fin =lpos[0][1] 691 logging.debug('LH2: nb = '+str(nb)+' / '+texte2[deb-lt1:fin-lt1]) 692 693 tri = [] 694 for cle,liste in dicoPos[1].iteritems(): 695 bisect.insort_right(tri,len(liste)) 696 logging.debug('tri liste1 = '+str(tri)) 697 logging.debug("longueur liste1 = %d, moyenne = %.2f, mediane = %d",len(tri),+(0.0+sum(tri))/len(tri),tri[len(tri)/2]) 698 tri = [] 699 for cle,liste in dicoPos[2].iteritems(): 700 bisect.insort_right(tri,len(liste)) 701 logging.debug('tri liste2 = '+str(tri)) 702 logging.debug('longueur liste2 = %d, moyenne = %.2f, mediane = %d',len(tri),(0.0+sum(tri))/len(tri),tri[len(tri)/2]) 703 logging.log(5,"fin calcPosCoutFixe") 704 #psyco.bind(self.preTraitDiffSym) 705 diffSym = self.preTraitDiffSym(LH1,LH2,dicoPos) # pré-calcul des différences symétriques 706 #logging.debug(LH1) 707 #logging.debug(LH2) 708 #logging.debug(dicoPos) 709 #diffSym = self.preTraitDiffSymVect(LH1,LH2,dicoPos) # pré-calcul des différences symétriques 710 #trace('diffSym = self.preTraitDiffSymVect(LH1,LH2,dicoPos)',locals()) 711 logging.log(5,"fin preTraitDiffSym") 712 713 #for cle,val in diffSym.iteritems(): 714 # assert diffSym2.has_key(cle) 715 # assert val == diffSym2[cle],str(val)+'/'+str(cle)+'/'+str(diffSym2[cle]) 716 #for cle,val in diffSym2.iteritems(): 717 # assert diffSym.has_key(cle) 718 # assert val == diffSym[cle],str(val)+'/'+str(cle)+'/'+str(diffSym[cle]) 719 best, closedSet = self.deplacements_pond_Astar(LC1,LC2,diffSym,dicoPos,coutFixe) # A* 720 logging.log(5,"fin deplacements_pond_Astar") 721 dicoBest={1:{}, 2:{}} # dico des positions dans L1 et L2 du meilleur parcours 722 while best != (-1,-1): # on remonte le meilleur parcours trouvé 723 dicoBest[1][best[0]]=1 724 dicoBest[2][best[1]]=1 725 best=closedSet[best][1] # noeud père 726 727 s1=self.__makeLRes(dicoBest[1],L1) 728 s2=self.__makeLRes(dicoBest[2],L2) 729 logging.debug('s1 = '+str(s1)) 730 logging.debug('s2 = '+str(s2)) 731 if __debug__: 732 s11=[] 733 s12=[] 734 for i in xrange(len(s1)): 735 s11.append(s1[i][0]) 736 s12.extend(s1[i][1]) 737 if s11[-1] is None: s11=s11[:-1] 738 s21=[] 739 s22=[] 740 for i in xrange(len(s2)): 741 s21.append(s2[i][0]) 742 s22.extend(s2[i][1]) 743 if s21[-1] is None: s21=s21[:-1] 744 self.ass2__(s11,0,lt1,texte1) 745 self.ass2__(s12,0,lt1,texte1) 746 texte=texte1+texte2 747 self.ass2__(s21,lt1,len(texte1)+len(texte2),texte) 748 self.ass2__(s22,lt1,len(texte1)+len(texte2),texte) 749 self.ass2__(s11+s21,0,len(texte1)+len(texte2),texte) 750 self.ass2__(s12+s22,0,len(texte1)+len(texte2),texte) 751 del dicoPos, coutFixe, dicoBest 752 logging.log(5,"fin _appelAstar") 753 return s1,s2
754