Package medite :: Package MediteAppli :: Package test :: Module test_suffix_tree
[hide private]
[frames] | no frames]

Source Code for Module medite.MediteAppli.test.test_suffix_tree

  1  # -*- coding: iso-8859-1 -*- 
  2  # Copyright 20003 - 2008: Julien Bourdaillet (julien.bourdaillet@lip6.fr), Jean-Gabriel Ganascia (jean-gabriel.ganascia@lip6.fr) 
  3  # This file is part of MEDITE. 
  4  # 
  5  #    MEDITE is free software; you can redistribute it and/or modify 
  6  #    it under the terms of the GNU General Public License as published by 
  7  #    the Free Software Foundation; either version 2 of the License, or 
  8  #    (at your option) any later version. 
  9  # 
 10  #    MEDITE is distributed in the hope that it will be useful, 
 11  #    but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13  #    GNU General Public License for more details. 
 14  # 
 15  #    You should have received a copy of the GNU General Public License 
 16  #    along with Foobar; if not, write to the Free Software 
 17  #    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA 
 18   
 19  # enable contract checking 
 20  #import contract,suffix_tree 
 21  #contract.checkmod(suffix_tree) 
 22  import psyco,bisect,string,logging,os,os.path, time,unittest 
 23  import contract 
 24  import numpy.numarray as numarray 
 25  import MediteAppli.suffix_tree, test_data 
 26  contract.checkmod(MediteAppli.suffix_tree) 
 27   
28 -class TestSuffixTree(unittest.TestCase):
29 """Teste les SuffixTree"""
30 - def setUp(self):
31 td = test_data.Test_Data() 32 self.texte = "cacaoccaocaococaica" #td._readFile('final.fr.txt')#td._readFile('Alth1v1.txt') #+td._readFile('Alth1v2.txt')+td._readFile('Condorcet_Esimpr2.txt')# 33 # "cacaocaoco" "cacaoccaocaococaica" "self.texte1Zself.texte2" 34 sequences = self.texte 35 #sequences = "cacao" 36 a = time.time() 37 self.st = MediteAppli.suffix_tree.SuffixTree(sequences) 38 b = time.time() 39 logging.debug('construction st done') 40 print 'temps construction = ',b-a 41 #print self.st 42 c = d = e = 0 43 for n in self.st.generate_leaves(): c += 1 44 for n in self.st.generate_inner_nodes(): d += 1 45 for n in self.st.generate_post_order_nodes(): e += 1 46 print 'size=',len(self.texte),'/ leaves=',c,'/ inner=',d,'/ all=',e 47 a = time.time() 48 self.LL = self.st.get_MEM() 49 #trace('self.LL = self.st.get_MEM()',locals()) 50 b = time.time() 51 print 'temps MEM = ',b-a 52 for size in self.LL: 53 x = '' 54 for pos in self.LL[size]: 55 x += self.texte[pos:pos+size] +' '
56 #print x 57 #print self.st.root.__str__(self.texte, short=True)
58 - def test1(self):
59 #self.assert_(isinstance(self.st,MediteAppli.suffix_tree.SuffixTree)) 60 pass
61 #print self.LL 62 #self.assert_(len(self.LL)>0) 63
64 -class TestGeneralisedSuffixTree(unittest.TestCase):
65 """Teste les GeneralisedSuffixTree"""
66 - def setUp(self):
67 self.td = test_data.Test_Data() 68 #sequences = [td._readFile('Alth1v1.txt'),td._readFile('Alth1v2.txt')]#+td._readFile('Condorcet_Esimpr2.txt')] 69 #sequences = [td._readFile('final.fr200.txt'),td._readFile('final-en2fr200.txt')] 70 #sequences = [td._readFile('chedidtapuscrit.txt'),td._readFile('chedidcor.txt')] 71 #sequences = [td._readFile('CB_experience.txt'),td._readFile('CB_Texte_de_1857.txt')] 72 #sequences = ["self.texte1", "self.texte2"] 73 #sequences = ["cacaoccaocaococaica", "cacaocaoco"] 74 self.texte1 = self.td._readFile('CB_experience.txt') ; self.texte2 = self.td._readFile('CB_Texte_de_1857.txt')
75 #texte1 = td._readFile('bovary1.txt') ; texte2 = td._readFile('bovary2.txt') 76 #texte1 = "le cat is vert. vraiment vert" ; texte2 = "le cat is bleu. it is vert" 77 #texte1 = td._readFile('chedidtapuscrit.txt') ; texte2 = td._readFile('chedidcor.txt') 78 #texte1 = td._readFile('IP_823.txt') ; texte2 = td._readFile('Macadam.txt') 79 #self.texte1 = td._self.readFile('Condorcet_Esimpr1.txt') ; self.texte2 = self.td._readFile('Condorcet_Esimpr2.txt') 80 #print len(texte1),len(texte2) 81
82 - def test(self):
83 texte1 = self.texte1.lower() ; texte2 = self.texte2.lower() 84 sepTable = string.maketrans("çéèàùâêîôûäëïöüÿÇÉÈÀÙÂÊÎÔÛÄËÏÖÜ","ceeauaeiouaeiouyCEEAUAEIOUAEIOU") 85 texte1 = texte1.translate(sepTable) 86 texte2 = texte2.translate(sepTable) 87 sepTable2 = string.maketrans(""" !\r,\n:\t;-?"'`’()""","................") 88 texte1 = texte1.translate(sepTable2) 89 texte2 = texte2.translate(sepTable2) 90 self.dictPosSuppression={} 91 self.tabNbSuppression={} 92 texte1,texte2=self.preTraitSuppSep(texte1,texte2) 93 lt1 = len(texte1) ; lt2 = len(texte2) 94 95 sequences = [texte1,texte2] 96 logging.debug('debut') 97 a = time.time() 98 #psyco.bind(MediteAppli.suffix_tree.GeneralisedSuffixTree) 99 #self.st = MediteAppli.suffix_tree.GeneralisedSuffixTree(sequences) 100 psyco.bind(MediteAppli.suffix_tree.GeneralisedSuffixTree) 101 self.st = MediteAppli.suffix_tree.GeneralisedSuffixTree(sequences) 102 b = time.time() 103 logging.debug('construction st done') 104 #self.LL = self.st.shared_substrings3(1) 105 #for chaine,lpos in self.LL.iteritems(): 106 # if 'mouvements' in chaine: 107 # print '1',chaine,lpos 108 #print self.LL 109 #trace('self.LL = self.st.shared_substrings3(1)',locals()) 110 #trace('self.LL2 = self.st.get_MEM_index_chaine()',locals()) 111 #psyco.bind(self.st.get_MEM_index_chaine3) 112 #f = psyco.proxy(self.st.get_MEM_index_chaine3) 113 self.LL2 = self.st.get_MEM_index_chaine3(just_keep_words=True) 114 #print self.st.root.__str__(sequences[0]+sequences[1],charOnly='.') 115 #for chaine,lpos in self.LL2.iteritems(): 116 # if 'mouvements' in chaine: 117 # print '2',chaine,lpos 118 #print self.LL2 119 c = time.time() 120 #print len(self.LL2), len(self.LL2) 121 122 #print self.LL 123 logging.debug('temps construction = %f',b-a) 124 logging.debug('temps MEM = %f',c-b) 125 #c = d = e = 0 126 #for n in self.st.generate_leaves(): c += 1 127 #for n in self.st.generate_inner_nodes(): d += 1 128 #for n in self.st.generate_post_order_nodes(): e += 1 129 #logging.debug('size='+str(len(sequences[0])+len(sequences[1]))+'/ leaves='+str(c)+'/ inner='+str(d)+'/ all='+str(e)) 130 131 # affichage des SMEM 132 top = [] 133 for (cle,longueur),liste_pos in self.LL2.iteritems(): 134 nb = len(liste_pos) ; chaine = texte1[liste_pos[0]:liste_pos[0]+longueur] 135 assert cle == hash(chaine) 136 bisect.insort_left(top,(nb,chaine,liste_pos)) 137 cumul_len = cumul_2 = cumul_3 = 0.0 138 top.reverse() 139 for x in top: 140 t = x[0]*len(x[1]) 141 cumul_len += t 142 if x[0] <= 2: cumul_2 += t 143 else: cumul_3 += t 144 #logging.debug(str(x)+' / cumul % texte total='+str(100*cumul_len/(lt1+lt2))) 145 logging.debug('cumul_2 % texte total='+str(100*cumul_2/(lt1+lt2))) 146 logging.debug('cumul_2 % texte répété='+str(100*cumul_2/(cumul_2+cumul_3))) 147 logging.debug('cumul_3 % texte total='+str(100*cumul_3/(lt1+lt2))) 148 logging.debug('cumul_3 % texte répété='+str(100*cumul_3/(cumul_2+cumul_3))) 149 logging.debug('cumul_2+cumul_3 % texte total='+str(100*(cumul_2+cumul_3)/(lt1+lt2))) 150 151 #affichage des mots 152 texte = texte1+texte2 153 liste = texte.split('.') 154 dic = {} 155 for i in liste: 156 try: 157 dic[i] += 1 158 except KeyError: 159 dic[i] = 1 160 n=len(liste) ; liste = [] 161 for mot,nb in dic.iteritems(): 162 bisect.insort_right(liste,(nb,mot)) 163 liste.reverse() 164 cumul = cumul_2 = cumul_3 = 0.0 ; nb_mot = 0 #; nb_mot1 = nb_mot2 = 0 165 for (nb,mot) in liste: 166 nb_mot += nb 167 t = len(mot)*nb 168 if nb > 2: cumul_3 += t 169 elif nb == 2: cumul_2 += t 170 cumul += t 171 #logging.debug(str((nb,mot))+' / cumul % texte total='+str(100*cumul/(lt1+lt2))) 172 assert n == nb_mot 173 logging.debug('cumul_2 % texte total='+str(100*cumul_2/(lt1+lt2))) 174 logging.debug('cumul_2 % texte répété='+str(100*cumul_2/(cumul_2+cumul_3))) 175 logging.debug('cumul_3 % texte total='+str(100*cumul_3/(lt1+lt2))) 176 logging.debug('cumul_3 % texte répété='+str(100*cumul_3/(cumul_2+cumul_3))) 177 logging.debug('cumul_2+cumul_3 % texte total='+str(100*(cumul_2+cumul_3)/(lt1+lt2))) 178 #logging.debug('nb_mot = '+str(nb_mot)) 179 180 # affichage des mots de + de 3 carac 181 cumul = cumul_2 = cumul_3 = cumul_nb = 0.0 182 ch_excel = [] ; ch_nb = [] 183 stopWords = ['avec','dans','pour','sous','mais','encore','quand','cette','notre', 'votre', 'leur', 'leurs','donc','puis', 184 'comme', 'lorsque', 'puisque', 'quand', 'quoique', 'afin', 'ainsi', 'avant', 'comme', 185 'aucun', 'aucune', 'autre', 'certain', 'certaine', 'certaines', 'certains', 'chaque', 'plusieurs', "quelqu'",' quelque', 'quelques' , 186 'telle', 'telles', 'tous', 'tout', 'toute', 'toutes', 'lequel', 'laquelle', 'lesquels', 'lesquelles', 187 'duquel', 'dequels', 'dequelles', 'auquel', 'auxquels', 'auxquelles', 188 'quel', 'quelle', 'quelles', 'quels', 189 'elle', 'nous', 'vous','elles', 190 'celui', 'celle', 'ceux', 'celles', 'ceci', 'cela', 191 'aucun', 'aucune', 'autre', 'autrui', 'certains', 'chacun', 'quiconque', 'rien'] 192 #for (nb,mot) in liste: 193 # if len(mot)>3: 194 # cumul_nb += n 195 ch4 = [] ; i = 0 196 global NOMB,CUM 197 for (nb,mot) in liste: 198 if len(mot)>3 and mot not in stopWords: 199 t = len(mot)*nb 200 if nb > 2: cumul_3 += t 201 elif nb == 2: cumul_2 += t 202 cumul += t 203 logging.debug(str((nb,mot))+' / cumul % texte total='+str(100*cumul/(lt1+lt2))) 204 ch_excel.append(str(nb)+';') 205 ch_nb.append(nb) 206 cumul_nb += nb 207 ch4.append(str(round(100*cumul/(lt1+lt2),2))+';') 208 if i< 200: 209 CUM[i] += round(100*cumul/(lt1+lt2),2) ; i += 1 210 NOMB += 1 211 ch_nb2 = [(0.0+nb)/(cumul_2+cumul_3) for nb in ch_nb] 212 ch_nb3 = [str(nb)+';' for nb in ch_nb2] 213 logging.debug('cumul_2 % texte total='+str(100*cumul_2/(lt1+lt2))) 214 logging.debug('cumul_2 % texte répété='+str(100*cumul_2/(cumul_2+cumul_3))) 215 logging.debug('cumul_3 % texte total='+str(100*cumul_3/(lt1+lt2))) 216 logging.debug('cumul_3 % texte répété='+str(100*cumul_3/(cumul_2+cumul_3))) 217 logging.debug('cumul_2+cumul_3 % texte total='+str(100*(cumul_2+cumul_3)/(lt1+lt2))) 218 logging.debug('chaine excel = '+''.join(ch_excel)) 219 logging.debug('chaine excel2 = '+''.join(ch_nb3)) 220 logging.debug('nb total = %d / nb moyen = %f / nb médian = %d',cumul_nb,cumul_nb/len(liste),ch_nb[len(ch_nb)/2])
221 #logging.critical(''.join(ch4))
222 - def test1(self):
223 self.assert_(isinstance(self.st,MediteAppli.suffix_tree.GeneralisedSuffixTree))
224 #print self.LL 225 #self.assert_(len(self.LL)>0) 226
227 - def preTraitSuppSep(self,texte1,texte2):
228 """ Prétraitement supprimant plusieurs séparateurs se suivant et n'en laissant qu'un 229 Prend en entrée texte1 et texte2 et les renvoie traités """ 230 debut=0 231 j=0 232 comptSuppression=0 233 res=[] 234 for texte in [texte1,texte2]: 235 i=0 236 lTexte=[] 237 while i<len(texte)-1: 238 if texte[i]=='.' and texte[i+1]=='.': 239 comptSuppression+=1 240 # position du texte original ou un séparateur a été supprimé 241 self.dictPosSuppression[i+debut]='.' 242 else: 243 lTexte.append(texte[i]) 244 # à chaque position du nouveau texte est associé le nb de suppression 245 # de séparateur jusqu'à cette position 246 self.tabNbSuppression[j]=comptSuppression 247 j+=1 248 i+=1 249 lTexte.append(texte[i]) 250 self.tabNbSuppression[j]=comptSuppression 251 j+=1 252 debut=i+1 253 res.append(''.join(lTexte)) # chaine 254 self.tabNbSuppression[j]=comptSuppression 255 return res[0],res[1]
256
257 - def postTraitSuppSep2(self,listeOcc):
258 res=[] 259 for (occ1,occ2) in listeOcc: 260 Nocc1 = occ1+self.tabNbSuppression[occ1] 261 Nocc2 = occ2+self.tabNbSuppression[occ2] 262 res.append((Nocc1,Nocc2)) 263 return res
264
265 -class TestGeneralisedSuffixTreeChedid(TestGeneralisedSuffixTree):
266 """Teste les GeneralisedSuffixTree"""
267 - def setUp(self):
268 self.td = test_data.Test_Data() 269 self.texte1 = self.td._readFile('chedidtapuscrit.txt') ; self.texte2 = self.td._readFile('chedidcor.txt') 270 self.texte1 = self.texte1[:8000] ; self.texte2 = self.texte2[:8000]
271
272 -class TestGeneralisedSuffixTreeBrousse(TestGeneralisedSuffixTree):
273 """Teste les GeneralisedSuffixTree"""
274 - def setUp(self):
275 self.td = test_data.Test_Data() 276 self.texte1 = self.td._readFile('IP_823.txt') ; self.texte2 = self.td._readFile('Macadam.txt') 277 self.texte1 = self.texte1[:8000] ; self.texte2 = self.texte2[:8000]
278
279 -class TestGeneralisedSuffixTreeCondo(TestGeneralisedSuffixTree):
280 """Teste les GeneralisedSuffixTree"""
281 - def setUp(self):
282 self.td = test_data.Test_Data() 283 self.texte1 = self.td._readFile('CONDORCET_IMPOT_1.txt') ; self.texte2 = self.td._readFile('CONDORCET_IMPOT_10.txt') 284 self.texte1 = self.texte1[:8000] ; self.texte2 = self.texte2[:8000]
285
286 -class TestGeneralisedSuffixTreeCondo2(TestGeneralisedSuffixTree):
287 """Teste les GeneralisedSuffixTree"""
288 - def setUp(self):
289 self.td = test_data.Test_Data() 290 self.texte1 = self.td._readFile('Condorcet_Esimpr1.txt') ; self.texte2 = self.td._readFile('Condorcet_Esimpr2.txt') 291 self.texte1 = self.texte1[:8000] ; self.texte2 = self.texte2[:8000]
292
293 -class TestGeneralisedSuffixTreeAlthu(TestGeneralisedSuffixTree):
294 """Teste les GeneralisedSuffixTree"""
295 - def setUp(self):
296 self.td = test_data.Test_Data() 297 self.texte1 = self.td._readFile('Alth1v1.txt') ; self.texte2 = self.td._readFile('Alth1v2.txt') 298 self.texte1 = self.texte1[:8000] ; self.texte2 = self.texte2[:8000]
299
300 -class TestGeneralisedSuffixTreeBernon(TestGeneralisedSuffixTree):
301 """Teste les GeneralisedSuffixTree"""
302 - def setUp(self):
303 self.td = test_data.Test_Data() 304 self.texte1 = self.td._readFile('bernon1.txt') ; self.texte2 = self.td._readFile('bernon5-18_5.txt') 305 self.texte1 = self.texte1[:8000] ; self.texte2 = self.texte2[:8000]
306
307 -class TestGeneralisedSuffixTreeCupidon(TestGeneralisedSuffixTree):
308 """Teste les GeneralisedSuffixTree"""
309 - def setUp(self):
310 self.td = test_data.Test_Data() 311 self.texte1 = self.td._readFile('dactylo-1.txt') ; self.texte2 = self.td._readFile('dactylo-11-rouge.txt') 312 self.texte1 = self.texte1[:8000] ; self.texte2 = self.texte2[:8000]
313
314 -class TestGeneralisedSuffixTreeRamuz1(TestGeneralisedSuffixTree):
315 """Teste les GeneralisedSuffixTree"""
316 - def setUp(self):
317 self.td = test_data.Test_Data() 318 self.texte1 = self.td._readFile('Tout-vieux orig.txt') ; self.texte2 = self.td._readFile('tout-vieux1909.txt') 319 self.texte1 = self.texte1[:8000] ; self.texte2 = self.texte2[:8000]
320
321 -def _test():
322 logging.basicConfig(level=logging.DEBUG, 323 format='%(asctime)s %(levelname)s %(message)s', 324 datefmt='%H:%M:%S') 325 suite1 = unittest.makeSuite(TestSuffixTree) 326 suite2 = unittest.TestSuite() 327 #suite2.addTest(TestGeneralisedSuffixTree('test')) 328 suite2.addTest(TestGeneralisedSuffixTreeChedid('test')) 329 suite2.addTest(TestGeneralisedSuffixTreeBrousse('test')) 330 suite2.addTest(TestGeneralisedSuffixTreeCondo('test')) 331 suite2.addTest(TestGeneralisedSuffixTreeCondo2('test')) 332 suite2.addTest(TestGeneralisedSuffixTreeAlthu('test')) 333 suite2.addTest(TestGeneralisedSuffixTreeBernon('test')) 334 suite2.addTest(TestGeneralisedSuffixTreeCupidon('test')) 335 suite2.addTest(TestGeneralisedSuffixTreeRamuz1('test')) 336 suite = unittest.TestSuite(( suite2)) 337 #suite = unittest.TestSuite((suite1, suite2)) 338 unittest.TextTestRunner(verbosity=2).run(suite)
339
340 -def trace(commande,dic_locals):
341 import profile,pstats,sys 342 #profile.runctx(commande,globals(), locals(),'Z:\workspace\medite\statFile') 343 profile.runctx(commande,globals(), dic_locals,'c:\workspace\medite\statFile') 344 s = pstats.Stats('c:\workspace\medite\statFile') 345 s.sort_stats('time') 346 s.print_stats() 347 sys.stderr.flush() 348 sys.stdout.flush()
349 350 CUM = numarray.zeros(200,numarray.Float) 351 NOMB = 0 352 if __name__ == '__main__': 353 logging.basicConfig(level=5,#logging.DEBUG,#INFO, 354 format='%(asctime)s %(levelname)s %(message)s', 355 #datefmt='%H:%M:%S', 356 filename=os.path.join(os.getcwd(),'log.txt'), 357 filemode='w') 358 console = logging.StreamHandler() 359 console.setLevel(logging.INFO) 360 log = logging.FileHandler(filename=os.path.join(os.getcwd(),'log_MA.txt'),mode='w') 361 log.setLevel(logging.CRITICAL) 362 log.setFormatter(logging.Formatter(''))#%(asctime)s %(levelname)s %(message)s')) 363 logging.getLogger('').addHandler(log) 364 _test() 365 print CUM 366 a = numarray.around(CUM / NOMB,2) ; ch = [] 367 for nb in a: 368 ch.append(str(nb)+';') 369 370 logging.critical(''.join(ch)) 371