# -*- coding: iso-8859-1 -*- # Copyright 20003 - 2008: Julien Bourdaillet (julien.bourdaillet@lip6.fr), Jean-Gabriel Ganascia (jean-gabriel.ganascia@lip6.fr) # This file is part of MEDITE. # # MEDITE is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # MEDITE is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Foobar; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA import string, os.path import suffix_tree def count_terms(sequence1, sequence2="", noCasse=False, noSep=False, noAccent=False): """Compte le nombre de token dans une chaine noCasse: insensibilité à la casse, noSep: aux séparateurs, noAccent: aux accents """ if noAccent: sequence1 = sequence1.translate(string.maketrans("çéèàùâêîôûäëïöüÿÇÉÈÀÙÂÊÎÔÛÄËÏÖÜ","ceeauaeiouaeiouyCEEAUAEIOUAEIOU")) sequence2 = sequence2.translate(string.maketrans("çéèàùâêîôûäëïöüÿÇÉÈÀÙÂÊÎÔÛÄËÏÖÜ","ceeauaeiouaeiouyCEEAUAEIOUAEIOU")) if noCasse: sequence1 = sequence1.lower() sequence2 = sequence2.lower() if noSep: sequence1 = sequence1.translate(string.maketrans(""" !\r,\n:\t;-?"'`’(.)"""," ")) sequence2 = sequence2.translate(string.maketrans(""" !\r,\n:\t;-?"'`’(.)"""," ")) texte = sequence1 + sequence2 lToken = [] # découpage en tokens if True: if len(sequence2) == 0: st = suffix_tree.SuffixTree(sequence1) else: st = suffix_tree.GeneralisedSuffixTree([sequence1,sequence2]) dic_MEM = st.get_MEM(5) for longueur in dic_MEM: for pos in dic_MEM[longueur]: lToken.append(texte[pos:pos+longueur]) else: lToken = sequence.split() # découpage en tokens # comptage pour chaque token de son nb d'occurences, indexation par token dicToken = {} for token in lToken: try: dicToken[token] += 1 except KeyError: dicToken[token] = 1 # indexation par nb d'occurences de tous les tokens ayant ce nb d'occurences classement = {} for token in dicToken: count = dicToken[token] try: classement[count].append(token) except KeyError: classement[count] = [token] affiche_dic(classement) def affiche_dic(dic): """ Affiche un dictionnaire (dont les clés sont numériques) de façon décroissante pre: isintance(dic,dict) """ nbItem = tailleTexte = 0 while len(dic) > 0: maxpos = 0 for x in dic: maxpos = max(x,maxpos) print maxpos,len(dic[maxpos]),dic[maxpos] for x in dic[maxpos]: tailleTexte += len(x) nbItem += maxpos * len(dic[maxpos]) del dic[maxpos] maxpos = 0 print 'nbItem = ',nbItem print 'tailleTexte = ',tailleTexte def lit_fichier(fichier): """ lecture du fichier dans une chaine """ f = open(fichier) s = ''.join(f.readlines()) f.close() return s def test(): #count_terms(lit_fichier('GLOBAL\Bernard\experiences\exp\experience.txt'),0,0,0)#1,1,1)#1,1,1) count_terms(lit_fichier('GLOBAL\Bernard\experiences\exp\experience.txt'), lit_fichier('GLOBAL/Bernard/experiences/exp/Texte_de_1857.txt'), #'', True,True,True) #False,False,False) def test_liste(): def f(chaine): print chaine,id(chaine) chaine = "qwerty" print chaine,id(chaine) def g(liste): print liste,id(liste) liste[1] = '333' print liste,id(liste) a = "azerty" print a,id(a) f(a) print a,id(a) b = list(a) print b,id(b) g(b) print b,id(b) c = b[:] print c,id(c) d = b[2:4] print d,id(d) b=map(lambda x:x*2,b) print b,id(b) e = list("azerty") e[3:6] = ['rr'] print e f = [1,2,3,4,5,6] f[3:4] = [7,8,9] print f g = [1,2,3,4,5,6] while len(g)>0: g = g[1:] print g print g g = [1,2,3,4,5,6] while len(g)>0: g.pop(0) print g print g def dif_intervalles(L, C) : # enlève un intervalle de valeurs dans une liste d'intervalles # L est une liste d'intervalles, C, un intervalle NL = [ ] ncouple = [] nncouple = [] for couple in L : if couple[0] < C[0] : if couple[1] >= C[0] : ncouple.append(couple[0]) ncouple.append(C[0]) if couple[1] > C[1] : nncouple.append(C[1]) nncouple.append(couple[1]) elif couple[1] > C[1] and couple[0] <= C[1] : ncouple.append(C[1]) ncouple.append(couple[1]) if ncouple != [] : NL.append(ncouple) ncouple = [] if nncouple != [] : NL.append(nncouple) nncouple = [] if couple[0] > C[1] or couple[1] < C[0] : NL.append(couple) return NL def soustr_l_intervallesAA(P, L): # soustractrion de deux liste d'intervalles while len(L) > 0: ##L <> []: P = dif_intervalles(P, L[0]) #L.pop(0)) #L[0]) L.pop(0) #L = L[1:] return P def soustr_l_intervallesBB(P, L): # soustractrion de deux liste d'intervalles while len(L) > 0: ##L <> []: P = dif_intervalles(P, L[0]) #L.pop(0)) #L[0]) del L[0] #L = L[1:] #L.pop(0) #L = L[1:] return P h = [[10,100]] i = [[20,30], [50,65], [70,98]] print soustr_l_intervallesBB(h ,i) a = [1,2,3,4] a[2:3] = [5,6] print a #[[10, 20], [30, 50], [65, 70], [98, 100]] def test_bbl(): def print_id(liste): r = [] for x in liste: r.append(id(x)) print r bbl = [] ; bcl = [] ; dl = [] a = ('bc',1,3,[(1,2)]) ; b = ('d', 20,30,[]) ; c = ('bc',50,60,[(51,55),(56,57),(58,59)]) bbl.append(a) ; bbl.append(b) ; bbl.append(c) bcl.append(a) ; dl.append(b) ; bcl.append(c) print bbl ; print bcl ; print dl print_id(bbl) ; print_id(bcl) ; print_id(dl) #a[2] = 4 bbl.pop(0) b[3].extend([(21,25),(26,28)]) ; c[3][1:2] = [] print bbl ; print bcl ; print dl print_id(bbl) ; print_id(bcl) ; print_id(dl) def extract(): t = lit_fichier(os.path.join(os.getcwd(),"MediteAppli","test","train.f.trans")) l = t.splitlines(True) f = open(os.path.join(os.getcwd(),"MediteAppli","test","trans100000.txt"),'w') for x in l[:100000]: f.write(x) f.close() if __name__ == '__main__': #test_liste() extract()