# -*- coding: iso-8859-1 -*- # Copyright 20003 - 2008: Julien Bourdaillet (julien.bourdaillet@lip6.fr), Jean-Gabriel Ganascia (jean-gabriel.ganascia@lip6.fr) # This file is part of MEDITE. # # MEDITE is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # MEDITE is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Foobar; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #---------------------------------------------------------------------------- # Nom: decodage.py # Objet: 2 méthodes (redondantes) permettant de décoder,de façon heuristique, # un fichier ou une chaine sans en connaitre le codage # # Auteurs: Julien Bourdaillet # # Cree: Nov 2004 # #---------------------------------------------------------------------------- import sys, codecs, re, locale import Donnees.resultatAppli if __debug__: import contract contract.checkmod(__name__) def readEncodedFile(chemin, encodage): f = codecs.open(chemin,'r',encodage) s = f.read() f.close() return s def decode_file_heuristically_(chemin): """Try interpreting s using several possible encodings. The return value is a three-element tuple. The first element is either an ASCII string or a Unicode object. The second element is 1 if the decoder had to punt and delete some characters from the input to successfully generate a Unicode object.""" try: s = readEncodedFile(chemin,'ascii') x = unicode(s, "ascii") # if it's ascii, we're done return s, 0, "ascii" except (UnicodeError,TypeError): encodings = ["iso-8859-1","cp1252","iso-8859-15","utf-8"] # if the default encoding is not ascii it's a good thing to try if sys.getdefaultencoding() != "ascii": encodings.insert(0, sys.getdefaultencoding()) for enc in encodings: # Most of the characters between 0x80 and 0x9F are displayable # in cp1252 but are control characters in iso-8859-1. Skip # iso-8859-1 if they are found, even though the unicode() call # might well succeed. if (enc in ("iso-8859-15", "iso-8859-1")): try: s = readEncodedFile(chemin,enc) if (re.search(r"[\x80-\x9f]", s) is not None): continue except (UnicodeError,TypeError):#,err: continue # Characters in the given range are more likely to be # symbols used in iso-8859-15, so even though unicode() # may accept such strings with those encodings, skip them. if (enc in ("iso-8859-1", "cp1252")): try: s = readEncodedFile(chemin,enc) if (re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None): continue except (UnicodeError,TypeError),err: continue try: s = readEncodedFile(chemin,enc) if not isinstance(s,unicode): x = unicode(s, enc) else: x = s except (UnicodeError,TypeError),err: pass else: if x.encode(enc) == s: return x, 0, enc # nothing worked perfectly - try again, but use the "ignore" parameter # and return the longest result output = list() for enc in encodings: try: s = readEncodedFile(chemin,enc) if not isinstance(s,unicode): output.append((unicode(s,enc,'ignore'),enc)) else: output.append((s,enc)) except (UnicodeError,TypeError),err: continue output = [(len(x), x) for x in output] output.sort() x, enc = output[-1][1] return x, 1, enc def decode_string_heuristically_(s, enc=None, denc=sys.getdefaultencoding()): """Try interpreting s using several possible encodings. The return value is a three-element tuple. The first element is either an ASCII string or a Unicode object. The second element is 1 if the decoder had to punt and delete some characters from the input to successfully generate a Unicode object.""" #if isinstance(s, unicode): # return s, 0, "utf-8" try: x = unicode(s, "ascii") # if it's ascii, we're done return s, 0, "ascii" except UnicodeError: encodings = ["iso-8859-1","cp1252","iso-8859-15","utf-8"] # if the default encoding is not ascii it's a good thing to try if denc != "ascii": encodings.insert(0, denc) # always try any caller-provided encoding first if enc: encodings.insert(0, enc) for enc in encodings: # Most of the characters between 0x80 and 0x9F are displayable # in cp1252 but are control characters in iso-8859-1. Skip # iso-8859-1 if they are found, even though the unicode() call # might well succeed. if (enc in ("iso-8859-15", "iso-8859-1") and re.search(r"[\x80-\x9f]", s) is not None): continue # Characters in the given range are more likely to be # symbols used in iso-8859-15, so even though unicode() # may accept such strings with those encodings, skip them. if (enc in ("iso-8859-1", "cp1252") and re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None): continue try: x = unicode(s, enc) except UnicodeError: pass else: if x.encode(enc) == s: return x, 0, enc # nothing worked perfectly - try again, but use the "ignore" parameter # and return the longest result output = [(unicode(s, enc, "ignore"), enc) for enc in encodings] output = [(len(x), x) for x in output] output.sort() x, enc = output[-1][1] return x, 1, enc def loadFile(chemin, renvoieEnc=False): (x,num,encodage) = decode_file_heuristically(chemin) #print encodage texte = x #readEncodedFile(chemin,encodage) #print type(texte) if renvoieEnc: return texte,encodage else: return texte#.encode(encodage) class mapperUnicodeUtf(object): """Map chaque position d'une chaine UTF-8 en leur position Unicode Chaque caractère de la chaine unicode est converti en UTF-8 et on compte leur nombre. """ def __init__(self, texteUni, encodage=None): """ linéaire par rapport à la longueur de la chaine unicode pre: isinstance(texteUni,unicode) """ assert isinstance(texteUni,unicode) self.texteUni = texteUni #if encodage is None: # chaine,x,enc = decode_string_heuristically(self.texteUni) # self.encodage = enc # print chaine.encode(enc),len(chaine),len(chaine.encode(enc)),len(self.texteUni) #else: self.encodage = encodage self.mapUtf2Uni = [] self.mapUni2Utf = [] posUni = posUtf = 0 # pour chaque caractère unicode for charUni in self.texteUni: self.mapUni2Utf.append(posUtf) # pour chaque octet correspondant à son encodage for j in xrange(len(charUni.encode(self.encodage))): self.mapUtf2Uni.append(posUni) posUtf += 1 #print self.mapUtf2Uni,len(self.mapUtf2Uni),enc posUni += 1 self.mapUtf2Uni.append(posUni) #print self.mapUtf2Uni,len(self.mapUtf2Uni),self.encodage def utfToUni(self,pos): """accès en O(1)""" #print pos,self.mapUtf2Uni[pos] return self.mapUtf2Uni[pos] def uniToUtf(self,pos): """accès en O(1)""" return self.mapUni2Utf[pos] def __convertResultat(self, item, utf2uni=True): """Traite type list, tuple et int normal""" if isinstance(item, list): return map(self.__convertResultat, item) elif isinstance(item, tuple): return (self.__convertResultat(item[0]),self. __convertResultat(item[1])) else: if utf2uni: return self.mapUtf2Uni[item] else: return self.mapUni2Utf[item] def utfToUniResultat(self, res): return self.convertResultat_(res,utf2uni=True) def uniToUtfResultat(self, res): return self.convertResultat_(res,utf2uni=False) def convertResultat_(self, res, utf2uni=True): """Converti les index d'un Resultat d'un format à l'autre Si utf2uni= True conversion de utf vers uni Si faux de unichr vers utf""" assert isinstance(res, Donnees.resultatAppli.Resultat) return Donnees.resultatAppli.Resultat( self.__convertResultat(res.getListeInsertions(),utf2uni), self.__convertResultat(res.getListeSuppressions(),utf2uni), self.__convertResultat(res.getListeDeplacements(),utf2uni), self.__convertResultat(res.getListeRemplacements(),utf2uni), self.__convertResultat(res.getLgSource(),utf2uni), res.getTextesConcatenes(), self.__convertResultat(res.getBlocsCommuns(),utf2uni), self.__convertResultat(res.getPairesBlocsDeplaces(),utf2uni)) import locale #def guess_encoding(data): def decode_string_heuristically(data):#s, enc=None, denc=sys.getdefaultencoding()): """ Given a byte string, attempt to decode it. Tries the standard 'UTF8' and 'latin-1' encodings, Plus several gathered from locale information. The calling program *must* first call locale.setlocale(locale.LC_ALL, '') If successful it returns (decoded_unicode, successful_encoding) If unsuccessful it raises a ``UnicodeError`` """ successful_encoding = None # we make 'utf-8' the first encoding encodings = ['utf-8'] # # next we add anything we can learn from the locale try: encodings.append(locale.nl_langinfo(locale.CODESET)) except AttributeError: pass try: encodings.append(locale.getlocale()[1]) except (AttributeError, IndexError): pass try: encodings.append(locale.getdefaultlocale()[1]) except (AttributeError, IndexError): pass # # we try 'latin-1' last encodings.extend(['latin-1']) for enc in encodings: # some of the locale calls # may have returned None if not enc: continue try: decoded = unicode(data, enc) successful_encoding = enc except (UnicodeError, LookupError): pass else: break if not successful_encoding: raise UnicodeError( 'Unable to decode input data. Tried the following encodings: %s.' % ', '.join([repr(enc) for enc in encodings if enc])) else: return decoded, None, successful_encoding #def guess_file_encoding(chemin): def decode_file_heuristically(chemin): f = open(chemin,'r') s = f.read() f.close() return decode_string_heuristically(s)