medite.Utile.decodage

Source Code for Module medite.Utile.decodage

1 # -*- coding: iso-8859-1 -*- 2 # Copyright 20003 - 2008: Julien Bourdaillet (julien.bourdaillet@lip6.fr), Jean-Gabriel Ganascia (jean-gabriel.ganascia@lip6.fr) 3 # This file is part of MEDITE. 4 # 5 # MEDITE is free software; you can redistribute it and/or modify 6 # it under the terms of the GNU General Public License as published by 7 # the Free Software Foundation; either version 2 of the License, or 8 # (at your option) any later version. 9 # 10 # MEDITE is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with Foobar; if not, write to the Free Software 17 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 #---------------------------------------------------------------------------- 19 # Nom: decodage.py 20 # Objet: 2 méthodes (redondantes) permettant de décoder,de façon heuristique, 21 # un fichier ou une chaine sans en connaitre le codage 22 # 23 # Auteurs: Julien Bourdaillet 24 # 25 # Cree: Nov 2004 26 # 27 #---------------------------------------------------------------------------- 28 29 import sys, codecs, re, locale 30 import Donnees.resultatAppli 31 if __debug__: 32 import contract 33 contract.checkmod(__name__) 34

35 -def readEncodedFile(chemin, encodage):

36 f = codecs.open(chemin,'r',encodage) 37 s = f.read() 38 f.close() 39 return s

40

41 -def decode_file_heuristically_(chemin):

42 """Try interpreting s using several possible encodings. 43 The return value is a three-element tuple. The first element is either an 44 ASCII string or a Unicode object. The second element is 1 45 if the decoder had to punt and delete some characters from the input 46 to successfully generate a Unicode object.""" 47 try: 48 s = readEncodedFile(chemin,'ascii') 49 x = unicode(s, "ascii") 50 # if it's ascii, we're done 51 return s, 0, "ascii" 52 except (UnicodeError,TypeError): 53 encodings = ["iso-8859-1","cp1252","iso-8859-15","utf-8"] 54 # if the default encoding is not ascii it's a good thing to try 55 if sys.getdefaultencoding() != "ascii": encodings.insert(0, sys.getdefaultencoding()) 56 for enc in encodings: 57 # Most of the characters between 0x80 and 0x9F are displayable 58 # in cp1252 but are control characters in iso-8859-1. Skip 59 # iso-8859-1 if they are found, even though the unicode() call 60 # might well succeed. 61 if (enc in ("iso-8859-15", "iso-8859-1")): 62 try: 63 s = readEncodedFile(chemin,enc) 64 if (re.search(r"[\x80-\x9f]", s) is not None): 65 continue 66 except (UnicodeError,TypeError):#,err: 67 continue 68 69 # Characters in the given range are more likely to be 70 # symbols used in iso-8859-15, so even though unicode() 71 # may accept such strings with those encodings, skip them. 72 if (enc in ("iso-8859-1", "cp1252")): 73 try: 74 s = readEncodedFile(chemin,enc) 75 if (re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None): 76 continue 77 except (UnicodeError,TypeError),err: 78 continue 79 80 try: 81 s = readEncodedFile(chemin,enc) 82 if not isinstance(s,unicode): 83 x = unicode(s, enc) 84 else: 85 x = s 86 except (UnicodeError,TypeError),err: 87 pass 88 else: 89 if x.encode(enc) == s: 90 return x, 0, enc 91 92 # nothing worked perfectly - try again, but use the "ignore" parameter 93 # and return the longest result 94 output = list() 95 for enc in encodings: 96 try: 97 s = readEncodedFile(chemin,enc) 98 if not isinstance(s,unicode): 99 output.append((unicode(s,enc,'ignore'),enc)) 100 else: 101 output.append((s,enc)) 102 except (UnicodeError,TypeError),err: 103 continue 104 output = [(len(x), x) for x in output] 105 output.sort() 106 x, enc = output[-1][1] 107 return x, 1, enc 108

109 -def decode_string_heuristically_(s, enc=None, denc=sys.getdefaultencoding()):

110 """Try interpreting s using several possible encodings. 111 The return value is a three-element tuple. The first element is either an 112 ASCII string or a Unicode object. The second element is 1 113 if the decoder had to punt and delete some characters from the input 114 to successfully generate a Unicode object.""" 115 #if isinstance(s, unicode): 116 # return s, 0, "utf-8" 117 try: 118 x = unicode(s, "ascii") 119 # if it's ascii, we're done 120 return s, 0, "ascii" 121 except UnicodeError: 122 encodings = ["iso-8859-1","cp1252","iso-8859-15","utf-8"] 123 # if the default encoding is not ascii it's a good thing to try 124 if denc != "ascii": encodings.insert(0, denc) 125 # always try any caller-provided encoding first 126 if enc: encodings.insert(0, enc) 127 for enc in encodings: 128 129 # Most of the characters between 0x80 and 0x9F are displayable 130 # in cp1252 but are control characters in iso-8859-1. Skip 131 # iso-8859-1 if they are found, even though the unicode() call 132 # might well succeed. 133 134 if (enc in ("iso-8859-15", "iso-8859-1") and 135 re.search(r"[\x80-\x9f]", s) is not None): 136 continue 137 138 # Characters in the given range are more likely to be 139 # symbols used in iso-8859-15, so even though unicode() 140 # may accept such strings with those encodings, skip them. 141 142 if (enc in ("iso-8859-1", "cp1252") and 143 re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None): 144 continue 145 146 147 try: 148 x = unicode(s, enc) 149 except UnicodeError: 150 pass 151 else: 152 if x.encode(enc) == s: 153 return x, 0, enc 154 155 # nothing worked perfectly - try again, but use the "ignore" parameter 156 # and return the longest result 157 output = [(unicode(s, enc, "ignore"), enc) for enc in encodings] 158 output = [(len(x), x) for x in output] 159 output.sort() 160 x, enc = output[-1][1] 161 return x, 1, enc

162

163 -def loadFile(chemin, renvoieEnc=False):

164 (x,num,encodage) = decode_file_heuristically(chemin) 165 #print encodage 166 texte = x #readEncodedFile(chemin,encodage) 167 #print type(texte) 168 if renvoieEnc: 169 return texte,encodage 170 else: 171 return texte#.encode(encodage)

172

173 -class mapperUnicodeUtf(object):

174 """Map chaque position d'une chaine UTF-8 en leur position Unicode 175 176 Chaque caractère de la chaine unicode est converti en UTF-8 et on compte 177 leur nombre. """

178 - def __init__(self, texteUni, encodage=None):

179 """ 180 linéaire par rapport à la longueur de la chaine unicode 181 pre: isinstance(texteUni,unicode) """ 182 assert isinstance(texteUni,unicode) 183 self.texteUni = texteUni 184 #if encodage is None: 185 # chaine,x,enc = decode_string_heuristically(self.texteUni) 186 # self.encodage = enc 187 # print chaine.encode(enc),len(chaine),len(chaine.encode(enc)),len(self.texteUni) 188 #else: 189 self.encodage = encodage 190 self.mapUtf2Uni = [] 191 self.mapUni2Utf = [] 192 posUni = posUtf = 0 193 # pour chaque caractère unicode 194 for charUni in self.texteUni: 195 self.mapUni2Utf.append(posUtf) 196 # pour chaque octet correspondant à son encodage 197 for j in xrange(len(charUni.encode(self.encodage))): 198 self.mapUtf2Uni.append(posUni) 199 posUtf += 1 200 #print self.mapUtf2Uni,len(self.mapUtf2Uni),enc 201 posUni += 1 202 self.mapUtf2Uni.append(posUni)

203 #print self.mapUtf2Uni,len(self.mapUtf2Uni),self.encodage 204

205 - def utfToUni(self,pos):

206 """accès en O(1)""" 207 #print pos,self.mapUtf2Uni[pos] 208 return self.mapUtf2Uni[pos]

209

210 - def uniToUtf(self,pos):

211 """accès en O(1)""" 212 return self.mapUni2Utf[pos]

213

214 - def __convertResultat(self, item, utf2uni=True):

215 """Traite type list, tuple et int normal""" 216 if isinstance(item, list): 217 return map(self.__convertResultat, item) 218 elif isinstance(item, tuple): 219 return (self.__convertResultat(item[0]),self. __convertResultat(item[1])) 220 else: 221 if utf2uni: 222 return self.mapUtf2Uni[item] 223 else: 224 return self.mapUni2Utf[item]

225

226 - def utfToUniResultat(self, res):

227 return self.convertResultat_(res,utf2uni=True)

228

229 - def uniToUtfResultat(self, res):

230 return self.convertResultat_(res,utf2uni=False)

231

232 - def convertResultat_(self, res, utf2uni=True):

233 """Converti les index d'un Resultat d'un format à l'autre 234 235 Si utf2uni= True conversion de utf vers uni 236 Si faux de unichr vers utf""" 237 assert isinstance(res, Donnees.resultatAppli.Resultat) 238 return Donnees.resultatAppli.Resultat( 239 self.__convertResultat(res.getListeInsertions(),utf2uni), 240 self.__convertResultat(res.getListeSuppressions(),utf2uni), 241 self.__convertResultat(res.getListeDeplacements(),utf2uni), 242 self.__convertResultat(res.getListeRemplacements(),utf2uni), 243 self.__convertResultat(res.getLgSource(),utf2uni), 244 res.getTextesConcatenes(), 245 self.__convertResultat(res.getBlocsCommuns(),utf2uni), 246 self.__convertResultat(res.getPairesBlocsDeplaces(),utf2uni))

247 248 import locale 249 #def guess_encoding(data):

250 -def decode_string_heuristically(data):#s, enc=None, denc=sys.getdefaultencoding()):

251 """ 252 Given a byte string, attempt to decode it. 253 Tries the standard 'UTF8' and 'latin-1' encodings, 254 Plus several gathered from locale information. 255 256 The calling program *must* first call 257 locale.setlocale(locale.LC_ALL, '') 258 259 If successful it returns 260 (decoded_unicode, successful_encoding) 261 If unsuccessful it raises a ``UnicodeError`` 262 """ 263 successful_encoding = None 264 # we make 'utf-8' the first encoding 265 encodings = ['utf-8'] 266 # 267 # next we add anything we can learn from the locale 268 try: 269 encodings.append(locale.nl_langinfo(locale.CODESET)) 270 except AttributeError: 271 pass 272 try: 273 encodings.append(locale.getlocale()[1]) 274 except (AttributeError, IndexError): 275 pass 276 try: 277 encodings.append(locale.getdefaultlocale()[1]) 278 except (AttributeError, IndexError): 279 pass 280 # 281 # we try 'latin-1' last 282 encodings.extend(['latin-1']) 283 for enc in encodings: 284 # some of the locale calls 285 # may have returned None 286 if not enc: 287 continue 288 try: 289 decoded = unicode(data, enc) 290 successful_encoding = enc 291 292 except (UnicodeError, LookupError): 293 pass 294 else: 295 break 296 if not successful_encoding: 297 raise UnicodeError( 298 'Unable to decode input data. Tried the following encodings: %s.' 299 % ', '.join([repr(enc) for enc in encodings if enc])) 300 else: 301 return decoded, None, successful_encoding 302 303 #def guess_file_encoding(chemin):

304 -def decode_file_heuristically(chemin):

305 f = open(chemin,'r') 306 s = f.read() 307 f.close() 308 return decode_string_heuristically(s) 309