1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 import sys, codecs, re, locale
30 import Donnees.resultatAppli
31 if __debug__:
32 import contract
33 contract.checkmod(__name__)
34
36 f = codecs.open(chemin,'r',encodage)
37 s = f.read()
38 f.close()
39 return s
40
42 """Try interpreting s using several possible encodings.
43 The return value is a three-element tuple. The first element is either an
44 ASCII string or a Unicode object. The second element is 1
45 if the decoder had to punt and delete some characters from the input
46 to successfully generate a Unicode object."""
47 try:
48 s = readEncodedFile(chemin,'ascii')
49 x = unicode(s, "ascii")
50
51 return s, 0, "ascii"
52 except (UnicodeError,TypeError):
53 encodings = ["iso-8859-1","cp1252","iso-8859-15","utf-8"]
54
55 if sys.getdefaultencoding() != "ascii": encodings.insert(0, sys.getdefaultencoding())
56 for enc in encodings:
57
58
59
60
61 if (enc in ("iso-8859-15", "iso-8859-1")):
62 try:
63 s = readEncodedFile(chemin,enc)
64 if (re.search(r"[\x80-\x9f]", s) is not None):
65 continue
66 except (UnicodeError,TypeError):
67 continue
68
69
70
71
72 if (enc in ("iso-8859-1", "cp1252")):
73 try:
74 s = readEncodedFile(chemin,enc)
75 if (re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None):
76 continue
77 except (UnicodeError,TypeError),err:
78 continue
79
80 try:
81 s = readEncodedFile(chemin,enc)
82 if not isinstance(s,unicode):
83 x = unicode(s, enc)
84 else:
85 x = s
86 except (UnicodeError,TypeError),err:
87 pass
88 else:
89 if x.encode(enc) == s:
90 return x, 0, enc
91
92
93
94 output = list()
95 for enc in encodings:
96 try:
97 s = readEncodedFile(chemin,enc)
98 if not isinstance(s,unicode):
99 output.append((unicode(s,enc,'ignore'),enc))
100 else:
101 output.append((s,enc))
102 except (UnicodeError,TypeError),err:
103 continue
104 output = [(len(x), x) for x in output]
105 output.sort()
106 x, enc = output[-1][1]
107 return x, 1, enc
108
110 """Try interpreting s using several possible encodings.
111 The return value is a three-element tuple. The first element is either an
112 ASCII string or a Unicode object. The second element is 1
113 if the decoder had to punt and delete some characters from the input
114 to successfully generate a Unicode object."""
115
116
117 try:
118 x = unicode(s, "ascii")
119
120 return s, 0, "ascii"
121 except UnicodeError:
122 encodings = ["iso-8859-1","cp1252","iso-8859-15","utf-8"]
123
124 if denc != "ascii": encodings.insert(0, denc)
125
126 if enc: encodings.insert(0, enc)
127 for enc in encodings:
128
129
130
131
132
133
134 if (enc in ("iso-8859-15", "iso-8859-1") and
135 re.search(r"[\x80-\x9f]", s) is not None):
136 continue
137
138
139
140
141
142 if (enc in ("iso-8859-1", "cp1252") and
143 re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None):
144 continue
145
146
147 try:
148 x = unicode(s, enc)
149 except UnicodeError:
150 pass
151 else:
152 if x.encode(enc) == s:
153 return x, 0, enc
154
155
156
157 output = [(unicode(s, enc, "ignore"), enc) for enc in encodings]
158 output = [(len(x), x) for x in output]
159 output.sort()
160 x, enc = output[-1][1]
161 return x, 1, enc
162
164 (x,num,encodage) = decode_file_heuristically(chemin)
165
166 texte = x
167
168 if renvoieEnc:
169 return texte,encodage
170 else:
171 return texte
172
174 """Map chaque position d'une chaine UTF-8 en leur position Unicode
175
176 Chaque caractère de la chaine unicode est converti en UTF-8 et on compte
177 leur nombre. """
178 - def __init__(self, texteUni, encodage=None):
179 """
180 linéaire par rapport à la longueur de la chaine unicode
181 pre: isinstance(texteUni,unicode) """
182 assert isinstance(texteUni,unicode)
183 self.texteUni = texteUni
184
185
186
187
188
189 self.encodage = encodage
190 self.mapUtf2Uni = []
191 self.mapUni2Utf = []
192 posUni = posUtf = 0
193
194 for charUni in self.texteUni:
195 self.mapUni2Utf.append(posUtf)
196
197 for j in xrange(len(charUni.encode(self.encodage))):
198 self.mapUtf2Uni.append(posUni)
199 posUtf += 1
200
201 posUni += 1
202 self.mapUtf2Uni.append(posUni)
203
204
206 """accès en O(1)"""
207
208 return self.mapUtf2Uni[pos]
209
211 """accès en O(1)"""
212 return self.mapUni2Utf[pos]
213
215 """Traite type list, tuple et int normal"""
216 if isinstance(item, list):
217 return map(self.__convertResultat, item)
218 elif isinstance(item, tuple):
219 return (self.__convertResultat(item[0]),self. __convertResultat(item[1]))
220 else:
221 if utf2uni:
222 return self.mapUtf2Uni[item]
223 else:
224 return self.mapUni2Utf[item]
225
228
231
233 """Converti les index d'un Resultat d'un format à l'autre
234
235 Si utf2uni= True conversion de utf vers uni
236 Si faux de unichr vers utf"""
237 assert isinstance(res, Donnees.resultatAppli.Resultat)
238 return Donnees.resultatAppli.Resultat(
239 self.__convertResultat(res.getListeInsertions(),utf2uni),
240 self.__convertResultat(res.getListeSuppressions(),utf2uni),
241 self.__convertResultat(res.getListeDeplacements(),utf2uni),
242 self.__convertResultat(res.getListeRemplacements(),utf2uni),
243 self.__convertResultat(res.getLgSource(),utf2uni),
244 res.getTextesConcatenes(),
245 self.__convertResultat(res.getBlocsCommuns(),utf2uni),
246 self.__convertResultat(res.getPairesBlocsDeplaces(),utf2uni))
247
248 import locale
249
251 """
252 Given a byte string, attempt to decode it.
253 Tries the standard 'UTF8' and 'latin-1' encodings,
254 Plus several gathered from locale information.
255
256 The calling program *must* first call
257 locale.setlocale(locale.LC_ALL, '')
258
259 If successful it returns
260 (decoded_unicode, successful_encoding)
261 If unsuccessful it raises a ``UnicodeError``
262 """
263 successful_encoding = None
264
265 encodings = ['utf-8']
266
267
268 try:
269 encodings.append(locale.nl_langinfo(locale.CODESET))
270 except AttributeError:
271 pass
272 try:
273 encodings.append(locale.getlocale()[1])
274 except (AttributeError, IndexError):
275 pass
276 try:
277 encodings.append(locale.getdefaultlocale()[1])
278 except (AttributeError, IndexError):
279 pass
280
281
282 encodings.extend(['latin-1'])
283 for enc in encodings:
284
285
286 if not enc:
287 continue
288 try:
289 decoded = unicode(data, enc)
290 successful_encoding = enc
291
292 except (UnicodeError, LookupError):
293 pass
294 else:
295 break
296 if not successful_encoding:
297 raise UnicodeError(
298 'Unable to decode input data. Tried the following encodings: %s.'
299 % ', '.join([repr(enc) for enc in encodings if enc]))
300 else:
301 return decoded, None, successful_encoding
302
303
305 f = open(chemin,'r')
306 s = f.read()
307 f.close()
308 return decode_string_heuristically(s)
309