# -*- coding: iso-8859-1 -*-
# Copyright 20003 - 2008: Julien Bourdaillet (julien.bourdaillet@lip6.fr), Jean-Gabriel Ganascia (jean-gabriel.ganascia@lip6.fr)
# This file is part of MEDITE.
#
#    MEDITE is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    MEDITE is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with Foobar; if not, write to the Free Software
#    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
#----------------------------------------------------------------------------
# Nom:          decodage.py
# Objet:        2 méthodes (redondantes) permettant de décoder,de façon heuristique,
#               un fichier ou une chaine sans en connaitre le codage    
#
# Auteurs:      Julien Bourdaillet
#
# Cree:         Nov 2004
#        
#----------------------------------------------------------------------------

import sys, codecs, re, locale
import Donnees.resultatAppli
if __debug__:
    import contract
    contract.checkmod(__name__)

def readEncodedFile(chemin, encodage):
        f = codecs.open(chemin,'r',encodage)
        s = f.read()
        f.close()
        return s
        
def decode_file_heuristically_(chemin): 
        """Try interpreting s using several possible encodings.
        The return value is a three-element tuple.  The first element is either an
        ASCII string or a Unicode object.  The second element is 1
        if the decoder had to punt and delete some characters from the input
        to successfully generate a Unicode object."""
        try:
            s = readEncodedFile(chemin,'ascii')
            x = unicode(s, "ascii")
            # if it's ascii, we're done
            return s, 0, "ascii"
        except (UnicodeError,TypeError):
            encodings = ["iso-8859-1","cp1252","iso-8859-15","utf-8"]
            # if the default encoding is not ascii it's a good thing to try
            if sys.getdefaultencoding() != "ascii": encodings.insert(0, sys.getdefaultencoding())
            for enc in encodings:
                # Most of the characters between 0x80 and 0x9F are displayable
                # in cp1252 but are control characters in iso-8859-1.  Skip
                # iso-8859-1 if they are found, even though the unicode() call
                # might well succeed.
                if (enc in ("iso-8859-15", "iso-8859-1")):
                    try:
                        s = readEncodedFile(chemin,enc)
                        if (re.search(r"[\x80-\x9f]", s) is not None):
                            continue
                    except (UnicodeError,TypeError):#,err:
                        continue 

                # Characters in the given range are more likely to be 
                # symbols used in iso-8859-15, so even though unicode()
                # may accept such strings with those encodings, skip them.
                if (enc in ("iso-8859-1", "cp1252")):
                    try:
                        s = readEncodedFile(chemin,enc)
                        if (re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None):
                            continue
                    except (UnicodeError,TypeError),err:
                        continue
                
                try:
                    s = readEncodedFile(chemin,enc)
                    if not isinstance(s,unicode):
                        x = unicode(s, enc)
                    else:
                        x = s
                except (UnicodeError,TypeError),err:
                    pass 
                else:
                    if x.encode(enc) == s:
                        return x, 0, enc

            # nothing worked perfectly - try again, but use the "ignore" parameter
            # and return the longest result
            output = list()
            for enc in encodings:
                try:
                    s = readEncodedFile(chemin,enc)
                    if not isinstance(s,unicode):
                        output.append((unicode(s,enc,'ignore'),enc))
                    else:
                        output.append((s,enc))
                except (UnicodeError,TypeError),err:   
                    continue
            output = [(len(x), x) for x in output]
            output.sort()
            x, enc = output[-1][1]
            return x, 1, enc
            
def decode_string_heuristically_(s, enc=None, denc=sys.getdefaultencoding()):
    """Try interpreting s using several possible encodings.
    The return value is a three-element tuple.  The first element is either an
    ASCII string or a Unicode object.  The second element is 1
    if the decoder had to punt and delete some characters from the input
    to successfully generate a Unicode object."""
    #if isinstance(s, unicode):
    #    return s, 0, "utf-8"
    try:
        x = unicode(s, "ascii")
        # if it's ascii, we're done
        return s, 0, "ascii"
    except UnicodeError:
        encodings = ["iso-8859-1","cp1252","iso-8859-15","utf-8"]
        # if the default encoding is not ascii it's a good thing to try
        if denc != "ascii": encodings.insert(0, denc)
        # always try any caller-provided encoding first
        if enc: encodings.insert(0, enc)
        for enc in encodings:

            # Most of the characters between 0x80 and 0x9F are displayable
            # in cp1252 but are control characters in iso-8859-1.  Skip
            # iso-8859-1 if they are found, even though the unicode() call
            # might well succeed.

            if (enc in ("iso-8859-15", "iso-8859-1") and
                re.search(r"[\x80-\x9f]", s) is not None):
                continue

            # Characters in the given range are more likely to be 
            # symbols used in iso-8859-15, so even though unicode()
            # may accept such strings with those encodings, skip them.

            if (enc in ("iso-8859-1", "cp1252") and
                re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None):
                continue
            
                
            try:
                x = unicode(s, enc)
            except UnicodeError:
                pass
            else:
                if x.encode(enc) == s:
                    return x, 0, enc

        # nothing worked perfectly - try again, but use the "ignore" parameter
        # and return the longest result
        output = [(unicode(s, enc, "ignore"), enc) for enc in encodings]
        output = [(len(x), x) for x in output]
        output.sort()
        x, enc = output[-1][1]
        return x, 1, enc
        
def loadFile(chemin, renvoieEnc=False):
        (x,num,encodage) = decode_file_heuristically(chemin)
        #print encodage
        texte = x #readEncodedFile(chemin,encodage)
        #print type(texte)
        if renvoieEnc:
            return texte,encodage
        else:            
            return texte#.encode(encodage)

class mapperUnicodeUtf(object):
    """Map chaque position d'une chaine UTF-8 en leur position Unicode
    
    Chaque caractère de la chaine unicode est converti en UTF-8 et on compte
    leur nombre. """ 
    def __init__(self, texteUni, encodage=None):
        """
        linéaire par rapport à la longueur de la chaine unicode
        pre: isinstance(texteUni,unicode) """
        assert isinstance(texteUni,unicode)
        self.texteUni = texteUni
        #if encodage is None:
        #    chaine,x,enc = decode_string_heuristically(self.texteUni)
        #    self.encodage = enc
        #    print chaine.encode(enc),len(chaine),len(chaine.encode(enc)),len(self.texteUni)
        #else:            
        self.encodage = encodage
        self.mapUtf2Uni = []
        self.mapUni2Utf = []
        posUni = posUtf = 0
        # pour chaque caractère unicode
        for charUni in self.texteUni:
            self.mapUni2Utf.append(posUtf)
            # pour chaque octet correspondant à son encodage 
            for j in xrange(len(charUni.encode(self.encodage))):
                self.mapUtf2Uni.append(posUni)
                posUtf += 1
                #print self.mapUtf2Uni,len(self.mapUtf2Uni),enc
            posUni += 1     
        self.mapUtf2Uni.append(posUni)            
        #print self.mapUtf2Uni,len(self.mapUtf2Uni),self.encodage            
                
    def utfToUni(self,pos):
        """accès en O(1)"""
        #print pos,self.mapUtf2Uni[pos]
        return self.mapUtf2Uni[pos]
    
    def uniToUtf(self,pos):
        """accès en O(1)"""
        return self.mapUni2Utf[pos]
        
    def __convertResultat(self, item, utf2uni=True):
        """Traite type list, tuple et int normal"""
        if isinstance(item, list):
            return map(self.__convertResultat, item)
        elif isinstance(item, tuple):
            return (self.__convertResultat(item[0]),self. __convertResultat(item[1]))
        else:
            if utf2uni:
                return  self.mapUtf2Uni[item]
            else:
                return  self.mapUni2Utf[item]
            
    def utfToUniResultat(self, res):
        return self.convertResultat_(res,utf2uni=True)
        
    def uniToUtfResultat(self, res):
        return self.convertResultat_(res,utf2uni=False)
        
    def convertResultat_(self, res, utf2uni=True):
        """Converti les index d'un Resultat d'un format à l'autre
        
        Si utf2uni= True conversion de utf vers uni
        Si faux de unichr vers utf"""
        assert isinstance(res, Donnees.resultatAppli.Resultat)        
        return Donnees.resultatAppli.Resultat(
            self.__convertResultat(res.getListeInsertions(),utf2uni),
            self.__convertResultat(res.getListeSuppressions(),utf2uni),
            self.__convertResultat(res.getListeDeplacements(),utf2uni),
            self.__convertResultat(res.getListeRemplacements(),utf2uni),
            self.__convertResultat(res.getLgSource(),utf2uni),
            res.getTextesConcatenes(),
            self.__convertResultat(res.getBlocsCommuns(),utf2uni),
            self.__convertResultat(res.getPairesBlocsDeplaces(),utf2uni))
            
import locale
#def guess_encoding(data):
def decode_string_heuristically(data):#s, enc=None, denc=sys.getdefaultencoding()):
    """
    Given a byte string, attempt to decode it.
    Tries the standard 'UTF8' and 'latin-1' encodings,
    Plus several gathered from locale information.

    The calling program *must* first call 
        locale.setlocale(locale.LC_ALL, '')

    If successful it returns 
        (decoded_unicode, successful_encoding)
    If unsuccessful it raises a ``UnicodeError``
    """
    successful_encoding = None
    # we make 'utf-8' the first encoding
    encodings = ['utf-8']
    #
    # next we add anything we can learn from the locale
    try:
        encodings.append(locale.nl_langinfo(locale.CODESET))
    except AttributeError:
        pass
    try:
        encodings.append(locale.getlocale()[1])
    except (AttributeError, IndexError):
        pass
    try:
        encodings.append(locale.getdefaultlocale()[1])
    except (AttributeError, IndexError):
        pass
    #
    # we try 'latin-1' last
    encodings.extend(['latin-1'])
    for enc in encodings:
        # some of the locale calls 
        # may have returned None
        if not enc:
            continue
        try:
            decoded = unicode(data, enc)
            successful_encoding = enc

        except (UnicodeError, LookupError):
            pass
        else:
            break
    if not successful_encoding:
         raise UnicodeError(
        'Unable to decode input data.  Tried the following encodings: %s.'
        % ', '.join([repr(enc) for enc in encodings if enc]))
    else:
         return decoded, None, successful_encoding    
         
#def guess_file_encoding(chemin):  
def decode_file_heuristically(chemin):     
    f = open(chemin,'r')
    s = f.read()
    f.close()
    return decode_string_heuristically(s)