1616import re
1717import sys
1818
19+ from lib .core .data import conf
20+ from lib .core .data import kb
1921from lib .core .settings import INVALID_UNICODE_PRIVATE_AREA
2022from lib .core .settings import IS_WIN
23+ from lib .core .settings import NULL
2124from lib .core .settings import PICKLE_PROTOCOL
2225from lib .core .settings import SAFE_HEX_MARKER
2326from lib .core .settings import UNICODE_ENCODING
@@ -89,6 +92,12 @@ def singleTimeWarnMessage(message): # Cross-referenced function
8992 sys .stdout .write ("\n " )
9093 sys .stdout .flush ()
9194
95+ def filterNone (values ): # Cross-referenced function
96+ raise NotImplementedError
97+
98+ def isListLike (value ): # Cross-referenced function
99+ raise NotImplementedError
100+
92101def stdoutencode (data ):
93102 retVal = data
94103
@@ -146,7 +155,7 @@ def decodeHex(value, binary=True):
146155 retVal = value
147156
148157 if isinstance (value , six .binary_type ):
149- value = value . decode ( UNICODE_ENCODING )
158+ value = getText ( value )
150159
151160 if value .lower ().startswith ("0x" ):
152161 value = value [2 :]
@@ -250,6 +259,50 @@ def getOrds(value):
250259
251260 return [_ if isinstance (_ , int ) else ord (_ ) for _ in value ]
252261
262+ def getUnicode (value , encoding = None , noneToNull = False ):
263+ """
264+ Return the unicode representation of the supplied value:
265+
266+ >>> getUnicode('test') == u'test'
267+ True
268+ >>> getUnicode(1) == u'1'
269+ True
270+ """
271+
272+ if noneToNull and value is None :
273+ return NULL
274+
275+ if isinstance (value , six .text_type ):
276+ return value
277+ elif isinstance (value , six .binary_type ):
278+ # Heuristics (if encoding not explicitly specified)
279+ candidates = filterNone ((encoding , kb .get ("pageEncoding" ) if kb .get ("originalPage" ) else None , conf .get ("encoding" ), UNICODE_ENCODING , sys .getfilesystemencoding ()))
280+ if all (_ in value for _ in (b'<' , b'>' )):
281+ pass
282+ elif any (_ in value for _ in (b":\\ " , b'/' , b'.' )) and b'\n ' not in value :
283+ candidates = filterNone ((encoding , sys .getfilesystemencoding (), kb .get ("pageEncoding" ) if kb .get ("originalPage" ) else None , UNICODE_ENCODING , conf .get ("encoding" )))
284+ elif conf .get ("encoding" ) and b'\n ' not in value :
285+ candidates = filterNone ((encoding , conf .get ("encoding" ), kb .get ("pageEncoding" ) if kb .get ("originalPage" ) else None , sys .getfilesystemencoding (), UNICODE_ENCODING ))
286+
287+ for candidate in candidates :
288+ try :
289+ return six .text_type (value , candidate )
290+ except UnicodeDecodeError :
291+ pass
292+
293+ try :
294+ return six .text_type (value , encoding or (kb .get ("pageEncoding" ) if kb .get ("originalPage" ) else None ) or UNICODE_ENCODING )
295+ except UnicodeDecodeError :
296+ return six .text_type (value , UNICODE_ENCODING , errors = "reversible" )
297+ elif isListLike (value ):
298+ value = list (getUnicode (_ , encoding , noneToNull ) for _ in value )
299+ return value
300+ else :
301+ try :
302+ return six .text_type (value )
303+ except UnicodeDecodeError :
304+ return six .text_type (str (value ), errors = "ignore" ) # encoding ignored for non-basestring instances
305+
253306def getText (value ):
254307 """
255308 Returns textual value of a given value (Note: not necessary Unicode on Python2)
@@ -263,7 +316,7 @@ def getText(value):
263316 retVal = value
264317
265318 if isinstance (value , six .binary_type ):
266- retVal = value . decode ( UNICODE_ENCODING )
319+ retVal = getUnicode ( value )
267320
268321 if six .PY2 :
269322 try :
0 commit comments