@@ -60,6 +60,7 @@ typedef struct {
6060 int ordered_attributes ; /* Return attributes as a list. */
6161 int specified_attributes ; /* Report only specified attributes. */
6262 int in_callback ; /* Is a callback active? */
63+ PyObject * intern ; /* Dictionary to intern strings */
6364 PyObject * * handlers ;
6465} xmlparseobject ;
6566
@@ -123,7 +124,7 @@ set_error(xmlparseobject *self)
123124 Returns None if str is a null pointer. */
124125
125126static PyObject *
126- conv_string_to_unicode (XML_Char * str )
127+ conv_string_to_unicode (const XML_Char * str )
127128{
128129 /* XXX currently this code assumes that XML_Char is 8-bit,
129130 and hence in UTF-8. */
@@ -132,8 +133,7 @@ conv_string_to_unicode(XML_Char *str)
132133 Py_INCREF (Py_None );
133134 return Py_None ;
134135 }
135- return PyUnicode_DecodeUTF8 ((const char * )str ,
136- strlen ((const char * )str ),
136+ return PyUnicode_DecodeUTF8 (str , strlen (str ),
137137 "strict" );
138138}
139139
@@ -155,7 +155,7 @@ conv_string_len_to_unicode(const XML_Char *str, int len)
155155 Returns None if str is a null pointer. */
156156
157157static PyObject *
158- conv_string_to_utf8 (XML_Char * str )
158+ conv_string_to_utf8 (const XML_Char * str )
159159{
160160 /* XXX currently this code assumes that XML_Char is 8-bit,
161161 and hence in UTF-8. */
@@ -164,7 +164,7 @@ conv_string_to_utf8(XML_Char *str)
164164 Py_INCREF (Py_None );
165165 return Py_None ;
166166 }
167- return PyString_FromString (( const char * ) str );
167+ return PyString_FromString (str );
168168}
169169
170170static PyObject *
@@ -275,6 +275,25 @@ call_with_frame(PyCodeObject *c, PyObject* func, PyObject* args)
275275 ? conv_string_to_unicode : conv_string_to_utf8)
276276#endif
277277
278+ static PyObject *
279+ string_intern (xmlparseobject * self , const char * str )
280+ {
281+ PyObject * result = STRING_CONV_FUNC (str );
282+ PyObject * value ;
283+ if (!self -> intern )
284+ return result ;
285+ value = PyDict_GetItem (self -> intern , result );
286+ if (!value ) {
287+ if (PyDict_SetItem (self -> intern , result , result ) == 0 )
288+ return result ;
289+ else
290+ return NULL ;
291+ }
292+ Py_INCREF (value );
293+ Py_DECREF (result );
294+ return value ;
295+ }
296+
278297static void
279298my_StartElementHandler (void * userData ,
280299 const XML_Char * name , const XML_Char * * atts )
@@ -307,7 +326,7 @@ my_StartElementHandler(void *userData,
307326 return ;
308327 }
309328 for (i = 0 ; i < max ; i += 2 ) {
310- PyObject * n = STRING_CONV_FUNC ( (XML_Char * ) atts [i ]);
329+ PyObject * n = string_intern ( self , (XML_Char * ) atts [i ]);
311330 PyObject * v ;
312331 if (n == NULL ) {
313332 flag_error (self );
@@ -336,7 +355,7 @@ my_StartElementHandler(void *userData,
336355 Py_DECREF (v );
337356 }
338357 }
339- args = Py_BuildValue ("(O&N )" , STRING_CONV_FUNC , name , container );
358+ args = Py_BuildValue ("(NN )" , string_intern ( self , name ) , container );
340359 if (args == NULL ) {
341360 Py_DECREF (container );
342361 return ;
@@ -394,13 +413,13 @@ my_##NAME##Handler PARAMS {\
394413
395414VOID_HANDLER (EndElement ,
396415 (void * userData , const XML_Char * name ),
397- ("(O& )" , STRING_CONV_FUNC , name ))
416+ ("(N )" , string_intern ( self , name ) ))
398417
399418VOID_HANDLER (ProcessingInstruction ,
400419 (void * userData ,
401420 const XML_Char * target ,
402421 const XML_Char * data ),
403- ("(O&O& )" ,STRING_CONV_FUNC , target , STRING_CONV_FUNC ,data ))
422+ ("(NO& )" , string_intern ( self , target ) , STRING_CONV_FUNC ,data ))
404423
405424#ifndef Py_USING_UNICODE
406425VOID_HANDLER (CharacterData ,
@@ -421,10 +440,10 @@ VOID_HANDLER(UnparsedEntityDecl,
421440 const XML_Char * systemId ,
422441 const XML_Char * publicId ,
423442 const XML_Char * notationName ),
424- ("(O&O&O&O&O& )" ,
425- STRING_CONV_FUNC , entityName , STRING_CONV_FUNC , base ,
426- STRING_CONV_FUNC , systemId , STRING_CONV_FUNC , publicId ,
427- STRING_CONV_FUNC , notationName ))
443+ ("(NNNNN )" ,
444+ string_intern ( self , entityName ), string_intern ( self , base ) ,
445+ string_intern ( self , systemId ), string_intern ( self , publicId ) ,
446+ string_intern ( self , notationName ) ))
428447
429448#ifndef Py_USING_UNICODE
430449VOID_HANDLER (EntityDecl ,
@@ -437,11 +456,12 @@ VOID_HANDLER(EntityDecl,
437456 const XML_Char * systemId ,
438457 const XML_Char * publicId ,
439458 const XML_Char * notationName ),
440- ("O&iNO&O&O&O& " ,
441- STRING_CONV_FUNC , entityName , is_parameter_entity ,
459+ ("NiNNNNN " ,
460+ string_intern ( self , entityName ) , is_parameter_entity ,
442461 conv_string_len_to_utf8 (value , value_length ),
443- STRING_CONV_FUNC ,base , STRING_CONV_FUNC ,systemId ,
444- STRING_CONV_FUNC ,publicId , STRING_CONV_FUNC ,notationName ))
462+ string_intern (self , base ), string_intern (self , systemId ),
463+ string_intern (self , publicId ),
464+ string_intern (self , notationName )))
445465#else
446466VOID_HANDLER (EntityDecl ,
447467 (void * userData ,
@@ -453,13 +473,14 @@ VOID_HANDLER(EntityDecl,
453473 const XML_Char * systemId ,
454474 const XML_Char * publicId ,
455475 const XML_Char * notationName ),
456- ("O&iNO&O&O&O& " ,
457- STRING_CONV_FUNC , entityName , is_parameter_entity ,
476+ ("NiNNNNN " ,
477+ string_intern ( self , entityName ) , is_parameter_entity ,
458478 (self -> returns_unicode
459479 ? conv_string_len_to_unicode (value , value_length )
460480 : conv_string_len_to_utf8 (value , value_length )),
461- STRING_CONV_FUNC ,base , STRING_CONV_FUNC ,systemId ,
462- STRING_CONV_FUNC ,publicId , STRING_CONV_FUNC ,notationName ))
481+ string_intern (self , base ), string_intern (self , systemId ),
482+ string_intern (self , publicId ),
483+ string_intern (self , notationName )))
463484#endif
464485
465486VOID_HANDLER (XmlDecl ,
@@ -473,7 +494,7 @@ VOID_HANDLER(XmlDecl,
473494
474495static PyObject *
475496conv_content_model (XML_Content * const model ,
476- PyObject * (* conv_string )(XML_Char * ))
497+ PyObject * (* conv_string )(const XML_Char * ))
477498{
478499 PyObject * result = NULL ;
479500 PyObject * children = PyTuple_New (model -> numchildren );
@@ -514,17 +535,17 @@ VOID_HANDLER(ElementDecl,
514535 (void * userData ,
515536 const XML_Char * name ,
516537 XML_Content * model ),
517- ("O&O &" ,
518- STRING_CONV_FUNC , name ,
538+ ("NO &" ,
539+ string_intern ( self , name ) ,
519540 (self -> returns_unicode ? conv_content_model_unicode
520541 : conv_content_model_utf8 ),model ))
521542#else
522543VOID_HANDLER (ElementDecl ,
523544 (void * userData ,
524545 const XML_Char * name ,
525546 XML_Content * model ),
526- ("O&O &" ,
527- STRING_CONV_FUNC , name , conv_content_model_utf8 ,model ))
547+ ("NO &" ,
548+ string_intern ( self , name ) , conv_content_model_utf8 ,model ))
528549#endif
529550
530551VOID_HANDLER (AttlistDecl ,
@@ -534,8 +555,8 @@ VOID_HANDLER(AttlistDecl,
534555 const XML_Char * att_type ,
535556 const XML_Char * dflt ,
536557 int isrequired ),
537- ("(O&O&O &O&i)" ,
538- STRING_CONV_FUNC , elname , STRING_CONV_FUNC , attname ,
558+ ("(NNO &O&i)" ,
559+ string_intern ( self , elname ), string_intern ( self , attname ) ,
539560 STRING_CONV_FUNC ,att_type , STRING_CONV_FUNC ,dflt ,
540561 isrequired ))
541562
@@ -545,24 +566,25 @@ VOID_HANDLER(NotationDecl,
545566 const XML_Char * base ,
546567 const XML_Char * systemId ,
547568 const XML_Char * publicId ),
548- ("(O&O&O&O& )" ,
549- STRING_CONV_FUNC , notationName , STRING_CONV_FUNC , base ,
550- STRING_CONV_FUNC , systemId , STRING_CONV_FUNC , publicId ))
569+ ("(NNNN )" ,
570+ string_intern ( self , notationName ), string_intern ( self , base ) ,
571+ string_intern ( self , systemId ), string_intern ( self , publicId ) ))
551572
552573VOID_HANDLER (StartNamespaceDecl ,
553574 (void * userData ,
554575 const XML_Char * prefix ,
555576 const XML_Char * uri ),
556- ("(O&O&)" , STRING_CONV_FUNC ,prefix , STRING_CONV_FUNC ,uri ))
577+ ("(NN)" ,
578+ string_intern (self , prefix ), string_intern (self , uri )))
557579
558580VOID_HANDLER (EndNamespaceDecl ,
559581 (void * userData ,
560582 const XML_Char * prefix ),
561- ("(O& )" , STRING_CONV_FUNC , prefix ))
583+ ("(N )" , string_intern ( self , prefix ) ))
562584
563585VOID_HANDLER (Comment ,
564- (void * userData , const XML_Char * prefix ),
565- ("(O&)" , STRING_CONV_FUNC ,prefix ))
586+ (void * userData , const XML_Char * data ),
587+ ("(O&)" , STRING_CONV_FUNC ,data ))
566588
567589VOID_HANDLER (StartCdataSection ,
568590 (void * userData ),
@@ -605,9 +627,9 @@ RC_HANDLER(int, ExternalEntityRef,
605627 const XML_Char * systemId ,
606628 const XML_Char * publicId ),
607629 int rc = 0 ;,
608- ("(O&O&O&O& )" ,
609- STRING_CONV_FUNC ,context , STRING_CONV_FUNC , base ,
610- STRING_CONV_FUNC , systemId , STRING_CONV_FUNC , publicId ),
630+ ("(O&NNN )" ,
631+ STRING_CONV_FUNC ,context , string_intern ( self , base ) ,
632+ string_intern ( self , systemId ), string_intern ( self , publicId ) ),
611633 rc = PyInt_AsLong (rv );, rc ,
612634 XML_GetUserData (parser ))
613635
@@ -617,8 +639,8 @@ VOID_HANDLER(StartDoctypeDecl,
617639 (void * userData , const XML_Char * doctypeName ,
618640 const XML_Char * sysid , const XML_Char * pubid ,
619641 int has_internal_subset ),
620- ("(O&O&O&i )" , STRING_CONV_FUNC , doctypeName ,
621- STRING_CONV_FUNC , sysid , STRING_CONV_FUNC , pubid ,
642+ ("(NNNi )" , string_intern ( self , doctypeName ) ,
643+ string_intern ( self , sysid ), string_intern ( self , pubid ) ,
622644 has_internal_subset ))
623645
624646VOID_HANDLER (EndDoctypeDecl , (void * userData ), ("()" ))
@@ -856,6 +878,8 @@ xmlparse_ExternalEntityParserCreate(xmlparseobject *self, PyObject *args)
856878 new_parser -> itself = XML_ExternalEntityParserCreate (self -> itself , context ,
857879 encoding );
858880 new_parser -> handlers = 0 ;
881+ new_parser -> intern = self -> intern ;
882+ Py_XINCREF (new_parser -> intern );
859883#ifdef Py_TPFLAGS_HAVE_GC
860884 PyObject_GC_Track (new_parser );
861885#else
@@ -988,7 +1012,7 @@ XML_Encoding * info)
9881012#endif
9891013
9901014static PyObject *
991- newxmlparseobject (char * encoding , char * namespace_separator )
1015+ newxmlparseobject (char * encoding , char * namespace_separator , PyObject * intern )
9921016{
9931017 int i ;
9941018 xmlparseobject * self ;
@@ -1022,6 +1046,8 @@ newxmlparseobject(char *encoding, char *namespace_separator)
10221046 else {
10231047 self -> itself = XML_ParserCreate (encoding );
10241048 }
1049+ self -> intern = intern ;
1050+ Py_XINCREF (self -> intern );
10251051#ifdef Py_TPFLAGS_HAVE_GC
10261052 PyObject_GC_Track (self );
10271053#else
@@ -1074,6 +1100,7 @@ xmlparse_dealloc(xmlparseobject *self)
10741100 }
10751101 free (self -> handlers );
10761102 }
1103+ Py_XDECREF (self -> intern );
10771104#if PY_MAJOR_VERSION == 1 && PY_MINOR_VERSION < 6
10781105 /* Code for versions before 1.6 */
10791106 free (self );
@@ -1118,6 +1145,16 @@ xmlparse_getattr(xmlparseobject *self, char *name)
11181145 return PyInt_FromLong ((long ) self -> returns_unicode );
11191146 if (strcmp (name , "specified_attributes" ) == 0 )
11201147 return PyInt_FromLong ((long ) self -> specified_attributes );
1148+ if (strcmp (name , "intern" ) == 0 ) {
1149+ if (self -> intern == NULL ) {
1150+ Py_INCREF (Py_None );
1151+ return Py_None ;
1152+ }
1153+ else {
1154+ Py_INCREF (self -> intern );
1155+ return self -> intern ;
1156+ }
1157+ }
11211158
11221159 handlernum = handlername2int (name );
11231160
@@ -1138,6 +1175,7 @@ xmlparse_getattr(xmlparseobject *self, char *name)
11381175 PyList_Append (rc , PyString_FromString ("ordered_attributes" ));
11391176 PyList_Append (rc , PyString_FromString ("returns_unicode" ));
11401177 PyList_Append (rc , PyString_FromString ("specified_attributes" ));
1178+ PyList_Append (rc , PyString_FromString ("intern" ));
11411179
11421180 return rc ;
11431181 }
@@ -1221,6 +1259,8 @@ static int
12211259xmlparse_clear (xmlparseobject * op )
12221260{
12231261 clear_handlers (op , 0 );
1262+ Py_XDECREF (op -> intern );
1263+ op -> intern = 0 ;
12241264 return 0 ;
12251265}
12261266#endif
@@ -1275,10 +1315,14 @@ pyexpat_ParserCreate(PyObject *notused, PyObject *args, PyObject *kw)
12751315{
12761316 char * encoding = NULL ;
12771317 char * namespace_separator = NULL ;
1278- static char * kwlist [] = {"encoding" , "namespace_separator" , NULL };
1279-
1280- if (!PyArg_ParseTupleAndKeywords (args , kw , "|zz:ParserCreate" , kwlist ,
1281- & encoding , & namespace_separator ))
1318+ PyObject * intern = NULL ;
1319+ PyObject * result ;
1320+ int intern_decref = 0 ;
1321+ static char * kwlist [] = {"encoding" , "namespace_separator" ,
1322+ "intern" , NULL };
1323+
1324+ if (!PyArg_ParseTupleAndKeywords (args , kw , "|zzO:ParserCreate" , kwlist ,
1325+ & encoding , & namespace_separator , & intern ))
12821326 return NULL ;
12831327 if (namespace_separator != NULL
12841328 && strlen (namespace_separator ) > 1 ) {
@@ -1287,7 +1331,26 @@ pyexpat_ParserCreate(PyObject *notused, PyObject *args, PyObject *kw)
12871331 " character, omitted, or None" );
12881332 return NULL ;
12891333 }
1290- return newxmlparseobject (encoding , namespace_separator );
1334+ /* Explicitly passing None means no interning is desired.
1335+ Not passing anything means that a new dictionary is used. */
1336+ if (intern == Py_None )
1337+ intern = NULL ;
1338+ else if (intern == NULL ) {
1339+ intern = PyDict_New ();
1340+ if (!intern )
1341+ return NULL ;
1342+ intern_decref = 1 ;
1343+ }
1344+ else if (!PyDict_Check (intern )) {
1345+ PyErr_SetString (PyExc_TypeError , "intern must be a dictionary" );
1346+ return NULL ;
1347+ }
1348+
1349+ result = newxmlparseobject (encoding , namespace_separator , intern );
1350+ if (intern_decref ) {
1351+ Py_DECREF (intern );
1352+ }
1353+ return result ;
12911354}
12921355
12931356PyDoc_STRVAR (pyexpat_ErrorString__doc__ ,
0 commit comments