@@ -688,86 +688,112 @@ static PyObject*
688688getline_via_fgets (FILE * fp )
689689{
690690/* INITBUFSIZE is the maximum line length that lets us get away with the fast
691- * no-realloc path. get_line uses 100 for its initial size, but isn't trying
692- * to avoid reallocs. Under MSVC 6, and using files with lines all under 100
693- * chars long, dropping this from 200 to 100 bought less than 1% speedup.
694- * Since many kinds of log files have lines exceeding 100 chars, the tiny
695- * slowdown from using 200 is more than offset by the large speedup for such
696- * log files.
697- * INCBUFSIZE is the amount by which we grow the buffer, if INITBUFSIZE isn't
698- * enough. It doesn't much matter what this set to.
691+ * no-realloc, one-fgets()-call path. Boosting it isn't free, because we have
692+ * to fill this much of the buffer with a known value in order to figure out
693+ * how much of the buffer fgets() overwrites. So if INITBUFSIZE is larger
694+ * than "most" lines, we waste time filling unused buffer slots. 100 is
695+ * surely adequate for most peoples' email archives, chewing over source code,
696+ * etc -- "regular old text files".
697+ * MAXBUFSIZE is the maximum line length that lets us get away with the less
698+ * fast (but still zippy) no-realloc, two-fgets()-call path. See above for
699+ * cautions about boosting that. 300 was chosen because the worst real-life
700+ * text-crunching job reported on Python-Dev was a mail-log crawler where over
701+ * half the lines were 254 chars.
702+ * INCBUFSIZE is the amount by which we grow the buffer, if MAXBUFSIZE isn't
703+ * enough. It doesn't much matter what this is set to: we only get here for
704+ * absurdly long lines anyway.
699705 */
700- #define INITBUFSIZE 200
706+ #define INITBUFSIZE 100
707+ #define MAXBUFSIZE 300
701708#define INCBUFSIZE 1000
709+ char * p ; /* temp */
710+ char buf [MAXBUFSIZE ];
702711 PyObject * v ; /* the string object result */
703- size_t total_v_size ; /* total # chars in v's buffer */
704712 char * pvfree ; /* address of next free slot */
705713 char * pvend ; /* address one beyond last free slot */
706- char * p ; /* temp */
707- char buf [ INITBUFSIZE ];
714+ size_t nfree ; /* # of free buffer slots; pvend-pvfree */
715+ size_t total_v_size ; /* total # of slots in buffer */
708716
709717 /* Optimize for normal case: avoid _PyString_Resize if at all
710- * possible via first reading into auto buf.
718+ * possible via first reading into stack buffer " buf" .
711719 */
712- Py_BEGIN_ALLOW_THREADS
713- memset (buf , '\n ', INITBUFSIZE );
714- p = fgets (buf , INITBUFSIZE , fp );
715- Py_END_ALLOW_THREADS
720+ total_v_size = INITBUFSIZE ; /* start small and pray */
721+ pvfree = buf ;
722+ for (;;) {
723+ Py_BEGIN_ALLOW_THREADS
724+ pvend = buf + total_v_size ;
725+ nfree = pvend - pvfree ;
726+ memset (pvfree , '\n' , nfree );
727+ p = fgets (pvfree , nfree , fp );
728+ Py_END_ALLOW_THREADS
716729
717- if (p == NULL) {
718- clearerr (fp );
719- if (PyErr_CheckSignals ())
720- return NULL ;
721- v = PyString_FromStringAndSize ("" , 0 );
722- return v ;
723- }
724- /* fgets read *something* */
725- p = memchr (buf , '\n' , INITBUFSIZE );
726- if (p != NULL ) {
727- /* Did the \n come from fgets or from us?
728- * Since fgets stops at the first \n, and then writes \0, if
729- * it's from fgets a \0 must be next. But if that's so, it
730- * could not have come from us, since the \n's we filled the
731- * buffer with have only more \n's to the right.
732- */
733- pvend = buf + INITBUFSIZE ;
734- if (p + 1 < pvend && * (p + 1 ) == '\0' ) {
735- /* It's from fgets: we win! In particular, we
736- * haven't done any mallocs yet, and can build the
737- * final result on the first try.
730+ if (p == NULL) {
731+ clearerr (fp );
732+ if (PyErr_CheckSignals ())
733+ return NULL ;
734+ v = PyString_FromStringAndSize (buf , pvfree - buf );
735+ return v ;
736+ }
737+ /* fgets read *something* */
738+ p = memchr (pvfree , '\n' , nfree );
739+ if (p != NULL ) {
740+ /* Did the \n come from fgets or from us?
741+ * Since fgets stops at the first \n, and then writes
742+ * \0, if it's from fgets a \0 must be next. But if
743+ * that's so, it could not have come from us, since
744+ * the \n's we filled the buffer with have only more
745+ * \n's to the right.
738746 */
739- v = PyString_FromStringAndSize (buf , p - buf + 1 );
747+ if (p + 1 < pvend && * (p + 1 ) == '\0' ) {
748+ /* It's from fgets: we win! In particular,
749+ * we haven't done any mallocs yet, and can
750+ * build the final result on the first try.
751+ */
752+ ++ p ; /* include \n from fgets */
753+ }
754+ else {
755+ /* Must be from us: fgets didn't fill the
756+ * buffer and didn't find a newline, so it
757+ * must be the last and newline-free line of
758+ * the file.
759+ */
760+ assert (p > pvfree && * (p - 1 ) == '\0' );
761+ -- p ; /* don't include \0 from fgets */
762+ }
763+ v = PyString_FromStringAndSize (buf , p - buf );
740764 return v ;
741765 }
742- /* Must be from us: fgets didn't fill the buffer and didn't
743- * find a newline, so it must be the last and newline-free
744- * line of the file.
766+ /* yuck: fgets overwrote all the newlines, i.e. the entire
767+ * buffer. So this line isn't over yet, or maybe it is but
768+ * we're exactly at EOF. If we haven't already, try using the
769+ * rest of the stack buffer.
745770 */
746- assert (p > buf && * (p - 1 ) == '\0' );
747- v = PyString_FromStringAndSize (buf , p - buf - 1 );
748- return v ;
771+ assert (* (pvend - 1 ) == '\0' );
772+ if (pvfree == buf ) {
773+ pvfree = pvend - 1 ; /* overwrite trailing null */
774+ total_v_size = MAXBUFSIZE ;
775+ }
776+ else
777+ break ;
749778 }
750- /* yuck: fgets overwrote all the newlines, i.e. the entire buffer.
751- * So this line isn't over yet, or maybe it is but we're exactly at
752- * EOF; in either case, we're tired <wink> .
779+
780+ /* The stack buffer isn't big enough; malloc a string object and read
781+ * into its buffer .
753782 */
754- assert (buf [INITBUFSIZE - 1 ] == '\0' );
755- total_v_size = INITBUFSIZE + INCBUFSIZE ;
783+ total_v_size = MAXBUFSIZE + INCBUFSIZE ;
756784 v = PyString_FromStringAndSize ((char * )NULL , (int )total_v_size );
757785 if (v == NULL )
758786 return v ;
759787 /* copy over everything except the last null byte */
760- memcpy (BUF (v ), buf , INITBUFSIZE - 1 );
761- pvfree = BUF (v ) + INITBUFSIZE - 1 ;
788+ memcpy (BUF (v ), buf , MAXBUFSIZE - 1 );
789+ pvfree = BUF (v ) + MAXBUFSIZE - 1 ;
762790
763791 /* Keep reading stuff into v; if it ever ends successfully, break
764792 * after setting p one beyond the end of the line. The code here is
765793 * very much like the code above, except reads into v's buffer; see
766794 * the code above for detailed comments about the logic.
767795 */
768796 for (;;) {
769- size_t nfree ;
770-
771797 Py_BEGIN_ALLOW_THREADS
772798 pvend = BUF (v ) + total_v_size ;
773799 nfree = pvend - pvfree ;
@@ -814,6 +840,7 @@ getline_via_fgets(FILE *fp)
814840 _PyString_Resize (& v , p - BUF (v ));
815841 return v ;
816842#undef INITBUFSIZE
843+ #undef MAXBUFSIZE
817844#undef INCBUFSIZE
818845}
819846#endif /* ifdef USE_FGETS_IN_GETLINE */
0 commit comments