1+ import { markdownToTxt } from 'markdown-to-txt' ;
2+
13const toHalfWidthAndCleanSpace = ( str : string ) : string => {
2- return str
4+ return markdownToTxt ( str )
35 . replaceAll ( / [ \uFF01 - \uFF5E ] / g, ( ch ) => String . fromCharCode ( ch . charCodeAt ( 0 ) - 0xFE_E0 ) )
46 . replaceAll ( '\u3000' , ' ' )
57 . replaceAll ( '。' , '.' )
@@ -22,32 +24,42 @@ const toHalfWidthAndCleanSpace = (str: string): string => {
2224 . replaceAll ( / \s + / g, ' ' ) ;
2325} ;
2426
25- export const splitTextIntoSegments = ( text : string , maxChars : number = 100 ) : string [ ] => {
27+ export const splitTextIntoSegments = ( text : string , chunkSize : number = 100 ) : string [ ] => {
2628 text = toHalfWidthAndCleanSpace ( text ) ;
2729
28- const sentences = text . match ( / [ ^ ! . ; ? ] + [ ! . ; ? ] + / g ) || [ ] ;
29- const segments : string [ ] = [ ] ;
30- let currentSegment = '' ;
30+ const chunks : string [ ] = [ ] ;
31+ const paragraphs = text . split ( '\n' ) ;
32+ let currentChunk = '' ;
3133
32- sentences . forEach ( ( sentence ) => {
33- if ( ( currentSegment + sentence ) . length > maxChars ) {
34- if ( currentSegment . length > 0 ) {
35- segments . push ( currentSegment . trim ( ) ) ;
36- currentSegment = '' ;
37- }
38- if ( sentence . length > maxChars ) {
39- segments . push ( sentence . trim ( ) ) ;
40- } else {
41- currentSegment = sentence ;
34+ function addChunk ( chunk : string ) {
35+ if ( chunk . trim ( ) ) {
36+ chunks . push ( chunk . trim ( ) ) ;
37+ }
38+ }
39+
40+ for ( const paragraph of paragraphs ) {
41+ if ( currentChunk . length + paragraph . length + 1 > chunkSize && currentChunk . length > 0 ) {
42+ addChunk ( currentChunk ) ;
43+ currentChunk = '' ;
44+ }
45+
46+ if ( paragraph . length > chunkSize ) {
47+ const sentences = paragraph . match ( / [ ^ ! . ? ] + [ ! . ? ] + / g) || [ paragraph ] ;
48+ for ( const sentence of sentences ) {
49+ if ( currentChunk . length + sentence . length + 1 > chunkSize && currentChunk . length > 0 ) {
50+ addChunk ( currentChunk ) ;
51+ currentChunk = '' ;
52+ }
53+ currentChunk += ( currentChunk ? ' ' : '' ) + sentence . trim ( ) ;
4254 }
4355 } else {
44- currentSegment += sentence ;
56+ currentChunk += ( currentChunk ? '\n' : '' ) + paragraph ;
4557 }
46- } ) ;
58+ }
4759
48- if ( currentSegment . length > 0 ) {
49- segments . push ( currentSegment . trim ( ) ) ;
60+ if ( currentChunk ) {
61+ addChunk ( currentChunk ) ;
5062 }
5163
52- return segments . filter ( Boolean ) ;
64+ return chunks ;
5365} ;
0 commit comments