11use crate :: trap;
2+ use globset:: { Glob , GlobSetBuilder } ;
23use rayon:: prelude:: * ;
3- use std:: collections:: HashMap ;
4- use std:: ffi:: { OsStr , OsString } ;
54use std:: fs:: File ;
65use std:: io:: BufRead ;
76use std:: path:: { Path , PathBuf } ;
@@ -13,7 +12,7 @@ pub struct LanguageSpec {
1312 pub prefix : & ' static str ,
1413 pub ts_language : tree_sitter:: Language ,
1514 pub node_types : & ' static str ,
16- pub file_extensions : Vec < OsString > ,
15+ pub file_globs : Vec < String > ,
1716}
1817
1918pub struct Extractor {
@@ -83,16 +82,23 @@ impl Extractor {
8382 schemas. push ( schema) ;
8483 }
8584
86- // Construct a map from file extension -> LanguageSpec
87- let mut file_extension_language_mapping: HashMap < & OsStr , Vec < usize > > = HashMap :: new ( ) ;
88- for ( i, lang) in self . languages . iter ( ) . enumerate ( ) {
89- for ( j, _ext) in lang. file_extensions . iter ( ) . enumerate ( ) {
90- let indexes = file_extension_language_mapping
91- . entry ( & lang. file_extensions [ j] )
92- . or_default ( ) ;
93- indexes. push ( i) ;
85+ // Construct a single globset containing all language globs,
86+ // and a mapping from glob index to language index.
87+ let ( globset, glob_language_mapping) = {
88+ let mut builder = GlobSetBuilder :: new ( ) ;
89+ let mut glob_lang_mapping = vec ! [ ] ;
90+ for ( i, lang) in self . languages . iter ( ) . enumerate ( ) {
91+ for glob_str in & lang. file_globs {
92+ let glob = Glob :: new ( glob_str) . expect ( "invalid glob" ) ;
93+ builder. add ( glob) ;
94+ glob_lang_mapping. push ( i) ;
95+ }
9496 }
95- }
97+ (
98+ builder. build ( ) . expect ( "failed to build globset" ) ,
99+ glob_lang_mapping,
100+ )
101+ } ;
96102
97103 let lines: std:: io:: Result < Vec < String > > =
98104 std:: io:: BufReader :: new ( file_list) . lines ( ) . collect ( ) ;
@@ -108,33 +114,42 @@ impl Extractor {
108114 let source = std:: fs:: read ( & path) ?;
109115 let mut trap_writer = trap:: Writer :: new ( ) ;
110116
111- match path. extension ( ) {
112- None => {
113- tracing:: error!( ?path, "No extension found, skipping file." ) ;
114- }
115- Some ( ext) => {
116- if let Some ( indexes) = file_extension_language_mapping. get ( ext) {
117- for i in indexes {
118- let lang = & self . languages [ * i] ;
119- crate :: extractor:: extract (
120- lang. ts_language ,
121- lang. prefix ,
122- & schemas[ * i] ,
123- & mut diagnostics_writer,
124- & mut trap_writer,
125- & path,
126- & source,
127- & [ ] ,
128- ) ;
129- std:: fs:: create_dir_all ( src_archive_file. parent ( ) . unwrap ( ) ) ?;
130- std:: fs:: copy ( & path, & src_archive_file) ?;
131- write_trap ( & self . trap_dir , & path, & trap_writer, trap_compression) ?;
132- }
133- } else {
134- tracing:: warn!( ?path, "No language matches path, skipping file." ) ;
117+ let matches = globset. matches ( & path) ;
118+ if matches. is_empty ( ) {
119+ tracing:: error!( ?path, "No matching language found, skipping file." ) ;
120+ } else {
121+ let mut languages_processed = {
122+ // No known extractor uses more than 8 languages.
123+ let mut v = Vec :: with_capacity ( 8 ) ;
124+ for _ in & self . languages {
125+ v. push ( false ) ;
135126 }
127+ v
128+ } ;
129+
130+ for m in matches {
131+ let i = glob_language_mapping[ m] ;
132+ if languages_processed[ i] {
133+ continue ;
134+ }
135+ languages_processed[ i] = true ;
136+ let lang = & self . languages [ i] ;
137+
138+ crate :: extractor:: extract (
139+ lang. ts_language ,
140+ lang. prefix ,
141+ & schemas[ i] ,
142+ & mut diagnostics_writer,
143+ & mut trap_writer,
144+ & path,
145+ & source,
146+ & [ ] ,
147+ ) ;
148+ std:: fs:: create_dir_all ( src_archive_file. parent ( ) . unwrap ( ) ) ?;
149+ std:: fs:: copy ( & path, & src_archive_file) ?;
150+ write_trap ( & self . trap_dir , & path, & trap_writer, trap_compression) ?;
136151 }
137- } ;
152+ }
138153 Ok ( ( ) ) as std:: io:: Result < ( ) >
139154 } )
140155 . expect ( "failed to extract files" ) ;
0 commit comments