Merge #3347

bors[bot] · f3r10 · ManyTheFish · web-flow · commit 39407885c2ea · 2023-02-21T10:52:13.000Z
3347: Enhance language detection r=irevoire a=ManyTheFish ## Summary Some completely unrelated Languages can share the same characters, in Meilisearch we detect the Languages using `whatlang`, which works well on large texts but fails on small search queries leading to a bad segmentation and normalization of the query. This PR now stores the Languages detected during the indexing in order to reduce the Languages list that can be detected during the search. ## Detail - Create a 19th database mapping the scripts and the Languages detected with the documents where the Language is detected - Fill the newly created database during indexing - Create an allow-list with this database and pass it to Charabia - Add a test ensuring that a Japanese request containing kanjis only is detected as Japanese and not Chinese ## Related issues Fixes #2403 Fixes #3513 Co-authored-by: f3r10 <frledesma@outlook.com> Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Many the fish <many@meilisearch.com>
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs
@@ -148,6 +148,28 @@ async fn simple_search() {
         .await;
 }
 
+#[cfg(feature = "default")]
+#[actix_rt::test]
+async fn test_kanji_language_detection() {
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    let documents = json!([
+        { "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
+        { "id": 1, "title": "東京のお寿司。" },
+        { "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }
+    ]);
+    index.add_documents(documents, None).await;
+    index.wait_task(0).await;
+
+    index
+        .search(json!({"q": "東京"}), |response, code| {
+            assert_eq!(code, 200, "{}", response);
+            assert_eq!(response["hits"].as_array().unwrap().len(), 1);
+        })
+        .await;
+}
+
 #[actix_rt::test]
 async fn search_multiple_params() {
     let server = Server::new().await;
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
@@ -16,7 +16,7 @@ bimap = { version = "0.6.2", features = ["serde"] }
 bincode = "1.3.3"
 bstr = "1.0.1"
 byteorder = "1.4.3"
-charabia = { version = "0.7.0", default-features = false }
+charabia = { version = "0.7.1", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.6"
 deserr = "0.5.0"
diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs
@@ -5,6 +5,7 @@ mod field_id_word_count_codec;
 mod obkv_codec;
 mod roaring_bitmap;
 mod roaring_bitmap_length;
+mod script_language_codec;
 mod str_beu32_codec;
 mod str_ref;
 mod str_str_u8_codec;
@@ -19,5 +20,6 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar
 pub use self::roaring_bitmap_length::{
     BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
 };
+pub use self::script_language_codec::ScriptLanguageCodec;
 pub use self::str_beu32_codec::StrBEU32Codec;
 pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs
@@ -0,0 +1,38 @@
+use std::borrow::Cow;
+use std::str;
+
+use charabia::{Language, Script};
+
+pub struct ScriptLanguageCodec;
+
+impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec {
+    type DItem = (Script, Language);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let sep = bytes.iter().position(|b| *b == 0)?;
+        let (s_bytes, l_bytes) = bytes.split_at(sep);
+        let script = str::from_utf8(s_bytes).ok()?;
+        let script_name = Script::from_name(script);
+        let lan = str::from_utf8(l_bytes).ok()?;
+        // skip '\0' byte between the two strings.
+        let lan_name = Language::from_name(&lan[1..]);
+
+        Some((script_name, lan_name))
+    }
+}
+
+impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec {
+    type EItem = (Script, Language);
+
+    fn bytes_encode((script, lan): &Self::EItem) -> Option<Cow<[u8]>> {
+        let script_name = script.name().as_bytes();
+        let lan_name = lan.name().as_bytes();
+
+        let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len() + 1);
+        bytes.extend_from_slice(script_name);
+        bytes.push(0);
+        bytes.extend_from_slice(lan_name);
+
+        Some(Cow::Owned(bytes))
+    }
+}
diff --git a/milli/src/index.rs b/milli/src/index.rs
@@ -4,6 +4,7 @@ use std::fs::File;
 use std::mem::size_of;
 use std::path::Path;
 
+use charabia::{Language, Script};
 use heed::flags::Flags;
 use heed::types::*;
 use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
@@ -18,7 +19,7 @@ use crate::heed_codec::facet::{
     FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
     FieldIdCodec, OrderedF64Codec,
 };
-use crate::heed_codec::StrRefCodec;
+use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec};
 use crate::{
     default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
     DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
@@ -83,6 +84,7 @@ pub mod db_name {
     pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
     pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
     pub const DOCUMENTS: &str = "documents";
+    pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
 }
 
 #[derive(Clone)]
@@ -122,6 +124,9 @@ pub struct Index {
     /// Maps the position of a word prefix with all the docids where this prefix appears.
     pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
 
+    /// Maps the script and language with all the docids that corresponds to it.
+    pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>,
+
     /// Maps the facet field id and the docids for which this field exists
     pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
 
@@ -148,7 +153,7 @@ impl Index {
     ) -> Result<Index> {
         use db_name::*;
 
-        options.max_dbs(18);
+        options.max_dbs(19);
         unsafe { options.flag(Flags::MdbAlwaysFreePages) };
 
         let env = options.open(path)?;
@@ -159,6 +164,7 @@ impl Index {
         let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?;
         let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
         let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
+        let script_language_docids = env.create_database(Some(SCRIPT_LANGUAGE_DOCIDS))?;
         let word_prefix_pair_proximity_docids =
             env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
         let prefix_word_pair_proximity_docids =
@@ -186,6 +192,7 @@ impl Index {
             exact_word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
+            script_language_docids,
             word_prefix_pair_proximity_docids,
             prefix_word_pair_proximity_docids,
             word_position_docids,
@@ -1187,6 +1194,38 @@ impl Index {
     pub(crate) fn delete_pagination_max_total_hits(&self, txn: &mut RwTxn) -> heed::Result<bool> {
         self.main.delete::<_, Str>(txn, main_key::PAGINATION_MAX_TOTAL_HITS)
     }
+
+    /* script  language docids */
+    /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any.
+    pub fn script_language_documents_ids(
+        &self,
+        rtxn: &RoTxn,
+        key: &(Script, Language),
+    ) -> heed::Result<Option<RoaringBitmap>> {
+        let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
+        let doc_ids = self.script_language_docids.get(rtxn, key)?;
+        Ok(doc_ids.map(|ids| ids - soft_deleted_documents))
+    }
+
+    pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Script, Vec<Language>>> {
+        let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
+
+        let mut script_language: HashMap<Script, Vec<Language>> = HashMap::new();
+        for sl in self.script_language_docids.iter(rtxn)? {
+            let ((script, language), docids) = sl?;
+
+            // keep only Languages that contains at least 1 document.
+            if !soft_deleted_documents.is_superset(&docids) {
+                if let Some(languages) = script_language.get_mut(&script) {
+                    (*languages).push(language);
+                } else {
+                    script_language.insert(script, vec![language]);
+                }
+            }
+        }
+
+        Ok(script_language)
+    }
 }
 
 #[cfg(test)]
diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs
@@ -152,6 +152,11 @@ impl<'a> Search<'a> {
                     tokbuilder.stop_words(stop_words);
                 }
 
+                let script_lang_map = self.index.script_language(self.rtxn)?;
+                if !script_lang_map.is_empty() {
+                    tokbuilder.allow_list(&script_lang_map);
+                }
+
                 let tokenizer = tokbuilder.build();
                 let tokens = tokenizer.tokenize(query);
                 builder
@@ -446,6 +451,28 @@ mod test {
     use super::*;
     use crate::index::tests::TempIndex;
 
+    #[cfg(feature = "default")]
+    #[test]
+    fn test_kanji_language_detection() {
+        let index = TempIndex::new();
+
+        index
+            .add_documents(documents!([
+                { "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
+                { "id": 1, "title": "東京のお寿司。" },
+                { "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }
+            ]))
+            .unwrap();
+
+        let txn = index.write_txn().unwrap();
+        let mut search = Search::new(&txn, &index);
+
+        search.query("東京");
+        let SearchResult { documents_ids, .. } = search.execute().unwrap();
+
+        assert_eq!(documents_ids, vec![1]);
+    }
+
     #[test]
     fn test_is_authorized_typos() {
         let index = TempIndex::new();
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
@@ -30,6 +30,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             word_position_docids,
             field_id_word_count_docids,
             word_prefix_position_docids,
+            script_language_docids,
             facet_id_f64_docids,
             facet_id_string_docids,
             facet_id_exists_docids,
@@ -82,6 +83,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         word_position_docids.clear(self.wtxn)?;
         field_id_word_count_docids.clear(self.wtxn)?;
         word_prefix_position_docids.clear(self.wtxn)?;
+        script_language_docids.clear(self.wtxn)?;
         facet_id_f64_docids.clear(self.wtxn)?;
         facet_id_exists_docids.clear(self.wtxn)?;
         facet_id_string_docids.clear(self.wtxn)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
@@ -243,6 +243,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             facet_id_string_docids: _,
             field_id_docid_facet_f64s: _,
             field_id_docid_facet_strings: _,
+            script_language_docids,
             facet_id_exists_docids,
             documents,
         } = self.index;
@@ -499,6 +500,22 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             .execute(self.wtxn)?;
         }
 
+        // Remove the documents ids from the script language database.
+        let mut iter = script_language_docids.iter_mut(self.wtxn)?;
+        while let Some((key, mut docids)) = iter.next().transpose()? {
+            let previous_len = docids.len();
+            docids -= &self.to_delete_docids;
+            if docids.is_empty() {
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.del_current()? };
+            } else if docids.len() != previous_len {
+                let key = key.to_owned();
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.put_current(&key, &docids)? };
+            }
+        }
+
+        drop(iter);
         // We delete the documents ids that are under the facet field id values.
         remove_docids_from_facet_id_exists_docids(
             self.wtxn,
@@ -1166,4 +1183,52 @@ mod tests {
         stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard);
         stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft);
     }
+
+    fn stored_detected_script_and_language_should_not_return_deleted_documents_(
+        deletion_strategy: DeletionStrategy,
+    ) {
+        use charabia::{Language, Script};
+        let index = TempIndex::new();
+        let mut wtxn = index.write_txn().unwrap();
+        index
+            .add_documents_using_wtxn(
+                &mut wtxn,
+                documents!([
+                { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
+                { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
+                { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
+                { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
+                { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
+                { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
+            ]))
+            .unwrap();
+
+        let key_cmn = (Script::Cj, Language::Cmn);
+        let cj_cmn_docs =
+            index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default();
+        let mut expected_cj_cmn_docids = RoaringBitmap::new();
+        expected_cj_cmn_docids.push(1);
+        expected_cj_cmn_docids.push(5);
+        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
+
+        delete_documents(&mut wtxn, &index, &["1"], deletion_strategy);
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let cj_cmn_docs =
+            index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default();
+        let mut expected_cj_cmn_docids = RoaringBitmap::new();
+        expected_cj_cmn_docids.push(5);
+        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
+    }
+
+    #[test]
+    fn stored_detected_script_and_language_should_not_return_deleted_documents() {
+        stored_detected_script_and_language_should_not_return_deleted_documents_(
+            DeletionStrategy::AlwaysHard,
+        );
+        stored_detected_script_and_language_should_not_return_deleted_documents_(
+            DeletionStrategy::AlwaysSoft,
+        );
+    }
 }
diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -1,9 +1,9 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::convert::TryInto;
 use std::fs::File;
 use std::{io, mem, str};
 
-use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
+use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder};
 use roaring::RoaringBitmap;
 use serde_json::Value;
 
@@ -13,6 +13,8 @@ use crate::{
     absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
 };
 
+pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
+
 /// Extracts the word and positions where this word appear and
 /// prefixes it by the document id.
 ///
@@ -25,12 +27,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     searchable_fields: &Option<HashSet<FieldId>>,
     stop_words: Option<&fst::Set<&[u8]>>,
     max_positions_per_attributes: Option<u32>,
-) -> Result<(RoaringBitmap, grenad::Reader<File>)> {
+) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
     let max_positions_per_attributes = max_positions_per_attributes
         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
     let max_memory = indexer.max_memory_by_thread();
 
     let mut documents_ids = RoaringBitmap::new();
+    let mut script_language_pair = HashMap::new();
     let mut docid_word_positions_sorter = create_sorter(
         grenad::SortAlgorithm::Stable,
         concat_u32s_array,
@@ -70,6 +73,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
 
                     for (index, token) in tokens {
+                        if let Some(language) = token.language {
+                            let script = token.script;
+                            let entry = script_language_pair
+                                .entry((script, language))
+                                .or_insert_with(RoaringBitmap::new);
+                            entry.push(document_id);
+                        }
                         let token = token.lemma().trim();
                         if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
                             key_buffer.truncate(mem::size_of::<u32>());
@@ -88,7 +98,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
         }
     }
 
-    sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader))
+    sorter_into_reader(docid_word_positions_sorter, indexer)
+        .map(|reader| (documents_ids, reader, script_language_pair))
 }
 
 /// Transform a JSON value into a string that can be indexed.
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs