Merge #660

bors[bot] · f3r10 · web-flow · commit 0417cd6768ce · 2022-10-19T17:20:40.000Z
660: Store detected language per document during indexing r=ManyTheFish a=f3r10 # Pull Request ## Related issue Fixes #[646](meilisearch/milli#646) ## What does this PR do? - [x] create a new database - [x] implementation - [x] update this database during indexing - [x] implementation - [x] tests - [x] update this database during deletion - [x] implementation - [x] tests ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: f3r10 <frledesma@outlook.com>
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
@@ -9,7 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] }
 bincode = "1.3.3"
 bstr = "1.0.1"
 byteorder = "1.4.3"
-charabia = { version = "0.6.0", default-features = false }
+charabia = { git = "https://github.com/meilisearch/charabia.git", branch = "impl-serde-on-enums", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.6"
 either = "1.8.0"
diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs
@@ -4,6 +4,7 @@ mod field_id_word_count_codec;
 mod obkv_codec;
 mod roaring_bitmap;
 mod roaring_bitmap_length;
+mod script_language_codec;
 mod str_beu32_codec;
 mod str_str_u8_codec;
 
@@ -14,5 +15,6 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar
 pub use self::roaring_bitmap_length::{
     BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
 };
+pub use self::script_language_codec::ScriptLanguageCodec;
 pub use self::str_beu32_codec::StrBEU32Codec;
 pub use self::str_str_u8_codec::{StrStrU8Codec, UncheckedStrStrU8Codec};
diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs
@@ -0,0 +1,37 @@
+use std::borrow::Cow;
+use std::str;
+
+use charabia::{Language, Script};
+
+pub struct ScriptLanguageCodec;
+
+impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec {
+    type DItem = (Script, Language);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let sep = bytes.iter().position(|b| *b == 0)?;
+        let (s_bytes, l_bytes) = bytes.split_at(sep);
+        let script = str::from_utf8(s_bytes).ok()?;
+        let script_name = Script::from_name(script);
+        let lan = str::from_utf8(l_bytes).ok()?;
+        let lan_name = Language::from_name(lan);
+
+        Some((script_name, lan_name))
+    }
+}
+
+impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec {
+    type EItem = (Script, Language);
+
+    fn bytes_encode((script, lan): &Self::EItem) -> Option<Cow<[u8]>> {
+        let script_name = script.name().as_bytes();
+        let lan_name = lan.name().as_bytes();
+
+        let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len() + 1);
+        bytes.extend_from_slice(script_name);
+        bytes.push(0);
+        bytes.extend_from_slice(lan_name);
+
+        Some(Cow::Owned(bytes))
+    }
+}
diff --git a/milli/src/index.rs b/milli/src/index.rs
@@ -4,6 +4,7 @@ use std::fs::File;
 use std::mem::size_of;
 use std::path::Path;
 
+use charabia::{Language, Script};
 use heed::flags::Flags;
 use heed::types::*;
 use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
@@ -17,6 +18,7 @@ use crate::heed_codec::facet::{
     FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
     FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec,
 };
+use crate::heed_codec::ScriptLanguageCodec;
 use crate::{
     default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
     DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
@@ -80,6 +82,7 @@ pub mod db_name {
     pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
     pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
     pub const DOCUMENTS: &str = "documents";
+    pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
 }
 
 #[derive(Clone)]
@@ -117,6 +120,9 @@ pub struct Index {
     /// Maps the position of a word prefix with all the docids where this prefix appears.
     pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
 
+    /// Maps the script and language with all the docids that corresponds to it.
+    pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>,
+
     /// Maps the facet field id and the docids for which this field exists
     pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
 
@@ -138,7 +144,7 @@ impl Index {
     pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
         use db_name::*;
 
-        options.max_dbs(17);
+        options.max_dbs(18);
         unsafe { options.flag(Flags::MdbAlwaysFreePages) };
 
         let env = options.open(path)?;
@@ -149,6 +155,7 @@ impl Index {
         let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?;
         let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
         let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
+        let script_language_docids = env.create_database(Some(SCRIPT_LANGUAGE_DOCIDS))?;
         let word_prefix_pair_proximity_docids =
             env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
         let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
@@ -174,6 +181,7 @@ impl Index {
             exact_word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
+            script_language_docids,
             word_prefix_pair_proximity_docids,
             word_position_docids,
             word_prefix_position_docids,
@@ -1185,6 +1193,18 @@ impl Index {
     pub(crate) fn delete_pagination_max_total_hits(&self, txn: &mut RwTxn) -> heed::Result<bool> {
         self.main.delete::<_, Str>(txn, main_key::PAGINATION_MAX_TOTAL_HITS)
     }
+
+    /* script  language docids */
+    /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any.
+    pub fn script_language_documents_ids(
+        &self,
+        rtxn: &RoTxn,
+        key: &(Script, Language),
+    ) -> heed::Result<Option<RoaringBitmap>> {
+        let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
+        let doc_ids = self.script_language_docids.get(rtxn, key)?;
+        Ok(doc_ids.map(|ids| ids - soft_deleted_documents))
+    }
 }
 
 #[cfg(test)]
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
@@ -28,6 +28,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             word_position_docids,
             field_id_word_count_docids,
             word_prefix_position_docids,
+            script_language_docids,
             facet_id_f64_docids,
             facet_id_string_docids,
             facet_id_exists_docids,
@@ -69,6 +70,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         word_position_docids.clear(self.wtxn)?;
         field_id_word_count_docids.clear(self.wtxn)?;
         word_prefix_position_docids.clear(self.wtxn)?;
+        script_language_docids.clear(self.wtxn)?;
         facet_id_f64_docids.clear(self.wtxn)?;
         facet_id_exists_docids.clear(self.wtxn)?;
         facet_id_string_docids.clear(self.wtxn)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
@@ -185,6 +185,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             word_prefix_pair_proximity_docids,
             word_position_docids,
             word_prefix_position_docids,
+            script_language_docids,
             facet_id_f64_docids,
             facet_id_exists_docids,
             facet_id_string_docids,
@@ -440,6 +441,22 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?;
         }
 
+        // Remove the documents ids from the field id word count database.
+        let mut iter = script_language_docids.iter_mut(self.wtxn)?;
+        while let Some((key, mut docids)) = iter.next().transpose()? {
+            let previous_len = docids.len();
+            docids -= &self.to_delete_docids;
+            if docids.is_empty() {
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.del_current()? };
+            } else if docids.len() != previous_len {
+                let key = key.to_owned();
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.put_current(&key, &docids)? };
+            }
+        }
+
+        drop(iter);
         // We delete the documents ids that are under the facet field id values.
         remove_docids_from_facet_field_id_docids(
             self.wtxn,
@@ -1087,4 +1104,33 @@ mod tests {
 
         wtxn.commit().unwrap();
     }
+
+    #[test]
+    fn stored_detected_script_and_language_should_not_return_deleted_documents() {
+        use charabia::{Language, Script};
+        let index = TempIndex::new();
+        let mut wtxn = index.write_txn().unwrap();
+        index
+            .add_documents_using_wtxn(
+                &mut wtxn,
+                documents!([
+                { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
+                { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
+                { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
+                { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
+                { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
+                { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
+            ]))
+            .unwrap();
+
+        delete_documents(&mut wtxn, &index, &["1"]);
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let key_cmn = (Script::Cj, Language::Cmn);
+        let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
+        let mut expected_cj_cmn_docids = RoaringBitmap::new();
+        expected_cj_cmn_docids.push(5);
+        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
+    }
 }
diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -1,9 +1,9 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::convert::TryInto;
 use std::fs::File;
 use std::{io, mem, str};
 
-use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
+use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder};
 use roaring::RoaringBitmap;
 use serde_json::Value;
 
@@ -25,12 +25,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     searchable_fields: &Option<HashSet<FieldId>>,
     stop_words: Option<&fst::Set<&[u8]>>,
     max_positions_per_attributes: Option<u32>,
-) -> Result<(RoaringBitmap, grenad::Reader<File>)> {
+) -> Result<(RoaringBitmap, grenad::Reader<File>, HashMap<(Script, Language), RoaringBitmap>)> {
     let max_positions_per_attributes = max_positions_per_attributes
         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
     let max_memory = indexer.max_memory_by_thread();
 
     let mut documents_ids = RoaringBitmap::new();
+    let mut script_language_pair = HashMap::new();
     let mut docid_word_positions_sorter = create_sorter(
         grenad::SortAlgorithm::Stable,
         concat_u32s_array,
@@ -70,6 +71,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
 
                     for (index, token) in tokens {
+                        if let Some(language) = token.language {
+                            let script = token.script;
+                            let entry = script_language_pair
+                                .entry((script, language))
+                                .or_insert_with(RoaringBitmap::new);
+                            entry.push(document_id);
+                        }
                         let token = token.lemma().trim();
                         if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
                             key_buffer.truncate(mem::size_of::<u32>());
@@ -88,7 +96,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
         }
     }
 
-    sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader))
+    sorter_into_reader(docid_word_positions_sorter, indexer)
+        .map(|reader| (documents_ids, reader, script_language_pair))
 }
 
 /// Transform a JSON value into a string that can be indexed.
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
@@ -253,13 +253,14 @@ fn send_and_extract_flattened_documents_data(
     let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
         rayon::join(
             || {
-                let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions(
-                    flattened_documents_chunk.clone(),
-                    indexer.clone(),
-                    searchable_fields,
-                    stop_words.as_ref(),
-                    max_positions_per_attributes,
-                )?;
+                let (documents_ids, docid_word_positions_chunk, script_language_pair) =
+                    extract_docid_word_positions(
+                        flattened_documents_chunk.clone(),
+                        indexer.clone(),
+                        searchable_fields,
+                        stop_words.as_ref(),
+                        max_positions_per_attributes,
+                    )?;
 
                 // send documents_ids to DB writer
                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids)));
@@ -270,6 +271,9 @@ fn send_and_extract_flattened_documents_data(
                 let _ = lmdb_writer_sx
                     .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())));
 
+                let _ =
+                    lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));
+
                 Ok(docid_word_positions_chunk)
             },
             || {
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
@@ -1503,11 +1503,11 @@ mod tests {
         let rtxn = index.read_txn().unwrap();
 
         // Only the first document should match.
-        let count = index.word_docids.get(&rtxn, "化妆包").unwrap().unwrap().len();
+        let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len();
         assert_eq!(count, 1);
 
         // Only the second document should match.
-        let count = index.word_docids.get(&rtxn, "包").unwrap().unwrap().len();
+        let count = index.word_docids.get(&rtxn, "xiǎo").unwrap().unwrap().len();
         assert_eq!(count, 1);
 
         let mut search = crate::Search::new(&rtxn, &index);
@@ -1759,4 +1759,31 @@ mod tests {
         let words_fst = index.words_fst(&rtxn).unwrap();
         assert!(!words_fst.contains(&long_word));
     }
+
+    #[cfg(feature = "default")]
+    #[test]
+    fn store_detected_script_and_language_per_document_during_indexing() {
+        use charabia::{Language, Script};
+        let index = TempIndex::new();
+        index
+            .add_documents(documents!([
+                { "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
+                { "id": 2, "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
+                { "id": 3, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
+                { "id": 4, "title": "関西国際空港限定トートバッグ すもももももももものうち" },
+                { "id": 5, "title": "ภาษาไทยง่ายนิดเดียว" },
+                { "id": 6, "title": "The quick 在尊嚴和權利上一律平等。" },
+            ]))
+            .unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let key_jpn = (Script::Cj, Language::Jpn);
+        let key_cmn = (Script::Cj, Language::Cmn);
+        let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap();
+        let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
+        let expected_cj_jpn_docids = [3].iter().collect();
+        assert_eq!(cj_jpn_docs, expected_cj_jpn_docids);
+        let expected_cj_cmn_docids = [1, 5].iter().collect();
+        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
+    }
 }
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
@@ -1,8 +1,10 @@
 use std::borrow::Cow;
+use std::collections::HashMap;
 use std::convert::TryInto;
 use std::fs::File;
 use std::io;
 
+use charabia::{Language, Script};
 use grenad::MergerBuilder;
 use heed::types::ByteSlice;
 use heed::{BytesDecode, RwTxn};
@@ -37,6 +39,7 @@ pub(crate) enum TypedChunk {
     FieldIdFacetNumberDocids(grenad::Reader<File>),
     FieldIdFacetExistsDocids(grenad::Reader<File>),
     GeoPoints(grenad::Reader<File>),
+    ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
 }
 
 /// Write typed chunk in the corresponding LMDB database of the provided index.
@@ -229,6 +232,25 @@ pub(crate) fn write_typed_chunk_into_index(
             index.put_geo_rtree(wtxn, &rtree)?;
             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
         }
+        TypedChunk::ScriptLanguageDocids(hash_pair) => {
+            let mut buffer = Vec::new();
+            for (key, value) in hash_pair {
+                buffer.clear();
+                let final_value = match index.script_language_docids.get(wtxn, &key)? {
+                    Some(db_values) => {
+                        let mut db_value_buffer = Vec::new();
+                        serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?;
+                        let mut new_value_buffer = Vec::new();
+                        serialize_roaring_bitmap(&value, &mut new_value_buffer)?;
+                        merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?;
+                        let merged_db_values = RoaringBitmap::deserialize_from(&buffer[..])?;
+                        merged_db_values
+                    }
+                    None => value,
+                };
+                index.script_language_docids.put(wtxn, &key, &final_value)?;
+            }
+        }
     }
 
     Ok((RoaringBitmap::new(), is_merged_database))