Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0417cd6

Browse files
bors[bot]f3r10
andauthored
Merge #660
660: Store detected language per document during indexing r=ManyTheFish a=f3r10 # Pull Request ## Related issue Fixes #[646](meilisearch/milli#646) ## What does this PR do? - [x] create a new database - [x] implementation - [x] update this database during indexing - [x] implementation - [x] tests - [x] update this database during deletion - [x] implementation - [x] tests ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: f3r10 <[email protected]>
2 parents 19b2326 + 1cb974e commit 0417cd6

File tree

10 files changed

+184
-15
lines changed

10 files changed

+184
-15
lines changed

milli/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] }
99
bincode = "1.3.3"
1010
bstr = "1.0.1"
1111
byteorder = "1.4.3"
12-
charabia = { version = "0.6.0", default-features = false }
12+
charabia = { git = "https://github.com/meilisearch/charabia.git", branch = "impl-serde-on-enums", default-features = false }
1313
concat-arrays = "0.1.2"
1414
crossbeam-channel = "0.5.6"
1515
either = "1.8.0"

milli/src/heed_codec/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ mod field_id_word_count_codec;
44
mod obkv_codec;
55
mod roaring_bitmap;
66
mod roaring_bitmap_length;
7+
mod script_language_codec;
78
mod str_beu32_codec;
89
mod str_str_u8_codec;
910

@@ -14,5 +15,6 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar
1415
pub use self::roaring_bitmap_length::{
1516
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
1617
};
18+
pub use self::script_language_codec::ScriptLanguageCodec;
1719
pub use self::str_beu32_codec::StrBEU32Codec;
1820
pub use self::str_str_u8_codec::{StrStrU8Codec, UncheckedStrStrU8Codec};
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
use std::borrow::Cow;
2+
use std::str;
3+
4+
use charabia::{Language, Script};
5+
6+
pub struct ScriptLanguageCodec;
7+
8+
impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec {
9+
type DItem = (Script, Language);
10+
11+
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
12+
let sep = bytes.iter().position(|b| *b == 0)?;
13+
let (s_bytes, l_bytes) = bytes.split_at(sep);
14+
let script = str::from_utf8(s_bytes).ok()?;
15+
let script_name = Script::from_name(script);
16+
let lan = str::from_utf8(l_bytes).ok()?;
17+
let lan_name = Language::from_name(lan);
18+
19+
Some((script_name, lan_name))
20+
}
21+
}
22+
23+
impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec {
24+
type EItem = (Script, Language);
25+
26+
fn bytes_encode((script, lan): &Self::EItem) -> Option<Cow<[u8]>> {
27+
let script_name = script.name().as_bytes();
28+
let lan_name = lan.name().as_bytes();
29+
30+
let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len() + 1);
31+
bytes.extend_from_slice(script_name);
32+
bytes.push(0);
33+
bytes.extend_from_slice(lan_name);
34+
35+
Some(Cow::Owned(bytes))
36+
}
37+
}

milli/src/index.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use std::fs::File;
44
use std::mem::size_of;
55
use std::path::Path;
66

7+
use charabia::{Language, Script};
78
use heed::flags::Flags;
89
use heed::types::*;
910
use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
@@ -17,6 +18,7 @@ use crate::heed_codec::facet::{
1718
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
1819
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec,
1920
};
21+
use crate::heed_codec::ScriptLanguageCodec;
2022
use crate::{
2123
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
2224
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
@@ -80,6 +82,7 @@ pub mod db_name {
8082
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
8183
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
8284
pub const DOCUMENTS: &str = "documents";
85+
pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
8386
}
8487

8588
#[derive(Clone)]
@@ -117,6 +120,9 @@ pub struct Index {
117120
/// Maps the position of a word prefix with all the docids where this prefix appears.
118121
pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
119122

123+
/// Maps the script and language with all the docids that corresponds to it.
124+
pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>,
125+
120126
/// Maps the facet field id and the docids for which this field exists
121127
pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
122128

@@ -138,7 +144,7 @@ impl Index {
138144
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
139145
use db_name::*;
140146

141-
options.max_dbs(17);
147+
options.max_dbs(18);
142148
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
143149

144150
let env = options.open(path)?;
@@ -149,6 +155,7 @@ impl Index {
149155
let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?;
150156
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
151157
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
158+
let script_language_docids = env.create_database(Some(SCRIPT_LANGUAGE_DOCIDS))?;
152159
let word_prefix_pair_proximity_docids =
153160
env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
154161
let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
@@ -174,6 +181,7 @@ impl Index {
174181
exact_word_prefix_docids,
175182
docid_word_positions,
176183
word_pair_proximity_docids,
184+
script_language_docids,
177185
word_prefix_pair_proximity_docids,
178186
word_position_docids,
179187
word_prefix_position_docids,
@@ -1185,6 +1193,18 @@ impl Index {
11851193
pub(crate) fn delete_pagination_max_total_hits(&self, txn: &mut RwTxn) -> heed::Result<bool> {
11861194
self.main.delete::<_, Str>(txn, main_key::PAGINATION_MAX_TOTAL_HITS)
11871195
}
1196+
1197+
/* script language docids */
1198+
/// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any.
1199+
pub fn script_language_documents_ids(
1200+
&self,
1201+
rtxn: &RoTxn,
1202+
key: &(Script, Language),
1203+
) -> heed::Result<Option<RoaringBitmap>> {
1204+
let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
1205+
let doc_ids = self.script_language_docids.get(rtxn, key)?;
1206+
Ok(doc_ids.map(|ids| ids - soft_deleted_documents))
1207+
}
11881208
}
11891209

11901210
#[cfg(test)]

milli/src/update/clear_documents.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
2828
word_position_docids,
2929
field_id_word_count_docids,
3030
word_prefix_position_docids,
31+
script_language_docids,
3132
facet_id_f64_docids,
3233
facet_id_string_docids,
3334
facet_id_exists_docids,
@@ -69,6 +70,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
6970
word_position_docids.clear(self.wtxn)?;
7071
field_id_word_count_docids.clear(self.wtxn)?;
7172
word_prefix_position_docids.clear(self.wtxn)?;
73+
script_language_docids.clear(self.wtxn)?;
7274
facet_id_f64_docids.clear(self.wtxn)?;
7375
facet_id_exists_docids.clear(self.wtxn)?;
7476
facet_id_string_docids.clear(self.wtxn)?;

milli/src/update/delete_documents.rs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
185185
word_prefix_pair_proximity_docids,
186186
word_position_docids,
187187
word_prefix_position_docids,
188+
script_language_docids,
188189
facet_id_f64_docids,
189190
facet_id_exists_docids,
190191
facet_id_string_docids,
@@ -440,6 +441,22 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
440441
self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?;
441442
}
442443

444+
// Remove the documents ids from the field id word count database.
445+
let mut iter = script_language_docids.iter_mut(self.wtxn)?;
446+
while let Some((key, mut docids)) = iter.next().transpose()? {
447+
let previous_len = docids.len();
448+
docids -= &self.to_delete_docids;
449+
if docids.is_empty() {
450+
// safety: we don't keep references from inside the LMDB database.
451+
unsafe { iter.del_current()? };
452+
} else if docids.len() != previous_len {
453+
let key = key.to_owned();
454+
// safety: we don't keep references from inside the LMDB database.
455+
unsafe { iter.put_current(&key, &docids)? };
456+
}
457+
}
458+
459+
drop(iter);
443460
// We delete the documents ids that are under the facet field id values.
444461
remove_docids_from_facet_field_id_docids(
445462
self.wtxn,
@@ -1087,4 +1104,33 @@ mod tests {
10871104

10881105
wtxn.commit().unwrap();
10891106
}
1107+
1108+
#[test]
1109+
fn stored_detected_script_and_language_should_not_return_deleted_documents() {
1110+
use charabia::{Language, Script};
1111+
let index = TempIndex::new();
1112+
let mut wtxn = index.write_txn().unwrap();
1113+
index
1114+
.add_documents_using_wtxn(
1115+
&mut wtxn,
1116+
documents!([
1117+
{ "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
1118+
{ "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
1119+
{ "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
1120+
{ "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
1121+
{ "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
1122+
{ "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
1123+
]))
1124+
.unwrap();
1125+
1126+
delete_documents(&mut wtxn, &index, &["1"]);
1127+
wtxn.commit().unwrap();
1128+
1129+
let rtxn = index.read_txn().unwrap();
1130+
let key_cmn = (Script::Cj, Language::Cmn);
1131+
let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
1132+
let mut expected_cj_cmn_docids = RoaringBitmap::new();
1133+
expected_cj_cmn_docids.push(5);
1134+
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
1135+
}
10901136
}

milli/src/update/index_documents/extract/extract_docid_word_positions.rs

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
use std::collections::HashSet;
1+
use std::collections::{HashMap, HashSet};
22
use std::convert::TryInto;
33
use std::fs::File;
44
use std::{io, mem, str};
55

6-
use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
6+
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder};
77
use roaring::RoaringBitmap;
88
use serde_json::Value;
99

@@ -25,12 +25,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
2525
searchable_fields: &Option<HashSet<FieldId>>,
2626
stop_words: Option<&fst::Set<&[u8]>>,
2727
max_positions_per_attributes: Option<u32>,
28-
) -> Result<(RoaringBitmap, grenad::Reader<File>)> {
28+
) -> Result<(RoaringBitmap, grenad::Reader<File>, HashMap<(Script, Language), RoaringBitmap>)> {
2929
let max_positions_per_attributes = max_positions_per_attributes
3030
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
3131
let max_memory = indexer.max_memory_by_thread();
3232

3333
let mut documents_ids = RoaringBitmap::new();
34+
let mut script_language_pair = HashMap::new();
3435
let mut docid_word_positions_sorter = create_sorter(
3536
grenad::SortAlgorithm::Stable,
3637
concat_u32s_array,
@@ -70,6 +71,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
7071
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
7172

7273
for (index, token) in tokens {
74+
if let Some(language) = token.language {
75+
let script = token.script;
76+
let entry = script_language_pair
77+
.entry((script, language))
78+
.or_insert_with(RoaringBitmap::new);
79+
entry.push(document_id);
80+
}
7381
let token = token.lemma().trim();
7482
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
7583
key_buffer.truncate(mem::size_of::<u32>());
@@ -88,7 +96,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
8896
}
8997
}
9098

91-
sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader))
99+
sorter_into_reader(docid_word_positions_sorter, indexer)
100+
.map(|reader| (documents_ids, reader, script_language_pair))
92101
}
93102

94103
/// Transform a JSON value into a string that can be indexed.

milli/src/update/index_documents/extract/mod.rs

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -253,13 +253,14 @@ fn send_and_extract_flattened_documents_data(
253253
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
254254
rayon::join(
255255
|| {
256-
let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions(
257-
flattened_documents_chunk.clone(),
258-
indexer.clone(),
259-
searchable_fields,
260-
stop_words.as_ref(),
261-
max_positions_per_attributes,
262-
)?;
256+
let (documents_ids, docid_word_positions_chunk, script_language_pair) =
257+
extract_docid_word_positions(
258+
flattened_documents_chunk.clone(),
259+
indexer.clone(),
260+
searchable_fields,
261+
stop_words.as_ref(),
262+
max_positions_per_attributes,
263+
)?;
263264

264265
// send documents_ids to DB writer
265266
let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids)));
@@ -270,6 +271,9 @@ fn send_and_extract_flattened_documents_data(
270271
let _ = lmdb_writer_sx
271272
.send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())));
272273

274+
let _ =
275+
lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));
276+
273277
Ok(docid_word_positions_chunk)
274278
},
275279
|| {

milli/src/update/index_documents/mod.rs

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1503,11 +1503,11 @@ mod tests {
15031503
let rtxn = index.read_txn().unwrap();
15041504

15051505
// Only the first document should match.
1506-
let count = index.word_docids.get(&rtxn, "化妆包").unwrap().unwrap().len();
1506+
let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len();
15071507
assert_eq!(count, 1);
15081508

15091509
// Only the second document should match.
1510-
let count = index.word_docids.get(&rtxn, "").unwrap().unwrap().len();
1510+
let count = index.word_docids.get(&rtxn, "xiǎo").unwrap().unwrap().len();
15111511
assert_eq!(count, 1);
15121512

15131513
let mut search = crate::Search::new(&rtxn, &index);
@@ -1759,4 +1759,31 @@ mod tests {
17591759
let words_fst = index.words_fst(&rtxn).unwrap();
17601760
assert!(!words_fst.contains(&long_word));
17611761
}
1762+
1763+
#[cfg(feature = "default")]
1764+
#[test]
1765+
fn store_detected_script_and_language_per_document_during_indexing() {
1766+
use charabia::{Language, Script};
1767+
let index = TempIndex::new();
1768+
index
1769+
.add_documents(documents!([
1770+
{ "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
1771+
{ "id": 2, "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
1772+
{ "id": 3, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
1773+
{ "id": 4, "title": "関西国際空港限定トートバッグ すもももももももものうち" },
1774+
{ "id": 5, "title": "ภาษาไทยง่ายนิดเดียว" },
1775+
{ "id": 6, "title": "The quick 在尊嚴和權利上一律平等。" },
1776+
]))
1777+
.unwrap();
1778+
1779+
let rtxn = index.read_txn().unwrap();
1780+
let key_jpn = (Script::Cj, Language::Jpn);
1781+
let key_cmn = (Script::Cj, Language::Cmn);
1782+
let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap();
1783+
let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
1784+
let expected_cj_jpn_docids = [3].iter().collect();
1785+
assert_eq!(cj_jpn_docs, expected_cj_jpn_docids);
1786+
let expected_cj_cmn_docids = [1, 5].iter().collect();
1787+
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
1788+
}
17621789
}

milli/src/update/index_documents/typed_chunk.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
use std::borrow::Cow;
2+
use std::collections::HashMap;
23
use std::convert::TryInto;
34
use std::fs::File;
45
use std::io;
56

7+
use charabia::{Language, Script};
68
use grenad::MergerBuilder;
79
use heed::types::ByteSlice;
810
use heed::{BytesDecode, RwTxn};
@@ -37,6 +39,7 @@ pub(crate) enum TypedChunk {
3739
FieldIdFacetNumberDocids(grenad::Reader<File>),
3840
FieldIdFacetExistsDocids(grenad::Reader<File>),
3941
GeoPoints(grenad::Reader<File>),
42+
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
4043
}
4144

4245
/// Write typed chunk in the corresponding LMDB database of the provided index.
@@ -229,6 +232,25 @@ pub(crate) fn write_typed_chunk_into_index(
229232
index.put_geo_rtree(wtxn, &rtree)?;
230233
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
231234
}
235+
TypedChunk::ScriptLanguageDocids(hash_pair) => {
236+
let mut buffer = Vec::new();
237+
for (key, value) in hash_pair {
238+
buffer.clear();
239+
let final_value = match index.script_language_docids.get(wtxn, &key)? {
240+
Some(db_values) => {
241+
let mut db_value_buffer = Vec::new();
242+
serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?;
243+
let mut new_value_buffer = Vec::new();
244+
serialize_roaring_bitmap(&value, &mut new_value_buffer)?;
245+
merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?;
246+
let merged_db_values = RoaringBitmap::deserialize_from(&buffer[..])?;
247+
merged_db_values
248+
}
249+
None => value,
250+
};
251+
index.script_language_docids.put(wtxn, &key, &final_value)?;
252+
}
253+
}
232254
}
233255

234256
Ok((RoaringBitmap::new(), is_merged_database))

0 commit comments

Comments
 (0)