Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3940788

Browse files
bors[bot]f3r10ManyTheFish
authored
Merge #3347
3347: Enhance language detection r=irevoire a=ManyTheFish ## Summary Some completely unrelated Languages can share the same characters, in Meilisearch we detect the Languages using `whatlang`, which works well on large texts but fails on small search queries leading to a bad segmentation and normalization of the query. This PR now stores the Languages detected during the indexing in order to reduce the Languages list that can be detected during the search. ## Detail - Create a 19th database mapping the scripts and the Languages detected with the documents where the Language is detected - Fill the newly created database during indexing - Create an allow-list with this database and pass it to Charabia - Add a test ensuring that a Japanese request containing kanjis only is detected as Japanese and not Chinese ## Related issues Fixes #2403 Fixes #3513 Co-authored-by: f3r10 <[email protected]> Co-authored-by: ManyTheFish <[email protected]> Co-authored-by: Many the fish <[email protected]>
2 parents a3e41ba + bbecab8 commit 3940788

File tree

13 files changed

+364
-90
lines changed

13 files changed

+364
-90
lines changed

Cargo.lock

Lines changed: 92 additions & 76 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

meilisearch/tests/search/mod.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,28 @@ async fn simple_search() {
148148
.await;
149149
}
150150

151+
#[cfg(feature = "default")]
152+
#[actix_rt::test]
153+
async fn test_kanji_language_detection() {
154+
let server = Server::new().await;
155+
let index = server.index("test");
156+
157+
let documents = json!([
158+
{ "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
159+
{ "id": 1, "title": "東京のお寿司。" },
160+
{ "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }
161+
]);
162+
index.add_documents(documents, None).await;
163+
index.wait_task(0).await;
164+
165+
index
166+
.search(json!({"q": "東京"}), |response, code| {
167+
assert_eq!(code, 200, "{}", response);
168+
assert_eq!(response["hits"].as_array().unwrap().len(), 1);
169+
})
170+
.await;
171+
}
172+
151173
#[actix_rt::test]
152174
async fn search_multiple_params() {
153175
let server = Server::new().await;

milli/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ bimap = { version = "0.6.2", features = ["serde"] }
1616
bincode = "1.3.3"
1717
bstr = "1.0.1"
1818
byteorder = "1.4.3"
19-
charabia = { version = "0.7.0", default-features = false }
19+
charabia = { version = "0.7.1", default-features = false }
2020
concat-arrays = "0.1.2"
2121
crossbeam-channel = "0.5.6"
2222
deserr = "0.5.0"

milli/src/heed_codec/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ mod field_id_word_count_codec;
55
mod obkv_codec;
66
mod roaring_bitmap;
77
mod roaring_bitmap_length;
8+
mod script_language_codec;
89
mod str_beu32_codec;
910
mod str_ref;
1011
mod str_str_u8_codec;
@@ -19,5 +20,6 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar
1920
pub use self::roaring_bitmap_length::{
2021
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
2122
};
23+
pub use self::script_language_codec::ScriptLanguageCodec;
2224
pub use self::str_beu32_codec::StrBEU32Codec;
2325
pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
use std::borrow::Cow;
2+
use std::str;
3+
4+
use charabia::{Language, Script};
5+
6+
pub struct ScriptLanguageCodec;
7+
8+
impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec {
9+
type DItem = (Script, Language);
10+
11+
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
12+
let sep = bytes.iter().position(|b| *b == 0)?;
13+
let (s_bytes, l_bytes) = bytes.split_at(sep);
14+
let script = str::from_utf8(s_bytes).ok()?;
15+
let script_name = Script::from_name(script);
16+
let lan = str::from_utf8(l_bytes).ok()?;
17+
// skip '\0' byte between the two strings.
18+
let lan_name = Language::from_name(&lan[1..]);
19+
20+
Some((script_name, lan_name))
21+
}
22+
}
23+
24+
impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec {
25+
type EItem = (Script, Language);
26+
27+
fn bytes_encode((script, lan): &Self::EItem) -> Option<Cow<[u8]>> {
28+
let script_name = script.name().as_bytes();
29+
let lan_name = lan.name().as_bytes();
30+
31+
let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len() + 1);
32+
bytes.extend_from_slice(script_name);
33+
bytes.push(0);
34+
bytes.extend_from_slice(lan_name);
35+
36+
Some(Cow::Owned(bytes))
37+
}
38+
}

milli/src/index.rs

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use std::fs::File;
44
use std::mem::size_of;
55
use std::path::Path;
66

7+
use charabia::{Language, Script};
78
use heed::flags::Flags;
89
use heed::types::*;
910
use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
@@ -18,7 +19,7 @@ use crate::heed_codec::facet::{
1819
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
1920
FieldIdCodec, OrderedF64Codec,
2021
};
21-
use crate::heed_codec::StrRefCodec;
22+
use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec};
2223
use crate::{
2324
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
2425
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
@@ -83,6 +84,7 @@ pub mod db_name {
8384
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
8485
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
8586
pub const DOCUMENTS: &str = "documents";
87+
pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
8688
}
8789

8890
#[derive(Clone)]
@@ -122,6 +124,9 @@ pub struct Index {
122124
/// Maps the position of a word prefix with all the docids where this prefix appears.
123125
pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
124126

127+
/// Maps the script and language with all the docids that corresponds to it.
128+
pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>,
129+
125130
/// Maps the facet field id and the docids for which this field exists
126131
pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
127132

@@ -148,7 +153,7 @@ impl Index {
148153
) -> Result<Index> {
149154
use db_name::*;
150155

151-
options.max_dbs(18);
156+
options.max_dbs(19);
152157
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
153158

154159
let env = options.open(path)?;
@@ -159,6 +164,7 @@ impl Index {
159164
let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?;
160165
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
161166
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
167+
let script_language_docids = env.create_database(Some(SCRIPT_LANGUAGE_DOCIDS))?;
162168
let word_prefix_pair_proximity_docids =
163169
env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
164170
let prefix_word_pair_proximity_docids =
@@ -186,6 +192,7 @@ impl Index {
186192
exact_word_prefix_docids,
187193
docid_word_positions,
188194
word_pair_proximity_docids,
195+
script_language_docids,
189196
word_prefix_pair_proximity_docids,
190197
prefix_word_pair_proximity_docids,
191198
word_position_docids,
@@ -1187,6 +1194,38 @@ impl Index {
11871194
pub(crate) fn delete_pagination_max_total_hits(&self, txn: &mut RwTxn) -> heed::Result<bool> {
11881195
self.main.delete::<_, Str>(txn, main_key::PAGINATION_MAX_TOTAL_HITS)
11891196
}
1197+
1198+
/* script language docids */
1199+
/// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any.
1200+
pub fn script_language_documents_ids(
1201+
&self,
1202+
rtxn: &RoTxn,
1203+
key: &(Script, Language),
1204+
) -> heed::Result<Option<RoaringBitmap>> {
1205+
let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
1206+
let doc_ids = self.script_language_docids.get(rtxn, key)?;
1207+
Ok(doc_ids.map(|ids| ids - soft_deleted_documents))
1208+
}
1209+
1210+
pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Script, Vec<Language>>> {
1211+
let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
1212+
1213+
let mut script_language: HashMap<Script, Vec<Language>> = HashMap::new();
1214+
for sl in self.script_language_docids.iter(rtxn)? {
1215+
let ((script, language), docids) = sl?;
1216+
1217+
// keep only Languages that contains at least 1 document.
1218+
if !soft_deleted_documents.is_superset(&docids) {
1219+
if let Some(languages) = script_language.get_mut(&script) {
1220+
(*languages).push(language);
1221+
} else {
1222+
script_language.insert(script, vec![language]);
1223+
}
1224+
}
1225+
}
1226+
1227+
Ok(script_language)
1228+
}
11901229
}
11911230

11921231
#[cfg(test)]

milli/src/search/mod.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,11 @@ impl<'a> Search<'a> {
152152
tokbuilder.stop_words(stop_words);
153153
}
154154

155+
let script_lang_map = self.index.script_language(self.rtxn)?;
156+
if !script_lang_map.is_empty() {
157+
tokbuilder.allow_list(&script_lang_map);
158+
}
159+
155160
let tokenizer = tokbuilder.build();
156161
let tokens = tokenizer.tokenize(query);
157162
builder
@@ -446,6 +451,28 @@ mod test {
446451
use super::*;
447452
use crate::index::tests::TempIndex;
448453

454+
#[cfg(feature = "default")]
455+
#[test]
456+
fn test_kanji_language_detection() {
457+
let index = TempIndex::new();
458+
459+
index
460+
.add_documents(documents!([
461+
{ "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
462+
{ "id": 1, "title": "東京のお寿司。" },
463+
{ "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }
464+
]))
465+
.unwrap();
466+
467+
let txn = index.write_txn().unwrap();
468+
let mut search = Search::new(&txn, &index);
469+
470+
search.query("東京");
471+
let SearchResult { documents_ids, .. } = search.execute().unwrap();
472+
473+
assert_eq!(documents_ids, vec![1]);
474+
}
475+
449476
#[test]
450477
fn test_is_authorized_typos() {
451478
let index = TempIndex::new();

milli/src/update/clear_documents.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
3030
word_position_docids,
3131
field_id_word_count_docids,
3232
word_prefix_position_docids,
33+
script_language_docids,
3334
facet_id_f64_docids,
3435
facet_id_string_docids,
3536
facet_id_exists_docids,
@@ -82,6 +83,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
8283
word_position_docids.clear(self.wtxn)?;
8384
field_id_word_count_docids.clear(self.wtxn)?;
8485
word_prefix_position_docids.clear(self.wtxn)?;
86+
script_language_docids.clear(self.wtxn)?;
8587
facet_id_f64_docids.clear(self.wtxn)?;
8688
facet_id_exists_docids.clear(self.wtxn)?;
8789
facet_id_string_docids.clear(self.wtxn)?;

milli/src/update/delete_documents.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
243243
facet_id_string_docids: _,
244244
field_id_docid_facet_f64s: _,
245245
field_id_docid_facet_strings: _,
246+
script_language_docids,
246247
facet_id_exists_docids,
247248
documents,
248249
} = self.index;
@@ -499,6 +500,22 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
499500
.execute(self.wtxn)?;
500501
}
501502

503+
// Remove the documents ids from the script language database.
504+
let mut iter = script_language_docids.iter_mut(self.wtxn)?;
505+
while let Some((key, mut docids)) = iter.next().transpose()? {
506+
let previous_len = docids.len();
507+
docids -= &self.to_delete_docids;
508+
if docids.is_empty() {
509+
// safety: we don't keep references from inside the LMDB database.
510+
unsafe { iter.del_current()? };
511+
} else if docids.len() != previous_len {
512+
let key = key.to_owned();
513+
// safety: we don't keep references from inside the LMDB database.
514+
unsafe { iter.put_current(&key, &docids)? };
515+
}
516+
}
517+
518+
drop(iter);
502519
// We delete the documents ids that are under the facet field id values.
503520
remove_docids_from_facet_id_exists_docids(
504521
self.wtxn,
@@ -1166,4 +1183,52 @@ mod tests {
11661183
stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard);
11671184
stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft);
11681185
}
1186+
1187+
fn stored_detected_script_and_language_should_not_return_deleted_documents_(
1188+
deletion_strategy: DeletionStrategy,
1189+
) {
1190+
use charabia::{Language, Script};
1191+
let index = TempIndex::new();
1192+
let mut wtxn = index.write_txn().unwrap();
1193+
index
1194+
.add_documents_using_wtxn(
1195+
&mut wtxn,
1196+
documents!([
1197+
{ "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
1198+
{ "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
1199+
{ "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
1200+
{ "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
1201+
{ "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
1202+
{ "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
1203+
]))
1204+
.unwrap();
1205+
1206+
let key_cmn = (Script::Cj, Language::Cmn);
1207+
let cj_cmn_docs =
1208+
index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default();
1209+
let mut expected_cj_cmn_docids = RoaringBitmap::new();
1210+
expected_cj_cmn_docids.push(1);
1211+
expected_cj_cmn_docids.push(5);
1212+
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
1213+
1214+
delete_documents(&mut wtxn, &index, &["1"], deletion_strategy);
1215+
wtxn.commit().unwrap();
1216+
1217+
let rtxn = index.read_txn().unwrap();
1218+
let cj_cmn_docs =
1219+
index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default();
1220+
let mut expected_cj_cmn_docids = RoaringBitmap::new();
1221+
expected_cj_cmn_docids.push(5);
1222+
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
1223+
}
1224+
1225+
#[test]
1226+
fn stored_detected_script_and_language_should_not_return_deleted_documents() {
1227+
stored_detected_script_and_language_should_not_return_deleted_documents_(
1228+
DeletionStrategy::AlwaysHard,
1229+
);
1230+
stored_detected_script_and_language_should_not_return_deleted_documents_(
1231+
DeletionStrategy::AlwaysSoft,
1232+
);
1233+
}
11691234
}

milli/src/update/index_documents/extract/extract_docid_word_positions.rs

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
use std::collections::HashSet;
1+
use std::collections::{HashMap, HashSet};
22
use std::convert::TryInto;
33
use std::fs::File;
44
use std::{io, mem, str};
55

6-
use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
6+
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder};
77
use roaring::RoaringBitmap;
88
use serde_json::Value;
99

@@ -13,6 +13,8 @@ use crate::{
1313
absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
1414
};
1515

16+
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
17+
1618
/// Extracts the word and positions where this word appear and
1719
/// prefixes it by the document id.
1820
///
@@ -25,12 +27,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
2527
searchable_fields: &Option<HashSet<FieldId>>,
2628
stop_words: Option<&fst::Set<&[u8]>>,
2729
max_positions_per_attributes: Option<u32>,
28-
) -> Result<(RoaringBitmap, grenad::Reader<File>)> {
30+
) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
2931
let max_positions_per_attributes = max_positions_per_attributes
3032
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
3133
let max_memory = indexer.max_memory_by_thread();
3234

3335
let mut documents_ids = RoaringBitmap::new();
36+
let mut script_language_pair = HashMap::new();
3437
let mut docid_word_positions_sorter = create_sorter(
3538
grenad::SortAlgorithm::Stable,
3639
concat_u32s_array,
@@ -70,6 +73,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
7073
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
7174

7275
for (index, token) in tokens {
76+
if let Some(language) = token.language {
77+
let script = token.script;
78+
let entry = script_language_pair
79+
.entry((script, language))
80+
.or_insert_with(RoaringBitmap::new);
81+
entry.push(document_id);
82+
}
7383
let token = token.lemma().trim();
7484
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
7585
key_buffer.truncate(mem::size_of::<u32>());
@@ -88,7 +98,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
8898
}
8999
}
90100

91-
sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader))
101+
sorter_into_reader(docid_word_positions_sorter, indexer)
102+
.map(|reader| (documents_ids, reader, script_language_pair))
92103
}
93104

94105
/// Transform a JSON value into a string that can be indexed.

0 commit comments

Comments
 (0)