Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit bbe827e

Browse files
authored
Add LRU example (#225)
* add lru example * fix text is not cleared * refactor * fmt * fix for jlabel
1 parent 5fe24f7 commit bbe827e

File tree

7 files changed

+258
-1
lines changed

7 files changed

+258
-1
lines changed

Cargo.lock

Lines changed: 74 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

examples/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,9 @@ Simple example of binary executable using jpreprocess.
77
## example-wasm
88

99
Example of using jpreprocess from WebAssembly.
10+
11+
## example-lru
12+
13+
Example of reducing memory usage by fetching from the file (with LRU cache) in jpreprocess.
14+
15+
Also includes the example of `DictionaryFetcher`.

examples/example-lru/Cargo.toml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[package]
2+
name = "example-lru"
3+
edition = "2021"
4+
5+
version.workspace = true
6+
description.workspace = true
7+
license.workspace = true
8+
repository.workspace = true
9+
keywords.workspace = true
10+
categories.workspace = true
11+
rust-version.workspace = true
12+
13+
[dependencies]
14+
jpreprocess.workspace = true
15+
jpreprocess-core.workspace = true
16+
jpreprocess-dictionary.workspace = true
17+
lindera-tokenizer = { version = "0.27" }
18+
19+
lru = "0.12.2"

examples/example-lru/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# example-lru
2+
3+
Example of reducing memory usage by fetching from the file (with LRU cache) in jpreprocess.
4+
5+
Also includes the example of `DictionaryFetcher`.

examples/example-lru/src/main.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#[cfg(not(target_family = "wasm"))]
2+
mod storage_fetcher;
3+
4+
#[cfg(not(target_family = "wasm"))]
5+
fn main() -> Result<(), Box<dyn std::error::Error>> {
6+
use jpreprocess::*;
7+
use std::path::PathBuf;
8+
9+
use crate::storage_fetcher::StorageFetcher;
10+
11+
let path = match std::env::args().nth(1).map(PathBuf::from) {
12+
Some(s) if s.is_dir() => s,
13+
_ => {
14+
eprintln!("Please specify a valid path to dictionary");
15+
std::process::exit(-1);
16+
}
17+
};
18+
19+
let fetcher = StorageFetcher::new(&path)?;
20+
let dictionary = SystemDictionaryConfig::File(path).load()?;
21+
22+
let jpreprocess = JPreprocess::with_dictionary_fetcher(fetcher, dictionary, None);
23+
24+
let mut text = String::new();
25+
while std::io::stdin().read_line(&mut text).is_ok() {
26+
let jpcommon_label = jpreprocess.extract_fullcontext(&text)?;
27+
let string_labels: Vec<_> = jpcommon_label.iter().map(ToString::to_string).collect();
28+
println!("{}", string_labels.join("\n"));
29+
text.clear();
30+
}
31+
32+
Ok(())
33+
}
34+
35+
#[cfg(target_family = "wasm")]
36+
fn main() {}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
use std::{
2+
fs::File,
3+
io::{Read, Seek},
4+
num::NonZeroUsize,
5+
path::Path,
6+
sync::Mutex,
7+
};
8+
9+
use jpreprocess_core::{error::JPreprocessErrorKind, word_entry::WordEntry, JPreprocessResult};
10+
use jpreprocess_dictionary::{default::WordDictionaryMode, DictionaryFetcher};
11+
use lindera_tokenizer::token::Token;
12+
use lru::LruCache;
13+
14+
pub struct StorageFetcher {
15+
inner: Mutex<CachedStorage>,
16+
}
17+
18+
impl StorageFetcher {
19+
pub fn new<P: AsRef<Path>>(dir: P) -> Result<Self, std::io::Error> {
20+
Ok(Self {
21+
inner: Mutex::new(CachedStorage::new(dir)?),
22+
})
23+
}
24+
}
25+
26+
impl DictionaryFetcher for StorageFetcher {
27+
fn get_word(&self, token: &Token) -> JPreprocessResult<WordEntry> {
28+
if token.word_id.is_unknown() {
29+
return Ok(WordEntry::default());
30+
}
31+
32+
let mut g = self.inner.lock().unwrap();
33+
g.get_word(token.word_id.0)
34+
}
35+
}
36+
37+
struct CachedStorage {
38+
mode: WordDictionaryMode,
39+
index_file: File,
40+
words_file: File,
41+
cache: LruCache<u32, WordEntry>,
42+
}
43+
44+
impl CachedStorage {
45+
pub fn new<P: AsRef<Path>>(dir: P) -> Result<Self, std::io::Error> {
46+
let mut index = File::open(dir.as_ref().join("dict.wordsidx"))?;
47+
let mut words = File::open(dir.as_ref().join("dict.words"))?;
48+
49+
let mut index_buf = vec![0u8; 4];
50+
index.read_exact(&mut index_buf)?;
51+
let start = u32::from_be_bytes([index_buf[0], index_buf[1], index_buf[2], index_buf[3]]);
52+
53+
let mut identifier_buf = vec![0u8; start as usize];
54+
words.read_exact(&mut identifier_buf)?;
55+
56+
let mode = WordDictionaryMode::from_metadata(String::from_utf8(identifier_buf).ok());
57+
58+
Ok(Self {
59+
mode,
60+
index_file: index,
61+
words_file: words,
62+
cache: LruCache::new(NonZeroUsize::new(1000).unwrap()),
63+
})
64+
}
65+
66+
pub fn get_word(&mut self, index: u32) -> JPreprocessResult<WordEntry> {
67+
if let Some(word) = self.cache.get(&index) {
68+
println!("Word #{} found in cache", index);
69+
return Ok(word.clone());
70+
}
71+
println!("Word #{} not found in cache", index);
72+
73+
let bytes = self
74+
.get_bytes(index)
75+
.map_err(|err| JPreprocessErrorKind::Io.with_error(err))?;
76+
let entry = self.mode.into_serializer().deserialize(&bytes)?;
77+
self.cache.push(index, entry.clone());
78+
79+
Ok(entry)
80+
}
81+
82+
fn get_bytes(&mut self, index: u32) -> Result<Vec<u8>, std::io::Error> {
83+
let (start, end) = self.read_u32_range(index)?;
84+
85+
self.words_file
86+
.seek(std::io::SeekFrom::Start(start as u64))?;
87+
if let Some(end) = end {
88+
let mut word_buf = vec![0u8; (end - start) as usize];
89+
self.words_file.read_exact(&mut word_buf)?;
90+
Ok(word_buf)
91+
} else {
92+
let mut word_buf = Vec::new();
93+
self.words_file.read_to_end(&mut word_buf)?;
94+
Ok(word_buf)
95+
}
96+
}
97+
fn read_u32_range(&mut self, index: u32) -> Result<(u32, Option<u32>), std::io::Error> {
98+
self.index_file
99+
.seek(std::io::SeekFrom::Start((index as u64) * 4))?;
100+
let mut index_buf = vec![0u8; 8];
101+
if self.index_file.read_exact(&mut index_buf).is_ok() {
102+
let start =
103+
u32::from_le_bytes([index_buf[0], index_buf[1], index_buf[2], index_buf[3]]);
104+
let end = u32::from_le_bytes([index_buf[4], index_buf[5], index_buf[6], index_buf[7]]);
105+
return Ok((start, Some(end)));
106+
}
107+
108+
self.index_file
109+
.seek(std::io::SeekFrom::Start((index as u64) * 4))?;
110+
let mut index_buf = vec![0u8; 4];
111+
self.index_file.read_exact(&mut index_buf)?;
112+
113+
let start = u32::from_le_bytes([index_buf[0], index_buf[1], index_buf[2], index_buf[3]]);
114+
Ok((start, None))
115+
}
116+
}

examples/example-simple/src/main.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
1919
while std::io::stdin().read_line(&mut text).is_ok() {
2020
let jpcommon_label = jpreprocess.extract_fullcontext(&text)?;
2121
let string_labels: Vec<_> = jpcommon_label.iter().map(ToString::to_string).collect();
22-
println!("{}", string_labels.join("\n"))
22+
println!("{}", string_labels.join("\n"));
23+
text.clear();
2324
}
2425

2526
Ok(())

0 commit comments

Comments
 (0)