|
| 1 | +use std::{ |
| 2 | + fs::File, |
| 3 | + io::{Read, Seek}, |
| 4 | + num::NonZeroUsize, |
| 5 | + path::Path, |
| 6 | + sync::Mutex, |
| 7 | +}; |
| 8 | + |
| 9 | +use jpreprocess_core::{error::JPreprocessErrorKind, word_entry::WordEntry, JPreprocessResult}; |
| 10 | +use jpreprocess_dictionary::{default::WordDictionaryMode, DictionaryFetcher}; |
| 11 | +use lindera_tokenizer::token::Token; |
| 12 | +use lru::LruCache; |
| 13 | + |
| 14 | +pub struct StorageFetcher { |
| 15 | + inner: Mutex<CachedStorage>, |
| 16 | +} |
| 17 | + |
| 18 | +impl StorageFetcher { |
| 19 | + pub fn new<P: AsRef<Path>>(dir: P) -> Result<Self, std::io::Error> { |
| 20 | + Ok(Self { |
| 21 | + inner: Mutex::new(CachedStorage::new(dir)?), |
| 22 | + }) |
| 23 | + } |
| 24 | +} |
| 25 | + |
| 26 | +impl DictionaryFetcher for StorageFetcher { |
| 27 | + fn get_word(&self, token: &Token) -> JPreprocessResult<WordEntry> { |
| 28 | + if token.word_id.is_unknown() { |
| 29 | + return Ok(WordEntry::default()); |
| 30 | + } |
| 31 | + |
| 32 | + let mut g = self.inner.lock().unwrap(); |
| 33 | + g.get_word(token.word_id.0) |
| 34 | + } |
| 35 | +} |
| 36 | + |
| 37 | +struct CachedStorage { |
| 38 | + mode: WordDictionaryMode, |
| 39 | + index_file: File, |
| 40 | + words_file: File, |
| 41 | + cache: LruCache<u32, WordEntry>, |
| 42 | +} |
| 43 | + |
| 44 | +impl CachedStorage { |
| 45 | + pub fn new<P: AsRef<Path>>(dir: P) -> Result<Self, std::io::Error> { |
| 46 | + let mut index = File::open(dir.as_ref().join("dict.wordsidx"))?; |
| 47 | + let mut words = File::open(dir.as_ref().join("dict.words"))?; |
| 48 | + |
| 49 | + let mut index_buf = vec![0u8; 4]; |
| 50 | + index.read_exact(&mut index_buf)?; |
| 51 | + let start = u32::from_be_bytes([index_buf[0], index_buf[1], index_buf[2], index_buf[3]]); |
| 52 | + |
| 53 | + let mut identifier_buf = vec![0u8; start as usize]; |
| 54 | + words.read_exact(&mut identifier_buf)?; |
| 55 | + |
| 56 | + let mode = WordDictionaryMode::from_metadata(String::from_utf8(identifier_buf).ok()); |
| 57 | + |
| 58 | + Ok(Self { |
| 59 | + mode, |
| 60 | + index_file: index, |
| 61 | + words_file: words, |
| 62 | + cache: LruCache::new(NonZeroUsize::new(1000).unwrap()), |
| 63 | + }) |
| 64 | + } |
| 65 | + |
| 66 | + pub fn get_word(&mut self, index: u32) -> JPreprocessResult<WordEntry> { |
| 67 | + if let Some(word) = self.cache.get(&index) { |
| 68 | + println!("Word #{} found in cache", index); |
| 69 | + return Ok(word.clone()); |
| 70 | + } |
| 71 | + println!("Word #{} not found in cache", index); |
| 72 | + |
| 73 | + let bytes = self |
| 74 | + .get_bytes(index) |
| 75 | + .map_err(|err| JPreprocessErrorKind::Io.with_error(err))?; |
| 76 | + let entry = self.mode.into_serializer().deserialize(&bytes)?; |
| 77 | + self.cache.push(index, entry.clone()); |
| 78 | + |
| 79 | + Ok(entry) |
| 80 | + } |
| 81 | + |
| 82 | + fn get_bytes(&mut self, index: u32) -> Result<Vec<u8>, std::io::Error> { |
| 83 | + let (start, end) = self.read_u32_range(index)?; |
| 84 | + |
| 85 | + self.words_file |
| 86 | + .seek(std::io::SeekFrom::Start(start as u64))?; |
| 87 | + if let Some(end) = end { |
| 88 | + let mut word_buf = vec![0u8; (end - start) as usize]; |
| 89 | + self.words_file.read_exact(&mut word_buf)?; |
| 90 | + Ok(word_buf) |
| 91 | + } else { |
| 92 | + let mut word_buf = Vec::new(); |
| 93 | + self.words_file.read_to_end(&mut word_buf)?; |
| 94 | + Ok(word_buf) |
| 95 | + } |
| 96 | + } |
| 97 | + fn read_u32_range(&mut self, index: u32) -> Result<(u32, Option<u32>), std::io::Error> { |
| 98 | + self.index_file |
| 99 | + .seek(std::io::SeekFrom::Start((index as u64) * 4))?; |
| 100 | + let mut index_buf = vec![0u8; 8]; |
| 101 | + if self.index_file.read_exact(&mut index_buf).is_ok() { |
| 102 | + let start = |
| 103 | + u32::from_le_bytes([index_buf[0], index_buf[1], index_buf[2], index_buf[3]]); |
| 104 | + let end = u32::from_le_bytes([index_buf[4], index_buf[5], index_buf[6], index_buf[7]]); |
| 105 | + return Ok((start, Some(end))); |
| 106 | + } |
| 107 | + |
| 108 | + self.index_file |
| 109 | + .seek(std::io::SeekFrom::Start((index as u64) * 4))?; |
| 110 | + let mut index_buf = vec![0u8; 4]; |
| 111 | + self.index_file.read_exact(&mut index_buf)?; |
| 112 | + |
| 113 | + let start = u32::from_le_bytes([index_buf[0], index_buf[1], index_buf[2], index_buf[3]]); |
| 114 | + Ok((start, None)) |
| 115 | + } |
| 116 | +} |
0 commit comments