Thanks to visit codestin.com
Credit goes to lib.rs

16 releases (8 breaking)

Uses new Rust 2024

0.9.0 Jul 6, 2025
0.8.0 Feb 27, 2024
0.7.3 Jan 19, 2023
0.6.2 Jul 16, 2022

#432 in Text processing

Codestin Search App Codestin Search App Codestin Search App

745 downloads per month

MIT license

140KB
4K SLoC

ultra-nlp

Install

cargo add ultra-nlp

Usage

ngrams

let text = "你好世界";

let result = ngrams(text, 2);

assert_eq!(
    result
        .into_iter()
        .collect::<Vec<&str>>(),
    vec!["你好", "好世", "世界"]
);

extract_consecutive_chinese_chars

let text = "foo中文bar字符baz";

let result = extract_consecutive_chinese_chars(text);

assert_eq!(
    result
        .into_iter()
        .collect::<Vec<&str>>(),
    vec!["中文", "字符"]
);

extract_consecutive_letters

let text = "foo中文,bar,字符baz";

let result = extract_consecutive_letters(text);

assert_eq!(
    result
        .into_iter()
        .collect::<Vec<&str>>(),
    vec!["foo中文", "bar", "字符baz"]
);

cedarwood(slow, low memory usage)

Ingore unmatched contents

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
    segment_fully,
    ForwardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(
    text,
    &dict,
    BehaviorForUnmatched::Ignore
);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec!["南京", "南京市", "市长", "长江", "大桥"]
);

Keep unmatched contents as chars

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
    segment_fully,
    ForwardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(
    text,
    &dict,
    BehaviorForUnmatched::KeepAsChars
);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ",", " ", "h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d", " ",
    ]
);

Keep unmatched ocntents as words

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
    segment_fully,
    ForwardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(
    text,
    &dict,
    BehaviorForUnmatched::KeepAsWords
);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ", hello world ",
    ]
);

daachorse(fast, high memory usage)

Ignore unmatched contents

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
    segment_fully,
    StandardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(text, &dict, BehaviorForUnmatched::Ignore);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
      "南京", "南京市", "市长", "长江", "大桥",
    ]
);

Keep unmatched contents as chars

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
    segment_fully,
    StandardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(text, &dict, BehaviorForUnmatched::KeepAsChars);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ",", " ", "h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d", " ",
    ]
);

Keep unmatched contents as words

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
    segment_fully,
    StandardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(text, &dict, BehaviorForUnmatched::KeepAsWords);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ", hello world ",
    ]
);

Dependencies

~2.2–3.5MB
~57K SLoC