Thanks to visit codestin.com
Credit goes to github.com

Skip to content
65 changes: 52 additions & 13 deletions src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ import com.github.pemistahl.lingua.api.Language.UNKNOWN
import com.github.pemistahl.lingua.api.Language.VIETNAMESE
import com.github.pemistahl.lingua.api.Language.YORUBA
import com.github.pemistahl.lingua.internal.Alphabet
import com.github.pemistahl.lingua.internal.Constant
import com.github.pemistahl.lingua.internal.Constant.MULTIPLE_WHITESPACE
import com.github.pemistahl.lingua.internal.Constant.NUMBERS
import com.github.pemistahl.lingua.internal.Constant.PUNCTUATION
Expand Down Expand Up @@ -202,7 +203,7 @@ class LanguageDetector internal constructor(
for (unigram in unigramLanguageModel.ngrams) {
val probability = lookUpNgramProbability(language, unigram)
if (probability > 0) {
unigramCounts.incrementCounter(language)
unigramCounts.incrementCounter(language, 1)
}
}
}
Expand Down Expand Up @@ -235,46 +236,56 @@ class LanguageDetector internal constructor(
var isMatch = false
for ((alphabet, language) in alphabetsSupportingExactlyOneLanguage) {
if (alphabet.matches(character)) {
wordLanguageCounts.incrementCounter(language)
wordLanguageCounts.incrementCounter(language, 1)
isMatch = true
}
}
if (!isMatch) {
when {
Alphabet.HAN.matches(character) -> wordLanguageCounts.incrementCounter(CHINESE)
JAPANESE_CHARACTER_SET.matches(character) -> wordLanguageCounts.incrementCounter(JAPANESE)
Alphabet.HAN.matches(character) -> wordLanguageCounts.incrementCounter(CHINESE, 1)
JAPANESE_CHARACTER_SET.matches(character) -> wordLanguageCounts.incrementCounter(JAPANESE, 1)
Alphabet.LATIN.matches(character) ||
Alphabet.CYRILLIC.matches(character) ||
Alphabet.DEVANAGARI.matches(character) ->
languagesWithUniqueCharacters.filter {
it.uniqueCharacters?.contains(character) ?: false
}.forEach {
wordLanguageCounts.incrementCounter(it)
wordLanguageCounts.incrementCounter(it, 1)
}
}
}
}

if (wordLanguageCounts.isEmpty()) {
totalLanguageCounts.incrementCounter(UNKNOWN)
totalLanguageCounts.incrementCounter(UNKNOWN, 1)
} else if (wordLanguageCounts.size == 1) {
val language = wordLanguageCounts.toList().first().first
if (language in languages) {
totalLanguageCounts.incrementCounter(language)
val logogramWordSizeOptional = logogramWordCountIfExist(language, word)
if (logogramWordSizeOptional > 0) {
totalLanguageCounts.incrementCounter(language, logogramWordSizeOptional)
} else {
totalLanguageCounts.incrementCounter(language, 1)
}
} else {
totalLanguageCounts.incrementCounter(UNKNOWN)
totalLanguageCounts.incrementCounter(UNKNOWN, 1)
}
} else if (wordLanguageCounts.containsKey(CHINESE) && wordLanguageCounts.containsKey(JAPANESE)) {
totalLanguageCounts.incrementCounter(JAPANESE)
val logogramWordSizeOptional = logogramWordCountIfExist(JAPANESE, word)
if (logogramWordSizeOptional > 0) {
totalLanguageCounts.incrementCounter(JAPANESE, logogramWordSizeOptional)
} else {
totalLanguageCounts.incrementCounter(JAPANESE, 1)
}
} else {
val sortedWordLanguageCounts = wordLanguageCounts.toList().sortedByDescending { it.second }
val (mostFrequentLanguage, firstCharCount) = sortedWordLanguageCounts[0]
val (_, secondCharCount) = sortedWordLanguageCounts[1]

if (firstCharCount > secondCharCount && mostFrequentLanguage in languages) {
totalLanguageCounts.incrementCounter(mostFrequentLanguage)
totalLanguageCounts.incrementCounter(mostFrequentLanguage, 1)
} else {
totalLanguageCounts.incrementCounter(UNKNOWN)
totalLanguageCounts.incrementCounter(UNKNOWN, 1)
}
}
}
Expand Down Expand Up @@ -310,7 +321,7 @@ class LanguageDetector internal constructor(
for (word in words) {
for (alphabet in Alphabet.values()) {
if (alphabet.matches(word)) {
detectedAlphabets.incrementCounter(alphabet)
detectedAlphabets.incrementCounter(alphabet, 1)
break
}
}
Expand All @@ -328,7 +339,7 @@ class LanguageDetector internal constructor(
for ((characters, languages) in CHARS_TO_LANGUAGES_MAPPING) {
if (word.containsAnyOf(characters)) {
for (language in languages) {
languageCounts.incrementCounter(language)
languageCounts.incrementCounter(language, 1)
}
break
}
Expand Down Expand Up @@ -419,6 +430,34 @@ class LanguageDetector internal constructor(

override fun hashCode() = 31 * languages.hashCode() + minimumRelativeDistance.hashCode()

internal fun logogramWordCountIfExist(language: Language, word: String): Int {
var wordSize = 0
if (!Constant.LANGUAGES_SUPPORTING_LOGOGRAMS.contains(language)) {
return wordSize
}
var otherWordSize = 0
var preOtherWordSize = 0
for (character in word.map { it.toString() }) {
when {
language.alphabets.stream().allMatch { e -> e.matches(character) } -> {
wordSize += 1
if (preOtherWordSize != otherWordSize) {
preOtherWordSize = otherWordSize
wordSize += 1
}
}
language.alphabets.stream().noneMatch { e -> e.matches(character) } -> {
preOtherWordSize = otherWordSize
otherWordSize += 1
}
}
}
if (preOtherWordSize != otherWordSize) {
wordSize += 1
}
return wordSize
}

internal companion object {
private val NO_LETTER = Regex("^[^\\p{L}]+$")
private val JAPANESE_CHARACTER_SET = try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@

package com.github.pemistahl.lingua.internal

import com.github.pemistahl.lingua.api.Language

internal object Constant {

val PUNCTUATION = Regex("\\p{P}")
val NUMBERS = Regex("\\p{N}")
val MULTIPLE_WHITESPACE = Regex("\\s+")
val LANGUAGES_SUPPORTING_LOGOGRAMS = listOf(Language.CHINESE, Language.JAPANESE, Language.KOREAN)
}
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ internal data class TrainingDataLanguageModel(
val textSlice = lowerCasedLine.slice(i until i + ngramLength)
if (regex.matches(textSlice)) {
val ngram = Ngram(textSlice)
absoluteFrequencies.incrementCounter(ngram)
absoluteFrequencies.incrementCounter(ngram, 1)
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@

package com.github.pemistahl.lingua.internal.util.extension

internal fun <T> MutableMap<T, Int>.incrementCounter(key: T) {
this[key] = this.getOrDefault(key, 0) + 1
internal fun <T> MutableMap<T, Int>.incrementCounter(key: T, amount: Int) {
this[key] = this.getOrDefault(key, 0) + amount
}
Original file line number Diff line number Diff line change
Expand Up @@ -103,31 +103,42 @@ class LanguageDetectorTest {

@MockK
private lateinit var unigramLanguageModelForEnglish: TrainingDataLanguageModel

@MockK
private lateinit var bigramLanguageModelForEnglish: TrainingDataLanguageModel

@MockK
private lateinit var trigramLanguageModelForEnglish: TrainingDataLanguageModel

@MockK
private lateinit var quadrigramLanguageModelForEnglish: TrainingDataLanguageModel

@MockK
private lateinit var fivegramLanguageModelForEnglish: TrainingDataLanguageModel

@MockK
private lateinit var unigramLanguageModelForGerman: TrainingDataLanguageModel

@MockK
private lateinit var bigramLanguageModelForGerman: TrainingDataLanguageModel

@MockK
private lateinit var trigramLanguageModelForGerman: TrainingDataLanguageModel

@MockK
private lateinit var quadrigramLanguageModelForGerman: TrainingDataLanguageModel

@MockK
private lateinit var fivegramLanguageModelForGerman: TrainingDataLanguageModel

// language model mocks for test data

@MockK
private lateinit var unigramTestDataLanguageModel: TestDataLanguageModel

@MockK
private lateinit var trigramTestDataLanguageModel: TestDataLanguageModel

@MockK
private lateinit var quadrigramTestDataLanguageModel: TestDataLanguageModel

Expand Down Expand Up @@ -946,4 +957,39 @@ class LanguageDetectorTest {
)
}
}

@ParameterizedTest
@CsvSource(
"上海大学是一个好大学this, CHINESE, 11",
"this, CHINESE, 1",
"上海Test是一个好大学Test, CHINESE, 10",
)
fun `assert that language of logogram wordCount if exist`(
word: String,
language: Language,
wordSize: Int
) {
assertThat(
detectorForAllLanguages.logogramWordCountIfExist(language, word)
).isEqualTo(
wordSize
)
}

@ParameterizedTest
@CsvSource(
"上海大学是一个好大学this is a test, CHINESE",
"this is a test, ENGLISH",
"上海Test是一个好大学Test, CHINESE",
)
fun `assert that language of logogram wordCount if exist`(
word: String,
expectedLanguage: Language
) {
assertThat(
detectorForAllLanguages.detectLanguageOf(word)
).isEqualTo(
expectedLanguage
)
}
}