-
Notifications
You must be signed in to change notification settings - Fork 92
Expand file tree
/
Copy pathdocument.jl
More file actions
103 lines (85 loc) · 3.07 KB
/
document.jl
File metadata and controls
103 lines (85 loc) · 3.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
@testset "Document" begin
dmeta = TextAnalysis.DocumentMetadata(
Languages.English(),
"test title",
"test author",
"test time",
Dict(:k1 => "v1", :k2 => "v2")
)
@test (dmeta.language == Languages.English()) &&
(dmeta.title == "test title") &&
(dmeta.author == "test author") &&
(dmeta.timestamp == "test time") &&
(get(dmeta.custom, :k1, "") == "v1") &&
(get(dmeta.custom, :k2, "") == "v2")
# mutability
dmeta.custom = nothing
@test isnothing(dmeta.custom)
sample_text1 = "This is a string"
sample_text2 = "This is also a string"
sample_file = joinpath(dirname(@__FILE__), "data", "poem.txt")
sd = StringDocument(sample_text1)
fd = FileDocument(sample_file)
td = TokenDocument(sample_text1)
ngd = NGramDocument(sample_text1)
@test isequal(text(sd), sample_text1)
text!(sd, sample_text2)
@test isequal(text(sd), sample_text2)
text!(sd, sample_text1)
@test isequal(text(sd), sample_text1)
@test all(tokens(sd) .== ["This", "is", "a", "string"])
@test "This" in keys(ngrams(sd, 1))
@test "is" in keys(ngrams(sd, 1))
@test "a" in keys(ngrams(sd, 1))
@test "string" in keys(ngrams(sd, 1))
@test length(sd) == 16
hamlet_text = "To be or not to be..."
sd = StringDocument(hamlet_text)
@test isa(sd, StringDocument)
@test isequal(text(sd), hamlet_text)
@test isa(fd, FileDocument)
@test length(text(fd)) > 0
my_tokens = ["To", "be", "or", "not", "to", "be..."]
td = TokenDocument(my_tokens)
@test isa(td, TokenDocument)
@test all(tokens(td) .== my_tokens)
my_ngrams = Dict{String,Int}()
my_ngrams["To"] = 1
my_ngrams["be"] = 2
my_ngrams["or"] = 1
my_ngrams["not"] = 1
my_ngrams["to"] = 1
my_ngrams["be..."] = 1
ngd = NGramDocument(my_ngrams)
@test isa(ngd, NGramDocument)
@test "To" in keys(ngrams(ngd))
# Test top features
top = top_terms(sd, 5)
@test [pair.first for pair in top] == ["be", "To", "not", "or", "to"]
@test [pair.second for pair in top] == [2, 1, 1, 1, 1]
@test top_terms(sd, 2) == ["be" => 2, "To" => 1]
sd = StringDocument(hamlet_text)
td = TokenDocument(hamlet_text)
ngd = NGramDocument(hamlet_text)
d = Document("To be or not to be...")
@test isa(d, StringDocument)
d = Document(joinpath(dirname(@__FILE__), "data", "poem.txt"))
@test isa(d, FileDocument)
d = Document(["To", "be", "or", "not", "to", "be..."])
@test isa(d, TokenDocument)
ng = Dict{String,Int}()
ng["a"] = 1
ng["b"] = 3
d = Document(ng)
@test isa(d, NGramDocument)
@test isequal(length(Document("this is text")), 12)
# NGramDocument creation with multiple ngram complexity
let N = ((), (2,), (Int32(2),), (1, 2), (Int32(1), Int16(2))),
C = (1, 2, 2, [1, 2], [1, 2]), L = (4, 3, 3, 7, 7)
for (n, c, l) in zip(N, C, L)
ngd = NGramDocument(sample_text1, n...)
@test ngram_complexity(ngd) == c
@test length(ngd.ngrams) == l
end
end
end