From aaea693a8862739a889a2316f32094f5efa00ffd Mon Sep 17 00:00:00 2001 From: James Alster Date: Sat, 7 Mar 2026 11:17:16 +0000 Subject: [PATCH 1/7] Add top_features functions --- Project.toml | 2 ++ docs/src/features.md | 19 +++++++++++++++++++ src/TextAnalysis.jl | 2 ++ src/corpus.jl | 11 +++++++++++ src/document.jl | 9 +++++++++ src/dtm.jl | 13 +++++++++++++ test/corpus.jl | 3 +++ test/document.jl | 7 +++++++ test/dtm.jl | 10 ++++++++++ test/runtests.jl | 1 + 10 files changed, 77 insertions(+) diff --git a/Project.toml b/Project.toml index 441da011..731365f4 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -29,6 +30,7 @@ DelimitedFiles = "1" DocStringExtensions = "0.9" JSON = "0.21, 1" Languages = "0.4" +OrderedCollections = "1.8.1" ProgressMeter = "1" Snowball = "0.1" Statistics = "1" diff --git a/docs/src/features.md b/docs/src/features.md index eb95e6db..a6b8bcba 100644 --- a/docs/src/features.md +++ b/docs/src/features.md @@ -102,6 +102,25 @@ julia> hash_dtv(crps[1]) 0 0 0 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 0 0 ``` +## Top Features + +We can use the function `top_features()` to quickly view the top features of a `Document`, `DocumentTermMatrix` or `Corpus`. + +```julia +julia> top_features(m) # or `top_features(crps)` +OrderedCollections.OrderedDict{String, Int64} with 6 entries: + "To" => 2 + "be" => 2 + "become" => 2 + "not" => 2 + "or" => 2 + "to" => 2 +julia> top_features(m, 2) +2-element Vector{String}: + "To" + "be" +``` + ## TF (Term Frequency) Often we need to find out what proportion of a document is contributed by each term. This can be done using the term frequency function: diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 2d763b6b..59bc26b9 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -3,6 +3,7 @@ using SparseArrays using Printf using LinearAlgebra using StatsBase: countmap, addcounts! +using OrderedCollections: OrderedDict using Languages using WordTokenizers using Snowball @@ -54,6 +55,7 @@ export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity export tf!, tf_idf!, bm_25!, lda! export remove_patterns!, remove_patterns export prune! +export top_features export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles diff --git a/src/corpus.jl b/src/corpus.jl index 9d3b273b..9fd2f639 100644 --- a/src/corpus.jl +++ b/src/corpus.jl @@ -298,3 +298,14 @@ function standardize!(crps::Corpus, ::Type{T}) where {T<:AbstractDocument} crps.documents[i] = convert(T, crps.documents[i]) end end + +############################################################################## +# +# top_features() methods +# +############################################################################## + +top_features(lx::Dict{String,Int}) = sort!(OrderedDict(lx); byvalue=true, rev=true) +top_features(lx::Dict{String,Int}, n::Int) = first(keys(top_features(lx)), n) +top_features(crps::Corpus) = top_features(lexicon(crps)) +top_features(crps::Corpus, n::Int) = top_features(lexicon(crps), n) \ No newline at end of file diff --git a/src/document.jl b/src/document.jl index e933f9a7..cb2de50a 100644 --- a/src/document.jl +++ b/src/document.jl @@ -398,3 +398,12 @@ Base.convert(::Type{NGramDocument}, d::NGramDocument) = d ############################################################################## Base.getindex(d::AbstractDocument, term::AbstractString) = ngrams(d)[term] + +############################################################################## +# +# top_features() methods +# +############################################################################## + +top_features(d::AbstractDocument) = sort!(OrderedDict(countmap(tokens(d))); byvalue=true, rev=true) +top_features(d::AbstractDocument, n::Int) = first(keys(top_features(d)), n) \ No newline at end of file diff --git a/src/dtm.jl b/src/dtm.jl index 35c9cc7c..4e812050 100644 --- a/src/dtm.jl +++ b/src/dtm.jl @@ -440,3 +440,16 @@ function merge!(dtm1::DocumentTermMatrix{T}, dtm2::DocumentTermMatrix{T}) where dtm1 end + +""" + top_features(x) + top_features(x, n) + +Return terms sorted in descending frequency. With `n`, return only the top `n` terms. +Accepts a `Corpus`, `AbstractDocument`, lexicon `Dict`, or `DocumentTermMatrix`. +""" +top_features(D::DocumentTermMatrix, n::Int) = first(keys(top_features(D)), n) +function top_features(D::DocumentTermMatrix) + counts = vec(sum(D.dtm; dims=1)) + return sort!(OrderedDict(zip(D.terms, counts)); byvalue=true, rev=true) +end \ No newline at end of file diff --git a/test/corpus.jl b/test/corpus.jl index 044c89a6..d2fe0e93 100644 --- a/test/corpus.jl +++ b/test/corpus.jl @@ -39,6 +39,9 @@ update_lexicon!(crps) answer = Dict("1" => 2, "2" => 1, "4" => 1) + @test top_features(crps) == top_features(crps[1]) + @test top_features(crps, 1) == top_features(crps[1], 1) + @test answer == lexicon(crps) end diff --git a/test/document.jl b/test/document.jl index 8ffa3ef3..e936f813 100644 --- a/test/document.jl +++ b/test/document.jl @@ -66,6 +66,13 @@ @test isa(ngd, NGramDocument) @test "To" in keys(ngrams(ngd)) + # Test top features + top = top_features(sd) + @test top isa OrderedDict + @test collect(keys(top)) == ["be", "or", "not", "to", "To"] + @test collect(values(top)) == [2, 1, 1, 1, 1] + @test top_features(sd, 2) == ["be", "or"] + sd = StringDocument(hamlet_text) td = TokenDocument(hamlet_text) ngd = NGramDocument(hamlet_text) diff --git a/test/dtm.jl b/test/dtm.jl index 0a2f01fd..f6fdba76 100644 --- a/test/dtm.jl +++ b/test/dtm.jl @@ -109,4 +109,14 @@ @test dtm2.terms == ["five", "four", "three", "two"] @test size(dtm2.dtm) == (2, 4) @test sum(dtm2.dtm, dims=(1,)) == [1 2 2 1] + + # Test top_features + crps3 = Corpus([FileDocument(sample_file)]) + update_lexicon!(crps3) + m3 = DocumentTermMatrix(crps3) + top = top_features(m3) + top5 = top_features(m3, 5) + @test top isa OrderedDict + @test top5 == first(keys(top), 5) == [",", "thou", "And", "and", ";"] + @test first(values(top), 5) == [29, 6, 5, 5, 3] end diff --git a/test/runtests.jl b/test/runtests.jl index 3ea9e016..471b95d6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,6 +4,7 @@ using Languages using TextAnalysis using WordTokenizers using Serialization +using OrderedCollections: OrderedDict tests = [ "coom.jl" From be5838732ec42c93f50ebcf5c6d08836ef4357ac Mon Sep 17 00:00:00 2001 From: James Alster Date: Sat, 7 Mar 2026 12:11:23 +0000 Subject: [PATCH 2/7] OrderedCollections version compatibility for Julia 1.6 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 731365f4..6294e2e8 100644 --- a/Project.toml +++ b/Project.toml @@ -30,7 +30,7 @@ DelimitedFiles = "1" DocStringExtensions = "0.9" JSON = "0.21, 1" Languages = "0.4" -OrderedCollections = "1.8.1" +OrderedCollections = "1.7.0" ProgressMeter = "1" Snowball = "0.1" Statistics = "1" From dbe1f57d3ec623014cdc05eb46097df58433805b Mon Sep 17 00:00:00 2001 From: James Alster Date: Sat, 7 Mar 2026 12:34:36 +0000 Subject: [PATCH 3/7] enforce alphabetic sorting --- src/corpus.jl | 2 +- src/document.jl | 2 +- src/dtm.jl | 2 +- test/document.jl | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/corpus.jl b/src/corpus.jl index 9fd2f639..7ca4c6af 100644 --- a/src/corpus.jl +++ b/src/corpus.jl @@ -305,7 +305,7 @@ end # ############################################################################## -top_features(lx::Dict{String,Int}) = sort!(OrderedDict(lx); byvalue=true, rev=true) +top_features(lx::Dict{String,Int}) = sort!(sort!(OrderedDict(lx)); byvalue=true, rev=true) # double sort for key then value order top_features(lx::Dict{String,Int}, n::Int) = first(keys(top_features(lx)), n) top_features(crps::Corpus) = top_features(lexicon(crps)) top_features(crps::Corpus, n::Int) = top_features(lexicon(crps), n) \ No newline at end of file diff --git a/src/document.jl b/src/document.jl index cb2de50a..f5dc73c4 100644 --- a/src/document.jl +++ b/src/document.jl @@ -405,5 +405,5 @@ Base.getindex(d::AbstractDocument, term::AbstractString) = ngrams(d)[term] # ############################################################################## -top_features(d::AbstractDocument) = sort!(OrderedDict(countmap(tokens(d))); byvalue=true, rev=true) +top_features(d::AbstractDocument) = sort!(sort!(OrderedDict(countmap(tokens(d)))); byvalue=true, rev=true) # double sort for key and value order top_features(d::AbstractDocument, n::Int) = first(keys(top_features(d)), n) \ No newline at end of file diff --git a/src/dtm.jl b/src/dtm.jl index 4e812050..1f66633e 100644 --- a/src/dtm.jl +++ b/src/dtm.jl @@ -451,5 +451,5 @@ Accepts a `Corpus`, `AbstractDocument`, lexicon `Dict`, or `DocumentTermMatrix`. top_features(D::DocumentTermMatrix, n::Int) = first(keys(top_features(D)), n) function top_features(D::DocumentTermMatrix) counts = vec(sum(D.dtm; dims=1)) - return sort!(OrderedDict(zip(D.terms, counts)); byvalue=true, rev=true) + return sort!(sort!(OrderedDict(zip(D.terms, counts))); byvalue=true, rev=true) # double sort for key and value order end \ No newline at end of file diff --git a/test/document.jl b/test/document.jl index e936f813..22523a2c 100644 --- a/test/document.jl +++ b/test/document.jl @@ -69,9 +69,9 @@ # Test top features top = top_features(sd) @test top isa OrderedDict - @test collect(keys(top)) == ["be", "or", "not", "to", "To"] + @test collect(keys(top)) == ["be", "To", "not", "or", "to"] @test collect(values(top)) == [2, 1, 1, 1, 1] - @test top_features(sd, 2) == ["be", "or"] + @test top_features(sd, 2) == ["be", "To"] sd = StringDocument(hamlet_text) td = TokenDocument(hamlet_text) From 7cf6d2b9c7256c489638c8360424238e533b8243 Mon Sep 17 00:00:00 2001 From: James Alster Date: Sat, 7 Mar 2026 12:35:57 +0000 Subject: [PATCH 4/7] improve docs --- src/dtm.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dtm.jl b/src/dtm.jl index 1f66633e..54d6a27a 100644 --- a/src/dtm.jl +++ b/src/dtm.jl @@ -447,6 +447,7 @@ end Return terms sorted in descending frequency. With `n`, return only the top `n` terms. Accepts a `Corpus`, `AbstractDocument`, lexicon `Dict`, or `DocumentTermMatrix`. +Ties are sorted alphabetically. """ top_features(D::DocumentTermMatrix, n::Int) = first(keys(top_features(D)), n) function top_features(D::DocumentTermMatrix) From a3bc21f8b911b784a4f9bab09193deb29545d310 Mon Sep 17 00:00:00 2001 From: James Alster Date: Tue, 17 Mar 2026 12:47:16 +0000 Subject: [PATCH 5/7] incorporate comments --- docs/src/features.md | 9 ++------- src/corpus.jl | 13 +++++++++---- src/document.jl | 9 +++++++-- src/dtm.jl | 12 +++++++----- test/corpus.jl | 1 - test/document.jl | 5 ++--- test/dtm.jl | 7 +++---- 7 files changed, 30 insertions(+), 26 deletions(-) diff --git a/docs/src/features.md b/docs/src/features.md index a6b8bcba..b50adb52 100644 --- a/docs/src/features.md +++ b/docs/src/features.md @@ -104,21 +104,16 @@ julia> hash_dtv(crps[1]) ## Top Features -We can use the function `top_features()` to quickly view the top features of a `Document`, `DocumentTermMatrix` or `Corpus`. +We can use the function `top_features(x, n)` to quickly view the top features of a `Document`, `DocumentTermMatrix` or `Corpus`. ```julia -julia> top_features(m) # or `top_features(crps)` +julia> top_features(m, 5) OrderedCollections.OrderedDict{String, Int64} with 6 entries: "To" => 2 "be" => 2 "become" => 2 "not" => 2 "or" => 2 - "to" => 2 -julia> top_features(m, 2) -2-element Vector{String}: - "To" - "be" ``` ## TF (Term Frequency) diff --git a/src/corpus.jl b/src/corpus.jl index 7ca4c6af..53f69eb1 100644 --- a/src/corpus.jl +++ b/src/corpus.jl @@ -305,7 +305,12 @@ end # ############################################################################## -top_features(lx::Dict{String,Int}) = sort!(sort!(OrderedDict(lx)); byvalue=true, rev=true) # double sort for key then value order -top_features(lx::Dict{String,Int}, n::Int) = first(keys(top_features(lx)), n) -top_features(crps::Corpus) = top_features(lexicon(crps)) -top_features(crps::Corpus, n::Int) = top_features(lexicon(crps), n) \ No newline at end of file +function top_features(lx::Dict{String,Int}, ::Val{N}) where {N} + D_pairs = collect(pairs(lx)) + n = min(N, length(D_pairs)) + idx = partialsortperm(D_pairs, 1:n, by = p -> (-p.second, p.first)) + OrderedDict(D_pairs[idx]) +end +top_features(lx::Dict{String,Int}, n::Int) = first.(top_features(lx), Val(n)) +top_features(crps::Corpus, n::Int) = top_features(lexicon(crps), Val(n)) +#top_features(crps::Corpus) = top_features(lexicon(crps)) \ No newline at end of file diff --git a/src/document.jl b/src/document.jl index f5dc73c4..58604d78 100644 --- a/src/document.jl +++ b/src/document.jl @@ -405,5 +405,10 @@ Base.getindex(d::AbstractDocument, term::AbstractString) = ngrams(d)[term] # ############################################################################## -top_features(d::AbstractDocument) = sort!(sort!(OrderedDict(countmap(tokens(d)))); byvalue=true, rev=true) # double sort for key and value order -top_features(d::AbstractDocument, n::Int) = first(keys(top_features(d)), n) \ No newline at end of file +function top_features(d::AbstractDocument, ::Val{N}) where {N} + D_pairs = collect(pairs(countmap(tokens(d)))) + n = min(N, length(D_pairs)) + idx = partialsortperm(D_pairs, 1:n; by = p -> (-p.second, p.first)) + OrderedDict(D_pairs[idx]) +end +top_features(d::AbstractDocument, n::Int) = top_features(d, Val(n)) \ No newline at end of file diff --git a/src/dtm.jl b/src/dtm.jl index 54d6a27a..702fc45c 100644 --- a/src/dtm.jl +++ b/src/dtm.jl @@ -449,8 +449,10 @@ Return terms sorted in descending frequency. With `n`, return only the top `n` t Accepts a `Corpus`, `AbstractDocument`, lexicon `Dict`, or `DocumentTermMatrix`. Ties are sorted alphabetically. """ -top_features(D::DocumentTermMatrix, n::Int) = first(keys(top_features(D)), n) -function top_features(D::DocumentTermMatrix) - counts = vec(sum(D.dtm; dims=1)) - return sort!(sort!(OrderedDict(zip(D.terms, counts))); byvalue=true, rev=true) # double sort for key and value order -end \ No newline at end of file +function top_features(D::DocumentTermMatrix, ::Val{N}) where {N} + counts = @view(sum(D.dtm; dims=1)[1, :]) + n = min(N, length(counts)) + idx = partialsortperm(counts, 1:n; rev=true) + OrderedDict(zip(D.terms[idx], counts[idx])) +end +top_features(D::DocumentTermMatrix, n::Int) = top_features(D, Val(n)) \ No newline at end of file diff --git a/test/corpus.jl b/test/corpus.jl index d2fe0e93..212ffd12 100644 --- a/test/corpus.jl +++ b/test/corpus.jl @@ -39,7 +39,6 @@ update_lexicon!(crps) answer = Dict("1" => 2, "2" => 1, "4" => 1) - @test top_features(crps) == top_features(crps[1]) @test top_features(crps, 1) == top_features(crps[1], 1) @test answer == lexicon(crps) diff --git a/test/document.jl b/test/document.jl index 22523a2c..f3955070 100644 --- a/test/document.jl +++ b/test/document.jl @@ -67,11 +67,10 @@ @test "To" in keys(ngrams(ngd)) # Test top features - top = top_features(sd) - @test top isa OrderedDict + top = top_features(sd, 5) @test collect(keys(top)) == ["be", "To", "not", "or", "to"] @test collect(values(top)) == [2, 1, 1, 1, 1] - @test top_features(sd, 2) == ["be", "To"] + @test top_features(sd, 2) == OrderedDict("be" => 2, "To" => 1) sd = StringDocument(hamlet_text) td = TokenDocument(hamlet_text) diff --git a/test/dtm.jl b/test/dtm.jl index f6fdba76..45f31527 100644 --- a/test/dtm.jl +++ b/test/dtm.jl @@ -114,9 +114,8 @@ crps3 = Corpus([FileDocument(sample_file)]) update_lexicon!(crps3) m3 = DocumentTermMatrix(crps3) - top = top_features(m3) top5 = top_features(m3, 5) - @test top isa OrderedDict - @test top5 == first(keys(top), 5) == [",", "thou", "And", "and", ";"] - @test first(values(top), 5) == [29, 6, 5, 5, 3] + @test top5 isa OrderedDict + @test collect(keys(top5)) == [",", "thou", "And", "and", ";"] + @test collect(values(top5)) == [29, 6, 5, 5, 3] end From 7ba28a81ae0edf1b5bc5fa12b09708266cc1defe Mon Sep 17 00:00:00 2001 From: James Alster Date: Tue, 17 Mar 2026 12:58:08 +0000 Subject: [PATCH 6/7] rename top_features to top_terms --- docs/src/features.md | 4 ++-- src/TextAnalysis.jl | 2 +- src/corpus.jl | 12 ++++++------ src/document.jl | 7 ++++--- src/dtm.jl | 8 ++++---- test/corpus.jl | 2 +- test/document.jl | 4 ++-- test/dtm.jl | 4 ++-- 8 files changed, 22 insertions(+), 21 deletions(-) diff --git a/docs/src/features.md b/docs/src/features.md index b50adb52..2e2ba289 100644 --- a/docs/src/features.md +++ b/docs/src/features.md @@ -104,10 +104,10 @@ julia> hash_dtv(crps[1]) ## Top Features -We can use the function `top_features(x, n)` to quickly view the top features of a `Document`, `DocumentTermMatrix` or `Corpus`. +We can use the function `top_terms(x, n)` to quickly view the top features of a `Document`, `DocumentTermMatrix` or `Corpus`. ```julia -julia> top_features(m, 5) +julia> top_terms(m, 5) OrderedCollections.OrderedDict{String, Int64} with 6 entries: "To" => 2 "be" => 2 diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 59bc26b9..99d3a3d3 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -55,7 +55,7 @@ export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity export tf!, tf_idf!, bm_25!, lda! export remove_patterns!, remove_patterns export prune! -export top_features +export top_terms export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles diff --git a/src/corpus.jl b/src/corpus.jl index 53f69eb1..4374ada7 100644 --- a/src/corpus.jl +++ b/src/corpus.jl @@ -301,16 +301,16 @@ end ############################################################################## # -# top_features() methods +# top_terms() methods # ############################################################################## -function top_features(lx::Dict{String,Int}, ::Val{N}) where {N} +function top_terms(lx::Dict{String,Int}, ::Val{N}) where {N} D_pairs = collect(pairs(lx)) n = min(N, length(D_pairs)) - idx = partialsortperm(D_pairs, 1:n, by = p -> (-p.second, p.first)) + # Count decreasing, break ties alphabetically + idx = partialsortperm(D_pairs, 1:n, by = p -> (-p.second, p.first)) OrderedDict(D_pairs[idx]) end -top_features(lx::Dict{String,Int}, n::Int) = first.(top_features(lx), Val(n)) -top_features(crps::Corpus, n::Int) = top_features(lexicon(crps), Val(n)) -#top_features(crps::Corpus) = top_features(lexicon(crps)) \ No newline at end of file +top_terms(lx::Dict{String,Int}, n::Int) = top_terms(lx, Val(n)) +top_terms(crps::Corpus, n::Int) = top_terms(lexicon(crps), Val(n)) \ No newline at end of file diff --git a/src/document.jl b/src/document.jl index 58604d78..bf14373f 100644 --- a/src/document.jl +++ b/src/document.jl @@ -401,14 +401,15 @@ Base.getindex(d::AbstractDocument, term::AbstractString) = ngrams(d)[term] ############################################################################## # -# top_features() methods +# top_terms() methods # ############################################################################## -function top_features(d::AbstractDocument, ::Val{N}) where {N} +function top_terms(d::AbstractDocument, ::Val{N}) where {N} D_pairs = collect(pairs(countmap(tokens(d)))) n = min(N, length(D_pairs)) + # Count decreasing, break ties alphabetically idx = partialsortperm(D_pairs, 1:n; by = p -> (-p.second, p.first)) OrderedDict(D_pairs[idx]) end -top_features(d::AbstractDocument, n::Int) = top_features(d, Val(n)) \ No newline at end of file +top_terms(d::AbstractDocument, n::Int) = top_terms(d, Val(n)) \ No newline at end of file diff --git a/src/dtm.jl b/src/dtm.jl index 702fc45c..5df36261 100644 --- a/src/dtm.jl +++ b/src/dtm.jl @@ -442,17 +442,17 @@ function merge!(dtm1::DocumentTermMatrix{T}, dtm2::DocumentTermMatrix{T}) where end """ - top_features(x) - top_features(x, n) + top_terms(x) + top_terms(x, n) Return terms sorted in descending frequency. With `n`, return only the top `n` terms. Accepts a `Corpus`, `AbstractDocument`, lexicon `Dict`, or `DocumentTermMatrix`. Ties are sorted alphabetically. """ -function top_features(D::DocumentTermMatrix, ::Val{N}) where {N} +function top_terms(D::DocumentTermMatrix, ::Val{N}) where {N} counts = @view(sum(D.dtm; dims=1)[1, :]) n = min(N, length(counts)) idx = partialsortperm(counts, 1:n; rev=true) OrderedDict(zip(D.terms[idx], counts[idx])) end -top_features(D::DocumentTermMatrix, n::Int) = top_features(D, Val(n)) \ No newline at end of file +top_terms(D::DocumentTermMatrix, n::Int) = top_terms(D, Val(n)) \ No newline at end of file diff --git a/test/corpus.jl b/test/corpus.jl index 212ffd12..3495697b 100644 --- a/test/corpus.jl +++ b/test/corpus.jl @@ -39,7 +39,7 @@ update_lexicon!(crps) answer = Dict("1" => 2, "2" => 1, "4" => 1) - @test top_features(crps, 1) == top_features(crps[1], 1) + @test top_terms(crps, 1) == top_terms(crps[1], 1) @test answer == lexicon(crps) end diff --git a/test/document.jl b/test/document.jl index f3955070..4af83db3 100644 --- a/test/document.jl +++ b/test/document.jl @@ -67,10 +67,10 @@ @test "To" in keys(ngrams(ngd)) # Test top features - top = top_features(sd, 5) + top = top_terms(sd, 5) @test collect(keys(top)) == ["be", "To", "not", "or", "to"] @test collect(values(top)) == [2, 1, 1, 1, 1] - @test top_features(sd, 2) == OrderedDict("be" => 2, "To" => 1) + @test top_terms(sd, 2) == OrderedDict("be" => 2, "To" => 1) sd = StringDocument(hamlet_text) td = TokenDocument(hamlet_text) diff --git a/test/dtm.jl b/test/dtm.jl index 45f31527..8292152b 100644 --- a/test/dtm.jl +++ b/test/dtm.jl @@ -110,11 +110,11 @@ @test size(dtm2.dtm) == (2, 4) @test sum(dtm2.dtm, dims=(1,)) == [1 2 2 1] - # Test top_features + # Test top_terms crps3 = Corpus([FileDocument(sample_file)]) update_lexicon!(crps3) m3 = DocumentTermMatrix(crps3) - top5 = top_features(m3, 5) + top5 = top_terms(m3, 5) @test top5 isa OrderedDict @test collect(keys(top5)) == [",", "thou", "And", "and", ";"] @test collect(values(top5)) == [29, 6, 5, 5, 3] From fc76728629f15614a19d5526d93d06d68d9761bc Mon Sep 17 00:00:00 2001 From: James Alster Date: Tue, 17 Mar 2026 13:09:41 +0000 Subject: [PATCH 7/7] Fix dtm method for top_terms --- src/dtm.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/dtm.jl b/src/dtm.jl index 5df36261..b2ab553e 100644 --- a/src/dtm.jl +++ b/src/dtm.jl @@ -451,8 +451,10 @@ Ties are sorted alphabetically. """ function top_terms(D::DocumentTermMatrix, ::Val{N}) where {N} counts = @view(sum(D.dtm; dims=1)[1, :]) - n = min(N, length(counts)) - idx = partialsortperm(counts, 1:n; rev=true) - OrderedDict(zip(D.terms[idx], counts[idx])) + D_pairs = D.terms .=> counts + n = min(N, length(D_pairs)) + # Count decreasing, break ties alphabetically + idx = partialsortperm(D_pairs, 1:n; by = p -> (-p.second, p.first)) + OrderedDict(D_pairs[idx]) end top_terms(D::DocumentTermMatrix, n::Int) = top_terms(D, Val(n)) \ No newline at end of file