Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port to Julia 0.7/1.0 #28

Merged
merged 4 commits into from
Sep 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
language: julia

os:
- osx
- linux
- osx

julia:
- 0.6
- 0.7
- 1.0
- nightly

notifications:
email: false

script:
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
- julia -e 'Pkg.clone(pwd()); Pkg.build("FreqTables"); Pkg.test("FreqTables"; coverage=true)';
- julia -e 'using Pkg; Pkg.clone(pwd()); Pkg.build("FreqTables"); Pkg.test("FreqTables"; coverage=true)';

after_success:
- julia -e 'cd(Pkg.dir("FreqTables")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())';
- julia -e 'using Pkg; cd(Pkg.dir("FreqTables")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())';
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[![Build Status](https://travis-ci.org/nalimilan/FreqTables.jl.svg?branch=master)](https://travis-ci.org/nalimilan/FreqTables.jl)
[![Coverage Status](https://coveralls.io/repos/nalimilan/FreqTables.jl/badge.svg?branch=master&service=github)](https://coveralls.io/github/nalimilan/FreqTables.jl?branch=master)
[![FreqTables](http://pkg.julialang.org/badges/FreqTables_0.6.svg)](http://pkg.julialang.org/?pkg=FreqTables&ver=0.6)
[![FreqTables](http://pkg.julialang.org/badges/FreqTables_1.0.svg)](http://pkg.julialang.org/?pkg=FreqTables&ver=1.0)

This package allows computing one- or multi-way frequency tables (a.k.a. contingency or pivot tables) from
any type of vector or array. It includes support for [`CategoricalArray`](https://github.com/JuliaData/CategoricalArrays.jl)
Expand Down
4 changes: 2 additions & 2 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
julia 0.6
NamedArrays
julia 0.7
NamedArrays 0.9.1
CategoricalArrays 0.3.0
DataFrames 0.11.0
32 changes: 16 additions & 16 deletions src/freqtable.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import Base.ht_keyindex

# Cf. https://github.com/JuliaStats/StatsBase.jl/issues/135
immutable UnitWeights <: AbstractVector{Int}
end
struct UnitWeights <: AbstractVector{Int} end
Base.getindex(w::UnitWeights, ::Integer...) = 1
Base.getindex(w::UnitWeights, ::AbstractVector) = w

Expand All @@ -15,11 +14,11 @@ Base.@pure vectypes(T) = Tuple{map(U -> Vector{U}, T.parameters)...}
function _freqtable(x::Tuple,
skipmissing::Bool = false,
weights::AbstractVector{<:Real} = UnitWeights(),
subset::Union{Void, AbstractVector{Int}, AbstractVector{Bool}} = nothing)
subset::Union{Nothing, AbstractVector{Int}, AbstractVector{Bool}} = nothing)
n = length(x)
n == 0 && throw(ArgumentError("at least one argument must be provided"))

if !isa(subset, Void)
if !isa(subset, Nothing)
x = map(y -> y[subset], x)
weights = weights[subset]
end
Expand Down Expand Up @@ -50,12 +49,12 @@ function _freqtable(x::Tuple,
end

if skipmissing
filter!((k, v) -> !any(ismissing, k), d)
filter!(p -> !any(ismissing, p[1]), d)
end

keyvec = collect(keys(d))

dimnames = Vector{Vector}(n)
dimnames = Vector{Vector}(undef, n)
for i in 1:n
s = Set{vtypes.parameters[i]}()
for j in 1:length(keyvec)
Expand All @@ -76,7 +75,7 @@ function _freqtable(x::Tuple,
na = NamedArray(a, tuple(dimnames...)::vectypes(vtypes), ntuple(i -> "Dim$i", n))

for (k, v) in d
na[k...] = v
na[Name.(k)...] = v
end

na
Expand All @@ -85,23 +84,25 @@ end
freqtable(x::AbstractVector...;
skipmissing::Bool = false,
weights::AbstractVector{<:Real} = UnitWeights(),
subset::Union{Void, AbstractVector{Int}, AbstractVector{Bool}} = nothing) =
subset::Union{Nothing, AbstractVector{Int}, AbstractVector{Bool}} = nothing) =
_freqtable(x, skipmissing, weights, subset)

# Internal function needed for now so that n is inferred
function _freqtable(x::NTuple{n, AbstractCategoricalVector}, skipmissing::Bool = false,
weights::AbstractVector{<:Real} = UnitWeights(),
subset::Union{Void, AbstractVector{Int}, AbstractVector{Bool}} = nothing) where n
subset::Union{Nothing, AbstractVector{Int}, AbstractVector{Bool}} = nothing) where n
n == 0 && throw(ArgumentError("at least one argument must be provided"))

if !isa(subset, Void)
if !isa(subset, Nothing)
x = map(y -> y[subset], x)
weights = weights[subset]
end

len = map(length, x)
miss = map(v -> eltype(v) >: Missing, x)
lev = map(v -> eltype(v) >: Missing && !skipmissing ? [levels(v); missing] : levels(v), x)
lev = map(x) do v
eltype(v) >: Missing && !skipmissing ? [levels(v); missing] : allowmissing(levels(v))
end
dims = map(length, lev)
# First entry is for missing values (only correct and used if present)
ord = map((v, d) -> Int[d; CategoricalArrays.order(v.pool)], x, dims)
Expand All @@ -121,7 +122,7 @@ function _freqtable(x::NTuple{n, AbstractCategoricalVector}, skipmissing::Bool =
missingpossible = any(miss)

@inbounds for i in 1:len[1]
ref = x[1].refs[i]
ref = x[1].refs[i]
el = ord[1][ref + 1]
anymiss = missingpossible & (ref <= 0)

Expand All @@ -141,7 +142,7 @@ end

freqtable(x::AbstractCategoricalVector...; skipmissing::Bool = false,
weights::AbstractVector{<:Real} = UnitWeights(),
subset::Union{Void, AbstractVector{Int}, AbstractVector{Bool}} = nothing) =
subset::Union{Nothing, AbstractVector{Int}, AbstractVector{Bool}} = nothing) =
_freqtable(x, skipmissing, weights, subset)

function freqtable(d::AbstractDataFrame, x::Symbol...; args...)
Expand Down Expand Up @@ -214,14 +215,13 @@ julia> sum(pt, (1, 2))

```
"""

prop(tbl::AbstractArray{<:Number}) = tbl / sum(tbl)

function prop(tbl::AbstractArray{<:Number,N}, margin::Integer...) where N
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should decide whether margin should become a keyword argument, and if yes whether we should keep this name.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point - maybe dims as keyword argument?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, maybe. However, the meaning is a bit different, and indeed the argument isn't passed to dims as-is. Maybe that's not an issue though.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can always fix it later as it is non breaking.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. #30

lo, hi = extrema(margin)
(lo < 1 || hi > N) && throw(ArgumentError("margin must be a valid dimension"))
tbl ./ sum(tbl, tuple(setdiff(1:N, margin)...))
tbl ./ sum(tbl, dims=tuple(setdiff(1:N, margin)...)::NTuple{N-length(margin),Int})
end

prop(tbl::NamedArray{<:Number}, margin::Integer...) =
NamedArray(prop(array(tbl), margin...), tbl.dicts, tbl.dimnames)
NamedArray(prop(convert(Array, tbl), margin...), tbl.dicts, tbl.dimnames)
53 changes: 27 additions & 26 deletions test/freqtable.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
using FreqTables
using Base.Test
using Test

x = repeat(["a", "b", "c", "d"], outer=[100]);
# Values not in order to test discrepancy between index and levels with CategoricalArray
Expand All @@ -8,7 +8,7 @@ y = repeat(["D", "C", "A", "B"], inner=[10], outer=[10]);
tab = @inferred freqtable(x)
@test tab == [100, 100, 100, 100]
@test names(tab) == [["a", "b", "c", "d"]]
@test prop(tab) == [0.25, 0.25, 0.25, 0.25]
@test @inferred prop(tab) == [0.25, 0.25, 0.25, 0.25]
tab = @inferred freqtable(y)
@test tab == [100, 100, 100, 100]
@test names(tab) == [["A", "B", "C", "D"]]
Expand Down Expand Up @@ -41,7 +41,7 @@ pt = @inferred prop(tab, 1, 2)
1.0 1.0 1.0 1.0]

tbl = @inferred prop(rand(5, 5, 5, 5), 1, 2)
sumtbl = sum(tbl, (3,4))
sumtbl = sum(tbl, dims=(3,4))
@test all(x -> x ≈ 1.0, sumtbl)

@test_throws MethodError prop()
Expand All @@ -51,9 +51,9 @@ sumtbl = sum(tbl, (3,4))
@test_throws ArgumentError prop([1,2,3], 2)
@test_throws ArgumentError prop([1,2,3], 0)

tab =freqtable(x, y,
subset=1:20,
weights=repeat([1, .5], outer=[10]))
tab = @inferred freqtable(x, y,
subset=1:20,
weights=repeat([1, .5], outer=[10]))
@test tab == [2.0 3.0
1.0 1.5
3.0 2.0
Expand Down Expand Up @@ -85,36 +85,35 @@ tab = @inferred freqtable(cx, cy)
20 30 30 20]
@test names(tab) == [["a", "b", "c", "d"], ["A", "B", "C", "D"]]

tab =freqtable(cx, cy,
subset=1:20,
weights=repeat([1, .5], outer=[10]))
tab = @inferred freqtable(cx, cy,
subset=1:20,
weights=repeat([1, .5], outer=[10]))
@test tab == [0.0 0.0 2.0 3.0
0.0 0.0 1.0 1.5
0.0 0.0 3.0 2.0
0.0 0.0 1.5 1.0]
@test names(tab) == [["a", "b", "c", "d"], ["A", "B", "C", "D"]]


using Missings
const ≅ = isequal
mx = Array{Union{String, Missing}}(x)
my = Array{Union{String, Missing}}(y)
mx[1] = missing
my[[1, 10, 20, 400]] = missing
my[[1, 10, 20, 400]] .= missing

mcx = categorical(mx)
mcy = categorical(my)

tab = freqtable(mx)
tabc = freqtable(mcx)
tab = @inferred freqtable(mx)
tabc = @inferred freqtable(mcx)
@test tab == tabc == [99, 100, 100, 100, 1]
@test names(tab) ≅ names(tabc) ≅ [["a", "b", "c", "d", missing]]
tab = freqtable(my)
tabc = freqtable(mcy)
tab = @inferred freqtable(my)
tabc = @inferred freqtable(mcy)
@test tab == tabc == [100, 99, 99, 98, 4]
@test names(tab) ≅ names(tabc) ≅ [["A", "B", "C", "D", missing]]
tab = freqtable(mx, my)
tabc = freqtable(mcx, mcy)
tab = @inferred freqtable(mx, my)
tabc = @inferred freqtable(mcx, mcy)
@test tab == tabc == [30 20 20 29 0;
30 20 20 29 1;
20 30 30 20 0;
Expand All @@ -124,16 +123,16 @@ tabc = freqtable(mcx, mcy)
["A", "B", "C", "D", missing]]


tab = freqtable(mx, skipmissing=true)
tabc = freqtable(mcx, skipmissing=true)
tab = @inferred freqtable(mx, skipmissing=true)
tabc = @inferred freqtable(mcx, skipmissing=true)
@test tab == tabc == [99, 100, 100, 100]
@test names(tab) ≅ names(tabc) ≅ [["a", "b", "c", "d"]]
tab = freqtable(my, skipmissing=true)
tabc = freqtable(mcy, skipmissing=true)
tab = @inferred freqtable(my, skipmissing=true)
tabc = @inferred freqtable(mcy, skipmissing=true)
@test names(tab) ≅ names(tabc) ≅ [["A", "B", "C", "D"]]
@test tab == tabc == [100, 99, 99, 98]
tab = freqtable(mx, my, skipmissing=true)
tabc = freqtable(mcx, mcy, skipmissing=true)
tab = @inferred freqtable(mx, my, skipmissing=true)
tabc = @inferred freqtable(mcx, mcy, skipmissing=true)
@test tab == tabc == [30 20 20 29;
30 20 20 29;
20 30 30 20;
Expand All @@ -143,7 +142,9 @@ tabc = freqtable(mcx, mcy, skipmissing=true)
using DataFrames, CSV

for docat in [false, true]
iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"), categorical=docat);
iris = CSV.read(joinpath(dirname(pathof(DataFrames)), "../test/data/iris.csv"),
DataFrame,
categorical=docat, allowmissing=:none);
if docat
iris[:LongSepal] = categorical(iris[:SepalLength] .> 5.0)
else
Expand All @@ -167,8 +168,8 @@ for docat in [false, true]
end

# Issue #5
@test freqtable([Set(1), Set(2)]) == [1, 1]
@test freqtable([Set(1), Set(2)], [Set(1), Set(2)]) == eye(2)
@test @inferred freqtable([Set(1), Set(2)]) == [1, 1]
@test @inferred freqtable([Set(1), Set(2)], [Set(1), Set(2)]) == [1 0; 0 1]

@test_throws ArgumentError freqtable()
@test_throws ArgumentError freqtable(DataFrame())