diff --git a/src/TimeDataFrames.jl b/src/TimeDataFrames.jl index 68160b18a7db23f78d6e635e0a6d85a2df3fbd7e..e20a8e41caca9a4c16ded0f12ba507fd67fbe826 100644 --- a/src/TimeDataFrames.jl +++ b/src/TimeDataFrames.jl @@ -4,7 +4,9 @@ using CSV using DataFrames using ExtendedDates -export TimeDataFrame, innerjoin, outerjoin +import Base: eachcol, eachrow, show + +export TimeDataFrame, periods, firstperiod, lastperiod, dataframe, innerjoin, outerjoin, lag, lead, ncol, nrow mutable struct TimeDataFrame data::DataFrame @@ -29,7 +31,7 @@ end function TimeDataFrame(filename::String, firstperiod::T) where T <: ExtendedDates.SimpleDate data = DataFrame(CSV.File(filename)) continuous = true - periods = range(firstperiod, size(data, 1)) + periods = range(firstperiod, length=size(data, 1), step = typeof(firstperiod - firstperiod)(1)) TimeDataFrame(data, periods, true) end @@ -261,6 +263,7 @@ function Base.copy(bc::Base.Broadcast.Broadcasted{TimeDataFrameStyle}) end end colnames = unique!([_names(df) for df in bcf.args if df isa TimeDataFrame]) + @show colnames if length(colnames) != 1 wrongnames = setdiff(union(colnames...), intersect(colnames...)) if isempty(wrongnames) @@ -297,9 +300,6 @@ Base.ndims(::Type{<:TimeDataFrame}) = 2 index(df::TimeDataFrame) = getfield(getfield(df, :data), :colindex) _columns(df::TimeDataFrame) = getfield(getfield(df, :data), :columns) -# note: these type assertions are required to pass tests -ncol(df::TimeDataFrame) = length(index(df)) -nrow(df::TimeDataFrame) = ncol(df) > 0 ? length(_columns(df)[1])::Int : 0 import Base.size Base.size(df::TimeDataFrame) = (nrow(df), ncol(df)) function Base.size(df::TimeDataFrame, i::Integer) @@ -338,4 +338,93 @@ function outerjoin(d1::TimeDataFrame, d2::TimeDataFrame) return TimeDataFrame(sort!(DataFrames.outerjoin(data1, data2, on=:Column1),1), union(periods1, periods2), true) end +function Base.show(io::IO, tdf::TimeDataFrame) + df = getfield(tdf, :data) + dfcopy = copy(df) + periods = getfield(tdf, :periods) + insertcols!(dfcopy, 1, :Periods => periods) + show(io, dfcopy, show_row_number = false, eltypes = false, summary = false) +end + +function Base.show(tdf::TimeDataFrame) + df = getfield(tdf, :data) + dfcopy = copy(df) + periods = getfield(tdf, :periods) + insertcols!(dfcopy, 1, :Periods => periods) + show(dfcopy, show_row_number = false, eltypes = false, summary = false) +end + +function Base.isequal(tdf1::TimeDataFrame, tdf2::TimeDataFrame) + isequal(getfield(tdf1, :data), getfield(tdf2, :data)) || return false + isequal(getfield(tdf1, :periods), getfield(tdf2, :periods)) || return false + getfield(tdf1, :continuous) == getfield(tdf2, :continuous) || return false + return true +end + +""" + copy(tdf::TimeDataFrame; copycols::Bool=true) + +Copy time data frame `tdf`. +If `copycols=true` (the default), return a new `TimeDataFrame` holding +copies of column vectors in `tdf`. +If `copycols=false`, return a new `TimeDataFrame` sharing column vectors with `tdf`. +""" +function Base.copy(tdf::TimeDataFrame; copycols::Bool=true) + return TimeDataFrame(copy(dataframe(tdf), copycols=copycols), copy(periods(tdf)), continuous(tdf)) +end +############################################################################## +## +## Equality +## +############################################################################## + +function Base.:(==)(tdf1::TimeDataFrame, tdf2::TimeDataFrame) + return isequal(tdf1, tdf2) +end + +#= +function Base.isequal(tdf1::TimeDataFrame, tdf2::TimeDataFrame) + size(tdf1, 2) == size(tdf2, 2) || return false + isequal(index(tdf1), index(tdf2)) || return false + isequal(continuous(tdf1), continuous(tdf2)) || return false + isequal(periods(tdf1), periods(tdf2)) || return false + isequal(dataframe(tdf1), dataframe(tdf2)) || return false + return true +end +=# +""" + isapprox(tdf1::TimeDataFrame, tdf2::TimeDataFrame; + rtol::Real=atol>0 ? 0 : √eps, atol::Real=0, + nans::Bool=false, norm::Function=norm) + +Inexact equality comparison. `tdf1` and `tdf2` must have the same size, column names and periods. +Return `true` if `isapprox` with given keyword arguments +applied to all pairs of columns stored in `df1` and `df2` returns `true`. +""" +function Base.isapprox(tdf1::TimeDataFrame, tdf2::TimeDataFrame; + atol::Real=0, rtol::Real=atol>0 ? 0 : √eps(), + nans::Bool=false, norm::Function=norm) + if size(tdf1) != size(tdf2) + throw(DimensionMismatch("dimensions must match: a has dims " * + "$(size(tdf1)), b has dims $(size(tdf2))")) + end + if !isequal(index(tdf1), index(tdf2)) + throw(ArgumentError("column names of passed time data frames do not match")) + end + if !isequal(periods(tdf1), periods(tdf2)) + throw(ArgumentError("periods of passed time data frames do not match")) + end + return isapprox(dataframe(tdf1), dataframe(tdf2), atol=atol, rtol=rtol, nans=nans, norm=norm) +end + +Base.Matrix(tdf::TimeDataFrame) = Matrix(dataframe(tdf)) +Base.Matrix{T}(tdf::TimeDataFrame) where T = Matrix{T}(dataframe(tdf)) +Base.Array(tdf::TimeDataFrame) = Array(dataframe(tdf)) +Base.Array{T}(tdf::TimeDataFrame) where T = Array{T}(dataframe(tdf)) + +include("accessors.jl") +export continuous, dataframe, firstperiod, lastperiod, periods +include("dataframe_functions.jl") +include("timeseries_functions.jl") +export lag, lead, align! end # module diff --git a/src/accessors.jl b/src/accessors.jl new file mode 100644 index 0000000000000000000000000000000000000000..5005789363772ad28fa345ea0a1587ad44897132 --- /dev/null +++ b/src/accessors.jl @@ -0,0 +1,28 @@ +""" + periods(tdf::TimeDataFrame) +returns the periods of the TimeDataFrame +""" +periods(tdf::TimeDataFrame) = getfield(tdf, :periods) + +""" + firstperiod(tdf::TimeDataFrame) +returns the first period of the TimeDataFrame +""" +firstperiod(tdf::TimeDataFrame) = periods(tdf)[1] + +""" + lasttperiod(tdf::TimeDataFrame) +returns the last period of the TimeDataFrame +""" +lastperiod(tdf::TimeDataFrame) = periods(tdf)[end] + +""" + dataframe(tdf::TimeDataFrame) +returns the DataFrame inside the TimeDataFrame +""" +dataframe(tdf::TimeDataFrame) = getfield(tdf, :data) +""" + continuous(tdf::TimeDataFrame) +returns whether the TimeDataFrame is continuous +""" +continuous(tdf::TimeDataFrame) = getfield(tdf, :continuous) diff --git a/src/dataframe_functions.jl b/src/dataframe_functions.jl new file mode 100644 index 0000000000000000000000000000000000000000..c975b71c8ef686b55609353a2044b878f7e6f9be --- /dev/null +++ b/src/dataframe_functions.jl @@ -0,0 +1,4 @@ +Base.eachcol(tdf::TimeDataFrame) = eachcol(dataframe(tdf)) +Base.eachrow(tdf::TimeDataFrame) = eachrow(dataframe(tdf)) +ncol(tdf::TimeDataFrame) = DataFrames.ncol(dataframe(tdf)) +nrow(tdf::TimeDataFrame) = DataFrames.nrow(dataframe(tdf)) diff --git a/src/timeseries_functions.jl b/src/timeseries_functions.jl new file mode 100644 index 0000000000000000000000000000000000000000..2b280de3a92a45189d61f7cc8dbfa0b93fee5b0e --- /dev/null +++ b/src/timeseries_functions.jl @@ -0,0 +1,87 @@ +function TimeDataFrame(f, tdf::TimeDataFrame, args...) + df = dataframe(tdf) + f1(x) = f(x, args) + return TimeDataFrame(map(f, eachcol(df)), names(df), firstperiod(tdf)) +end + +function lag(x::AbstractVector{T}, k::Int64) where T + n = length(x) + y = Vector{Union{T, Missing}}(undef, n) + view(y, 1:k) .= missing + view(y, k+1:n) .= view(x, 1:n-k) + return y +end + + +lag(tdf::TimeDataFrame, k::Int64) = TimeDataFrame(lag, tdf, k) +lag(x) = lag(x, 1) + +function lead(x::AbstractVector{T}, k::Int64) where T + n = length(x) + y = Vector{Union{T, Missing}}(undef, n) + view(y, 1:n-k) .= view(x, k+1:n) + view(y, n-k+1:n) .= missing + return y +end + +lead(tdf::TimeDataFrame, k::Int64) = TimeDataFrame(lead, tdf, k) +lead(x) = lead(x, 1) + +function addmissingfirst!(tdf, k::DatePeriod) + kv = k.value + tmp = Vector{Union{Float64, Missing}}(undef, size(tdf,1) + kv) + for c in eachcol(tdf) + tmp .= append!(c, repeat([missing], kv)) + c .= circshift(tmp, kv) + end +end + +function addmissingend!(tdf, k::DatePeriod) + kv = k.value + for c in eachcol(tdf) + append!(c, repeat([missing], kv)) + end +end + +function addperiodsfirst!(periods, k::DatePeriod) + kv = k.value + append!(periods, + collect(range(periods[1] - typeof(k)(1), length = kv, step=-typeof(k)(1)))) + tmp = circshift(periods, kv) + periods .= tmp +end + +addperiodsend!(periods, k::DatePeriod) = + append!(periods, + collect(range(periods[end] + typeof(k)(1), length = k.value, step=typeof(k)(1)))) + +function align!(tdf, tdfs...) + periods = TimeDataFrames.periods(tdf) + p1 = periods[1] + p2 = periods[end] + tp = typeof(periods[1]) + for t in tdfs + periods1 = TimeDataFrames.periods(t) + if typeof(periods1[1]) != tp + error("TimeDataFrames must have the same frequency") + end + p1 = min(p1, periods1[1]) + p2 = max(p2, periods1[end]) + end + zeroperiod = periods[1] - periods[1] + for t in push!([tdfs...], tdf) + periods1 = TimeDataFrames.periods(t) + m = periods1[1] - p1 + if m > zeroperiod + allowmissing!(dataframe(t)) + addmissingfirst!(t, m) + addperiodsfirst!(periods1, m) + end + m = p2 - periods1[end] + if m > zeroperiod + allowmissing!(dataframe(t)) + addmissingend!(t, m) + addperiodsend!(periods1, m) + end + end +end