Add dbnomics to dseries conversion routine, closes Dynare/dseries#44

198023a2 · Dóra Kocsis · 5ae52153 · 198023a2 · 198023a2 · 198023a2
Commit 198023a2 authored 4 years ago by Dóra Kocsis
--- a/src/initialize_dseries_class.m
+++ b/src/initialize_dseries_class.m
@@ -23,7 +23,8 @@ dseries_src_path_s = strsplit(dseries_src_root, filesep());
 isstandalone = ~isequal(dseries_src_path_s(end-3:end), {'matlab', 'modules', 'dseries', 'src'}) & isempty(which('dynare'));

 % Set the subfolders to be added in the path.
-p = {'read'; ...
+p = {'mdbnomics2dseries'; ...
+     'read'; ...
     'utilities/is'; ...
     'utilities/op'; ...
     'utilities/convert'; ...
@@ -31,6 +32,7 @@ p = {'read'; ...
     'utilities/insert'; ...
     'utilities/file'; ...
     'utilities/from'; ...
+     'utilities/get'; ...
     'utilities/print'; ...
     'utilities/variables'; ...
     'utilities/cumulate'; ...

--- a/src/mdbnomics2dseries/mdbnomics2dseries.m
+++ b/src/mdbnomics2dseries/mdbnomics2dseries.m
+function ds = mdbnomics2dseries(varargin)   % --*-- Unitary tests --*--
+
+% Given cell array from the mdbnomics library, it returns a dseries object.
+%
+% INPUTS
+%
+% - If only one arguments are provided, we must have:
+%  + varargin{1}  [cell]        A T*N array of data.
+%
+% OUTPUTS
+% - ds [dseries]
+
+% Copyright (C) 2020 Dynare Team
+%
+% This code is free software: you can redistribute it and/or modify
+% it under the terms of the GNU General Public License as published by
+% the Free Software Foundation, either version 3 of the License, or
+% (at your option) any later version.
+%
+% Dynare dseries submodule is distributed in the hope that it will be useful,
+% but WITHOUT ANY WARRANTY; without even the implied warranty of
+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+% GNU General Public License for more details.
+%
+% You should have received a copy of the GNU General Public License
+% along with Dynare.  If not, see <http://www.gnu.org/licenses/>.
+
+switch nargin
+    case 0
+        % Return empty object.
+        error('mdbnomics2dseries:WrongInputArguments', 'Input must be non empty!');
+    case 1
+        if iscell(varargin{1})
+            switch length(varargin{1})
+                case 0
+                    error('mdbnomics2dseries:WrongInputArguments', 'Input must be non empty!');
+                otherwise
+                    o.data = varargin{1}(2:end,:);
+                    o.cols = varargin{1}(1,:);
+                    col_idx = {'x_frequency', 'dataset_code', 'series_code', 'original_period', 'period', 'value'};
+                    for ii = 1:size(col_idx,2)
+                        o.col_idx.(col_idx{ii}) = find(strcmp(col_idx{ii}, o.cols));
+                    end
+                    % Check if database has multiple frequencies
+                    if size(unique(o.data(:, o.col_idx.x_frequency)),1) > 1
+                        error('mdbnomics2dseries:DatabaseCheck: The database, that you are trying to convert, contains multiple frequencies. Currently, this type of dseries conversion is not supported. Please select a section of your database with uniform frequency.');
+                    end
+                    ds = convert_mdbnomics(o);
+            end
+        end
+    otherwise
+        error('mdbnomics2dseries:WrongInputArguments', 'Too many input arguments! Please check the manual.')
+end
+end
--- a/src/utilities/convert/convert_mdbnomics.m
+++ b/src/utilities/convert/convert_mdbnomics.m
+function ds = convert_mdbnomics(o)
+
+% INPUTS
+% - o         [struct]         Struct with fields: data, cols, col_idx
+%
+% OUTPUTS
+% - ds        [dseries]
+
+% Copyright (C) 2020 Dynare Team
+%
+% This code is free software: you can redistribute it and/or modify
+% it under the terms of the GNU General Public License as published by
+% the Free Software Foundation, either version 3 of the License, or
+% (at your option) any later version.
+%
+% Dynare dates submodule is distributed in the hope that it will be useful,
+% but WITHOUT ANY WARRANTY; without even the implied warranty of
+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+% GNU General Public License for more details.
+%
+% You should have received a copy of the GNU General Public License
+% along with Dynare.  If not, see <http://www.gnu.org/licenses/>.
+
+if ~isstruct(o)
+    error('mdbnomics2dseries::convert_mdbnomics: The input argument must be a struct!');
+end
+
+% Initialize dseries
+ds = dseries();
+
+% Check for multiple datasets
+dataset_codes = unique(o.data(:,o.col_idx.dataset_code),'stable');
+
+% Convert mdbnomics to dseries
+for ii = 1:length(dataset_codes)
+    % Slice data for dataset
+    ds_dataset = o.data(strcmp(o.data(:,o.col_idx.dataset_code),dataset_codes{ii}),:);
+    series_codes = unique(ds_dataset(:,o.col_idx.series_code),'stable');
+    % Get list of variable names
+    list_of_names = cellfun(@(x)regexprep(x, '[^a-zA-Z0-9]', '_'), series_codes, 'UniformOutput', false);
+    % Get dataset values
+    ds_dataset_values = cell2mat(ds_dataset(:,o.col_idx.value));
+    % Get length of series
+    series_length = cell2mat(cellfun(@(x)length(find(strcmp(x, ds_dataset(:,o.col_idx.series_code)))), series_codes, 'UniformOutput', false));
+    % Get starting value indices
+    dataset_start_val = cumsum([1, series_length(1:end-1)']);
+
+    % Check if dataset starting date is uniform
+    starting_dates = o.data(dataset_start_val, o.col_idx.original_period);
+    freq = ds_dataset{1,o.col_idx.x_frequency};
+    if length(unique(starting_dates)) > 1
+        % Build dseries object by series
+        dataset = dseries();
+        for s = 1:length(starting_dates)
+            % Get dseries date format from dataset
+            dseries_date = get_series_start_date(freq, starting_dates{s});
+            % Transform dataset into dseries
+            data_series = ds_dataset_values(dataset_start_val(s):dataset_start_val(s)+series_length(s)-1);
+            series = dseries(data_series, dseries_date, list_of_names{s});
+            dataset = [dataset series];
+        end
+    else
+        % Pad values with NaN when series length in the same dataset is unequal
+        if size(series_length, 1) > 1 && length(unique(series_length)) > 1
+            val_ = mat2cell(ds_dataset_values, series_length');
+            ds_dataset_values = cell2mat(cellfun(@(x)cat(1, x, nan(max(series_length)-length(x),1)), val_, 'UniformOutput', false));
+        end
+
+        % Reshape dseries input data
+        data_dataset = reshape(ds_dataset_values, max(series_length), size(series_codes, 1));
+
+        % Get dseries date format from dataset
+        starting_date = min(datetime(ds_dataset(:,o.col_idx.period), 'InputFormat', 'yyyy-MM-dd', 'Format', 'yyyy-MM-dd'));
+        original_period = ds_dataset(strcmp(ds_dataset(:,o.col_idx.period),string(starting_date)),o.col_idx.original_period);
+        dseries_date = get_series_start_date(freq, original_period{1});
+        % Transform dataset into dseries
+        dataset = dseries(data_dataset, dseries_date, list_of_names);
+    end
+    % Append initial dseries object
+    ds = [ds dataset];
+end
+
+% Add tags to the variables
+if length(dataset_codes) > 1
+    series_codes = unique(o.data(:,o.col_idx.series_code),'stable');
+    list_of_names = cellfun(@(x)regexprep(x, '[^a-zA-Z0-9]', '_'), series_codes, 'UniformOutput', false);
+    series_length = cell2mat(cellfun(@(x)length(find(strcmp(x, o.data(:,o.col_idx.series_code)))), series_codes, 'UniformOutput', false));
+end
+
+% Select relevant column indices (ignore columns: 'original_period', 'period', 'original_value', 'value')
+col_idx = [1:6,11:size(o.data,2)];
+tag_names = cellfun(@(x)regexprep(x, '[^a-zA-Z0-9]','_'), o.cols(col_idx), 'UniformOutput', false);
+data_start_val = cumsum([1, series_length(1:end-1)']);
+tag_data = o.data(data_start_val,col_idx);
+
+for ii = 1:length(tag_names)
+    tag(ds, tag_names{ii});
+    for jj = 1:length(list_of_names)
+        tag(ds, tag_names{ii}, list_of_names{jj}, tag_data{jj,ii});
+    end
+end
+end
--- a/src/utilities/get/get_series_start_date.m
+++ b/src/utilities/get/get_series_start_date.m
+function series_start_date = get_series_start_date(frequency, original_period)   % --*-- Unitary tests --*--
+% Given cell array obtained using from the mdbnomics library,
+% it returns a cell array of metadata ot be appended to a dseries object.
+%
+% INPUTS
+% - frequency         [string]         Dataset frequency: monthly, quarterly, bi-annual, annual
+% - original_period   [string]         Series original period
+%
+% OUTPUTS
+% - series_start_date [string]
+
+% Copyright (C) 2020 Dynare Team
+%
+% This code is free software: you can redistribute it and/or modify
+% it under the terms of the GNU General Public License as published by
+% the Free Software Foundation, either version 3 of the License, or
+% (at your option) any later version.
+%
+% Dynare dates submodule is distributed in the hope that it will be useful,
+% but WITHOUT ANY WARRANTY; without even the implied warranty of
+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+% GNU General Public License for more details.
+%
+% You should have received a copy of the GNU General Public License
+% along with Dynare.  If not, see <http://www.gnu.org/licenses/>.
+
+date_ext = regexp(original_period, '\d*', 'Match');
+switch frequency
+    case 'monthly'
+        series_start_date = [date_ext{1} 'M' regexprep(date_ext{2},'\<0*','')];
+    case 'quarterly'
+        series_start_date = [date_ext{1} 'Q' date_ext{2}];
+    case {'bi-annual', 'bi-monthly'}
+        series_start_date = [date_ext{1} 'H' date_ext{2}];
+    case 'annual'
+        series_start_date = [original_period 'Y'];
+    otherwise
+        error('mdbnomics2dseries::get_series_start_date: The frequency of the dataset is currently unsupported!');
+end
+end
+
+%@test:1
+%$ try
+%$     str = get_series_start_date('monthly','1997-01');
+%$     t(1) = true;
+%$ catch
+%$     t(1) = false;
+%$ end
+%$
+%$ if t(1)
+%$     t(2) = dassert(str, '1997M1');
+%$ end
+%$
+%$ T = all(t);
+%@eof:1
+
+%@test:2
+%$ try
+%$     str = get_series_start_date('quarterly','1938-Q4');
+%$     t(1) = true;
+%$ catch
+%$     t(1) = false;
+%$ end
+%$
+%$ if t(1)
+%$     t(2) = dassert(str, '1938Q4');
+%$ end
+%$
+%$ T = all(t);
+%@eof:2
+
+%@test:3
+%$ try
+%$     str = get_series_start_date('bi-annual','1997-S2');
+%$     t(1) = true;
+%$ catch
+%$     t(1) = false;
+%$ end
+%$
+%$ if t(1)
+%$     t(2) = dassert(str, '1997H2');
+%$ end
+%$
+%$ T = all(t);
+%@eof:3
+
+%@test:4
+%$ try
+%$     str = get_series_start_date('annual','1997');
+%$     t(1) = true;
+%$ catch
+%$     t(1) = false;
+%$ end
+%$
+%$ if t(1)
+%$     t(2) = dassert(str, '1997Y');
+%$ end
+%$
+%$ T = all(t);
+%@eof:4