From 144139317fe7a703be19e5426a7b8c677be636c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=B3ra=20Kocsis?= <dora@dynare.org> Date: Wed, 25 Mar 2020 17:48:38 +0100 Subject: [PATCH] add subroutines to filter series, closes #2 --- src/fetch_series_by_api_link.m | 97 ++++++++++++++++++++++------------ src/filter_series.m | 11 ++++ src/flatten_editor_series.m | 18 +++++++ src/iter_filtered_series.m | 36 +++++++++++++ 4 files changed, 128 insertions(+), 34 deletions(-) create mode 100644 src/filter_series.m create mode 100644 src/flatten_editor_series.m create mode 100644 src/iter_filtered_series.m diff --git a/src/fetch_series_by_api_link.m b/src/fetch_series_by_api_link.m index 0c94fb1..f473ab2 100644 --- a/src/fetch_series_by_api_link.m +++ b/src/fetch_series_by_api_link.m @@ -43,49 +43,68 @@ else end ordered_columns_names = [common_columns, dimensions_codes_columns_names, dimensions_labels_columns_names]; + if ~isempty(p.Results.dbnomics_filters) + filtered_series_list = filter_series(series_list, p.Results.dbnomics_filters,p.Results.editor_api_base_url); + % Append common column names with period_middle_day and filtered columns for the final DataFrame + idx_period = find(strcmp(ordered_columns_names, "period")); + idx_value = find(strcmp(ordered_columns_names, "value")); + ordered_columns_names = [ordered_columns_names{1:idx_period}, "period_middle_day", ordered_columns_names{idx_period+1:idx_value}, "filtered", ordered_columns_names{idx_value+1:end}]; + % Append series_list with the filtered series + series_list = [series_list, filtered_series_list]; + end + df = cell(length(series_list{1}.value)*length(series_list)+1,length(ordered_columns_names)); for col = 1:length(ordered_columns_names) col_ = ordered_columns_names{col}; df{1,col} = col_; end - series_length=0; + + series_length=0; % Flatten series received from the API (rename some keys of JSON result to match DataFrame organization) for ii = 1:length(series_list) - flat_series = flatten_dbnomics_series(series_list{ii}); - % Add dimensions labels to flat_series - complete_dataset_code = [flat_series.provider_code '_' flat_series.dataset_code]; - dataset_dimensions = datasets_dimensions.(complete_dataset_code); - if isfield(dataset_dimensions, 'dimensions_labels') - dataset_dimensions_labels = dataset_dimensions.dimensions_labels; - else - dataset_dimensions_labels = struct(); - for jj = 1:length(dataset_dimensions.dimensions_codes_order) - dataset_dimensions_labels.(dataset_dimensions.dimensions_codes_order{jj}) = [dataset_dimensions.dimensions_codes_order{jj} '_label']; - end - end - % Add dimensions values labels to current series - if isfield(dataset_dimensions, 'dimensions_values_labels') - dimension_codes = intersect(dimensions_codes_columns_names, string(fieldnames(dataset_dimensions_labels)')); %string(fieldnames(dataset_dimensions_labels)'); - for jj = 1:length(dimension_codes) - series_code = regexprep(flat_series.series_code,'[^a-zA-Z0-9]','_'); - dimension_label = dataset_dimensions_labels.(dimension_codes{jj}); - flat_series.labels{jj, 1} = dimension_label; - dimension_value_code = regexprep(series_dims_by_dataset_code.(complete_dataset_code).(series_code).(dimension_codes{jj}),'[^a-zA-Z0-9]','_'); - if isstrprop(dimension_value_code(1), 'digit') %MATLAB doesn't allow struct fieldnames to start with a digit - dimension_value_code = strcat('x', dimension_value_code); + if ~isfield(series_list{ii}, 'filtered') + flat_series = flatten_dbnomics_series(series_list{ii}); + % Add dimensions labels to flat_series + complete_dataset_code = [flat_series.provider_code '_' flat_series.dataset_code]; + dataset_dimensions = datasets_dimensions.(complete_dataset_code); + if isfield(dataset_dimensions, 'dimensions_labels') + dataset_dimensions_labels = dataset_dimensions.dimensions_labels; + else + dataset_dimensions_labels = struct(); + for jj = 1:length(dataset_dimensions.dimensions_codes_order) + dataset_dimensions_labels.(dataset_dimensions.dimensions_codes_order{jj}) = [dataset_dimensions.dimensions_codes_order{jj} '_label']; end - try - flat_series.labels{jj, 2} = dataset_dimensions.dimensions_values_labels.(dimension_codes{jj}).(dimension_value_code); - catch - for it = 1:size(dataset_dimensions.dimensions_values_labels.(dimension_codes{jj}), 1) - tmp = regexprep(dataset_dimensions.dimensions_values_labels.(dimension_codes{jj}){it}{1}, '[^a-zA-Z0-9]', '_'); - if strcmp(tmp, dimension_value_code) - flat_series.labels{jj, 2} = dataset_dimensions.dimensions_values_labels.(dimension_codes{jj}){it}{2}; + end + % Add dimensions values labels to current series + if isfield(dataset_dimensions, 'dimensions_values_labels') + dimension_codes = intersect(dimensions_codes_columns_names, string(fieldnames(dataset_dimensions_labels)')); %string(fieldnames(dataset_dimensions_labels)'); + for jj = 1:length(dimension_codes) + series_code = regexprep(flat_series.series_code,'[^a-zA-Z0-9]','_'); + dimension_label = dataset_dimensions_labels.(dimension_codes{jj}); + flat_series.labels{jj, 1} = dimension_label; + dimension_value_code = regexprep(series_dims_by_dataset_code.(complete_dataset_code).(series_code).(dimension_codes{jj}),'[^a-zA-Z0-9]','_'); + if isstrprop(dimension_value_code(1), 'digit') %MATLAB doesn't allow struct fieldnames to start with a digit + dimension_value_code = strcat('x', dimension_value_code); + end + try + flat_series.labels{jj, 2} = dataset_dimensions.dimensions_values_labels.(dimension_codes{jj}).(dimension_value_code); + catch + for it = 1:size(dataset_dimensions.dimensions_values_labels.(dimension_codes{jj}), 1) + tmp = regexprep(dataset_dimensions.dimensions_values_labels.(dimension_codes{jj}){it}{1}, '[^a-zA-Z0-9]', '_'); + if strcmp(tmp, dimension_value_code) + flat_series.labels{jj, 2} = dataset_dimensions.dimensions_values_labels.(dimension_codes{jj}){it}{2}; + end end end end end + if ~isempty(p.Results.dbnomics_filters) + flat_series.filtered = false; + end + else + flat_series = series_list{ii}; end + % Create final cell array for col = 1:length(ordered_columns_names) col_ = ordered_columns_names{col}; @@ -93,14 +112,24 @@ else if strcmp(col_, 'original_value') || strcmp(col_,'value') || strcmp(col_, 'original_period') || strcmp(col_, 'period') df{series_length+jj+1,col} = flat_series.(col_){jj}; elseif any(strcmp(col_,dimensions_labels_columns_names)) - if ~any(strcmp(col_,flat_series.labels)) - df{series_length+jj+1,col} = NaN; + if isfield(flat_series, 'labels') + if ~any(strcmp(col_,flat_series.labels)) + df{series_length+jj+1,col} = NaN; + else + idx = find(strcmp(flat_series.labels, col_)); + df{series_length+jj+1,col} = flat_series.labels{idx,2}; + end else - idx = find(strcmp(flat_series.labels, col_)); - df{series_length+jj+1,col} = flat_series.labels{idx,2}; + df{series_length+jj+1,col} = NaN; end elseif any(strcmp(col_, dimensions_codes_columns_names)) && ~any(strcmp(col_,string(fieldnames(flat_series)'))) df{series_length+jj+1,col} = NaN; + elseif strcmp(col_, 'period_middle_day') + if isfield(flat_series, 'period_middle_day') + df{series_length+jj+1,col} = flat_series.(col_){jj}; + else + df{series_length+jj+1,col} = NaN; + end else df{series_length+jj+1,col} = flat_series.(col_); end diff --git a/src/filter_series.m b/src/filter_series.m new file mode 100644 index 0000000..5b1fcac --- /dev/null +++ b/src/filter_series.m @@ -0,0 +1,11 @@ +function filtered_series = filter_series(series_list, dbnomics_filters, editor_api_base_url) +%FILTER_SERIES Summary of this function goes here +% Detailed explanation goes here +if strcmp(editor_api_base_url(end),'/') == 0 + editor_api_base_url = strcat(editor_api_base_url, '/'); +end + +apply_endpoint_url = strcat(editor_api_base_url,'apply'); +filtered_series = iter_filtered_series(series_list, dbnomics_filters, apply_endpoint_url); +end + diff --git a/src/flatten_editor_series.m b/src/flatten_editor_series.m new file mode 100644 index 0000000..2fa5b29 --- /dev/null +++ b/src/flatten_editor_series.m @@ -0,0 +1,18 @@ +function series = flatten_editor_series(series, dbnomics_series) +% Adapt Time Series Editor series attributes to ease DataFrame construction. +series = normalize_period(series); +series = normalize_value(series); + +series.x_frequency = series.frequency; +series = rmfield(series, 'frequency'); +orig_fields = ["provider_code", "dataset_code", "dataset_name"]; +for ii = 1:length(orig_fields) + series.(orig_fields{ii}) = dbnomics_series.(orig_fields{ii}); +end + +series.series_code = [dbnomics_series.series_code '_filtered']; +if isfield(dbnomics_series, 'series_name') + series.series_name = [dbnomics_series.series_name, ' (filtered)']; +end +series.filtered = true; +end \ No newline at end of file diff --git a/src/iter_filtered_series.m b/src/iter_filtered_series.m new file mode 100644 index 0000000..4008dfb --- /dev/null +++ b/src/iter_filtered_series.m @@ -0,0 +1,36 @@ +function filtered_series_list = iter_filtered_series(series_list, dbnomics_filters, apply_endpoint_url) +editor_apply_endpoint_nb_series_per_post = 100; +opts = weboptions('ContentType','json', 'MediaType','application/json', 'RequestMethod','POST'); + +if size(series_list, 2) > editor_apply_endpoint_nb_series_per_post + grouped_series = mat2cell(series_list,1,repmat(editor_apply_endpoint_nb_series_per_post, size(series_list,1), size(series_list,2))); +else + grouped_series = {series_list}; +end + +for gg = 1:size(grouped_series, 2) + series_list = grouped_series{gg}; + posted_series_list = cell(1,size(series_list,2)); + series_fields = ["x_frequency", "period_start_day", "value"]; + posted_series_fields = ["frequency", "period_start_day", "value"]; + for series = 1:size(series_list, 2) + for ii = 1:length(posted_series_fields) + posted_series.(posted_series_fields{ii}) = series_list{series}.(series_fields{ii}); + end + posted_series_list{series} = posted_series; + end + + json_request = sprintf('{"filters":%s,"series":%s}', dbnomics_filters, jsonencode(posted_series_list)); + try + response = webwrite(apply_endpoint_url, json_request, opts); + catch + error("Invalid response from Time Series Editor (JSON expected)"); + end + + filtered_series_list = cell(1,size(series_list,2)); + for ii = 1:length(response.filter_results) + filtered_series = flatten_editor_series(response.filter_results(ii).series, series_list{ii}); + filtered_series_list{ii} = filtered_series; + end +end +end \ No newline at end of file -- GitLab