diff --git a/src/fetch_series.m b/src/fetch_series.m new file mode 100644 index 0000000000000000000000000000000000000000..7fce64f5a511a86a7b4f7c91384e22bc5ed5d948 --- /dev/null +++ b/src/fetch_series.m @@ -0,0 +1,96 @@ +function df = fetch_series(varargin) +% Download time series from DBnomics. +% +% If not `None`, `dimensions` parameter must be a `dict` of dimensions (`list` of `str`), like so: +% `{"freq": ["A", "M"], "country": ["FR"]}`. +% +% If not `None`, `series_code` must be a `str`. It can be a series code (one series), or a "mask" (many series): +% - remove a constraint on a dimension, for example `M..PCPIEC_WT`; +% - enumerate many values for a dimension, separated by a '+', for example `M.FR+DE.PCPIEC_WT`; +% - combine these possibilities many times in the same SDMX filter. +% +% If the rightmost dimension value code is removed, then the final '.' can be removed too: `A.FR.` = `A.FR`. +% +% If not `None`, `series_ids` parameter must be a non-empty `list` of series IDs. +% A series ID is a string formatted like `provider_code/dataset_code/series_code`. +% +% If `max_nb_series` is `None`, a default value of 50 series will be used. +% +% Return a cell array. +% +% Examples: +% +% - fetch one series: +% fetch_series('provider_code', "IMF", 'dataset_code', "CPI", 'series_code', "M.FR+DE.PCPIEC_IX+PCPIA_IX") +% fetch_series('provider_code', "IMF", 'dataset_code', "CPI", 'series_code', ".FR.PCPIEC_WT") +% fetch_series('provider_code', "IMF", 'dataset_code', "CPI", 'series_code', "M..PCPIEC_IX+PCPIA_IX") +% +% - fetch all the series of a dataset: +% fetch_series('provider_code', "AMECO", 'dataset_code', "UVGD", 'max_nb_series', 500); +% +% - fetch many series from different datasets: +% fetch_series('series_ids', ["AMECO/ZUTN/EA19.1.0.0.0.ZUTN", "AMECO/ZUTN/DNK.1.0.0.0.ZUTN", "IMF/CPI/A.AT.PCPIT_IX"]) +% +% - fetch many series from the same dataset, searching by dimension: +% fetch_series('provider_code', "AMECO", 'dataset_code', "ZUTN", dimensions={"geo": ["dnk"]}) +% + +default_api_base_url = 'https://api.db.nomics.world/v22/'; +p = inputParser; +p.addParameter('provider_code', '', @isstring); +p.addParameter('dataset_code', '', @isstring); +p.addParameter('series_code', '', @isstring); %string array shoudl be accepted as well +p.addParameter('dimensions', @ischar); %if no dimensions specified, provider_code & dataset_code MUST BE GIVEN +p.addParameter('series_ids', @iscellstr); +p.addParameter('max_nb_series', NaN, @isnumeric); +p.addParameter('api_base_url', default_api_base_url, @isstring); +p.addParameter('dbnomics_filters', '', @ischar); +p.KeepUnmatched = false; +p.parse(varargin{:}); + +if strcmp(p.Results.api_base_url(end),'/') == 0 + p.Results.api_base_url = [p.Results.api_base_url '/']; +end + +if isempty(p.Results.dataset_code) + if iscell(p.Results.provider_code) + p.Results.series_ids = p.Results.provider_code; + p.Results.provider_code = ''; + elseif isstring(p.Results.provider_code) + p.Results.series_ids = {p.Results.provider_code}; + p.Results.provider_code = ''; + end +end + +series_base_url = [p.Results.api_base_url 'series']; + +if ~isstruct(p.Results.dimensions) && ~isstring(p.Results.series_code) && ~isstring(p.Results.series_ids) + if ~isstring(p.Results.provider_code) || ~isstring(p.Results.dataset_code) + error('When you don''t use dimensions, you must specifiy provider_code and dataset_code.'); + end + api_link = sprintf('%s/%s/%s?observations=1', series_base_url, p.Results.provider_code, p.Results.dataset_code); +end + +if ischar(p.Results.dimensions) + if ~isstring(p.Results.provider_code) || ~isstring(p.Results.dataset_code) + error('When you use dimensions, you must specifiy provider_code and dataset_code.'); + end + api_link = sprintf('%s/%s/%s?observations=1&dimensions=%s', series_base_url, p.Results.provider_code, p.Results.dataset_code, p.Results.dimensions); %jsonencode( +end + +if isstring(p.Results.series_code) + if ~isstring(p.Results.provider_code) || ~isstring(p.Results.dataset_code) + error('When you use series_code, you must specifiy provider_code and dataset_code.'); + end + api_link = sprintf('%s/%s/%s/%s?observations=1', series_base_url, p.Results.provider_code, p.Results.dataset_code, p.Results.series_code); +end + +if isstring(p.Results.series_ids) + if isstring(p.Results.provider_code) || isstring(p.Results.dataset_code) + error('When you use series_ids, you must not specifiy provider_code nor dataset_code.'); + end + api_link = sprintf('%s?observations=1&series_ids=%s', series_base_url, strjoin(p.Results.series_ids,',')); +end +df = fetch_series_by_api_link(api_link, p.Results.dbnomics_filters, p.Results.max_nb_series); +end + diff --git a/src/fetch_series_by_api_link.m b/src/fetch_series_by_api_link.m new file mode 100644 index 0000000000000000000000000000000000000000..d1da92a84eedf17ae4e9eaaa1f59b11bd3e0f7bf --- /dev/null +++ b/src/fetch_series_by_api_link.m @@ -0,0 +1,96 @@ +function df = fetch_series_by_api_link(api_link, varargin) +% Fetch series given an "API link" URL. +% "API link" URLs can be found on DBnomics web site (https://db.nomics.world/) on dataset or series pages using "Download" buttons. +% +% Example: +% fetch_series_by_api_link("https://api.db.nomics.world/v22/series?provider_code=AMECO&dataset_code=ZUTN") + +p = inputParser; +p.addRequired('api_link'); +p.addOptional('dbnomics_filters', '',@ischar); +p.addOptional('max_nb_series', NaN, @isnumeric); +p.KeepUnmatched = false; +p.parse(api_link, varargin{:}); + +[datasets_dimensions, series_dims_by_dataset_code, series_list]= iter_series_info(api_link, p.Results.max_nb_series); + +if isempty(series_list) + df = {}; +else + common_columns = ["x_frequency", "provider_code", "dataset_code", "dataset_name", "series_code", "series_name","original_period", "period", "original_value", "value"]; + % Compute dimensions_labels_columns_names and dimensions_codes_columns_names + dimensions_labels_columns_names = []; + dimensions_codes_columns_names = []; + + dataset_codes = fieldnames(datasets_dimensions); + for ii = 1:length(fieldnames(datasets_dimensions)) + dataset_dimensions = datasets_dimensions.(dataset_codes{ii}); + dimensions_codes_columns_names = [dimensions_codes_columns_names, string(dataset_dimensions.dimensions_codes_order')]; + for jj = 1:length(dataset_dimensions.dimensions_codes_order) + if isfield(dataset_dimensions, 'dimensions_labels') && isfield(dataset_dimensions, 'dimensions_values_labels') + dimensions_labels_columns_names_dataset{jj} = dataset_dimensions.dimensions_labels.(dataset_dimensions.dimensions_codes_order{jj}); + else + if isfield(dataset_dimensions, 'dimensions_values_labels') + dimensions_labels_columns_names_dataset{jj} = [dataset_dimensions.dimensions_codes_order{jj} '_label']; + end + end + end + dimensions_labels_columns_names = [dimensions_labels_columns_names, string(dimensions_labels_columns_names_dataset)]; + end + ordered_columns_names = [common_columns, dimensions_codes_columns_names, dimensions_labels_columns_names]; + + df = cell(length(series_list{1}.value)*length(series_list)+1,length(ordered_columns_names)); + for col = 1:length(ordered_columns_names) + col_ = ordered_columns_names{col}; + df{1,col} = col_; + end + series_length=0; + % Flatten series received from the API (rename some keys of JSON result to match DataFrame organization) + for ii = 1:length(series_list) + flat_series = flatten_dbnomics_series(series_list{ii}); + % Add dimensions labels to flat_series + complete_dataset_code = [flat_series.provider_code '_' flat_series.dataset_code]; + dataset_dimensions = datasets_dimensions.(complete_dataset_code); + if isfield(dataset_dimensions, 'dimensions_labels') + dataset_dimensions_labels = dataset_dimensions.dimensions_labels; + else + dataset_dimensions_labels = struct(); + for jj = 1:length(dataset_dimensions.dimensions_codes_order) + dataset_dimensions_labels.(dataset_dimensions.dimensions_codes_order{jj}) = [dataset_dimensions.dimensions_codes_order{jj} '_label']; + end + end + % Add dimensions values labels to current series + if isfield(dataset_dimensions, 'dimensions_values_labels') + dimension_codes = intersect(dimensions_codes_columns_names, string(fieldnames(dataset_dimensions_labels)')); %string(fieldnames(dataset_dimensions_labels)'); + for jj = 1:length(dimension_codes) + series_code = regexprep(flat_series.series_code,'[^a-zA-Z0-9]','_'); + dimension_label = dataset_dimensions_labels.(dimension_codes{jj}); + flat_series.labels{jj, 1} = dimension_label; + dimension_value_code = regexprep(series_dims_by_dataset_code.(complete_dataset_code).(series_code).(dimension_codes{jj}),'[^a-zA-Z0-9]','_'); + flat_series.labels{jj, 2} = dataset_dimensions.dimensions_values_labels.(dimension_codes{jj}).(dimension_value_code); + end + end + % Create final cell array + for col = 1:length(ordered_columns_names) + col_ = ordered_columns_names{col}; + for jj = 1:length(flat_series.value) + if strcmp(col_, 'original_value') || strcmp(col_,'value') || strcmp(col_, 'original_period') || strcmp(col_, 'period') + df{series_length+jj+1,col} = flat_series.(col_){jj}; + elseif any(strcmp(col_,dimensions_labels_columns_names)) + if ~any(strcmp(col_,flat_series.labels)) + df{series_length+jj+1,col} = NaN; + else + idx = find(strcmp(flat_series.labels, col_)); + df{series_length+jj+1,col} = flat_series.labels{idx,2}; + end + elseif any(strcmp(col_, dimensions_codes_columns_names)) && ~any(strcmp(col_,string(fieldnames(flat_series)'))) + df{series_length+jj+1,col} = NaN; + else + df{series_length+jj+1,col} = flat_series.(col_); + end + end + end + series_length=series_length+length(flat_series.value); + end +end +end \ No newline at end of file diff --git a/src/fetch_series_page.m b/src/fetch_series_page.m new file mode 100644 index 0000000000000000000000000000000000000000..66a9b2779e03b8660db5da7d41d35589529ccc39 --- /dev/null +++ b/src/fetch_series_page.m @@ -0,0 +1,22 @@ +function response_json = fetch_series_page(series_endpoint_url,offset) +% Adapt series_endpoint_url and make API request + +if contains(series_endpoint_url, '?') + series_page_url = sprintf('%s%soffset=%i', series_endpoint_url, '&', offset); +else + series_page_url = sprintf('%s%soffset=%i', series_endpoint_url, '?', offset); +end + +options = weboptions('ContentType','json'); +try + response_json = webread(series_page_url, options); +catch ME + error_message = ['Could not fetch data from URL: ' series_page_url ' because: ' ME.identifier]; + error(error_message); +end + +series_page = response_json.series; +if ~isempty(series_page) + assert(series_page.offset == offset); +end +end diff --git a/src/flatten_dbnomics_series.m b/src/flatten_dbnomics_series.m new file mode 100644 index 0000000000000000000000000000000000000000..00c1a3084c97b91696a4981b55a5d423f7d8f224 --- /dev/null +++ b/src/flatten_dbnomics_series.m @@ -0,0 +1,29 @@ +function series = flatten_dbnomics_series(series) +% Adapt DBnomics series attributes to ease cell array construction. +% Rename some struct fields, remove other ones +% (the `series` struct is nested but we want a flat struct to build a cell array). + +series = normalize_period(series); +series = normalize_value(series); +% Flatten dimensions. +if isfield(series, 'dimensions') + dimensions = series.dimensions; + fields_dim = fieldnames(dimensions); + series = rmfield(series, {'dimensions', 'indexed_at'}); + for ii = 1:length(fields_dim) + series.(fields_dim{ii}) = dimensions.(fields_dim{ii}); + end +end + +% Flatten observation attributes. +if isfield(series, 'observation_attributes') + observation_attributes = series.observation_attributes; + fields_obs = fieldnames(observation_attributes); + series = rmfield(series, 'observation_attributes'); + for ii = 1:length(fields_obs) + series.(fields_obs{ii}) = observation_attributes.(fields_obs{ii}); + end +else + series.observation_attributes = []; +end +end \ No newline at end of file diff --git a/src/iter_series_info.m b/src/iter_series_info.m new file mode 100644 index 0000000000000000000000000000000000000000..63e23c33ec27a5c47ef8cf8fc7ade6bd8f0fd388 --- /dev/null +++ b/src/iter_series_info.m @@ -0,0 +1,85 @@ +function [datasets_dimensions, series_dims_by_dataset_code, series_list] = iter_series_info(api_link, max_nb_series) +% Iterate through series.docs returned by API +% Returns structs of dataset(s) dimensions and series. +% The answer can have a key 'dataset_dimensions' if only one dataset is returned by API, or 'datasets_dimensions' if +% more than one dataset is returned. +% - datasets_dimensions or dataset_dimensions don't change between calls +% - series is the current series +% Example: +% { +% 'datasets_dimensions': { +% "AMECO/ZUTN": { +% "code": "ZUTN", +% "converted_at": "2019-05-08T02:51:04Z", +% "dimensions_codes_order": ["freq", "unit", "geo" ...], +% ... +% }, +% "CEPII/CHELEM-TRADE-GTAP": { +% "code": "CHELEM-TRADE-GTAP", +% "converted_at": "2019-01-29T15:53:30Z", +% "dimensions_codes_order": ["exporter", "importer", "secgroup", ...], +% ... +% }, +% 'series': +% } + +default_max_nb_series = 50; +total_nb_series = 0; + +datasets_dimensions = struct(); +series_dims_by_dataset_code = struct(); + +while (true) + response_json = fetch_series_page(api_link, total_nb_series); + series_page = response_json.series; + num_found = series_page.num_found; + + if isnan(max_nb_series) && num_found > default_max_nb_series + error("DBnomics Web API found %i series matching your request, %i, but you did not pass any value for the 'max_nb_series' argument, so a default value of %i was used. Please give a higher value (at least max_nb_series=%i), and try again.", ... + num_found, default_max_nb_series, num_found); + end + + page_nb_series = length(series_page.docs); + total_nb_series = total_nb_series+page_nb_series; + + if ~isnan(max_nb_series) + if total_nb_series == max_nb_series + break; + elseif total_nb_series > max_nb_series + nb_remaining_series = page_nb_series - (total_nb_series - max_nb_series); + series_page.docs = series_page.docs(1:nb_remaining_series-1); + end + end + + series_list = cell(1,length(series_page.docs)); + + for ii = 1:length(series_page.docs) + try + series = series_page.docs{ii}; + catch + series = series_page.docs(ii); + end + series.provider_code = regexprep(series.provider_code,'[^a-zA-Z0-9]','_'); + series.dataset_code = regexprep(series.dataset_code,'[^a-zA-Z0-9]','_'); + complete_dataset_code = [series.provider_code '_' series.dataset_code]; + if numel(fieldnames(datasets_dimensions)) == 0 + assert(isfield(response_json, 'datasets') || isfield(response_json, 'dataset')); + if isfield(response_json, 'datasets') + datasets_dimensions = response_json.datasets; + else + datasets_dimensions.(complete_dataset_code) = response_json.dataset; + end + end + + series_list{ii} = series; + % Store series dimensions information for future use + series_code = regexprep(series.series_code,'[^a-zA-Z0-9]','_'); + series_dims_by_dataset_code.(complete_dataset_code).(series_code) = series.dimensions; + end + + assert(total_nb_series <= num_found); + if total_nb_series == num_found + break; + end +end +end diff --git a/src/normalize_period.m b/src/normalize_period.m new file mode 100644 index 0000000000000000000000000000000000000000..86cb1b6dcb037b9b9e3653c043986aea10f73357 --- /dev/null +++ b/src/normalize_period.m @@ -0,0 +1,12 @@ +function series = normalize_period(series) +% Keep original period and convert str to datetime. +% Modifies `series` +period = series.period; +period_start_day = series.period_start_day; +series = rmfield(series, "period_start_day"); + +series.original_period = period; +for ii = 1:length(period_start_day) + series.period{ii,1} = datestr(datetime(period_start_day{ii},'InputFormat','yyyy-MM-dd'),29); +end +end diff --git a/src/normalize_value.m b/src/normalize_value.m new file mode 100644 index 0000000000000000000000000000000000000000..2bbefbbbeaca4ac9a1c59c482a2d61ca6a9d8ad1 --- /dev/null +++ b/src/normalize_value.m @@ -0,0 +1,17 @@ +function series = normalize_value(series) +% Keep original value and convert "NA" to NaN (or user specified value). +% Modifies `series` +if iscell(series.value) + series.original_value = series.value; + value = series.value; + for ii = 1:length(value) + if strcmp(value{ii}, 'NA') + value{ii} = NaN; + end + end + series.value = value; +else + series.original_value = num2cell(series.value); + series.value = num2cell(series.value); +end +end