From 540f95c3de3cce9ff13a9e03ff769454c15a2006 Mon Sep 17 00:00:00 2001
From: Qianqian Fang <fangqq@gmail.com>
Date: Mon, 14 Feb 2022 22:35:08 -0500
Subject: [PATCH] add optional preceding whitespace, explain format

---
 loadbj.m                |  8 +++-
 loadjson.m              | 95 +++++++++++++++++++++++++++++++----------
 savejson.m              | 10 ++---
 test/run_jsonlab_test.m |  6 +--
 4 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/loadbj.m b/loadbj.m
index a94a515..589b787 100644
--- a/loadbj.m
+++ b/loadbj.m
@@ -12,7 +12,7 @@ function [data, mmap] = loadbj(fname,varargin)
 % including uint16(u), uint32(m), uint64(M) and half-precision float (h).
 % Starting from BJD Draft-2 (JSONLab 3.0 beta or later), all integer and
 % floating-point numbers are parsed in Little-Endian as opposed to
-% Big-Endian form as in BJD Draft-1/UBJSON Draft-12 (JSONLab 2.1 or older)
+% Big-Endian form as in BJD Draft-1/UBJSON Draft-12 (JSONLab 2.0 or older)
 %
 % authors:Qianqian Fang (q.fang <at> neu.edu)
 % initially created on 2013/08/01
@@ -71,6 +71,12 @@ function [data, mmap] = loadbj(fname,varargin)
 % output:
 %      dat: a cell array, where {...} blocks are converted into cell arrays,
 %           and [...] are converted to arrays
+%      mmap: (optional) a cell array in the form of
+%           {{jsonpath1,[start,length]}, {jsonpath2,[start,length]}, ...}
+%           where jsonpath_i is a string in the form of JSONPath, and
+%           start is an integer referring to the offset from the begining
+%           of the stream, and length is the JSON object string length.
+%           For more details, please see the help section of loadjson.m
 %
 % examples:
 %      obj=struct('string','value','array',[1 2 3]);
diff --git a/loadjson.m b/loadjson.m
index 60d34d8..1ee4c12 100644
--- a/loadjson.m
+++ b/loadjson.m
@@ -80,11 +80,60 @@ function [data, mmap] = loadjson(fname,varargin)
 % output:
 %      dat: a cell array, where {...} blocks are converted into cell arrays,
 %           and [...] are converted to arrays
-%      mmap: (optional) a cell array in the form of 
-%           {{jsonpath1,[start,length]}, {jsonpath2,[start,length]}, ...}
-%           where jsonpath_i is a string in the form of JSONPath [1], and
-%           start is an integer referring to the offset from the begining
-%           of the stream, and length is the JSON object string length.
+%      mmap: (optional) a cell array as memory-mapping table in the form of
+%             {{jsonpath1,[start,length,<whitespace>]},
+%              {jsonpath2,[start,length,<whitespace>]}, ...}
+%           where jsonpath_i is a string in the JSONPath [1,2] format, and
+%           "start" is an integer referring to the offset from the begining
+%           of the stream, and "length" is the JSON object string length.
+%           An optional 3rd integer "whitespace" may appear to record the
+%           preceding whitespace length in case expansion of the data
+%           record is needed when using the mmap.
+%
+%           Memory-mapping table (mmap) is useful when fast reading/writing
+%           specific data records inside a large JSON file without needing
+%           to load/parse/overwrite the entire file.
+%
+%           The JSONPath keys used in mmap is largely compatible to the
+%           upstream specification defined in [1], with a slight extension
+%           to handle contatenated JSON files.
+%
+%           In the mmap jsonpath key, a '$' denotes the root object, a '.'
+%           denotes a child of the preceding element; '.key' points to the
+%           value segment of the child named "key" of the preceding
+%           object; '.[i]' denotes the (i+1)th member of the preceding
+%           element, which must be an array. For example, a key
+%
+%           $.obj1.obj2.[0].obj3
+%
+%           defines the memory-map of the "value" section in the below
+%           hierarchy:
+%             {
+%                "obj1":{
+%            	     "obj2":[
+%                       {"obj3":value},
+%                       ...
+%                    ],
+%                    ...
+%                 }
+%             }
+%           Please note that "value" can be any valid JSON value, including
+%           an array, an object, a string or numerical value.
+%
+%           To handle concatenated JSON objects (including ndjson,
+%           http://ndjson.org/), such as
+%
+%             {"root1": {"obj1": ...}}
+%             ["root2", value1, value2, {"obj2": ...}]
+%             {"root3": ...}
+%
+%           we use '$' or '$0' for the first root-object, and '$1' refers
+%           to the 2nd root object (["root2",...]) and '$2' referrs to the
+%           3rd root object, and so on. Please note that this syntax is an
+%           extension from the JSONPath documentation [1,2]
+%
+%           [1] https://goessner.net/articles/JsonPath/
+%           [2] http://jsonpath.herokuapp.com/
 %
 % examples:
 %      dat=loadjson('{"obj":{"string":"value","array":[1,2,3]}}')
@@ -151,7 +200,7 @@ function [data, mmap] = loadjson(fname,varargin)
     opt.parsestringarray=jsonopt('ParseStringArray',0,opt);
     opt.usemap=jsonopt('UseMap',0,opt);
     opt.arraydepth_=1;
-    mmaponly=jsonopt('MmapOnly',0,opt);
+    opt.mmaponly=jsonopt('MmapOnly',0,opt);
 
     if(jsonopt('ShowProgress',0,opt)==1)
         opt.progressbar_=waitbar(0,'loading ...');
@@ -163,7 +212,7 @@ function [data, mmap] = loadjson(fname,varargin)
         maxobjid=inf;
     end
     opt.jsonpath_='$';
-    if(nargout>1 || mmaponly)
+    if(nargout>1 || opt.mmaponly)
         mmap={};
     end
     jsoncount=1;
@@ -171,19 +220,19 @@ function [data, mmap] = loadjson(fname,varargin)
         [cc,pos,w1]=next_char(inputstr, pos);
         switch(cc)
             case '{'
-                if(nargout>1 || mmaponly)
-                    mmap{end+1}={opt.jsonpath_,pos-w1};
+                if(nargout>1 || opt.mmaponly)
+                    mmap{end+1}={opt.jsonpath_,[pos, 0, w1]};
                     [data{jsoncount},pos,index_esc,newmmap] = parse_object(inputstr, pos, esc, index_esc,opt);
-                    mmap{end}{2}=[mmap{end}{2},pos-mmap{end}{2}];
+                    mmap{end}{2}(2)=pos-mmap{end}{2}(1);
                     mmap=[mmap(:);newmmap(:)];
                 else
                     [data{jsoncount},pos,index_esc] = parse_object(inputstr, pos, esc, index_esc,opt);
                 end
             case '['
-                if(nargout>1 || mmaponly)
-                    mmap{end+1}={opt.jsonpath_,pos-w1};
+                if(nargout>1 || opt.mmaponly)
+                    mmap{end+1}={opt.jsonpath_,[pos,0,w1]};
                     [data{jsoncount},pos,index_esc,newmmap] = parse_array(inputstr, pos, esc, index_esc,opt);
-                    mmap{end}{2}=[mmap{end}{2},pos-mmap{end}{2}];
+                    mmap{end}{2}(2)=pos-mmap{end}{2}(1);
                     mmap=[mmap(:);newmmap(:)];
                 else
                     [data{jsoncount},pos,index_esc] = parse_array(inputstr, pos, esc, index_esc,opt);
@@ -206,10 +255,11 @@ function [data, mmap] = loadjson(fname,varargin)
     if(jsoncount==1 && iscell(data))
         data=data{1};
     end
-    if(nargout>1 || mmaponly)
+    if(nargout>1 || opt.mmaponly)
         mmap=mmap';
         mmap=filterjsonmmap(mmap, jsonopt('MMapExclude',{},opt), 0);
         mmap=filterjsonmmap(mmap, jsonopt('MMapInclude',{},opt), 1);
+        mmap=cellfun(@(x) {x{1},x{2}(1:(2+int8(length(x{2})>=3 && (x{2}(3)>0))))}, mmap, 'UniformOutput', false);
     end
     if(jsonopt('JDataDecode',1,varargin{:})==1)
         try
@@ -220,7 +270,7 @@ function [data, mmap] = loadjson(fname,varargin)
                 ME.identifier, ME.message, savejson('',ME.stack));
         end
     end
-    if(mmaponly)
+    if(opt.mmaponly)
         data=mmap;
     end
     if(isfield(opt,'progressbar_'))
@@ -307,13 +357,14 @@ function [object, pos,index_esc, mmap] = parse_array(inputstr, pos, esc, index_e
         catch
         end
         if(isempty(endpos) || pos~=endpos)
+            w2=0;
             while 1
                 varargin{1}.arraydepth_=arraydepth+1;
                 if(nargout>3)
                     varargin{1}.jsonpath_=[origpath '.' sprintf('[%d]',length(object))];
-                    mmap{end+1}={varargin{1}.jsonpath_, pos};
+                    mmap{end+1}={varargin{1}.jsonpath_, [pos, 0, w2]};
                     [val, pos, index_esc, newmmap] = parse_value(inputstr, pos, esc, index_esc,varargin{:});
-                    mmap{end}{2}=[mmap{end}{2}, pos-mmap{end}{2}];
+                    mmap{end}{2}(2)=pos-mmap{end}{2}(1);
                     mmap=[mmap(:);newmmap(:)];
                 else
                     [val, pos,index_esc] = parse_value(inputstr, pos, esc, index_esc,varargin{:});
@@ -323,7 +374,7 @@ function [object, pos,index_esc, mmap] = parse_array(inputstr, pos, esc, index_e
                 if cc == ']'
                     break;
                 end
-                pos=parse_char(inputstr, pos, ',');
+                [pos, w1, w2]=parse_char(inputstr, pos, ',');
             end
         end
     end
@@ -528,14 +579,12 @@ function [object, pos, index_esc, mmap] = parse_object(inputstr, pos, esc, index
             if isempty(str)
                 pos=error_pos('Name of value at position %d cannot be empty',inputstr,pos);
             end
-            pos=parse_char(inputstr, pos, ':');
+            [pos, w1, w2]=parse_char(inputstr, pos, ':');
             if(nargout>3)
                 varargin{1}.jsonpath_=[origpath,'.',str];
-                mmap{end+1}={varargin{1}.jsonpath_,pos};
-            end
-            if(nargout>3)
+                mmap{end+1}={varargin{1}.jsonpath_,[pos,0,w2]};
                 [val, pos,index_esc, newmmap] = parse_value(inputstr, pos, esc, index_esc, varargin{:});
-                mmap{end}{2}=[mmap{end}{2}, pos-mmap{end}{2}];
+                mmap{end}{2}(2)=pos-mmap{end}{2}(1);
                 mmap=[mmap(:);newmmap(:)];
             else
                 [val, pos,index_esc] = parse_value(inputstr, pos, esc, index_esc, varargin{:});
diff --git a/savejson.m b/savejson.m
index 4ab2cc1..2b6a0d5 100644
--- a/savejson.m
+++ b/savejson.m
@@ -70,7 +70,7 @@ function json=savejson(rootname,obj,varargin)
 %                         wrapped inside a function call as 'foo(...);'
 %           UnpackHex [1|0]: conver the 0x[hex code] output by loadjson 
 %                         back to the string form
-%           SaveBinary [0|1]: 1 - save the JSON file in binary mode; 0 - text mode.
+%           SaveBinary [1|0]: 1 - save the JSON file in binary mode; 0 - text mode.
 %           Compact [0|1]: 1- out compact JSON format (remove all newlines and tabs)
 %           Compression  'zlib', 'gzip', 'lzma', 'lzip', 'lz4' or 'lz4hc': specify array 
 %                         compression method; currently only supports 6 methods. The
@@ -254,11 +254,11 @@ if(~isempty(filename))
     if(jsonopt('Append',0,opt))
         mode='a';
     end
-    if(jsonopt('SaveBinary',0,opt)==1)
-        if(~isempty(encoding))
-            fid = fopen(filename, [mode 'b'],endian,encoding);
-        else
+    if(jsonopt('SaveBinary',1,opt)==1)
+        if(isempty(encoding))
             fid = fopen(filename, [mode 'b'],endian);
+        else
+            fid = fopen(filename, [mode 'b'],endian,encoding);
         end
         fwrite(fid,json);
     else
diff --git a/test/run_jsonlab_test.m b/test/run_jsonlab_test.m
index 5907691..11a9a97 100644
--- a/test/run_jsonlab_test.m
+++ b/test/run_jsonlab_test.m
@@ -302,7 +302,7 @@ if(ismember('jmap',tests))
     test_jsonlab('mmap of an object',@savejson,loadjson('{"a":1,"b":[2,3]}','mmaponly',1),...
         '[["$",[1,17]],["$.a",[6,1]],["$.b",[12,5]]]','compact',1);
     test_jsonlab('mmap of object with white-space',@savejson,loadjson('{"a":1 , "b"  :  [2,3]}','mmaponly',1),...
-        '[["$",[1,23]],["$.a",[6,1]],["$.b",[18,5]]]','compact',1);
+        '[["$",[1,23]],["$.a",[6,1]],["$.b",[18,5,2]]]','compact',1);
     test_jsonlab('mmapinclude option',@savejson,loadjson('[[1,2,3],{"a":[4,5]}]','mmaponly',1,'mmapinclude','.a'),...
         '[["$.[1].a",[15,5]]]','compact',1);
     test_jsonlab('mmapexclude option',@savejson,loadjson('[[1,2,3],{"a":[4,5]}]','mmaponly',1,'mmapexclude',{'[0]','[1]','[2]'}),...
@@ -328,6 +328,6 @@ if(ismember('bmap',tests))
         '[["$.[1].a",[15,8]]]','compact',1);
     test_jsonlab('mmapexclude option',@savejson,loadbj(savebj({[1,2,3],struct('a',[4,5])}),'mmaponly',1,'mmapexclude',{'[0]','[1]','[2]'}),...
         '[["$",[1,24]]]','compact',1);
-    test_jsonlab('json with indentation',@savejson,loadbj(savebj({[1,2,3],struct('a',[4,5])}),'mmaponly',1,'mmapinclude','.a'),...
-        '[["$.[1].a",[15,8]]]','compact',1);
+    test_jsonlab('test multiple root objects with N padding',@savejson,loadbj([savebj({[1,2,3],struct('a',[4,5])}) 'NNN' savebj(struct('b',[4,5]))],'mmaponly',1,'mmapinclude','.b'),...
+        '[["$1.b",[32,8]]]','compact',1);
 end
\ No newline at end of file
-- 
GitLab