From 47c60a60643e34c879600c667dd600f936f870e5 Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Thu, 2 Jun 2016 16:51:05 -0700 Subject: [PATCH 01/16] ENH: Adding lines to read_json #9180 --- pandas/io/json.py | 9 ++++++++- pandas/io/tests/json/test_pandas.py | 5 +++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index fd97e51208f7e..684636de87b18 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -4,6 +4,7 @@ import copy from collections import defaultdict import numpy as np +import StringIO import pandas.json as _json from pandas.tslib import iNaT @@ -105,7 +106,7 @@ def _format_axes(self): def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, - numpy=False, precise_float=False, date_unit=None): + numpy=False, precise_float=False, date_unit=None, lines=False): """ Convert a JSON string to pandas object @@ -178,6 +179,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, is to try and detect the correct precision, but if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, milliseconds, microseconds or nanoseconds respectively. + lines : boolean, default False + Read the file as a json object per line. Returns ------- @@ -204,6 +207,10 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, else: json = filepath_or_buffer + if lines: + lines = list(StringIO.StringIO(json)) + json = '[' + ','.join(lines) + ']' + obj = None if typ == 'frame': obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 9f8aedc2e399e..3e68f10b7cb54 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -948,6 +948,11 @@ def test_tz_range_is_utc(self): df = DataFrame({'DT': dti}) self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) + def test_read_jsonl(self): + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + if __name__ == '__main__': import nose From a8dd0ef1f0f60e38303789a60e96b6d4e96d3306 Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Sat, 9 Jul 2016 20:35:19 -0500 Subject: [PATCH 02/16] Updated feature to new version number. --- doc/source/whatsnew/v0.19.0.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f65f7d57d5d08..bfb93c589b881 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -254,6 +254,7 @@ Other enhancements .. _whatsnew_0190.api: + API changes ~~~~~~~~~~~ @@ -271,7 +272,7 @@ API changes - ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) - +- The ``pd.read_json`` has gained support for reading json lines with ``lines`` option (:issue:`9180`) .. _whatsnew_0190.api.tolist: From f71d0116a78dd5706bd62d9339b2b51fdd0573c0 Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Thu, 2 Jun 2016 18:10:52 -0700 Subject: [PATCH 03/16] Updating comments from #13351 --- pandas/io/json.py | 9 ++++++--- pandas/io/tests/json/test_pandas.py | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 684636de87b18..15ad8180d3706 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -4,11 +4,10 @@ import copy from collections import defaultdict import numpy as np -import StringIO import pandas.json as _json from pandas.tslib import iNaT -from pandas.compat import long, u +from pandas.compat import StringIO, long, u from pandas import compat, isnull from pandas import Series, DataFrame, to_datetime from pandas.io.common import get_filepath_or_buffer @@ -182,6 +181,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, lines : boolean, default False Read the file as a json object per line. + .. versionadded:: 0.18.2 + Returns ------- result : Series or DataFrame @@ -208,7 +209,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, json = filepath_or_buffer if lines: - lines = list(StringIO.StringIO(json)) + # If given a json lines file, we break the string into lines, add + # commas and put it in a json list to make a valid json object. + lines = list(StringIO(json)) json = '[' + ','.join(lines) + ']' obj = None diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 3e68f10b7cb54..88023e062fb6f 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -949,6 +949,7 @@ def test_tz_range_is_utc(self): self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) def test_read_jsonl(self): + # GH9180 result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) From fc865c43dd915ec3ac28ffb69daf97b61f02a7f6 Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Tue, 7 Jun 2016 16:05:23 -0500 Subject: [PATCH 04/16] Add encoding to read_json per #13356. --- pandas/io/json.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 15ad8180d3706..835490ea223b8 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -10,7 +10,7 @@ from pandas.compat import StringIO, long, u from pandas import compat, isnull from pandas import Series, DataFrame, to_datetime -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer, _get_handle from pandas.core.common import AbstractMethodError from pandas.formats.printing import pprint_thing @@ -105,7 +105,8 @@ def _format_axes(self): def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, - numpy=False, precise_float=False, date_unit=None, lines=False): + numpy=False, precise_float=False, date_unit=None, encoding=None, + lines=False): """ Convert a JSON string to pandas object @@ -183,12 +184,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.18.2 + encoding : the encoding to use to decode py3 bytes, default is 'utf-8' + + .. versionadded:: 0.18.2 + Returns ------- result : Series or DataFrame """ - filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf) + filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, encoding=encoding) if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) @@ -199,7 +204,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, exists = False if exists: - with open(filepath_or_buffer, 'r') as fh: + with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh: json = fh.read() else: json = filepath_or_buffer @@ -212,7 +217,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, # If given a json lines file, we break the string into lines, add # commas and put it in a json list to make a valid json object. lines = list(StringIO(json)) - json = '[' + ','.join(lines) + ']' + json = u'[' + u','.join(lines) + u']' obj = None if typ == 'frame': From e8f10ea6439152b297bec5e54376612d08ae6d50 Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Wed, 8 Jun 2016 09:33:38 -0500 Subject: [PATCH 05/16] God bless ci/lint.py --- pandas/io/json.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 835490ea223b8..02ec1778fbd67 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -193,7 +193,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, result : Series or DataFrame """ - filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, encoding=encoding) + filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, + encoding=encoding) if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) From 3c796a98ac31721166e4c5390450b6a3e8fb1771 Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Sat, 9 Jul 2016 20:39:10 -0500 Subject: [PATCH 06/16] fix docstring for read_json --- pandas/io/json.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 02ec1778fbd67..12cda828186d8 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -184,7 +184,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.18.2 - encoding : the encoding to use to decode py3 bytes, default is 'utf-8' + encoding : str, default is 'utf-8' + The encoding to use to decode py3 bytes. .. versionadded:: 0.18.2 From 6861a714e3076e836c7575f3cf9adea7e7376505 Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Sat, 9 Jul 2016 21:35:29 -0500 Subject: [PATCH 07/16] Adding lines to to_json --- pandas/core/generic.py | 12 ++++++++++-- pandas/io/json.py | 25 ++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c1676fbdd7f4..7ef224775dbe6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1016,7 +1016,7 @@ def __setstate__(self, state): def to_json(self, path_or_buf=None, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None): + default_handler=None, lines=False): """ Convert the object to a JSON string. @@ -1064,6 +1064,13 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', Handler to call if object cannot otherwise be converted to a suitable format for JSON. Should receive a single argument which is the object to convert and return a serialisable object. + lines : boolean, defalut False + If 'orient' is 'records' write out json to jsaon lines format. Will + throw ValueError if incorrect 'orient' since others are not list + like. + + .. versionadded:: 0.19.0 + Returns ------- @@ -1076,7 +1083,8 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', date_format=date_format, double_precision=double_precision, force_ascii=force_ascii, date_unit=date_unit, - default_handler=default_handler) + default_handler=default_handler, + lines=lines) def to_hdf(self, path_or_buf, key, **kwargs): """Activate the HDFStore. diff --git a/pandas/io/json.py b/pandas/io/json.py index 12cda828186d8..ef4e1a3ad05c2 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -22,7 +22,10 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None): + default_handler=None, lines=False): + + if lines and orient != 'records': + raise ValueError("'lines' keyword only valid when 'orient' is records") if isinstance(obj, Series): s = SeriesWriter( @@ -37,6 +40,22 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', else: raise NotImplementedError("'obj' should be a Series or a DataFrame") + if lines and s[0] == '[' and s[-1] == ']': # Determine we have a JSON + s = s[1:-1] # list to turn to lines + num_open_brackets_seen = 0; + commas_to_replace = [] + for idx, char in enumerate(s): # iter through to find all + if char == ',': # commas that should be \n + if num_open_brackets_seen == 0: + commas_to_replace.append(idx) + elif char == '{': + num_open_brackets_seen += 1 + elif char == '}': + num_open_brackets_seen -= 1 + s_arr = np.array(list(s)) # Turn to an array to set + s_arr[commas_to_replace] = '\n' # all commas at once. + s = ''.join(s_arr) + if isinstance(path_or_buf, compat.string_types): with open(path_or_buf, 'w') as fh: fh.write(s) @@ -182,12 +201,12 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, lines : boolean, default False Read the file as a json object per line. - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 encoding : str, default is 'utf-8' The encoding to use to decode py3 bytes. - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Returns ------- From c76dafebcafaed4ff1737aed2212632e8b54c3ee Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Sat, 9 Jul 2016 22:23:06 -0500 Subject: [PATCH 08/16] Update json lines docs --- doc/source/io.rst | 20 ++++++++++++++++++++ doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/core/generic.py | 2 +- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index da0444a8b8df9..301b24f90f728 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1466,6 +1466,7 @@ with optional parameters: - ``force_ascii`` : force encoded string to be ASCII, default True. - ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. - ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object. +- ``lines`` : If ``records`` orient, then will write each record per line as json. Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters. @@ -1656,6 +1657,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` None. By default the timestamp precision will be detected, if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to seconds, milliseconds, microseconds or nanoseconds respectively. +- ``lines`` : reads file as one json object per line. +- ``encoding`` : The encoding to use to decode py3 bytes. The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable. @@ -1845,6 +1848,23 @@ into a flat table. json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) +Line delimited json +''''''''''''''''''' + +.. versionadded:: 0.19.0 + +pandas is able to read and write line-delimited jsaon files that are common in data preocessing pipelines +using Hadoop or Spark. + +.. ipython:: python + + from pandas.io.json import read_json + jsonl = '''{"a":1,"b":2} + {"a":3,"b":4}''' + df = read_json(jsonl, lines=True) + df + df.to_json(orient='records', lines=True) + HTML ---- diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index bfb93c589b881..fbd1ffcb1f585 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -272,7 +272,7 @@ API changes - ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) -- The ``pd.read_json`` has gained support for reading json lines with ``lines`` option (:issue:`9180`) +- The ``pd.read_json`` and ``pd.to_json`` has gained support for reading and writing json lines with ``lines`` option (:issue:`9180`) .. _whatsnew_0190.api.tolist: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7ef224775dbe6..cf5e99bd52993 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1065,7 +1065,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', suitable format for JSON. Should receive a single argument which is the object to convert and return a serialisable object. lines : boolean, defalut False - If 'orient' is 'records' write out json to jsaon lines format. Will + If 'orient' is 'records' write out line delimited json format. Will throw ValueError if incorrect 'orient' since others are not list like. From b20798aa72f363d4ee2b2a52d1a50e418067114c Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Sat, 9 Jul 2016 22:29:45 -0500 Subject: [PATCH 09/16] Add to_json with lines test --- pandas/io/tests/json/test_pandas.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 88023e062fb6f..29bc7691587a5 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -954,6 +954,13 @@ def test_read_jsonl(self): expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) + def test_to_jsonl(self): + # GH9180 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":1,"b":2}\n{"a":1,"b":2}' + self.assertEqual(result, expected) + if __name__ == '__main__': import nose From f547b0d46092aff8e678fc729f3b6cb8b5ade785 Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Sat, 9 Jul 2016 22:33:47 -0500 Subject: [PATCH 10/16] Pleasing flake8 --- pandas/io/json.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index ef4e1a3ad05c2..463a54b37b642 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -25,7 +25,8 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', default_handler=None, lines=False): if lines and orient != 'records': - raise ValueError("'lines' keyword only valid when 'orient' is records") + raise ValueError( + "'lines' keyword only valid when 'orient' is records") if isinstance(obj, Series): s = SeriesWriter( @@ -42,7 +43,7 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', if lines and s[0] == '[' and s[-1] == ']': # Determine we have a JSON s = s[1:-1] # list to turn to lines - num_open_brackets_seen = 0; + num_open_brackets_seen = 0 commas_to_replace = [] for idx, char in enumerate(s): # iter through to find all if char == ',': # commas that should be \n From ae19f0448ca0e5e3a3645a46de9eb3c30a4b1716 Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Mon, 11 Jul 2016 07:23:13 -0500 Subject: [PATCH 11/16] Few doc fixes from @jorisvandenbossche --- doc/source/io.rst | 6 +++--- doc/source/whatsnew/v0.19.0.txt | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 301b24f90f728..b5dc27e5532b2 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1853,15 +1853,15 @@ Line delimited json .. versionadded:: 0.19.0 -pandas is able to read and write line-delimited jsaon files that are common in data preocessing pipelines +pandas is able to read and write line-delimited json files that are common in data processing pipelines using Hadoop or Spark. .. ipython:: python - from pandas.io.json import read_json + import pandas as pd jsonl = '''{"a":1,"b":2} {"a":3,"b":4}''' - df = read_json(jsonl, lines=True) + df = pd.read_json(jsonl, lines=True) df df.to_json(orient='records', lines=True) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index fbd1ffcb1f585..e45471ffc57af 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -272,7 +272,7 @@ API changes - ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) -- The ``pd.read_json`` and ``pd.to_json`` has gained support for reading and writing json lines with ``lines`` option (:issue:`9180`) +- The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option (:issue:`9180`) .. _whatsnew_0190.api.tolist: From ac7b68772999c00b9bffcbd36d955b90ca7ffdbe Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Mon, 18 Jul 2016 22:43:39 -0500 Subject: [PATCH 12/16] Fix issue with whitespace on either side of jsonl content --- doc/source/io.rst | 6 ++++-- pandas/io/json.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index b5dc27e5532b2..6fd46a0031b2a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1859,8 +1859,10 @@ using Hadoop or Spark. .. ipython:: python import pandas as pd - jsonl = '''{"a":1,"b":2} - {"a":3,"b":4}''' + jsonl = ''' + {"a":1,"b":2} + {"a":3,"b":4} + ''' df = pd.read_json(jsonl, lines=True) df df.to_json(orient='records', lines=True) diff --git a/pandas/io/json.py b/pandas/io/json.py index 463a54b37b642..5c8847bd97fdb 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -238,7 +238,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, if lines: # If given a json lines file, we break the string into lines, add # commas and put it in a json list to make a valid json object. - lines = list(StringIO(json)) + lines = list(StringIO(json.strip())) json = u'[' + u','.join(lines) + u']' obj = None From f7c3bbf250c7a0b1a46c42e6802bda986db6b712 Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Mon, 18 Jul 2016 22:52:27 -0500 Subject: [PATCH 13/16] Test json encoding --- pandas/io/tests/json/test_pandas.py | 39 +++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 29bc7691587a5..fedb7e2ac329a 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -961,6 +961,45 @@ def test_to_jsonl(self): expected = '{"a":1,"b":2}\n{"a":1,"b":2}' self.assertEqual(result, expected) + def test_latin_encoding(self): + if compat.PY2: + self.assertRaisesRegexp( + TypeError, '\[unicode\] is not implemented as a table column') + return + + values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'a', b'b', b'c'], + [b'EE, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], + [b'', b'a', b'b', b'c'], + [b'\xf8\xfc', b'a', b'b', b'c'], + [b'A\xf8\xfc', b'', b'a', b'b', b'c'], + [np.nan, b'', b'b', b'c'], + [b'A\xf8\xfc', np.nan, b'', b'b', b'c']] + + def _try_decode(x, encoding='latin-1'): + try: + return x.decode(encoding) + except AttributeError: + return x + + # not sure how to remove latin-1 from code in python 2 and 3 + values = [[_try_decode(x) for x in y] for y in values] + + examples = [] + for dtype in ['category', object]: + for val in values: + examples.append(pandas.Series(val, dtype=dtype)) + + def roundtrip(s, encoding='latin-1'): + with ensure_clean('test.json') as path: + s.to_json(path, encoding=encoding) + retr = read_json(path, encoding=encoding) + assert_series_equal(s, retr, check_categorical=False) + + for s in examples: + roundtrip(s) + if __name__ == '__main__': import nose From 37252c69ef54777f8b8934e5c4a4999e0484d24d Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Tue, 19 Jul 2016 06:47:10 -0500 Subject: [PATCH 14/16] Fixing some minor doc issues --- doc/source/io.rst | 3 ++- doc/source/whatsnew/v0.19.0.txt | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 6fd46a0031b2a..58a3d03a9b73a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1848,6 +1848,8 @@ into a flat table. json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) +.. _io.jsonl: + Line delimited json ''''''''''''''''''' @@ -1858,7 +1860,6 @@ using Hadoop or Spark. .. ipython:: python - import pandas as pd jsonl = ''' {"a":1,"b":2} {"a":3,"b":4} diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index e45471ffc57af..f549d7361ea5f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -272,7 +272,7 @@ API changes - ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) -- The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option (:issue:`9180`) +- The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) .. _whatsnew_0190.api.tolist: From e6353187d2be47c9965bd6dafd2d9e68c31de6d0 Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Tue, 19 Jul 2016 06:47:30 -0500 Subject: [PATCH 15/16] Split line conversion to separate function to make code clearer --- pandas/io/json.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 5c8847bd97fdb..5d937856ae06d 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -17,7 +17,30 @@ loads = _json.loads dumps = _json.dumps + # interface to/from +def _convert_to_line_delimits(s): + """Helper function that converts json lists to line delimited json.""" + + # Determine we have a JSON list to turn to lines otherwise just return the + # json object, only lists can + if not s[0] == '[' and s[-1] == ']': + return s + s = s[1:-1] + num_open_brackets_seen = 0 + commas_to_replace = [] + for idx, char in enumerate(s): # iter through to find all + if char == ',': # commas that should be \n + if num_open_brackets_seen == 0: + commas_to_replace.append(idx) + elif char == '{': + num_open_brackets_seen += 1 + elif char == '}': + num_open_brackets_seen -= 1 + s_arr = np.array(list(s)) # Turn to an array to set + s_arr[commas_to_replace] = '\n' # all commas at once. + s = ''.join(s_arr) + return s def to_json(path_or_buf, obj, orient=None, date_format='epoch', @@ -41,21 +64,8 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', else: raise NotImplementedError("'obj' should be a Series or a DataFrame") - if lines and s[0] == '[' and s[-1] == ']': # Determine we have a JSON - s = s[1:-1] # list to turn to lines - num_open_brackets_seen = 0 - commas_to_replace = [] - for idx, char in enumerate(s): # iter through to find all - if char == ',': # commas that should be \n - if num_open_brackets_seen == 0: - commas_to_replace.append(idx) - elif char == '{': - num_open_brackets_seen += 1 - elif char == '}': - num_open_brackets_seen -= 1 - s_arr = np.array(list(s)) # Turn to an array to set - s_arr[commas_to_replace] = '\n' # all commas at once. - s = ''.join(s_arr) + if lines: + s = _convert_to_line_delimits(s) if isinstance(path_or_buf, compat.string_types): with open(path_or_buf, 'w') as fh: From 32a2f8d6fa0f245d29ffe08d723e6a78481e2a3f Mon Sep 17 00:00:00 2001 From: "Andy R. Terrel" Date: Wed, 20 Jul 2016 13:27:33 -0500 Subject: [PATCH 16/16] Fix test failure with pandas namespace --- pandas/io/tests/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index fedb7e2ac329a..6516ced7b5fb7 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -989,7 +989,7 @@ def _try_decode(x, encoding='latin-1'): examples = [] for dtype in ['category', object]: for val in values: - examples.append(pandas.Series(val, dtype=dtype)) + examples.append(Series(val, dtype=dtype)) def roundtrip(s, encoding='latin-1'): with ensure_clean('test.json') as path: