pruned venvs
This commit is contained in:
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -1,90 +0,0 @@
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_raises_regex
|
||||
|
||||
|
||||
def test_compression_roundtrip(compression):
|
||||
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
|
||||
[12.32112, 123123.2, 321321.2]],
|
||||
index=['A', 'B'], columns=['X', 'Y', 'Z'])
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_json(path, compression=compression)
|
||||
assert_frame_equal(df, pd.read_json(path,
|
||||
compression=compression))
|
||||
|
||||
# explicitly ensure file was compressed.
|
||||
with tm.decompress_file(path, compression) as fh:
|
||||
result = fh.read().decode('utf8')
|
||||
assert_frame_equal(df, pd.read_json(result))
|
||||
|
||||
|
||||
def test_read_zipped_json(datapath):
|
||||
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
|
||||
uncompressed_df = pd.read_json(uncompressed_path)
|
||||
|
||||
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
|
||||
compressed_df = pd.read_json(compressed_path, compression='zip')
|
||||
|
||||
assert_frame_equal(uncompressed_df, compressed_df)
|
||||
|
||||
|
||||
def test_with_s3_url(compression):
|
||||
boto3 = pytest.importorskip('boto3')
|
||||
pytest.importorskip('s3fs')
|
||||
moto = pytest.importorskip('moto')
|
||||
|
||||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
|
||||
with moto.mock_s3():
|
||||
conn = boto3.resource("s3", region_name="us-east-1")
|
||||
bucket = conn.create_bucket(Bucket="pandas-test")
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_json(path, compression=compression)
|
||||
with open(path, 'rb') as f:
|
||||
bucket.put_object(Key='test-1', Body=f)
|
||||
|
||||
roundtripped_df = pd.read_json('s3://pandas-test/test-1',
|
||||
compression=compression)
|
||||
assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_lines_with_compression(compression):
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
|
||||
df.to_json(path, orient='records', lines=True,
|
||||
compression=compression)
|
||||
roundtripped_df = pd.read_json(path, lines=True,
|
||||
compression=compression)
|
||||
assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_chunksize_with_compression(compression):
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
|
||||
df.to_json(path, orient='records', lines=True,
|
||||
compression=compression)
|
||||
|
||||
res = pd.read_json(path, lines=True, chunksize=1,
|
||||
compression=compression)
|
||||
roundtripped_df = pd.concat(res)
|
||||
assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_write_unsupported_compression_type():
|
||||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
|
||||
with tm.ensure_clean() as path:
|
||||
msg = "Unrecognized compression type: unsupported"
|
||||
assert_raises_regex(ValueError, msg, df.to_json,
|
||||
path, compression="unsupported")
|
||||
|
||||
|
||||
def test_read_unsupported_compression_type():
|
||||
with tm.ensure_clean() as path:
|
||||
msg = "Unrecognized compression type: unsupported"
|
||||
assert_raises_regex(ValueError, msg, pd.read_json,
|
||||
path, compression="unsupported")
|
||||
-575
@@ -1,575 +0,0 @@
|
||||
"""Tests for Table Schema integration."""
|
||||
import json
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
PeriodDtype, CategoricalDtype, DatetimeTZDtype)
|
||||
from pandas.io.json.table_schema import (
|
||||
as_json_table_type,
|
||||
build_table_schema,
|
||||
convert_pandas_type_to_json_field,
|
||||
convert_json_field_to_pandas_type,
|
||||
set_default_names)
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestBuildSchema(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
self.df = DataFrame(
|
||||
{'A': [1, 2, 3, 4],
|
||||
'B': ['a', 'b', 'c', 'c'],
|
||||
'C': pd.date_range('2016-01-01', freq='d', periods=4),
|
||||
'D': pd.timedelta_range('1H', periods=4, freq='T'),
|
||||
},
|
||||
index=pd.Index(range(4), name='idx'))
|
||||
|
||||
def test_build_table_schema(self):
|
||||
result = build_table_schema(self.df, version=False)
|
||||
expected = {
|
||||
'fields': [{'name': 'idx', 'type': 'integer'},
|
||||
{'name': 'A', 'type': 'integer'},
|
||||
{'name': 'B', 'type': 'string'},
|
||||
{'name': 'C', 'type': 'datetime'},
|
||||
{'name': 'D', 'type': 'duration'},
|
||||
],
|
||||
'primaryKey': ['idx']
|
||||
}
|
||||
assert result == expected
|
||||
result = build_table_schema(self.df)
|
||||
assert "pandas_version" in result
|
||||
|
||||
def test_series(self):
|
||||
s = pd.Series([1, 2, 3], name='foo')
|
||||
result = build_table_schema(s, version=False)
|
||||
expected = {'fields': [{'name': 'index', 'type': 'integer'},
|
||||
{'name': 'foo', 'type': 'integer'}],
|
||||
'primaryKey': ['index']}
|
||||
assert result == expected
|
||||
result = build_table_schema(s)
|
||||
assert 'pandas_version' in result
|
||||
|
||||
def test_series_unnamed(self):
|
||||
result = build_table_schema(pd.Series([1, 2, 3]), version=False)
|
||||
expected = {'fields': [{'name': 'index', 'type': 'integer'},
|
||||
{'name': 'values', 'type': 'integer'}],
|
||||
'primaryKey': ['index']}
|
||||
assert result == expected
|
||||
|
||||
def test_multiindex(self):
|
||||
df = self.df.copy()
|
||||
idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)])
|
||||
df.index = idx
|
||||
|
||||
result = build_table_schema(df, version=False)
|
||||
expected = {
|
||||
'fields': [{'name': 'level_0', 'type': 'string'},
|
||||
{'name': 'level_1', 'type': 'integer'},
|
||||
{'name': 'A', 'type': 'integer'},
|
||||
{'name': 'B', 'type': 'string'},
|
||||
{'name': 'C', 'type': 'datetime'},
|
||||
{'name': 'D', 'type': 'duration'},
|
||||
],
|
||||
'primaryKey': ['level_0', 'level_1']
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
df.index.names = ['idx0', None]
|
||||
expected['fields'][0]['name'] = 'idx0'
|
||||
expected['primaryKey'] = ['idx0', 'level_1']
|
||||
result = build_table_schema(df, version=False)
|
||||
assert result == expected
|
||||
|
||||
|
||||
class TestTableSchemaType(object):
|
||||
|
||||
@pytest.mark.parametrize('int_type', [
|
||||
np.int, np.int16, np.int32, np.int64])
|
||||
def test_as_json_table_type_int_data(self, int_type):
|
||||
int_data = [1, 2, 3]
|
||||
assert as_json_table_type(np.array(
|
||||
int_data, dtype=int_type)) == 'integer'
|
||||
|
||||
@pytest.mark.parametrize('float_type', [
|
||||
np.float, np.float16, np.float32, np.float64])
|
||||
def test_as_json_table_type_float_data(self, float_type):
|
||||
float_data = [1., 2., 3.]
|
||||
assert as_json_table_type(np.array(
|
||||
float_data, dtype=float_type)) == 'number'
|
||||
|
||||
@pytest.mark.parametrize('bool_type', [bool, np.bool])
|
||||
def test_as_json_table_type_bool_data(self, bool_type):
|
||||
bool_data = [True, False]
|
||||
assert as_json_table_type(np.array(
|
||||
bool_data, dtype=bool_type)) == 'boolean'
|
||||
|
||||
@pytest.mark.parametrize('date_data', [
|
||||
pd.to_datetime(['2016']),
|
||||
pd.to_datetime(['2016'], utc=True),
|
||||
pd.Series(pd.to_datetime(['2016'])),
|
||||
pd.Series(pd.to_datetime(['2016'], utc=True)),
|
||||
pd.period_range('2016', freq='A', periods=3)
|
||||
])
|
||||
def test_as_json_table_type_date_data(self, date_data):
|
||||
assert as_json_table_type(date_data) == 'datetime'
|
||||
|
||||
@pytest.mark.parametrize('str_data', [
|
||||
pd.Series(['a', 'b']), pd.Index(['a', 'b'])])
|
||||
def test_as_json_table_type_string_data(self, str_data):
|
||||
assert as_json_table_type(str_data) == 'string'
|
||||
|
||||
@pytest.mark.parametrize('cat_data', [
|
||||
pd.Categorical(['a']),
|
||||
pd.Categorical([1]),
|
||||
pd.Series(pd.Categorical([1])),
|
||||
pd.CategoricalIndex([1]),
|
||||
pd.Categorical([1])])
|
||||
def test_as_json_table_type_categorical_data(self, cat_data):
|
||||
assert as_json_table_type(cat_data) == 'any'
|
||||
|
||||
# ------
|
||||
# dtypes
|
||||
# ------
|
||||
@pytest.mark.parametrize('int_dtype', [
|
||||
np.int, np.int16, np.int32, np.int64])
|
||||
def test_as_json_table_type_int_dtypes(self, int_dtype):
|
||||
assert as_json_table_type(int_dtype) == 'integer'
|
||||
|
||||
@pytest.mark.parametrize('float_dtype', [
|
||||
np.float, np.float16, np.float32, np.float64])
|
||||
def test_as_json_table_type_float_dtypes(self, float_dtype):
|
||||
assert as_json_table_type(float_dtype) == 'number'
|
||||
|
||||
@pytest.mark.parametrize('bool_dtype', [bool, np.bool])
|
||||
def test_as_json_table_type_bool_dtypes(self, bool_dtype):
|
||||
assert as_json_table_type(bool_dtype) == 'boolean'
|
||||
|
||||
@pytest.mark.parametrize('date_dtype', [
|
||||
np.datetime64, np.dtype("<M8[ns]"), PeriodDtype(),
|
||||
DatetimeTZDtype('ns', 'US/Central')])
|
||||
def test_as_json_table_type_date_dtypes(self, date_dtype):
|
||||
# TODO: datedate.date? datetime.time?
|
||||
assert as_json_table_type(date_dtype) == 'datetime'
|
||||
|
||||
@pytest.mark.parametrize('td_dtype', [
|
||||
np.timedelta64, np.dtype("<m8[ns]")])
|
||||
def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
|
||||
assert as_json_table_type(td_dtype) == 'duration'
|
||||
|
||||
@pytest.mark.parametrize('str_dtype', [object]) # TODO
|
||||
def test_as_json_table_type_string_dtypes(self, str_dtype):
|
||||
assert as_json_table_type(str_dtype) == 'string'
|
||||
|
||||
def test_as_json_table_type_categorical_dtypes(self):
|
||||
# TODO: I think before is_categorical_dtype(Categorical)
|
||||
# returned True, but now it's False. Figure out why or
|
||||
# if it matters
|
||||
assert as_json_table_type(pd.Categorical(['a'])) == 'any'
|
||||
assert as_json_table_type(CategoricalDtype()) == 'any'
|
||||
|
||||
|
||||
class TestTableOrient(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
self.df = DataFrame(
|
||||
{'A': [1, 2, 3, 4],
|
||||
'B': ['a', 'b', 'c', 'c'],
|
||||
'C': pd.date_range('2016-01-01', freq='d', periods=4),
|
||||
'D': pd.timedelta_range('1H', periods=4, freq='T'),
|
||||
'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
|
||||
'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
|
||||
ordered=True)),
|
||||
'G': [1., 2., 3, 4.],
|
||||
'H': pd.date_range('2016-01-01', freq='d', periods=4,
|
||||
tz='US/Central'),
|
||||
},
|
||||
index=pd.Index(range(4), name='idx'))
|
||||
|
||||
def test_build_series(self):
|
||||
s = pd.Series([1, 2], name='a')
|
||||
s.index.name = 'id'
|
||||
result = s.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result['schema']
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
fields = [{'name': 'id', 'type': 'integer'},
|
||||
{'name': 'a', 'type': 'integer'}]
|
||||
|
||||
schema = {
|
||||
'fields': fields,
|
||||
'primaryKey': ['id'],
|
||||
}
|
||||
|
||||
expected = OrderedDict([
|
||||
('schema', schema),
|
||||
('data', [OrderedDict([('id', 0), ('a', 1)]),
|
||||
OrderedDict([('id', 1), ('a', 2)])])])
|
||||
assert result == expected
|
||||
|
||||
def test_to_json(self):
|
||||
df = self.df.copy()
|
||||
df.index.name = 'idx'
|
||||
result = df.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result['schema']
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
fields = [
|
||||
{'name': 'idx', 'type': 'integer'},
|
||||
{'name': 'A', 'type': 'integer'},
|
||||
{'name': 'B', 'type': 'string'},
|
||||
{'name': 'C', 'type': 'datetime'},
|
||||
{'name': 'D', 'type': 'duration'},
|
||||
{'constraints': {'enum': ['a', 'b', 'c']},
|
||||
'name': 'E',
|
||||
'ordered': False,
|
||||
'type': 'any'},
|
||||
{'constraints': {'enum': ['a', 'b', 'c']},
|
||||
'name': 'F',
|
||||
'ordered': True,
|
||||
'type': 'any'},
|
||||
{'name': 'G', 'type': 'number'},
|
||||
{'name': 'H', 'type': 'datetime', 'tz': 'US/Central'}
|
||||
]
|
||||
|
||||
schema = {
|
||||
'fields': fields,
|
||||
'primaryKey': ['idx'],
|
||||
}
|
||||
data = [
|
||||
OrderedDict([('idx', 0), ('A', 1), ('B', 'a'),
|
||||
('C', '2016-01-01T00:00:00.000Z'),
|
||||
('D', 'P0DT1H0M0S'),
|
||||
('E', 'a'), ('F', 'a'), ('G', 1.),
|
||||
('H', '2016-01-01T06:00:00.000Z')
|
||||
]),
|
||||
OrderedDict([('idx', 1), ('A', 2), ('B', 'b'),
|
||||
('C', '2016-01-02T00:00:00.000Z'),
|
||||
('D', 'P0DT1H1M0S'),
|
||||
('E', 'b'), ('F', 'b'), ('G', 2.),
|
||||
('H', '2016-01-02T06:00:00.000Z')
|
||||
]),
|
||||
OrderedDict([('idx', 2), ('A', 3), ('B', 'c'),
|
||||
('C', '2016-01-03T00:00:00.000Z'),
|
||||
('D', 'P0DT1H2M0S'),
|
||||
('E', 'c'), ('F', 'c'), ('G', 3.),
|
||||
('H', '2016-01-03T06:00:00.000Z')
|
||||
]),
|
||||
OrderedDict([('idx', 3), ('A', 4), ('B', 'c'),
|
||||
('C', '2016-01-04T00:00:00.000Z'),
|
||||
('D', 'P0DT1H3M0S'),
|
||||
('E', 'c'), ('F', 'c'), ('G', 4.),
|
||||
('H', '2016-01-04T06:00:00.000Z')
|
||||
]),
|
||||
]
|
||||
expected = OrderedDict([('schema', schema), ('data', data)])
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_float_index(self):
|
||||
data = pd.Series(1, index=[1., 2.])
|
||||
result = data.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
expected = (
|
||||
OrderedDict([('schema', {
|
||||
'fields': [{'name': 'index', 'type': 'number'},
|
||||
{'name': 'values', 'type': 'integer'}],
|
||||
'primaryKey': ['index']
|
||||
}),
|
||||
('data', [OrderedDict([('index', 1.0), ('values', 1)]),
|
||||
OrderedDict([('index', 2.0), ('values', 1)])])])
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_period_index(self):
|
||||
idx = pd.period_range('2016', freq='Q-JAN', periods=2)
|
||||
data = pd.Series(1, idx)
|
||||
result = data.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
fields = [{'freq': 'Q-JAN', 'name': 'index', 'type': 'datetime'},
|
||||
{'name': 'values', 'type': 'integer'}]
|
||||
|
||||
schema = {'fields': fields, 'primaryKey': ['index']}
|
||||
data = [OrderedDict([('index', '2015-11-01T00:00:00.000Z'),
|
||||
('values', 1)]),
|
||||
OrderedDict([('index', '2016-02-01T00:00:00.000Z'),
|
||||
('values', 1)])]
|
||||
expected = OrderedDict([('schema', schema), ('data', data)])
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_categorical_index(self):
|
||||
data = pd.Series(1, pd.CategoricalIndex(['a', 'b']))
|
||||
result = data.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
expected = (
|
||||
OrderedDict([('schema',
|
||||
{'fields': [{'name': 'index', 'type': 'any',
|
||||
'constraints': {'enum': ['a', 'b']},
|
||||
'ordered': False},
|
||||
{'name': 'values', 'type': 'integer'}],
|
||||
'primaryKey': ['index']}),
|
||||
('data', [
|
||||
OrderedDict([('index', 'a'),
|
||||
('values', 1)]),
|
||||
OrderedDict([('index', 'b'), ('values', 1)])])])
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
def test_date_format_raises(self):
|
||||
with pytest.raises(ValueError):
|
||||
self.df.to_json(orient='table', date_format='epoch')
|
||||
|
||||
# others work
|
||||
self.df.to_json(orient='table', date_format='iso')
|
||||
self.df.to_json(orient='table')
|
||||
|
||||
@pytest.mark.parametrize('kind', [pd.Series, pd.Index])
|
||||
def test_convert_pandas_type_to_json_field_int(self, kind):
|
||||
data = [1, 2, 3]
|
||||
result = convert_pandas_type_to_json_field(kind(data, name='name'))
|
||||
expected = {"name": "name", "type": "integer"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize('kind', [pd.Series, pd.Index])
|
||||
def test_convert_pandas_type_to_json_field_float(self, kind):
|
||||
data = [1., 2., 3.]
|
||||
result = convert_pandas_type_to_json_field(kind(data, name='name'))
|
||||
expected = {"name": "name", "type": "number"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize('dt_args,extra_exp', [
|
||||
({}, {}), ({'utc': True}, {'tz': 'UTC'})])
|
||||
@pytest.mark.parametrize('wrapper', [None, pd.Series])
|
||||
def test_convert_pandas_type_to_json_field_datetime(self, dt_args,
|
||||
extra_exp, wrapper):
|
||||
data = [1., 2., 3.]
|
||||
data = pd.to_datetime(data, **dt_args)
|
||||
if wrapper is pd.Series:
|
||||
data = pd.Series(data, name='values')
|
||||
result = convert_pandas_type_to_json_field(data)
|
||||
expected = {"name": "values", "type": 'datetime'}
|
||||
expected.update(extra_exp)
|
||||
assert result == expected
|
||||
|
||||
def test_convert_pandas_type_to_json_period_range(self):
|
||||
arr = pd.period_range('2016', freq='A-DEC', periods=4)
|
||||
result = convert_pandas_type_to_json_field(arr)
|
||||
expected = {"name": "values", "type": 'datetime', "freq": "A-DEC"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize('kind', [pd.Categorical, pd.CategoricalIndex])
|
||||
@pytest.mark.parametrize('ordered', [True, False])
|
||||
def test_convert_pandas_type_to_json_field_categorical(self, kind,
|
||||
ordered):
|
||||
data = ['a', 'b', 'c']
|
||||
if kind is pd.Categorical:
|
||||
arr = pd.Series(kind(data, ordered=ordered), name='cats')
|
||||
elif kind is pd.CategoricalIndex:
|
||||
arr = kind(data, ordered=ordered, name='cats')
|
||||
|
||||
result = convert_pandas_type_to_json_field(arr)
|
||||
expected = {"name": "cats", "type": "any",
|
||||
"constraints": {"enum": data},
|
||||
"ordered": ordered}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize("inp,exp", [
|
||||
({'type': 'integer'}, 'int64'),
|
||||
({'type': 'number'}, 'float64'),
|
||||
({'type': 'boolean'}, 'bool'),
|
||||
({'type': 'duration'}, 'timedelta64'),
|
||||
({'type': 'datetime'}, 'datetime64[ns]'),
|
||||
({'type': 'datetime', 'tz': 'US/Hawaii'}, 'datetime64[ns, US/Hawaii]'),
|
||||
({'type': 'any'}, 'object'),
|
||||
({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
|
||||
'ordered': False}, CategoricalDtype(categories=['a', 'b', 'c'],
|
||||
ordered=False)),
|
||||
({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
|
||||
'ordered': True}, CategoricalDtype(categories=['a', 'b', 'c'],
|
||||
ordered=True)),
|
||||
({'type': 'string'}, 'object')])
|
||||
def test_convert_json_field_to_pandas_type(self, inp, exp):
|
||||
field = {'name': 'foo'}
|
||||
field.update(inp)
|
||||
assert convert_json_field_to_pandas_type(field) == exp
|
||||
|
||||
@pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
|
||||
def test_convert_json_field_to_pandas_type_raises(self, inp):
|
||||
field = {'type': inp}
|
||||
with tm.assert_raises_regex(ValueError, "Unsupported or invalid field "
|
||||
"type: {}".format(inp)):
|
||||
convert_json_field_to_pandas_type(field)
|
||||
|
||||
def test_categorical(self):
|
||||
s = pd.Series(pd.Categorical(['a', 'b', 'a']))
|
||||
s.index.name = 'idx'
|
||||
result = s.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
fields = [{'name': 'idx', 'type': 'integer'},
|
||||
{'constraints': {'enum': ['a', 'b']},
|
||||
'name': 'values',
|
||||
'ordered': False,
|
||||
'type': 'any'}]
|
||||
|
||||
expected = OrderedDict([
|
||||
('schema', {'fields': fields,
|
||||
'primaryKey': ['idx']}),
|
||||
('data', [OrderedDict([('idx', 0), ('values', 'a')]),
|
||||
OrderedDict([('idx', 1), ('values', 'b')]),
|
||||
OrderedDict([('idx', 2), ('values', 'a')])])])
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize('idx,nm,prop', [
|
||||
(pd.Index([1]), 'index', 'name'),
|
||||
(pd.Index([1], name='myname'), 'myname', 'name'),
|
||||
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')]),
|
||||
['level_0', 'level_1'], 'names'),
|
||||
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
|
||||
names=['n1', 'n2']),
|
||||
['n1', 'n2'], 'names'),
|
||||
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
|
||||
names=['n1', None]),
|
||||
['n1', 'level_1'], 'names')
|
||||
])
|
||||
def test_set_names_unset(self, idx, nm, prop):
|
||||
data = pd.Series(1, idx)
|
||||
result = set_default_names(data)
|
||||
assert getattr(result.index, prop) == nm
|
||||
|
||||
@pytest.mark.parametrize("idx", [
|
||||
pd.Index([], name='index'),
|
||||
pd.MultiIndex.from_arrays([['foo'], ['bar']],
|
||||
names=('level_0', 'level_1')),
|
||||
pd.MultiIndex.from_arrays([['foo'], ['bar']],
|
||||
names=('foo', 'level_1'))
|
||||
])
|
||||
def test_warns_non_roundtrippable_names(self, idx):
|
||||
# GH 19130
|
||||
df = pd.DataFrame([[]], index=idx)
|
||||
df.index.name = 'index'
|
||||
with tm.assert_produces_warning():
|
||||
set_default_names(df)
|
||||
|
||||
def test_timestamp_in_columns(self):
|
||||
df = pd.DataFrame([[1, 2]], columns=[pd.Timestamp('2016'),
|
||||
pd.Timedelta(10, unit='s')])
|
||||
result = df.to_json(orient="table")
|
||||
js = json.loads(result)
|
||||
assert js['schema']['fields'][1]['name'] == 1451606400000
|
||||
assert js['schema']['fields'][2]['name'] == 10000
|
||||
|
||||
@pytest.mark.parametrize('case', [
|
||||
pd.Series([1], index=pd.Index([1], name='a'), name='a'),
|
||||
pd.DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
|
||||
pd.DataFrame({"A": [1]}, index=pd.MultiIndex.from_arrays([
|
||||
['a'], [1]], names=["A", "a"]))
|
||||
])
|
||||
def test_overlapping_names(self, case):
|
||||
with tm.assert_raises_regex(ValueError, 'Overlapping'):
|
||||
case.to_json(orient='table')
|
||||
|
||||
def test_mi_falsey_name(self):
|
||||
# GH 16203
|
||||
df = pd.DataFrame(np.random.randn(4, 4),
|
||||
index=pd.MultiIndex.from_product([('A', 'B'),
|
||||
('a', 'b')]))
|
||||
result = [x['name'] for x in build_table_schema(df)['fields']]
|
||||
assert result == ['level_0', 'level_1', 0, 1, 2, 3]
|
||||
|
||||
|
||||
class TestTableOrientReader(object):
|
||||
|
||||
@pytest.mark.parametrize("index_nm", [
|
||||
None, "idx", pytest.param("index", marks=pytest.mark.xfail),
|
||||
'level_0'])
|
||||
@pytest.mark.parametrize("vals", [
|
||||
{'ints': [1, 2, 3, 4]},
|
||||
{'objects': ['a', 'b', 'c', 'd']},
|
||||
{'date_ranges': pd.date_range('2016-01-01', freq='d', periods=4)},
|
||||
{'categoricals': pd.Series(pd.Categorical(['a', 'b', 'c', 'c']))},
|
||||
{'ordered_cats': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
|
||||
ordered=True))},
|
||||
pytest.param({'floats': [1., 2., 3., 4.]}, marks=pytest.mark.xfail),
|
||||
{'floats': [1.1, 2.2, 3.3, 4.4]},
|
||||
{'bools': [True, False, False, True]}])
|
||||
def test_read_json_table_orient(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize("index_nm", [
|
||||
None, "idx", "index"])
|
||||
@pytest.mark.parametrize("vals", [
|
||||
{'timedeltas': pd.timedelta_range('1H', periods=4, freq='T')},
|
||||
{'timezones': pd.date_range('2016-01-01', freq='d', periods=4,
|
||||
tz='US/Central')}])
|
||||
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
|
||||
out = df.to_json(orient="table")
|
||||
with tm.assert_raises_regex(NotImplementedError, 'can not yet read '):
|
||||
pd.read_json(out, orient="table")
|
||||
|
||||
def test_comprehensive(self):
|
||||
df = DataFrame(
|
||||
{'A': [1, 2, 3, 4],
|
||||
'B': ['a', 'b', 'c', 'c'],
|
||||
'C': pd.date_range('2016-01-01', freq='d', periods=4),
|
||||
# 'D': pd.timedelta_range('1H', periods=4, freq='T'),
|
||||
'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
|
||||
'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
|
||||
ordered=True)),
|
||||
'G': [1.1, 2.2, 3.3, 4.4],
|
||||
# 'H': pd.date_range('2016-01-01', freq='d', periods=4,
|
||||
# tz='US/Central'),
|
||||
'I': [True, False, False, True],
|
||||
},
|
||||
index=pd.Index(range(4), name='idx'))
|
||||
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize("index_names", [
|
||||
[None, None], ['foo', 'bar'], ['foo', None], [None, 'foo'],
|
||||
['index', 'foo']])
|
||||
def test_multiindex(self, index_names):
|
||||
# GH 18912
|
||||
df = pd.DataFrame(
|
||||
[["Arr", "alpha", [1, 2, 3, 4]],
|
||||
["Bee", "Beta", [10, 20, 30, 40]]],
|
||||
index=[["A", "B"], ["Null", "Eins"]],
|
||||
columns=["Aussprache", "Griechisch", "Args"]
|
||||
)
|
||||
df.index.names = index_names
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize("strict_check", [
|
||||
pytest.param(True, marks=pytest.mark.xfail), False])
|
||||
def test_empty_frame_roundtrip(self, strict_check):
|
||||
# GH 21287
|
||||
df = pd.DataFrame([], columns=['a', 'b', 'c'])
|
||||
expected = df.copy()
|
||||
out = df.to_json(orient='table')
|
||||
result = pd.read_json(out, orient='table')
|
||||
# TODO: When DF coercion issue (#21345) is resolved tighten type checks
|
||||
tm.assert_frame_equal(expected, result,
|
||||
check_dtype=strict_check,
|
||||
check_index_type=strict_check)
|
||||
@@ -1,442 +0,0 @@
|
||||
import pytest
|
||||
import numpy as np
|
||||
import json
|
||||
|
||||
import pandas.util.testing as tm
|
||||
from pandas import compat, Index, DataFrame
|
||||
|
||||
from pandas.io.json import json_normalize
|
||||
from pandas.io.json.normalize import nested_to_record
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def deep_nested():
|
||||
# deeply nested data
|
||||
return [{'country': 'USA',
|
||||
'states': [{'name': 'California',
|
||||
'cities': [{'name': 'San Francisco',
|
||||
'pop': 12345},
|
||||
{'name': 'Los Angeles',
|
||||
'pop': 12346}]
|
||||
},
|
||||
{'name': 'Ohio',
|
||||
'cities': [{'name': 'Columbus',
|
||||
'pop': 1234},
|
||||
{'name': 'Cleveland',
|
||||
'pop': 1236}]}
|
||||
]
|
||||
},
|
||||
{'country': 'Germany',
|
||||
'states': [{'name': 'Bayern',
|
||||
'cities': [{'name': 'Munich', 'pop': 12347}]
|
||||
},
|
||||
{'name': 'Nordrhein-Westfalen',
|
||||
'cities': [{'name': 'Duesseldorf', 'pop': 1238},
|
||||
{'name': 'Koeln', 'pop': 1239}]}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def state_data():
|
||||
return [
|
||||
{'counties': [{'name': 'Dade', 'population': 12345},
|
||||
{'name': 'Broward', 'population': 40000},
|
||||
{'name': 'Palm Beach', 'population': 60000}],
|
||||
'info': {'governor': 'Rick Scott'},
|
||||
'shortname': 'FL',
|
||||
'state': 'Florida'},
|
||||
{'counties': [{'name': 'Summit', 'population': 1234},
|
||||
{'name': 'Cuyahoga', 'population': 1337}],
|
||||
'info': {'governor': 'John Kasich'},
|
||||
'shortname': 'OH',
|
||||
'state': 'Ohio'}]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def author_missing_data():
|
||||
return [
|
||||
{'info': None},
|
||||
{'info':
|
||||
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
|
||||
'author_name':
|
||||
{'first': 'Jane', 'last_name': 'Doe'}
|
||||
}]
|
||||
|
||||
|
||||
class TestJSONNormalize(object):
|
||||
|
||||
def test_simple_records(self):
|
||||
recs = [{'a': 1, 'b': 2, 'c': 3},
|
||||
{'a': 4, 'b': 5, 'c': 6},
|
||||
{'a': 7, 'b': 8, 'c': 9},
|
||||
{'a': 10, 'b': 11, 'c': 12}]
|
||||
|
||||
result = json_normalize(recs)
|
||||
expected = DataFrame(recs)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_simple_normalize(self, state_data):
|
||||
result = json_normalize(state_data[0], 'counties')
|
||||
expected = DataFrame(state_data[0]['counties'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, 'counties')
|
||||
|
||||
expected = []
|
||||
for rec in state_data:
|
||||
expected.extend(rec['counties'])
|
||||
expected = DataFrame(expected)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, 'counties', meta='state')
|
||||
expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_empty_array(self):
|
||||
result = json_normalize([])
|
||||
expected = DataFrame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_simple_normalize_with_separator(self, deep_nested):
|
||||
# GH 14883
|
||||
result = json_normalize({'A': {'A': 1, 'B': 2}})
|
||||
expected = DataFrame([[1, 2]], columns=['A.A', 'A.B'])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_')
|
||||
expected = DataFrame([[1, 2]], columns=['A_A', 'A_B'])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3')
|
||||
expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B'])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize(deep_nested, ['states', 'cities'],
|
||||
meta=['country', ['states', 'name']],
|
||||
sep='_')
|
||||
expected = Index(['name', 'pop',
|
||||
'country', 'states_name']).sort_values()
|
||||
assert result.columns.sort_values().equals(expected)
|
||||
|
||||
def test_value_array_record_prefix(self):
|
||||
# GH 21536
|
||||
result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.')
|
||||
expected = DataFrame([[1], [2]], columns=['Prefix.0'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_more_deeply_nested(self, deep_nested):
|
||||
|
||||
result = json_normalize(deep_nested, ['states', 'cities'],
|
||||
meta=['country', ['states', 'name']])
|
||||
# meta_prefix={'states': 'state_'})
|
||||
|
||||
ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
|
||||
'states.name': ['California', 'California', 'Ohio', 'Ohio',
|
||||
'Bayern', 'Nordrhein-Westfalen',
|
||||
'Nordrhein-Westfalen'],
|
||||
'name': ['San Francisco', 'Los Angeles', 'Columbus',
|
||||
'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'],
|
||||
'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]}
|
||||
|
||||
expected = DataFrame(ex_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_shallow_nested(self):
|
||||
data = [{'state': 'Florida',
|
||||
'shortname': 'FL',
|
||||
'info': {
|
||||
'governor': 'Rick Scott'
|
||||
},
|
||||
'counties': [{'name': 'Dade', 'population': 12345},
|
||||
{'name': 'Broward', 'population': 40000},
|
||||
{'name': 'Palm Beach', 'population': 60000}]},
|
||||
{'state': 'Ohio',
|
||||
'shortname': 'OH',
|
||||
'info': {
|
||||
'governor': 'John Kasich'
|
||||
},
|
||||
'counties': [{'name': 'Summit', 'population': 1234},
|
||||
{'name': 'Cuyahoga', 'population': 1337}]}]
|
||||
|
||||
result = json_normalize(data, 'counties',
|
||||
['state', 'shortname',
|
||||
['info', 'governor']])
|
||||
ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit',
|
||||
'Cuyahoga'],
|
||||
'state': ['Florida'] * 3 + ['Ohio'] * 2,
|
||||
'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'],
|
||||
'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2,
|
||||
'population': [12345, 40000, 60000, 1234, 1337]}
|
||||
expected = DataFrame(ex_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_meta_name_conflict(self):
|
||||
data = [{'foo': 'hello',
|
||||
'bar': 'there',
|
||||
'data': [{'foo': 'something', 'bar': 'else'},
|
||||
{'foo': 'something2', 'bar': 'else2'}]}]
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
json_normalize(data, 'data', meta=['foo', 'bar'])
|
||||
|
||||
result = json_normalize(data, 'data', meta=['foo', 'bar'],
|
||||
meta_prefix='meta')
|
||||
|
||||
for val in ['metafoo', 'metabar', 'foo', 'bar']:
|
||||
assert val in result
|
||||
|
||||
def test_meta_parameter_not_modified(self):
|
||||
# GH 18610
|
||||
data = [{'foo': 'hello',
|
||||
'bar': 'there',
|
||||
'data': [{'foo': 'something', 'bar': 'else'},
|
||||
{'foo': 'something2', 'bar': 'else2'}]}]
|
||||
|
||||
COLUMNS = ['foo', 'bar']
|
||||
result = json_normalize(data, 'data', meta=COLUMNS,
|
||||
meta_prefix='meta')
|
||||
|
||||
assert COLUMNS == ['foo', 'bar']
|
||||
for val in ['metafoo', 'metabar', 'foo', 'bar']:
|
||||
assert val in result
|
||||
|
||||
def test_record_prefix(self, state_data):
|
||||
result = json_normalize(state_data[0], 'counties')
|
||||
expected = DataFrame(state_data[0]['counties'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, 'counties',
|
||||
meta='state',
|
||||
record_prefix='county_')
|
||||
|
||||
expected = []
|
||||
for rec in state_data:
|
||||
expected.extend(rec['counties'])
|
||||
expected = DataFrame(expected)
|
||||
expected = expected.rename(columns=lambda x: 'county_' + x)
|
||||
expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_non_ascii_key(self):
|
||||
if compat.PY3:
|
||||
testjson = (
|
||||
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
|
||||
b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
|
||||
).decode('utf8')
|
||||
else:
|
||||
testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
|
||||
'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]')
|
||||
|
||||
testdata = {
|
||||
u'sub.A': [1, 3],
|
||||
u'sub.B': [2, 4],
|
||||
b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1]
|
||||
}
|
||||
expected = DataFrame(testdata)
|
||||
|
||||
result = json_normalize(json.loads(testjson))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing_field(self, author_missing_data):
|
||||
# GH20030:
|
||||
result = json_normalize(author_missing_data)
|
||||
ex_data = [
|
||||
{'info': np.nan,
|
||||
'author_name.first': np.nan,
|
||||
'author_name.last_name': np.nan,
|
||||
'info.created_at': np.nan,
|
||||
'info.last_updated': np.nan},
|
||||
{'info': None,
|
||||
'author_name.first': 'Jane',
|
||||
'author_name.last_name': 'Doe',
|
||||
'info.created_at': '11/08/1993',
|
||||
'info.last_updated': '26/05/2012'}
|
||||
]
|
||||
expected = DataFrame(ex_data)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestNestedToRecord(object):
|
||||
|
||||
def test_flat_stays_flat(self):
|
||||
recs = [dict(flat1=1, flat2=2),
|
||||
dict(flat1=3, flat2=4),
|
||||
]
|
||||
|
||||
result = nested_to_record(recs)
|
||||
expected = recs
|
||||
assert result == expected
|
||||
|
||||
def test_one_level_deep_flattens(self):
|
||||
data = dict(flat1=1,
|
||||
dict1=dict(c=1, d=2))
|
||||
|
||||
result = nested_to_record(data)
|
||||
expected = {'dict1.c': 1,
|
||||
'dict1.d': 2,
|
||||
'flat1': 1}
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_nested_flattens(self):
|
||||
data = dict(flat1=1,
|
||||
dict1=dict(c=1, d=2),
|
||||
nested=dict(e=dict(c=1, d=2),
|
||||
d=2))
|
||||
|
||||
result = nested_to_record(data)
|
||||
expected = {'dict1.c': 1,
|
||||
'dict1.d': 2,
|
||||
'flat1': 1,
|
||||
'nested.d': 2,
|
||||
'nested.e.c': 1,
|
||||
'nested.e.d': 2}
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_json_normalize_errors(self):
|
||||
# GH14583: If meta keys are not always present
|
||||
# a new option to set errors='ignore' has been implemented
|
||||
i = {
|
||||
"Trades": [{
|
||||
"general": {
|
||||
"tradeid": 100,
|
||||
"trade_version": 1,
|
||||
"stocks": [{
|
||||
|
||||
"symbol": "AAPL",
|
||||
"name": "Apple",
|
||||
"price": "0"
|
||||
}, {
|
||||
"symbol": "GOOG",
|
||||
"name": "Google",
|
||||
"price": "0"
|
||||
}
|
||||
]
|
||||
}
|
||||
}, {
|
||||
"general": {
|
||||
"tradeid": 100,
|
||||
"stocks": [{
|
||||
"symbol": "AAPL",
|
||||
"name": "Apple",
|
||||
"price": "0"
|
||||
}, {
|
||||
"symbol": "GOOG",
|
||||
"name": "Google",
|
||||
"price": "0"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
j = json_normalize(data=i['Trades'],
|
||||
record_path=[['general', 'stocks']],
|
||||
meta=[['general', 'tradeid'],
|
||||
['general', 'trade_version']],
|
||||
errors='ignore')
|
||||
expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
|
||||
'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
|
||||
'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
|
||||
'price': {0: '0', 1: '0', 2: '0', 3: '0'},
|
||||
'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
|
||||
|
||||
assert j.fillna('').to_dict() == expected
|
||||
|
||||
pytest.raises(KeyError,
|
||||
json_normalize, data=i['Trades'],
|
||||
record_path=[['general', 'stocks']],
|
||||
meta=[['general', 'tradeid'],
|
||||
['general', 'trade_version']],
|
||||
errors='raise'
|
||||
)
|
||||
|
||||
def test_donot_drop_nonevalues(self):
|
||||
# GH21356
|
||||
data = [
|
||||
{'info': None,
|
||||
'author_name':
|
||||
{'first': 'Smith', 'last_name': 'Appleseed'}
|
||||
},
|
||||
{'info':
|
||||
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
|
||||
'author_name':
|
||||
{'first': 'Jane', 'last_name': 'Doe'}
|
||||
}
|
||||
]
|
||||
result = nested_to_record(data)
|
||||
expected = [
|
||||
{'info': None,
|
||||
'author_name.first': 'Smith',
|
||||
'author_name.last_name': 'Appleseed'},
|
||||
{'author_name.first': 'Jane',
|
||||
'author_name.last_name': 'Doe',
|
||||
'info.created_at': '11/08/1993',
|
||||
'info.last_updated': '26/05/2012'}]
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_nonetype_top_level_bottom_level(self):
|
||||
# GH21158: If inner level json has a key with a null value
|
||||
# make sure it doesnt do a new_d.pop twice and except
|
||||
data = {
|
||||
"id": None,
|
||||
"location": {
|
||||
"country": {
|
||||
"state": {
|
||||
"id": None,
|
||||
"town.info": {
|
||||
"id": None,
|
||||
"region": None,
|
||||
"x": 49.151580810546875,
|
||||
"y": -33.148521423339844,
|
||||
"z": 27.572303771972656}}}
|
||||
}
|
||||
}
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
'id': None,
|
||||
'location.country.state.id': None,
|
||||
'location.country.state.town.info.id': None,
|
||||
'location.country.state.town.info.region': None,
|
||||
'location.country.state.town.info.x': 49.151580810546875,
|
||||
'location.country.state.town.info.y': -33.148521423339844,
|
||||
'location.country.state.town.info.z': 27.572303771972656}
|
||||
assert result == expected
|
||||
|
||||
def test_nonetype_multiple_levels(self):
|
||||
# GH21158: If inner level json has a key with a null value
|
||||
# make sure it doesnt do a new_d.pop twice and except
|
||||
data = {
|
||||
"id": None,
|
||||
"location": {
|
||||
"id": None,
|
||||
"country": {
|
||||
"id": None,
|
||||
"state": {
|
||||
"id": None,
|
||||
"town.info": {
|
||||
"region": None,
|
||||
"x": 49.151580810546875,
|
||||
"y": -33.148521423339844,
|
||||
"z": 27.572303771972656}}}
|
||||
}
|
||||
}
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
'id': None,
|
||||
'location.id': None,
|
||||
'location.country.id': None,
|
||||
'location.country.state.id': None,
|
||||
'location.country.state.town.info.region': None,
|
||||
'location.country.state.town.info.x': 49.151580810546875,
|
||||
'location.country.state.town.info.y': -33.148521423339844,
|
||||
'location.country.state.town.info.z': 27.572303771972656}
|
||||
assert result == expected
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,169 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, read_json
|
||||
from pandas.compat import StringIO
|
||||
from pandas.io.json.json import JsonReader
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (assert_frame_equal, assert_series_equal,
|
||||
ensure_clean)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lines_json_df():
|
||||
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
return df.to_json(lines=True, orient="records")
|
||||
|
||||
|
||||
def test_read_jsonl():
|
||||
# GH9180
|
||||
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
|
||||
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_jsonl_unicode_chars():
|
||||
# GH15132: non-ascii unicode characters
|
||||
# \u201d == RIGHT DOUBLE QUOTATION MARK
|
||||
|
||||
# simulate file handle
|
||||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||||
json = StringIO(json)
|
||||
result = read_json(json, lines=True)
|
||||
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
|
||||
columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# simulate string
|
||||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||||
result = read_json(json, lines=True)
|
||||
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
|
||||
columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_jsonl():
|
||||
# GH9180
|
||||
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
|
||||
assert result == expected
|
||||
|
||||
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
|
||||
assert result == expected
|
||||
assert_frame_equal(read_json(result, lines=True), df)
|
||||
|
||||
# GH15096: escaped characters in columns and data
|
||||
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
|
||||
columns=["a\\", 'b'])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
|
||||
'{"a\\\\":"foo\\"","b":"bar"}')
|
||||
assert result == expected
|
||||
assert_frame_equal(read_json(result, lines=True), df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [1, 1.0])
|
||||
def test_readjson_chunks(lines_json_df, chunksize):
|
||||
# Basic test that read_json(chunks=True) gives the same result as
|
||||
# read_json(chunks=False)
|
||||
# GH17048: memory usage when lines=True
|
||||
|
||||
unchunked = read_json(StringIO(lines_json_df), lines=True)
|
||||
reader = read_json(StringIO(lines_json_df), lines=True,
|
||||
chunksize=chunksize)
|
||||
chunked = pd.concat(reader)
|
||||
|
||||
assert_frame_equal(chunked, unchunked)
|
||||
|
||||
|
||||
def test_readjson_chunksize_requires_lines(lines_json_df):
|
||||
msg = "chunksize can only be passed if lines=True"
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
|
||||
|
||||
|
||||
def test_readjson_chunks_series():
|
||||
# Test reading line-format JSON to Series with chunksize param
|
||||
s = pd.Series({'A': 1, 'B': 2})
|
||||
|
||||
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||||
unchunked = pd.read_json(strio, lines=True, typ='Series')
|
||||
|
||||
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||||
chunked = pd.concat(pd.read_json(
|
||||
strio, lines=True, typ='Series', chunksize=1
|
||||
))
|
||||
|
||||
assert_series_equal(chunked, unchunked)
|
||||
|
||||
|
||||
def test_readjson_each_chunk(lines_json_df):
|
||||
# Other tests check that the final result of read_json(chunksize=True)
|
||||
# is correct. This checks the intermediate chunks.
|
||||
chunks = list(
|
||||
pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
|
||||
)
|
||||
assert chunks[0].shape == (2, 2)
|
||||
assert chunks[1].shape == (1, 2)
|
||||
|
||||
|
||||
def test_readjson_chunks_from_file():
|
||||
with ensure_clean('test.json') as path:
|
||||
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
df.to_json(path, lines=True, orient="records")
|
||||
chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
|
||||
unchunked = pd.read_json(path, lines=True)
|
||||
assert_frame_equal(unchunked, chunked)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [None, 1])
|
||||
def test_readjson_chunks_closes(chunksize):
|
||||
with ensure_clean('test.json') as path:
|
||||
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
df.to_json(path, lines=True, orient="records")
|
||||
reader = JsonReader(
|
||||
path, orient=None, typ="frame", dtype=True, convert_axes=True,
|
||||
convert_dates=True, keep_default_dates=True, numpy=False,
|
||||
precise_float=False, date_unit=None, encoding=None,
|
||||
lines=True, chunksize=chunksize, compression=None)
|
||||
reader.read()
|
||||
assert reader.open_stream.closed, "didn't close stream with \
|
||||
chunksize = {chunksize}".format(chunksize=chunksize)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
|
||||
def test_readjson_invalid_chunksize(lines_json_df, chunksize):
|
||||
msg = r"'chunksize' must be an integer >=1"
|
||||
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
pd.read_json(StringIO(lines_json_df), lines=True,
|
||||
chunksize=chunksize)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [None, 1, 2])
|
||||
def test_readjson_chunks_multiple_empty_lines(chunksize):
|
||||
j = """
|
||||
|
||||
{"A":1,"B":4}
|
||||
|
||||
|
||||
|
||||
{"A":2,"B":5}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
{"A":3,"B":6}
|
||||
"""
|
||||
orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
test = pd.read_json(j, lines=True, chunksize=chunksize)
|
||||
if chunksize is not None:
|
||||
test = pd.concat(test)
|
||||
tm.assert_frame_equal(
|
||||
orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize))
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user