pruned venvs
This commit is contained in:
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -1,120 +0,0 @@
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
def test_compression_roundtrip(compression):
|
||||
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
|
||||
[12.32112, 123123.2, 321321.2]],
|
||||
index=['A', 'B'], columns=['X', 'Y', 'Z'])
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_json(path, compression=compression)
|
||||
assert_frame_equal(df, pd.read_json(path,
|
||||
compression=compression))
|
||||
|
||||
# explicitly ensure file was compressed.
|
||||
with tm.decompress_file(path, compression) as fh:
|
||||
result = fh.read().decode('utf8')
|
||||
assert_frame_equal(df, pd.read_json(result))
|
||||
|
||||
|
||||
def test_read_zipped_json(datapath):
|
||||
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
|
||||
uncompressed_df = pd.read_json(uncompressed_path)
|
||||
|
||||
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
|
||||
compressed_df = pd.read_json(compressed_path, compression='zip')
|
||||
|
||||
assert_frame_equal(uncompressed_df, compressed_df)
|
||||
|
||||
|
||||
@td.skip_if_not_us_locale
|
||||
def test_with_s3_url(compression, s3_resource):
|
||||
# Bucket "pandas-test" created in tests/io/conftest.py
|
||||
|
||||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_json(path, compression=compression)
|
||||
with open(path, 'rb') as f:
|
||||
s3_resource.Bucket("pandas-test").put_object(Key='test-1', Body=f)
|
||||
|
||||
roundtripped_df = pd.read_json('s3://pandas-test/test-1',
|
||||
compression=compression)
|
||||
assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_lines_with_compression(compression):
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
|
||||
df.to_json(path, orient='records', lines=True,
|
||||
compression=compression)
|
||||
roundtripped_df = pd.read_json(path, lines=True,
|
||||
compression=compression)
|
||||
assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_chunksize_with_compression(compression):
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
|
||||
df.to_json(path, orient='records', lines=True,
|
||||
compression=compression)
|
||||
|
||||
res = pd.read_json(path, lines=True, chunksize=1,
|
||||
compression=compression)
|
||||
roundtripped_df = pd.concat(res)
|
||||
assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_write_unsupported_compression_type():
|
||||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
|
||||
with tm.ensure_clean() as path:
|
||||
msg = "Unrecognized compression type: unsupported"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_json(path, compression="unsupported")
|
||||
|
||||
|
||||
def test_read_unsupported_compression_type():
|
||||
with tm.ensure_clean() as path:
|
||||
msg = "Unrecognized compression type: unsupported"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_json(path, compression="unsupported")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("to_infer", [True, False])
|
||||
@pytest.mark.parametrize("read_infer", [True, False])
|
||||
def test_to_json_compression(compression_only,
|
||||
read_infer, to_infer):
|
||||
# see gh-15008
|
||||
compression = compression_only
|
||||
|
||||
if compression == "zip":
|
||||
pytest.skip("{compression} is not supported "
|
||||
"for to_csv".format(compression=compression))
|
||||
|
||||
# We'll complete file extension subsequently.
|
||||
filename = "test."
|
||||
|
||||
if compression == "gzip":
|
||||
filename += "gz"
|
||||
else:
|
||||
# xz --> .xz
|
||||
# bz2 --> .bz2
|
||||
filename += compression
|
||||
|
||||
df = pd.DataFrame({"A": [1]})
|
||||
|
||||
to_compression = "infer" if to_infer else compression
|
||||
read_compression = "infer" if read_infer else compression
|
||||
|
||||
with tm.ensure_clean(filename) as path:
|
||||
df.to_json(path, compression=to_compression)
|
||||
result = pd.read_json(path, compression=read_compression)
|
||||
tm.assert_frame_equal(result, df)
|
||||
-580
@@ -1,580 +0,0 @@
|
||||
"""Tests for Table Schema integration."""
|
||||
from collections import OrderedDict
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
CategoricalDtype, DatetimeTZDtype, PeriodDtype)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.json.table_schema import (
|
||||
as_json_table_type, build_table_schema, convert_json_field_to_pandas_type,
|
||||
convert_pandas_type_to_json_field, set_default_names)
|
||||
|
||||
|
||||
class TestBuildSchema(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
self.df = DataFrame(
|
||||
{'A': [1, 2, 3, 4],
|
||||
'B': ['a', 'b', 'c', 'c'],
|
||||
'C': pd.date_range('2016-01-01', freq='d', periods=4),
|
||||
'D': pd.timedelta_range('1H', periods=4, freq='T'),
|
||||
},
|
||||
index=pd.Index(range(4), name='idx'))
|
||||
|
||||
def test_build_table_schema(self):
|
||||
result = build_table_schema(self.df, version=False)
|
||||
expected = {
|
||||
'fields': [{'name': 'idx', 'type': 'integer'},
|
||||
{'name': 'A', 'type': 'integer'},
|
||||
{'name': 'B', 'type': 'string'},
|
||||
{'name': 'C', 'type': 'datetime'},
|
||||
{'name': 'D', 'type': 'duration'},
|
||||
],
|
||||
'primaryKey': ['idx']
|
||||
}
|
||||
assert result == expected
|
||||
result = build_table_schema(self.df)
|
||||
assert "pandas_version" in result
|
||||
|
||||
def test_series(self):
|
||||
s = pd.Series([1, 2, 3], name='foo')
|
||||
result = build_table_schema(s, version=False)
|
||||
expected = {'fields': [{'name': 'index', 'type': 'integer'},
|
||||
{'name': 'foo', 'type': 'integer'}],
|
||||
'primaryKey': ['index']}
|
||||
assert result == expected
|
||||
result = build_table_schema(s)
|
||||
assert 'pandas_version' in result
|
||||
|
||||
def test_series_unnamed(self):
|
||||
result = build_table_schema(pd.Series([1, 2, 3]), version=False)
|
||||
expected = {'fields': [{'name': 'index', 'type': 'integer'},
|
||||
{'name': 'values', 'type': 'integer'}],
|
||||
'primaryKey': ['index']}
|
||||
assert result == expected
|
||||
|
||||
def test_multiindex(self):
|
||||
df = self.df.copy()
|
||||
idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)])
|
||||
df.index = idx
|
||||
|
||||
result = build_table_schema(df, version=False)
|
||||
expected = {
|
||||
'fields': [{'name': 'level_0', 'type': 'string'},
|
||||
{'name': 'level_1', 'type': 'integer'},
|
||||
{'name': 'A', 'type': 'integer'},
|
||||
{'name': 'B', 'type': 'string'},
|
||||
{'name': 'C', 'type': 'datetime'},
|
||||
{'name': 'D', 'type': 'duration'},
|
||||
],
|
||||
'primaryKey': ['level_0', 'level_1']
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
df.index.names = ['idx0', None]
|
||||
expected['fields'][0]['name'] = 'idx0'
|
||||
expected['primaryKey'] = ['idx0', 'level_1']
|
||||
result = build_table_schema(df, version=False)
|
||||
assert result == expected
|
||||
|
||||
|
||||
class TestTableSchemaType(object):
|
||||
|
||||
@pytest.mark.parametrize('int_type', [
|
||||
np.int, np.int16, np.int32, np.int64])
|
||||
def test_as_json_table_type_int_data(self, int_type):
|
||||
int_data = [1, 2, 3]
|
||||
assert as_json_table_type(np.array(
|
||||
int_data, dtype=int_type)) == 'integer'
|
||||
|
||||
@pytest.mark.parametrize('float_type', [
|
||||
np.float, np.float16, np.float32, np.float64])
|
||||
def test_as_json_table_type_float_data(self, float_type):
|
||||
float_data = [1., 2., 3.]
|
||||
assert as_json_table_type(np.array(
|
||||
float_data, dtype=float_type)) == 'number'
|
||||
|
||||
@pytest.mark.parametrize('bool_type', [bool, np.bool])
|
||||
def test_as_json_table_type_bool_data(self, bool_type):
|
||||
bool_data = [True, False]
|
||||
assert as_json_table_type(np.array(
|
||||
bool_data, dtype=bool_type)) == 'boolean'
|
||||
|
||||
@pytest.mark.parametrize('date_data', [
|
||||
pd.to_datetime(['2016']),
|
||||
pd.to_datetime(['2016'], utc=True),
|
||||
pd.Series(pd.to_datetime(['2016'])),
|
||||
pd.Series(pd.to_datetime(['2016'], utc=True)),
|
||||
pd.period_range('2016', freq='A', periods=3)
|
||||
])
|
||||
def test_as_json_table_type_date_data(self, date_data):
|
||||
assert as_json_table_type(date_data) == 'datetime'
|
||||
|
||||
@pytest.mark.parametrize('str_data', [
|
||||
pd.Series(['a', 'b']), pd.Index(['a', 'b'])])
|
||||
def test_as_json_table_type_string_data(self, str_data):
|
||||
assert as_json_table_type(str_data) == 'string'
|
||||
|
||||
@pytest.mark.parametrize('cat_data', [
|
||||
pd.Categorical(['a']),
|
||||
pd.Categorical([1]),
|
||||
pd.Series(pd.Categorical([1])),
|
||||
pd.CategoricalIndex([1]),
|
||||
pd.Categorical([1])])
|
||||
def test_as_json_table_type_categorical_data(self, cat_data):
|
||||
assert as_json_table_type(cat_data) == 'any'
|
||||
|
||||
# ------
|
||||
# dtypes
|
||||
# ------
|
||||
@pytest.mark.parametrize('int_dtype', [
|
||||
np.int, np.int16, np.int32, np.int64])
|
||||
def test_as_json_table_type_int_dtypes(self, int_dtype):
|
||||
assert as_json_table_type(int_dtype) == 'integer'
|
||||
|
||||
@pytest.mark.parametrize('float_dtype', [
|
||||
np.float, np.float16, np.float32, np.float64])
|
||||
def test_as_json_table_type_float_dtypes(self, float_dtype):
|
||||
assert as_json_table_type(float_dtype) == 'number'
|
||||
|
||||
@pytest.mark.parametrize('bool_dtype', [bool, np.bool])
|
||||
def test_as_json_table_type_bool_dtypes(self, bool_dtype):
|
||||
assert as_json_table_type(bool_dtype) == 'boolean'
|
||||
|
||||
@pytest.mark.parametrize('date_dtype', [
|
||||
np.datetime64, np.dtype("<M8[ns]"), PeriodDtype('D'),
|
||||
DatetimeTZDtype('ns', 'US/Central')])
|
||||
def test_as_json_table_type_date_dtypes(self, date_dtype):
|
||||
# TODO: datedate.date? datetime.time?
|
||||
assert as_json_table_type(date_dtype) == 'datetime'
|
||||
|
||||
@pytest.mark.parametrize('td_dtype', [
|
||||
np.timedelta64, np.dtype("<m8[ns]")])
|
||||
def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
|
||||
assert as_json_table_type(td_dtype) == 'duration'
|
||||
|
||||
@pytest.mark.parametrize('str_dtype', [object]) # TODO
|
||||
def test_as_json_table_type_string_dtypes(self, str_dtype):
|
||||
assert as_json_table_type(str_dtype) == 'string'
|
||||
|
||||
def test_as_json_table_type_categorical_dtypes(self):
|
||||
# TODO: I think before is_categorical_dtype(Categorical)
|
||||
# returned True, but now it's False. Figure out why or
|
||||
# if it matters
|
||||
assert as_json_table_type(pd.Categorical(['a'])) == 'any'
|
||||
assert as_json_table_type(CategoricalDtype()) == 'any'
|
||||
|
||||
|
||||
class TestTableOrient(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
self.df = DataFrame(
|
||||
{'A': [1, 2, 3, 4],
|
||||
'B': ['a', 'b', 'c', 'c'],
|
||||
'C': pd.date_range('2016-01-01', freq='d', periods=4),
|
||||
'D': pd.timedelta_range('1H', periods=4, freq='T'),
|
||||
'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
|
||||
'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
|
||||
ordered=True)),
|
||||
'G': [1., 2., 3, 4.],
|
||||
'H': pd.date_range('2016-01-01', freq='d', periods=4,
|
||||
tz='US/Central'),
|
||||
},
|
||||
index=pd.Index(range(4), name='idx'))
|
||||
|
||||
def test_build_series(self):
|
||||
s = pd.Series([1, 2], name='a')
|
||||
s.index.name = 'id'
|
||||
result = s.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result['schema']
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
fields = [{'name': 'id', 'type': 'integer'},
|
||||
{'name': 'a', 'type': 'integer'}]
|
||||
|
||||
schema = {
|
||||
'fields': fields,
|
||||
'primaryKey': ['id'],
|
||||
}
|
||||
|
||||
expected = OrderedDict([
|
||||
('schema', schema),
|
||||
('data', [OrderedDict([('id', 0), ('a', 1)]),
|
||||
OrderedDict([('id', 1), ('a', 2)])])])
|
||||
assert result == expected
|
||||
|
||||
def test_to_json(self):
|
||||
df = self.df.copy()
|
||||
df.index.name = 'idx'
|
||||
result = df.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result['schema']
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
fields = [
|
||||
{'name': 'idx', 'type': 'integer'},
|
||||
{'name': 'A', 'type': 'integer'},
|
||||
{'name': 'B', 'type': 'string'},
|
||||
{'name': 'C', 'type': 'datetime'},
|
||||
{'name': 'D', 'type': 'duration'},
|
||||
{'constraints': {'enum': ['a', 'b', 'c']},
|
||||
'name': 'E',
|
||||
'ordered': False,
|
||||
'type': 'any'},
|
||||
{'constraints': {'enum': ['a', 'b', 'c']},
|
||||
'name': 'F',
|
||||
'ordered': True,
|
||||
'type': 'any'},
|
||||
{'name': 'G', 'type': 'number'},
|
||||
{'name': 'H', 'type': 'datetime', 'tz': 'US/Central'}
|
||||
]
|
||||
|
||||
schema = {
|
||||
'fields': fields,
|
||||
'primaryKey': ['idx'],
|
||||
}
|
||||
data = [
|
||||
OrderedDict([('idx', 0), ('A', 1), ('B', 'a'),
|
||||
('C', '2016-01-01T00:00:00.000Z'),
|
||||
('D', 'P0DT1H0M0S'),
|
||||
('E', 'a'), ('F', 'a'), ('G', 1.),
|
||||
('H', '2016-01-01T06:00:00.000Z')
|
||||
]),
|
||||
OrderedDict([('idx', 1), ('A', 2), ('B', 'b'),
|
||||
('C', '2016-01-02T00:00:00.000Z'),
|
||||
('D', 'P0DT1H1M0S'),
|
||||
('E', 'b'), ('F', 'b'), ('G', 2.),
|
||||
('H', '2016-01-02T06:00:00.000Z')
|
||||
]),
|
||||
OrderedDict([('idx', 2), ('A', 3), ('B', 'c'),
|
||||
('C', '2016-01-03T00:00:00.000Z'),
|
||||
('D', 'P0DT1H2M0S'),
|
||||
('E', 'c'), ('F', 'c'), ('G', 3.),
|
||||
('H', '2016-01-03T06:00:00.000Z')
|
||||
]),
|
||||
OrderedDict([('idx', 3), ('A', 4), ('B', 'c'),
|
||||
('C', '2016-01-04T00:00:00.000Z'),
|
||||
('D', 'P0DT1H3M0S'),
|
||||
('E', 'c'), ('F', 'c'), ('G', 4.),
|
||||
('H', '2016-01-04T06:00:00.000Z')
|
||||
]),
|
||||
]
|
||||
expected = OrderedDict([('schema', schema), ('data', data)])
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_float_index(self):
|
||||
data = pd.Series(1, index=[1., 2.])
|
||||
result = data.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
expected = (
|
||||
OrderedDict([('schema', {
|
||||
'fields': [{'name': 'index', 'type': 'number'},
|
||||
{'name': 'values', 'type': 'integer'}],
|
||||
'primaryKey': ['index']
|
||||
}),
|
||||
('data', [OrderedDict([('index', 1.0), ('values', 1)]),
|
||||
OrderedDict([('index', 2.0), ('values', 1)])])])
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_period_index(self):
|
||||
idx = pd.period_range('2016', freq='Q-JAN', periods=2)
|
||||
data = pd.Series(1, idx)
|
||||
result = data.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
fields = [{'freq': 'Q-JAN', 'name': 'index', 'type': 'datetime'},
|
||||
{'name': 'values', 'type': 'integer'}]
|
||||
|
||||
schema = {'fields': fields, 'primaryKey': ['index']}
|
||||
data = [OrderedDict([('index', '2015-11-01T00:00:00.000Z'),
|
||||
('values', 1)]),
|
||||
OrderedDict([('index', '2016-02-01T00:00:00.000Z'),
|
||||
('values', 1)])]
|
||||
expected = OrderedDict([('schema', schema), ('data', data)])
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_categorical_index(self):
|
||||
data = pd.Series(1, pd.CategoricalIndex(['a', 'b']))
|
||||
result = data.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
expected = (
|
||||
OrderedDict([('schema',
|
||||
{'fields': [{'name': 'index', 'type': 'any',
|
||||
'constraints': {'enum': ['a', 'b']},
|
||||
'ordered': False},
|
||||
{'name': 'values', 'type': 'integer'}],
|
||||
'primaryKey': ['index']}),
|
||||
('data', [
|
||||
OrderedDict([('index', 'a'),
|
||||
('values', 1)]),
|
||||
OrderedDict([('index', 'b'), ('values', 1)])])])
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
def test_date_format_raises(self):
|
||||
with pytest.raises(ValueError):
|
||||
self.df.to_json(orient='table', date_format='epoch')
|
||||
|
||||
# others work
|
||||
self.df.to_json(orient='table', date_format='iso')
|
||||
self.df.to_json(orient='table')
|
||||
|
||||
@pytest.mark.parametrize('kind', [pd.Series, pd.Index])
|
||||
def test_convert_pandas_type_to_json_field_int(self, kind):
|
||||
data = [1, 2, 3]
|
||||
result = convert_pandas_type_to_json_field(kind(data, name='name'))
|
||||
expected = {"name": "name", "type": "integer"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize('kind', [pd.Series, pd.Index])
|
||||
def test_convert_pandas_type_to_json_field_float(self, kind):
|
||||
data = [1., 2., 3.]
|
||||
result = convert_pandas_type_to_json_field(kind(data, name='name'))
|
||||
expected = {"name": "name", "type": "number"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize('dt_args,extra_exp', [
|
||||
({}, {}), ({'utc': True}, {'tz': 'UTC'})])
|
||||
@pytest.mark.parametrize('wrapper', [None, pd.Series])
|
||||
def test_convert_pandas_type_to_json_field_datetime(self, dt_args,
|
||||
extra_exp, wrapper):
|
||||
data = [1., 2., 3.]
|
||||
data = pd.to_datetime(data, **dt_args)
|
||||
if wrapper is pd.Series:
|
||||
data = pd.Series(data, name='values')
|
||||
result = convert_pandas_type_to_json_field(data)
|
||||
expected = {"name": "values", "type": 'datetime'}
|
||||
expected.update(extra_exp)
|
||||
assert result == expected
|
||||
|
||||
def test_convert_pandas_type_to_json_period_range(self):
|
||||
arr = pd.period_range('2016', freq='A-DEC', periods=4)
|
||||
result = convert_pandas_type_to_json_field(arr)
|
||||
expected = {"name": "values", "type": 'datetime', "freq": "A-DEC"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize('kind', [pd.Categorical, pd.CategoricalIndex])
|
||||
@pytest.mark.parametrize('ordered', [True, False])
|
||||
def test_convert_pandas_type_to_json_field_categorical(self, kind,
|
||||
ordered):
|
||||
data = ['a', 'b', 'c']
|
||||
if kind is pd.Categorical:
|
||||
arr = pd.Series(kind(data, ordered=ordered), name='cats')
|
||||
elif kind is pd.CategoricalIndex:
|
||||
arr = kind(data, ordered=ordered, name='cats')
|
||||
|
||||
result = convert_pandas_type_to_json_field(arr)
|
||||
expected = {"name": "cats", "type": "any",
|
||||
"constraints": {"enum": data},
|
||||
"ordered": ordered}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize("inp,exp", [
|
||||
({'type': 'integer'}, 'int64'),
|
||||
({'type': 'number'}, 'float64'),
|
||||
({'type': 'boolean'}, 'bool'),
|
||||
({'type': 'duration'}, 'timedelta64'),
|
||||
({'type': 'datetime'}, 'datetime64[ns]'),
|
||||
({'type': 'datetime', 'tz': 'US/Hawaii'}, 'datetime64[ns, US/Hawaii]'),
|
||||
({'type': 'any'}, 'object'),
|
||||
({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
|
||||
'ordered': False}, CategoricalDtype(categories=['a', 'b', 'c'],
|
||||
ordered=False)),
|
||||
({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
|
||||
'ordered': True}, CategoricalDtype(categories=['a', 'b', 'c'],
|
||||
ordered=True)),
|
||||
({'type': 'string'}, 'object')])
|
||||
def test_convert_json_field_to_pandas_type(self, inp, exp):
|
||||
field = {'name': 'foo'}
|
||||
field.update(inp)
|
||||
assert convert_json_field_to_pandas_type(field) == exp
|
||||
|
||||
@pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
|
||||
def test_convert_json_field_to_pandas_type_raises(self, inp):
|
||||
field = {'type': inp}
|
||||
with pytest.raises(ValueError, match=("Unsupported or invalid field "
|
||||
"type: {}".format(inp))):
|
||||
convert_json_field_to_pandas_type(field)
|
||||
|
||||
def test_categorical(self):
|
||||
s = pd.Series(pd.Categorical(['a', 'b', 'a']))
|
||||
s.index.name = 'idx'
|
||||
result = s.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
fields = [{'name': 'idx', 'type': 'integer'},
|
||||
{'constraints': {'enum': ['a', 'b']},
|
||||
'name': 'values',
|
||||
'ordered': False,
|
||||
'type': 'any'}]
|
||||
|
||||
expected = OrderedDict([
|
||||
('schema', {'fields': fields,
|
||||
'primaryKey': ['idx']}),
|
||||
('data', [OrderedDict([('idx', 0), ('values', 'a')]),
|
||||
OrderedDict([('idx', 1), ('values', 'b')]),
|
||||
OrderedDict([('idx', 2), ('values', 'a')])])])
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize('idx,nm,prop', [
|
||||
(pd.Index([1]), 'index', 'name'),
|
||||
(pd.Index([1], name='myname'), 'myname', 'name'),
|
||||
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')]),
|
||||
['level_0', 'level_1'], 'names'),
|
||||
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
|
||||
names=['n1', 'n2']),
|
||||
['n1', 'n2'], 'names'),
|
||||
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
|
||||
names=['n1', None]),
|
||||
['n1', 'level_1'], 'names')
|
||||
])
|
||||
def test_set_names_unset(self, idx, nm, prop):
|
||||
data = pd.Series(1, idx)
|
||||
result = set_default_names(data)
|
||||
assert getattr(result.index, prop) == nm
|
||||
|
||||
@pytest.mark.parametrize("idx", [
|
||||
pd.Index([], name='index'),
|
||||
pd.MultiIndex.from_arrays([['foo'], ['bar']],
|
||||
names=('level_0', 'level_1')),
|
||||
pd.MultiIndex.from_arrays([['foo'], ['bar']],
|
||||
names=('foo', 'level_1'))
|
||||
])
|
||||
def test_warns_non_roundtrippable_names(self, idx):
|
||||
# GH 19130
|
||||
df = pd.DataFrame([[]], index=idx)
|
||||
df.index.name = 'index'
|
||||
with tm.assert_produces_warning():
|
||||
set_default_names(df)
|
||||
|
||||
def test_timestamp_in_columns(self):
|
||||
df = pd.DataFrame([[1, 2]], columns=[pd.Timestamp('2016'),
|
||||
pd.Timedelta(10, unit='s')])
|
||||
result = df.to_json(orient="table")
|
||||
js = json.loads(result)
|
||||
assert js['schema']['fields'][1]['name'] == 1451606400000
|
||||
assert js['schema']['fields'][2]['name'] == 10000
|
||||
|
||||
@pytest.mark.parametrize('case', [
|
||||
pd.Series([1], index=pd.Index([1], name='a'), name='a'),
|
||||
pd.DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
|
||||
pd.DataFrame({"A": [1]}, index=pd.MultiIndex.from_arrays([
|
||||
['a'], [1]], names=["A", "a"]))
|
||||
])
|
||||
def test_overlapping_names(self, case):
|
||||
with pytest.raises(ValueError, match='Overlapping'):
|
||||
case.to_json(orient='table')
|
||||
|
||||
def test_mi_falsey_name(self):
|
||||
# GH 16203
|
||||
df = pd.DataFrame(np.random.randn(4, 4),
|
||||
index=pd.MultiIndex.from_product([('A', 'B'),
|
||||
('a', 'b')]))
|
||||
result = [x['name'] for x in build_table_schema(df)['fields']]
|
||||
assert result == ['level_0', 'level_1', 0, 1, 2, 3]
|
||||
|
||||
|
||||
class TestTableOrientReader(object):
|
||||
|
||||
@pytest.mark.parametrize("index_nm", [
|
||||
None,
|
||||
"idx",
|
||||
pytest.param("index",
|
||||
marks=pytest.mark.xfail),
|
||||
'level_0'])
|
||||
@pytest.mark.parametrize("vals", [
|
||||
{'ints': [1, 2, 3, 4]},
|
||||
{'objects': ['a', 'b', 'c', 'd']},
|
||||
{'date_ranges': pd.date_range('2016-01-01', freq='d', periods=4)},
|
||||
{'categoricals': pd.Series(pd.Categorical(['a', 'b', 'c', 'c']))},
|
||||
{'ordered_cats': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
|
||||
ordered=True))},
|
||||
pytest.param({'floats': [1., 2., 3., 4.]},
|
||||
marks=pytest.mark.xfail),
|
||||
{'floats': [1.1, 2.2, 3.3, 4.4]},
|
||||
{'bools': [True, False, False, True]}])
|
||||
def test_read_json_table_orient(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize("index_nm", [
|
||||
None, "idx", "index"])
|
||||
@pytest.mark.parametrize("vals", [
|
||||
{'timedeltas': pd.timedelta_range('1H', periods=4, freq='T')},
|
||||
{'timezones': pd.date_range('2016-01-01', freq='d', periods=4,
|
||||
tz='US/Central')}])
|
||||
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
|
||||
out = df.to_json(orient="table")
|
||||
with pytest.raises(NotImplementedError, match='can not yet read '):
|
||||
pd.read_json(out, orient="table")
|
||||
|
||||
def test_comprehensive(self):
|
||||
df = DataFrame(
|
||||
{'A': [1, 2, 3, 4],
|
||||
'B': ['a', 'b', 'c', 'c'],
|
||||
'C': pd.date_range('2016-01-01', freq='d', periods=4),
|
||||
# 'D': pd.timedelta_range('1H', periods=4, freq='T'),
|
||||
'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
|
||||
'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
|
||||
ordered=True)),
|
||||
'G': [1.1, 2.2, 3.3, 4.4],
|
||||
# 'H': pd.date_range('2016-01-01', freq='d', periods=4,
|
||||
# tz='US/Central'),
|
||||
'I': [True, False, False, True],
|
||||
},
|
||||
index=pd.Index(range(4), name='idx'))
|
||||
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize("index_names", [
|
||||
[None, None], ['foo', 'bar'], ['foo', None], [None, 'foo'],
|
||||
['index', 'foo']])
|
||||
def test_multiindex(self, index_names):
|
||||
# GH 18912
|
||||
df = pd.DataFrame(
|
||||
[["Arr", "alpha", [1, 2, 3, 4]],
|
||||
["Bee", "Beta", [10, 20, 30, 40]]],
|
||||
index=[["A", "B"], ["Null", "Eins"]],
|
||||
columns=["Aussprache", "Griechisch", "Args"]
|
||||
)
|
||||
df.index.names = index_names
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize("strict_check", [
|
||||
pytest.param(True, marks=pytest.mark.xfail),
|
||||
False
|
||||
])
|
||||
def test_empty_frame_roundtrip(self, strict_check):
|
||||
# GH 21287
|
||||
df = pd.DataFrame([], columns=['a', 'b', 'c'])
|
||||
expected = df.copy()
|
||||
out = df.to_json(orient='table')
|
||||
result = pd.read_json(out, orient='table')
|
||||
# TODO: When DF coercion issue (#21345) is resolved tighten type checks
|
||||
tm.assert_frame_equal(expected, result,
|
||||
check_dtype=strict_check,
|
||||
check_index_type=strict_check)
|
||||
@@ -1,462 +0,0 @@
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Index, compat
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.json import json_normalize
|
||||
from pandas.io.json.normalize import nested_to_record
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def deep_nested():
|
||||
# deeply nested data
|
||||
return [{'country': 'USA',
|
||||
'states': [{'name': 'California',
|
||||
'cities': [{'name': 'San Francisco',
|
||||
'pop': 12345},
|
||||
{'name': 'Los Angeles',
|
||||
'pop': 12346}]
|
||||
},
|
||||
{'name': 'Ohio',
|
||||
'cities': [{'name': 'Columbus',
|
||||
'pop': 1234},
|
||||
{'name': 'Cleveland',
|
||||
'pop': 1236}]}
|
||||
]
|
||||
},
|
||||
{'country': 'Germany',
|
||||
'states': [{'name': 'Bayern',
|
||||
'cities': [{'name': 'Munich', 'pop': 12347}]
|
||||
},
|
||||
{'name': 'Nordrhein-Westfalen',
|
||||
'cities': [{'name': 'Duesseldorf', 'pop': 1238},
|
||||
{'name': 'Koeln', 'pop': 1239}]}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def state_data():
|
||||
return [
|
||||
{'counties': [{'name': 'Dade', 'population': 12345},
|
||||
{'name': 'Broward', 'population': 40000},
|
||||
{'name': 'Palm Beach', 'population': 60000}],
|
||||
'info': {'governor': 'Rick Scott'},
|
||||
'shortname': 'FL',
|
||||
'state': 'Florida'},
|
||||
{'counties': [{'name': 'Summit', 'population': 1234},
|
||||
{'name': 'Cuyahoga', 'population': 1337}],
|
||||
'info': {'governor': 'John Kasich'},
|
||||
'shortname': 'OH',
|
||||
'state': 'Ohio'}]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def author_missing_data():
|
||||
return [
|
||||
{'info': None},
|
||||
{'info':
|
||||
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
|
||||
'author_name':
|
||||
{'first': 'Jane', 'last_name': 'Doe'}
|
||||
}]
|
||||
|
||||
|
||||
class TestJSONNormalize(object):
|
||||
|
||||
def test_simple_records(self):
|
||||
recs = [{'a': 1, 'b': 2, 'c': 3},
|
||||
{'a': 4, 'b': 5, 'c': 6},
|
||||
{'a': 7, 'b': 8, 'c': 9},
|
||||
{'a': 10, 'b': 11, 'c': 12}]
|
||||
|
||||
result = json_normalize(recs)
|
||||
expected = DataFrame(recs)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_simple_normalize(self, state_data):
|
||||
result = json_normalize(state_data[0], 'counties')
|
||||
expected = DataFrame(state_data[0]['counties'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, 'counties')
|
||||
|
||||
expected = []
|
||||
for rec in state_data:
|
||||
expected.extend(rec['counties'])
|
||||
expected = DataFrame(expected)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, 'counties', meta='state')
|
||||
expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_empty_array(self):
|
||||
result = json_normalize([])
|
||||
expected = DataFrame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_simple_normalize_with_separator(self, deep_nested):
|
||||
# GH 14883
|
||||
result = json_normalize({'A': {'A': 1, 'B': 2}})
|
||||
expected = DataFrame([[1, 2]], columns=['A.A', 'A.B'])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_')
|
||||
expected = DataFrame([[1, 2]], columns=['A_A', 'A_B'])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3')
|
||||
expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B'])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize(deep_nested, ['states', 'cities'],
|
||||
meta=['country', ['states', 'name']],
|
||||
sep='_')
|
||||
expected = Index(['name', 'pop',
|
||||
'country', 'states_name']).sort_values()
|
||||
assert result.columns.sort_values().equals(expected)
|
||||
|
||||
def test_value_array_record_prefix(self):
|
||||
# GH 21536
|
||||
result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.')
|
||||
expected = DataFrame([[1], [2]], columns=['Prefix.0'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nested_object_record_path(self):
|
||||
# GH 22706
|
||||
data = {'state': 'Florida',
|
||||
'info': {
|
||||
'governor': 'Rick Scott',
|
||||
'counties': [{'name': 'Dade', 'population': 12345},
|
||||
{'name': 'Broward', 'population': 40000},
|
||||
{'name': 'Palm Beach', 'population': 60000}]}}
|
||||
result = json_normalize(data, record_path=["info", "counties"])
|
||||
expected = DataFrame([['Dade', 12345],
|
||||
['Broward', 40000],
|
||||
['Palm Beach', 60000]],
|
||||
columns=['name', 'population'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_more_deeply_nested(self, deep_nested):
|
||||
|
||||
result = json_normalize(deep_nested, ['states', 'cities'],
|
||||
meta=['country', ['states', 'name']])
|
||||
# meta_prefix={'states': 'state_'})
|
||||
|
||||
ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
|
||||
'states.name': ['California', 'California', 'Ohio', 'Ohio',
|
||||
'Bayern', 'Nordrhein-Westfalen',
|
||||
'Nordrhein-Westfalen'],
|
||||
'name': ['San Francisco', 'Los Angeles', 'Columbus',
|
||||
'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'],
|
||||
'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]}
|
||||
|
||||
expected = DataFrame(ex_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_shallow_nested(self):
|
||||
data = [{'state': 'Florida',
|
||||
'shortname': 'FL',
|
||||
'info': {
|
||||
'governor': 'Rick Scott'
|
||||
},
|
||||
'counties': [{'name': 'Dade', 'population': 12345},
|
||||
{'name': 'Broward', 'population': 40000},
|
||||
{'name': 'Palm Beach', 'population': 60000}]},
|
||||
{'state': 'Ohio',
|
||||
'shortname': 'OH',
|
||||
'info': {
|
||||
'governor': 'John Kasich'
|
||||
},
|
||||
'counties': [{'name': 'Summit', 'population': 1234},
|
||||
{'name': 'Cuyahoga', 'population': 1337}]}]
|
||||
|
||||
result = json_normalize(data, 'counties',
|
||||
['state', 'shortname',
|
||||
['info', 'governor']])
|
||||
ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit',
|
||||
'Cuyahoga'],
|
||||
'state': ['Florida'] * 3 + ['Ohio'] * 2,
|
||||
'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'],
|
||||
'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2,
|
||||
'population': [12345, 40000, 60000, 1234, 1337]}
|
||||
expected = DataFrame(ex_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_meta_name_conflict(self):
|
||||
data = [{'foo': 'hello',
|
||||
'bar': 'there',
|
||||
'data': [{'foo': 'something', 'bar': 'else'},
|
||||
{'foo': 'something2', 'bar': 'else2'}]}]
|
||||
|
||||
msg = (r"Conflicting metadata name (foo|bar),"
|
||||
" need distinguishing prefix")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
json_normalize(data, 'data', meta=['foo', 'bar'])
|
||||
|
||||
result = json_normalize(data, 'data', meta=['foo', 'bar'],
|
||||
meta_prefix='meta')
|
||||
|
||||
for val in ['metafoo', 'metabar', 'foo', 'bar']:
|
||||
assert val in result
|
||||
|
||||
def test_meta_parameter_not_modified(self):
|
||||
# GH 18610
|
||||
data = [{'foo': 'hello',
|
||||
'bar': 'there',
|
||||
'data': [{'foo': 'something', 'bar': 'else'},
|
||||
{'foo': 'something2', 'bar': 'else2'}]}]
|
||||
|
||||
COLUMNS = ['foo', 'bar']
|
||||
result = json_normalize(data, 'data', meta=COLUMNS,
|
||||
meta_prefix='meta')
|
||||
|
||||
assert COLUMNS == ['foo', 'bar']
|
||||
for val in ['metafoo', 'metabar', 'foo', 'bar']:
|
||||
assert val in result
|
||||
|
||||
def test_record_prefix(self, state_data):
|
||||
result = json_normalize(state_data[0], 'counties')
|
||||
expected = DataFrame(state_data[0]['counties'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, 'counties',
|
||||
meta='state',
|
||||
record_prefix='county_')
|
||||
|
||||
expected = []
|
||||
for rec in state_data:
|
||||
expected.extend(rec['counties'])
|
||||
expected = DataFrame(expected)
|
||||
expected = expected.rename(columns=lambda x: 'county_' + x)
|
||||
expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_non_ascii_key(self):
|
||||
if compat.PY3:
|
||||
testjson = (
|
||||
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
|
||||
b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
|
||||
).decode('utf8')
|
||||
else:
|
||||
testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
|
||||
'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]')
|
||||
|
||||
testdata = {
|
||||
u'sub.A': [1, 3],
|
||||
u'sub.B': [2, 4],
|
||||
b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1]
|
||||
}
|
||||
expected = DataFrame(testdata)
|
||||
|
||||
result = json_normalize(json.loads(testjson))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing_field(self, author_missing_data):
|
||||
# GH20030:
|
||||
result = json_normalize(author_missing_data)
|
||||
ex_data = [
|
||||
{'info': np.nan,
|
||||
'author_name.first': np.nan,
|
||||
'author_name.last_name': np.nan,
|
||||
'info.created_at': np.nan,
|
||||
'info.last_updated': np.nan},
|
||||
{'info': None,
|
||||
'author_name.first': 'Jane',
|
||||
'author_name.last_name': 'Doe',
|
||||
'info.created_at': '11/08/1993',
|
||||
'info.last_updated': '26/05/2012'}
|
||||
]
|
||||
expected = DataFrame(ex_data)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestNestedToRecord(object):
|
||||
|
||||
def test_flat_stays_flat(self):
|
||||
recs = [dict(flat1=1, flat2=2),
|
||||
dict(flat1=3, flat2=4),
|
||||
]
|
||||
|
||||
result = nested_to_record(recs)
|
||||
expected = recs
|
||||
assert result == expected
|
||||
|
||||
def test_one_level_deep_flattens(self):
|
||||
data = dict(flat1=1,
|
||||
dict1=dict(c=1, d=2))
|
||||
|
||||
result = nested_to_record(data)
|
||||
expected = {'dict1.c': 1,
|
||||
'dict1.d': 2,
|
||||
'flat1': 1}
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_nested_flattens(self):
|
||||
data = dict(flat1=1,
|
||||
dict1=dict(c=1, d=2),
|
||||
nested=dict(e=dict(c=1, d=2),
|
||||
d=2))
|
||||
|
||||
result = nested_to_record(data)
|
||||
expected = {'dict1.c': 1,
|
||||
'dict1.d': 2,
|
||||
'flat1': 1,
|
||||
'nested.d': 2,
|
||||
'nested.e.c': 1,
|
||||
'nested.e.d': 2}
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_json_normalize_errors(self):
|
||||
# GH14583: If meta keys are not always present
|
||||
# a new option to set errors='ignore' has been implemented
|
||||
i = {
|
||||
"Trades": [{
|
||||
"general": {
|
||||
"tradeid": 100,
|
||||
"trade_version": 1,
|
||||
"stocks": [{
|
||||
|
||||
"symbol": "AAPL",
|
||||
"name": "Apple",
|
||||
"price": "0"
|
||||
}, {
|
||||
"symbol": "GOOG",
|
||||
"name": "Google",
|
||||
"price": "0"
|
||||
}
|
||||
]
|
||||
}
|
||||
}, {
|
||||
"general": {
|
||||
"tradeid": 100,
|
||||
"stocks": [{
|
||||
"symbol": "AAPL",
|
||||
"name": "Apple",
|
||||
"price": "0"
|
||||
}, {
|
||||
"symbol": "GOOG",
|
||||
"name": "Google",
|
||||
"price": "0"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
j = json_normalize(data=i['Trades'],
|
||||
record_path=[['general', 'stocks']],
|
||||
meta=[['general', 'tradeid'],
|
||||
['general', 'trade_version']],
|
||||
errors='ignore')
|
||||
expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
|
||||
'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
|
||||
'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
|
||||
'price': {0: '0', 1: '0', 2: '0', 3: '0'},
|
||||
'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
|
||||
|
||||
assert j.fillna('').to_dict() == expected
|
||||
|
||||
msg = ("Try running with errors='ignore' as key 'trade_version'"
|
||||
" is not always present")
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
json_normalize(
|
||||
data=i['Trades'],
|
||||
record_path=[['general', 'stocks']],
|
||||
meta=[['general', 'tradeid'],
|
||||
['general', 'trade_version']],
|
||||
errors='raise')
|
||||
|
||||
def test_donot_drop_nonevalues(self):
|
||||
# GH21356
|
||||
data = [
|
||||
{'info': None,
|
||||
'author_name':
|
||||
{'first': 'Smith', 'last_name': 'Appleseed'}
|
||||
},
|
||||
{'info':
|
||||
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
|
||||
'author_name':
|
||||
{'first': 'Jane', 'last_name': 'Doe'}
|
||||
}
|
||||
]
|
||||
result = nested_to_record(data)
|
||||
expected = [
|
||||
{'info': None,
|
||||
'author_name.first': 'Smith',
|
||||
'author_name.last_name': 'Appleseed'},
|
||||
{'author_name.first': 'Jane',
|
||||
'author_name.last_name': 'Doe',
|
||||
'info.created_at': '11/08/1993',
|
||||
'info.last_updated': '26/05/2012'}]
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_nonetype_top_level_bottom_level(self):
|
||||
# GH21158: If inner level json has a key with a null value
|
||||
# make sure it doesnt do a new_d.pop twice and except
|
||||
data = {
|
||||
"id": None,
|
||||
"location": {
|
||||
"country": {
|
||||
"state": {
|
||||
"id": None,
|
||||
"town.info": {
|
||||
"id": None,
|
||||
"region": None,
|
||||
"x": 49.151580810546875,
|
||||
"y": -33.148521423339844,
|
||||
"z": 27.572303771972656}}}
|
||||
}
|
||||
}
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
'id': None,
|
||||
'location.country.state.id': None,
|
||||
'location.country.state.town.info.id': None,
|
||||
'location.country.state.town.info.region': None,
|
||||
'location.country.state.town.info.x': 49.151580810546875,
|
||||
'location.country.state.town.info.y': -33.148521423339844,
|
||||
'location.country.state.town.info.z': 27.572303771972656}
|
||||
assert result == expected
|
||||
|
||||
def test_nonetype_multiple_levels(self):
|
||||
# GH21158: If inner level json has a key with a null value
|
||||
# make sure it doesnt do a new_d.pop twice and except
|
||||
data = {
|
||||
"id": None,
|
||||
"location": {
|
||||
"id": None,
|
||||
"country": {
|
||||
"id": None,
|
||||
"state": {
|
||||
"id": None,
|
||||
"town.info": {
|
||||
"region": None,
|
||||
"x": 49.151580810546875,
|
||||
"y": -33.148521423339844,
|
||||
"z": 27.572303771972656}}}
|
||||
}
|
||||
}
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
'id': None,
|
||||
'location.id': None,
|
||||
'location.country.id': None,
|
||||
'location.country.state.id': None,
|
||||
'location.country.state.town.info.region': None,
|
||||
'location.country.state.town.info.x': 49.151580810546875,
|
||||
'location.country.state.town.info.y': -33.148521423339844,
|
||||
'location.country.state.town.info.z': 27.572303771972656}
|
||||
assert result == expected
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,172 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, read_json
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_frame_equal, assert_series_equal, ensure_clean)
|
||||
|
||||
from pandas.io.json.json import JsonReader
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lines_json_df():
|
||||
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
return df.to_json(lines=True, orient="records")
|
||||
|
||||
|
||||
def test_read_jsonl():
|
||||
# GH9180
|
||||
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
|
||||
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_jsonl_unicode_chars():
|
||||
# GH15132: non-ascii unicode characters
|
||||
# \u201d == RIGHT DOUBLE QUOTATION MARK
|
||||
|
||||
# simulate file handle
|
||||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||||
json = StringIO(json)
|
||||
result = read_json(json, lines=True)
|
||||
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
|
||||
columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# simulate string
|
||||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||||
result = read_json(json, lines=True)
|
||||
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
|
||||
columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_jsonl():
|
||||
# GH9180
|
||||
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
|
||||
assert result == expected
|
||||
|
||||
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
|
||||
assert result == expected
|
||||
assert_frame_equal(read_json(result, lines=True), df)
|
||||
|
||||
# GH15096: escaped characters in columns and data
|
||||
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
|
||||
columns=["a\\", 'b'])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
|
||||
'{"a\\\\":"foo\\"","b":"bar"}')
|
||||
assert result == expected
|
||||
assert_frame_equal(read_json(result, lines=True), df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [1, 1.0])
|
||||
def test_readjson_chunks(lines_json_df, chunksize):
|
||||
# Basic test that read_json(chunks=True) gives the same result as
|
||||
# read_json(chunks=False)
|
||||
# GH17048: memory usage when lines=True
|
||||
|
||||
unchunked = read_json(StringIO(lines_json_df), lines=True)
|
||||
reader = read_json(StringIO(lines_json_df), lines=True,
|
||||
chunksize=chunksize)
|
||||
chunked = pd.concat(reader)
|
||||
|
||||
assert_frame_equal(chunked, unchunked)
|
||||
|
||||
|
||||
def test_readjson_chunksize_requires_lines(lines_json_df):
|
||||
msg = "chunksize can only be passed if lines=True"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
|
||||
|
||||
|
||||
def test_readjson_chunks_series():
|
||||
# Test reading line-format JSON to Series with chunksize param
|
||||
s = pd.Series({'A': 1, 'B': 2})
|
||||
|
||||
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||||
unchunked = pd.read_json(strio, lines=True, typ='Series')
|
||||
|
||||
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||||
chunked = pd.concat(pd.read_json(
|
||||
strio, lines=True, typ='Series', chunksize=1
|
||||
))
|
||||
|
||||
assert_series_equal(chunked, unchunked)
|
||||
|
||||
|
||||
def test_readjson_each_chunk(lines_json_df):
|
||||
# Other tests check that the final result of read_json(chunksize=True)
|
||||
# is correct. This checks the intermediate chunks.
|
||||
chunks = list(
|
||||
pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
|
||||
)
|
||||
assert chunks[0].shape == (2, 2)
|
||||
assert chunks[1].shape == (1, 2)
|
||||
|
||||
|
||||
def test_readjson_chunks_from_file():
|
||||
with ensure_clean('test.json') as path:
|
||||
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
df.to_json(path, lines=True, orient="records")
|
||||
chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
|
||||
unchunked = pd.read_json(path, lines=True)
|
||||
assert_frame_equal(unchunked, chunked)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [None, 1])
|
||||
def test_readjson_chunks_closes(chunksize):
|
||||
with ensure_clean('test.json') as path:
|
||||
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
df.to_json(path, lines=True, orient="records")
|
||||
reader = JsonReader(
|
||||
path, orient=None, typ="frame", dtype=True, convert_axes=True,
|
||||
convert_dates=True, keep_default_dates=True, numpy=False,
|
||||
precise_float=False, date_unit=None, encoding=None,
|
||||
lines=True, chunksize=chunksize, compression=None)
|
||||
reader.read()
|
||||
assert reader.open_stream.closed, "didn't close stream with \
|
||||
chunksize = {chunksize}".format(chunksize=chunksize)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
|
||||
def test_readjson_invalid_chunksize(lines_json_df, chunksize):
|
||||
msg = r"'chunksize' must be an integer >=1"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_json(StringIO(lines_json_df), lines=True,
|
||||
chunksize=chunksize)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [None, 1, 2])
|
||||
def test_readjson_chunks_multiple_empty_lines(chunksize):
|
||||
j = """
|
||||
|
||||
{"A":1,"B":4}
|
||||
|
||||
|
||||
|
||||
{"A":2,"B":5}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
{"A":3,"B":6}
|
||||
"""
|
||||
orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
test = pd.read_json(j, lines=True, chunksize=chunksize)
|
||||
if chunksize is not None:
|
||||
test = pd.concat(test)
|
||||
tm.assert_frame_equal(
|
||||
orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize))
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user