pruned venvs

This commit is contained in:
d3m1g0d
2019-03-12 21:56:25 +01:00
parent 8ee094481c
commit 33f0511081
4095 changed files with 0 additions and 748399 deletions
@@ -1,90 +0,0 @@
import pytest
import pandas as pd
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_raises_regex
def test_compression_roundtrip(compression):
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])
with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
assert_frame_equal(df, pd.read_json(path,
compression=compression))
# explicitly ensure file was compressed.
with tm.decompress_file(path, compression) as fh:
result = fh.read().decode('utf8')
assert_frame_equal(df, pd.read_json(result))
def test_read_zipped_json(datapath):
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
uncompressed_df = pd.read_json(uncompressed_path)
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
compressed_df = pd.read_json(compressed_path, compression='zip')
assert_frame_equal(uncompressed_df, compressed_df)
def test_with_s3_url(compression):
boto3 = pytest.importorskip('boto3')
pytest.importorskip('s3fs')
moto = pytest.importorskip('moto')
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
with moto.mock_s3():
conn = boto3.resource("s3", region_name="us-east-1")
bucket = conn.create_bucket(Bucket="pandas-test")
with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
with open(path, 'rb') as f:
bucket.put_object(Key='test-1', Body=f)
roundtripped_df = pd.read_json('s3://pandas-test/test-1',
compression=compression)
assert_frame_equal(df, roundtripped_df)
def test_lines_with_compression(compression):
with tm.ensure_clean() as path:
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
df.to_json(path, orient='records', lines=True,
compression=compression)
roundtripped_df = pd.read_json(path, lines=True,
compression=compression)
assert_frame_equal(df, roundtripped_df)
def test_chunksize_with_compression(compression):
with tm.ensure_clean() as path:
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
df.to_json(path, orient='records', lines=True,
compression=compression)
res = pd.read_json(path, lines=True, chunksize=1,
compression=compression)
roundtripped_df = pd.concat(res)
assert_frame_equal(df, roundtripped_df)
def test_write_unsupported_compression_type():
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
with tm.ensure_clean() as path:
msg = "Unrecognized compression type: unsupported"
assert_raises_regex(ValueError, msg, df.to_json,
path, compression="unsupported")
def test_read_unsupported_compression_type():
with tm.ensure_clean() as path:
msg = "Unrecognized compression type: unsupported"
assert_raises_regex(ValueError, msg, pd.read_json,
path, compression="unsupported")
@@ -1,575 +0,0 @@
"""Tests for Table Schema integration."""
import json
from collections import OrderedDict
import numpy as np
import pandas as pd
import pytest
from pandas import DataFrame
from pandas.core.dtypes.dtypes import (
PeriodDtype, CategoricalDtype, DatetimeTZDtype)
from pandas.io.json.table_schema import (
as_json_table_type,
build_table_schema,
convert_pandas_type_to_json_field,
convert_json_field_to_pandas_type,
set_default_names)
import pandas.util.testing as tm
class TestBuildSchema(object):
def setup_method(self, method):
self.df = DataFrame(
{'A': [1, 2, 3, 4],
'B': ['a', 'b', 'c', 'c'],
'C': pd.date_range('2016-01-01', freq='d', periods=4),
'D': pd.timedelta_range('1H', periods=4, freq='T'),
},
index=pd.Index(range(4), name='idx'))
def test_build_table_schema(self):
result = build_table_schema(self.df, version=False)
expected = {
'fields': [{'name': 'idx', 'type': 'integer'},
{'name': 'A', 'type': 'integer'},
{'name': 'B', 'type': 'string'},
{'name': 'C', 'type': 'datetime'},
{'name': 'D', 'type': 'duration'},
],
'primaryKey': ['idx']
}
assert result == expected
result = build_table_schema(self.df)
assert "pandas_version" in result
def test_series(self):
s = pd.Series([1, 2, 3], name='foo')
result = build_table_schema(s, version=False)
expected = {'fields': [{'name': 'index', 'type': 'integer'},
{'name': 'foo', 'type': 'integer'}],
'primaryKey': ['index']}
assert result == expected
result = build_table_schema(s)
assert 'pandas_version' in result
def test_series_unnamed(self):
result = build_table_schema(pd.Series([1, 2, 3]), version=False)
expected = {'fields': [{'name': 'index', 'type': 'integer'},
{'name': 'values', 'type': 'integer'}],
'primaryKey': ['index']}
assert result == expected
def test_multiindex(self):
df = self.df.copy()
idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)])
df.index = idx
result = build_table_schema(df, version=False)
expected = {
'fields': [{'name': 'level_0', 'type': 'string'},
{'name': 'level_1', 'type': 'integer'},
{'name': 'A', 'type': 'integer'},
{'name': 'B', 'type': 'string'},
{'name': 'C', 'type': 'datetime'},
{'name': 'D', 'type': 'duration'},
],
'primaryKey': ['level_0', 'level_1']
}
assert result == expected
df.index.names = ['idx0', None]
expected['fields'][0]['name'] = 'idx0'
expected['primaryKey'] = ['idx0', 'level_1']
result = build_table_schema(df, version=False)
assert result == expected
class TestTableSchemaType(object):
@pytest.mark.parametrize('int_type', [
np.int, np.int16, np.int32, np.int64])
def test_as_json_table_type_int_data(self, int_type):
int_data = [1, 2, 3]
assert as_json_table_type(np.array(
int_data, dtype=int_type)) == 'integer'
@pytest.mark.parametrize('float_type', [
np.float, np.float16, np.float32, np.float64])
def test_as_json_table_type_float_data(self, float_type):
float_data = [1., 2., 3.]
assert as_json_table_type(np.array(
float_data, dtype=float_type)) == 'number'
@pytest.mark.parametrize('bool_type', [bool, np.bool])
def test_as_json_table_type_bool_data(self, bool_type):
bool_data = [True, False]
assert as_json_table_type(np.array(
bool_data, dtype=bool_type)) == 'boolean'
@pytest.mark.parametrize('date_data', [
pd.to_datetime(['2016']),
pd.to_datetime(['2016'], utc=True),
pd.Series(pd.to_datetime(['2016'])),
pd.Series(pd.to_datetime(['2016'], utc=True)),
pd.period_range('2016', freq='A', periods=3)
])
def test_as_json_table_type_date_data(self, date_data):
assert as_json_table_type(date_data) == 'datetime'
@pytest.mark.parametrize('str_data', [
pd.Series(['a', 'b']), pd.Index(['a', 'b'])])
def test_as_json_table_type_string_data(self, str_data):
assert as_json_table_type(str_data) == 'string'
@pytest.mark.parametrize('cat_data', [
pd.Categorical(['a']),
pd.Categorical([1]),
pd.Series(pd.Categorical([1])),
pd.CategoricalIndex([1]),
pd.Categorical([1])])
def test_as_json_table_type_categorical_data(self, cat_data):
assert as_json_table_type(cat_data) == 'any'
# ------
# dtypes
# ------
@pytest.mark.parametrize('int_dtype', [
np.int, np.int16, np.int32, np.int64])
def test_as_json_table_type_int_dtypes(self, int_dtype):
assert as_json_table_type(int_dtype) == 'integer'
@pytest.mark.parametrize('float_dtype', [
np.float, np.float16, np.float32, np.float64])
def test_as_json_table_type_float_dtypes(self, float_dtype):
assert as_json_table_type(float_dtype) == 'number'
@pytest.mark.parametrize('bool_dtype', [bool, np.bool])
def test_as_json_table_type_bool_dtypes(self, bool_dtype):
assert as_json_table_type(bool_dtype) == 'boolean'
@pytest.mark.parametrize('date_dtype', [
np.datetime64, np.dtype("<M8[ns]"), PeriodDtype(),
DatetimeTZDtype('ns', 'US/Central')])
def test_as_json_table_type_date_dtypes(self, date_dtype):
# TODO: datedate.date? datetime.time?
assert as_json_table_type(date_dtype) == 'datetime'
@pytest.mark.parametrize('td_dtype', [
np.timedelta64, np.dtype("<m8[ns]")])
def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
assert as_json_table_type(td_dtype) == 'duration'
@pytest.mark.parametrize('str_dtype', [object]) # TODO
def test_as_json_table_type_string_dtypes(self, str_dtype):
assert as_json_table_type(str_dtype) == 'string'
def test_as_json_table_type_categorical_dtypes(self):
# TODO: I think before is_categorical_dtype(Categorical)
# returned True, but now it's False. Figure out why or
# if it matters
assert as_json_table_type(pd.Categorical(['a'])) == 'any'
assert as_json_table_type(CategoricalDtype()) == 'any'
class TestTableOrient(object):
def setup_method(self, method):
self.df = DataFrame(
{'A': [1, 2, 3, 4],
'B': ['a', 'b', 'c', 'c'],
'C': pd.date_range('2016-01-01', freq='d', periods=4),
'D': pd.timedelta_range('1H', periods=4, freq='T'),
'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
ordered=True)),
'G': [1., 2., 3, 4.],
'H': pd.date_range('2016-01-01', freq='d', periods=4,
tz='US/Central'),
},
index=pd.Index(range(4), name='idx'))
def test_build_series(self):
s = pd.Series([1, 2], name='a')
s.index.name = 'id'
result = s.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result['schema']
result['schema'].pop('pandas_version')
fields = [{'name': 'id', 'type': 'integer'},
{'name': 'a', 'type': 'integer'}]
schema = {
'fields': fields,
'primaryKey': ['id'],
}
expected = OrderedDict([
('schema', schema),
('data', [OrderedDict([('id', 0), ('a', 1)]),
OrderedDict([('id', 1), ('a', 2)])])])
assert result == expected
def test_to_json(self):
df = self.df.copy()
df.index.name = 'idx'
result = df.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result['schema']
result['schema'].pop('pandas_version')
fields = [
{'name': 'idx', 'type': 'integer'},
{'name': 'A', 'type': 'integer'},
{'name': 'B', 'type': 'string'},
{'name': 'C', 'type': 'datetime'},
{'name': 'D', 'type': 'duration'},
{'constraints': {'enum': ['a', 'b', 'c']},
'name': 'E',
'ordered': False,
'type': 'any'},
{'constraints': {'enum': ['a', 'b', 'c']},
'name': 'F',
'ordered': True,
'type': 'any'},
{'name': 'G', 'type': 'number'},
{'name': 'H', 'type': 'datetime', 'tz': 'US/Central'}
]
schema = {
'fields': fields,
'primaryKey': ['idx'],
}
data = [
OrderedDict([('idx', 0), ('A', 1), ('B', 'a'),
('C', '2016-01-01T00:00:00.000Z'),
('D', 'P0DT1H0M0S'),
('E', 'a'), ('F', 'a'), ('G', 1.),
('H', '2016-01-01T06:00:00.000Z')
]),
OrderedDict([('idx', 1), ('A', 2), ('B', 'b'),
('C', '2016-01-02T00:00:00.000Z'),
('D', 'P0DT1H1M0S'),
('E', 'b'), ('F', 'b'), ('G', 2.),
('H', '2016-01-02T06:00:00.000Z')
]),
OrderedDict([('idx', 2), ('A', 3), ('B', 'c'),
('C', '2016-01-03T00:00:00.000Z'),
('D', 'P0DT1H2M0S'),
('E', 'c'), ('F', 'c'), ('G', 3.),
('H', '2016-01-03T06:00:00.000Z')
]),
OrderedDict([('idx', 3), ('A', 4), ('B', 'c'),
('C', '2016-01-04T00:00:00.000Z'),
('D', 'P0DT1H3M0S'),
('E', 'c'), ('F', 'c'), ('G', 4.),
('H', '2016-01-04T06:00:00.000Z')
]),
]
expected = OrderedDict([('schema', schema), ('data', data)])
assert result == expected
def test_to_json_float_index(self):
data = pd.Series(1, index=[1., 2.])
result = data.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
result['schema'].pop('pandas_version')
expected = (
OrderedDict([('schema', {
'fields': [{'name': 'index', 'type': 'number'},
{'name': 'values', 'type': 'integer'}],
'primaryKey': ['index']
}),
('data', [OrderedDict([('index', 1.0), ('values', 1)]),
OrderedDict([('index', 2.0), ('values', 1)])])])
)
assert result == expected
def test_to_json_period_index(self):
idx = pd.period_range('2016', freq='Q-JAN', periods=2)
data = pd.Series(1, idx)
result = data.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
result['schema'].pop('pandas_version')
fields = [{'freq': 'Q-JAN', 'name': 'index', 'type': 'datetime'},
{'name': 'values', 'type': 'integer'}]
schema = {'fields': fields, 'primaryKey': ['index']}
data = [OrderedDict([('index', '2015-11-01T00:00:00.000Z'),
('values', 1)]),
OrderedDict([('index', '2016-02-01T00:00:00.000Z'),
('values', 1)])]
expected = OrderedDict([('schema', schema), ('data', data)])
assert result == expected
def test_to_json_categorical_index(self):
data = pd.Series(1, pd.CategoricalIndex(['a', 'b']))
result = data.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
result['schema'].pop('pandas_version')
expected = (
OrderedDict([('schema',
{'fields': [{'name': 'index', 'type': 'any',
'constraints': {'enum': ['a', 'b']},
'ordered': False},
{'name': 'values', 'type': 'integer'}],
'primaryKey': ['index']}),
('data', [
OrderedDict([('index', 'a'),
('values', 1)]),
OrderedDict([('index', 'b'), ('values', 1)])])])
)
assert result == expected
def test_date_format_raises(self):
with pytest.raises(ValueError):
self.df.to_json(orient='table', date_format='epoch')
# others work
self.df.to_json(orient='table', date_format='iso')
self.df.to_json(orient='table')
@pytest.mark.parametrize('kind', [pd.Series, pd.Index])
def test_convert_pandas_type_to_json_field_int(self, kind):
data = [1, 2, 3]
result = convert_pandas_type_to_json_field(kind(data, name='name'))
expected = {"name": "name", "type": "integer"}
assert result == expected
@pytest.mark.parametrize('kind', [pd.Series, pd.Index])
def test_convert_pandas_type_to_json_field_float(self, kind):
data = [1., 2., 3.]
result = convert_pandas_type_to_json_field(kind(data, name='name'))
expected = {"name": "name", "type": "number"}
assert result == expected
@pytest.mark.parametrize('dt_args,extra_exp', [
({}, {}), ({'utc': True}, {'tz': 'UTC'})])
@pytest.mark.parametrize('wrapper', [None, pd.Series])
def test_convert_pandas_type_to_json_field_datetime(self, dt_args,
extra_exp, wrapper):
data = [1., 2., 3.]
data = pd.to_datetime(data, **dt_args)
if wrapper is pd.Series:
data = pd.Series(data, name='values')
result = convert_pandas_type_to_json_field(data)
expected = {"name": "values", "type": 'datetime'}
expected.update(extra_exp)
assert result == expected
def test_convert_pandas_type_to_json_period_range(self):
arr = pd.period_range('2016', freq='A-DEC', periods=4)
result = convert_pandas_type_to_json_field(arr)
expected = {"name": "values", "type": 'datetime', "freq": "A-DEC"}
assert result == expected
@pytest.mark.parametrize('kind', [pd.Categorical, pd.CategoricalIndex])
@pytest.mark.parametrize('ordered', [True, False])
def test_convert_pandas_type_to_json_field_categorical(self, kind,
ordered):
data = ['a', 'b', 'c']
if kind is pd.Categorical:
arr = pd.Series(kind(data, ordered=ordered), name='cats')
elif kind is pd.CategoricalIndex:
arr = kind(data, ordered=ordered, name='cats')
result = convert_pandas_type_to_json_field(arr)
expected = {"name": "cats", "type": "any",
"constraints": {"enum": data},
"ordered": ordered}
assert result == expected
@pytest.mark.parametrize("inp,exp", [
({'type': 'integer'}, 'int64'),
({'type': 'number'}, 'float64'),
({'type': 'boolean'}, 'bool'),
({'type': 'duration'}, 'timedelta64'),
({'type': 'datetime'}, 'datetime64[ns]'),
({'type': 'datetime', 'tz': 'US/Hawaii'}, 'datetime64[ns, US/Hawaii]'),
({'type': 'any'}, 'object'),
({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
'ordered': False}, CategoricalDtype(categories=['a', 'b', 'c'],
ordered=False)),
({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
'ordered': True}, CategoricalDtype(categories=['a', 'b', 'c'],
ordered=True)),
({'type': 'string'}, 'object')])
def test_convert_json_field_to_pandas_type(self, inp, exp):
field = {'name': 'foo'}
field.update(inp)
assert convert_json_field_to_pandas_type(field) == exp
@pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
def test_convert_json_field_to_pandas_type_raises(self, inp):
field = {'type': inp}
with tm.assert_raises_regex(ValueError, "Unsupported or invalid field "
"type: {}".format(inp)):
convert_json_field_to_pandas_type(field)
def test_categorical(self):
s = pd.Series(pd.Categorical(['a', 'b', 'a']))
s.index.name = 'idx'
result = s.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
result['schema'].pop('pandas_version')
fields = [{'name': 'idx', 'type': 'integer'},
{'constraints': {'enum': ['a', 'b']},
'name': 'values',
'ordered': False,
'type': 'any'}]
expected = OrderedDict([
('schema', {'fields': fields,
'primaryKey': ['idx']}),
('data', [OrderedDict([('idx', 0), ('values', 'a')]),
OrderedDict([('idx', 1), ('values', 'b')]),
OrderedDict([('idx', 2), ('values', 'a')])])])
assert result == expected
@pytest.mark.parametrize('idx,nm,prop', [
(pd.Index([1]), 'index', 'name'),
(pd.Index([1], name='myname'), 'myname', 'name'),
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')]),
['level_0', 'level_1'], 'names'),
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
names=['n1', 'n2']),
['n1', 'n2'], 'names'),
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
names=['n1', None]),
['n1', 'level_1'], 'names')
])
def test_set_names_unset(self, idx, nm, prop):
data = pd.Series(1, idx)
result = set_default_names(data)
assert getattr(result.index, prop) == nm
@pytest.mark.parametrize("idx", [
pd.Index([], name='index'),
pd.MultiIndex.from_arrays([['foo'], ['bar']],
names=('level_0', 'level_1')),
pd.MultiIndex.from_arrays([['foo'], ['bar']],
names=('foo', 'level_1'))
])
def test_warns_non_roundtrippable_names(self, idx):
# GH 19130
df = pd.DataFrame([[]], index=idx)
df.index.name = 'index'
with tm.assert_produces_warning():
set_default_names(df)
def test_timestamp_in_columns(self):
df = pd.DataFrame([[1, 2]], columns=[pd.Timestamp('2016'),
pd.Timedelta(10, unit='s')])
result = df.to_json(orient="table")
js = json.loads(result)
assert js['schema']['fields'][1]['name'] == 1451606400000
assert js['schema']['fields'][2]['name'] == 10000
@pytest.mark.parametrize('case', [
pd.Series([1], index=pd.Index([1], name='a'), name='a'),
pd.DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
pd.DataFrame({"A": [1]}, index=pd.MultiIndex.from_arrays([
['a'], [1]], names=["A", "a"]))
])
def test_overlapping_names(self, case):
with tm.assert_raises_regex(ValueError, 'Overlapping'):
case.to_json(orient='table')
def test_mi_falsey_name(self):
# GH 16203
df = pd.DataFrame(np.random.randn(4, 4),
index=pd.MultiIndex.from_product([('A', 'B'),
('a', 'b')]))
result = [x['name'] for x in build_table_schema(df)['fields']]
assert result == ['level_0', 'level_1', 0, 1, 2, 3]
class TestTableOrientReader(object):
@pytest.mark.parametrize("index_nm", [
None, "idx", pytest.param("index", marks=pytest.mark.xfail),
'level_0'])
@pytest.mark.parametrize("vals", [
{'ints': [1, 2, 3, 4]},
{'objects': ['a', 'b', 'c', 'd']},
{'date_ranges': pd.date_range('2016-01-01', freq='d', periods=4)},
{'categoricals': pd.Series(pd.Categorical(['a', 'b', 'c', 'c']))},
{'ordered_cats': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
ordered=True))},
pytest.param({'floats': [1., 2., 3., 4.]}, marks=pytest.mark.xfail),
{'floats': [1.1, 2.2, 3.3, 4.4]},
{'bools': [True, False, False, True]}])
def test_read_json_table_orient(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize("index_nm", [
None, "idx", "index"])
@pytest.mark.parametrize("vals", [
{'timedeltas': pd.timedelta_range('1H', periods=4, freq='T')},
{'timezones': pd.date_range('2016-01-01', freq='d', periods=4,
tz='US/Central')}])
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
with tm.assert_raises_regex(NotImplementedError, 'can not yet read '):
pd.read_json(out, orient="table")
def test_comprehensive(self):
df = DataFrame(
{'A': [1, 2, 3, 4],
'B': ['a', 'b', 'c', 'c'],
'C': pd.date_range('2016-01-01', freq='d', periods=4),
# 'D': pd.timedelta_range('1H', periods=4, freq='T'),
'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
ordered=True)),
'G': [1.1, 2.2, 3.3, 4.4],
# 'H': pd.date_range('2016-01-01', freq='d', periods=4,
# tz='US/Central'),
'I': [True, False, False, True],
},
index=pd.Index(range(4), name='idx'))
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize("index_names", [
[None, None], ['foo', 'bar'], ['foo', None], [None, 'foo'],
['index', 'foo']])
def test_multiindex(self, index_names):
# GH 18912
df = pd.DataFrame(
[["Arr", "alpha", [1, 2, 3, 4]],
["Bee", "Beta", [10, 20, 30, 40]]],
index=[["A", "B"], ["Null", "Eins"]],
columns=["Aussprache", "Griechisch", "Args"]
)
df.index.names = index_names
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize("strict_check", [
pytest.param(True, marks=pytest.mark.xfail), False])
def test_empty_frame_roundtrip(self, strict_check):
# GH 21287
df = pd.DataFrame([], columns=['a', 'b', 'c'])
expected = df.copy()
out = df.to_json(orient='table')
result = pd.read_json(out, orient='table')
# TODO: When DF coercion issue (#21345) is resolved tighten type checks
tm.assert_frame_equal(expected, result,
check_dtype=strict_check,
check_index_type=strict_check)
@@ -1,442 +0,0 @@
import pytest
import numpy as np
import json
import pandas.util.testing as tm
from pandas import compat, Index, DataFrame
from pandas.io.json import json_normalize
from pandas.io.json.normalize import nested_to_record
@pytest.fixture
def deep_nested():
# deeply nested data
return [{'country': 'USA',
'states': [{'name': 'California',
'cities': [{'name': 'San Francisco',
'pop': 12345},
{'name': 'Los Angeles',
'pop': 12346}]
},
{'name': 'Ohio',
'cities': [{'name': 'Columbus',
'pop': 1234},
{'name': 'Cleveland',
'pop': 1236}]}
]
},
{'country': 'Germany',
'states': [{'name': 'Bayern',
'cities': [{'name': 'Munich', 'pop': 12347}]
},
{'name': 'Nordrhein-Westfalen',
'cities': [{'name': 'Duesseldorf', 'pop': 1238},
{'name': 'Koeln', 'pop': 1239}]}
]
}
]
@pytest.fixture
def state_data():
return [
{'counties': [{'name': 'Dade', 'population': 12345},
{'name': 'Broward', 'population': 40000},
{'name': 'Palm Beach', 'population': 60000}],
'info': {'governor': 'Rick Scott'},
'shortname': 'FL',
'state': 'Florida'},
{'counties': [{'name': 'Summit', 'population': 1234},
{'name': 'Cuyahoga', 'population': 1337}],
'info': {'governor': 'John Kasich'},
'shortname': 'OH',
'state': 'Ohio'}]
@pytest.fixture
def author_missing_data():
return [
{'info': None},
{'info':
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
'author_name':
{'first': 'Jane', 'last_name': 'Doe'}
}]
class TestJSONNormalize(object):
def test_simple_records(self):
recs = [{'a': 1, 'b': 2, 'c': 3},
{'a': 4, 'b': 5, 'c': 6},
{'a': 7, 'b': 8, 'c': 9},
{'a': 10, 'b': 11, 'c': 12}]
result = json_normalize(recs)
expected = DataFrame(recs)
tm.assert_frame_equal(result, expected)
def test_simple_normalize(self, state_data):
result = json_normalize(state_data[0], 'counties')
expected = DataFrame(state_data[0]['counties'])
tm.assert_frame_equal(result, expected)
result = json_normalize(state_data, 'counties')
expected = []
for rec in state_data:
expected.extend(rec['counties'])
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
result = json_normalize(state_data, 'counties', meta='state')
expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
tm.assert_frame_equal(result, expected)
def test_empty_array(self):
result = json_normalize([])
expected = DataFrame()
tm.assert_frame_equal(result, expected)
def test_simple_normalize_with_separator(self, deep_nested):
# GH 14883
result = json_normalize({'A': {'A': 1, 'B': 2}})
expected = DataFrame([[1, 2]], columns=['A.A', 'A.B'])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_')
expected = DataFrame([[1, 2]], columns=['A_A', 'A_B'])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3')
expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B'])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize(deep_nested, ['states', 'cities'],
meta=['country', ['states', 'name']],
sep='_')
expected = Index(['name', 'pop',
'country', 'states_name']).sort_values()
assert result.columns.sort_values().equals(expected)
def test_value_array_record_prefix(self):
# GH 21536
result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.')
expected = DataFrame([[1], [2]], columns=['Prefix.0'])
tm.assert_frame_equal(result, expected)
def test_more_deeply_nested(self, deep_nested):
result = json_normalize(deep_nested, ['states', 'cities'],
meta=['country', ['states', 'name']])
# meta_prefix={'states': 'state_'})
ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
'states.name': ['California', 'California', 'Ohio', 'Ohio',
'Bayern', 'Nordrhein-Westfalen',
'Nordrhein-Westfalen'],
'name': ['San Francisco', 'Los Angeles', 'Columbus',
'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'],
'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]}
expected = DataFrame(ex_data, columns=result.columns)
tm.assert_frame_equal(result, expected)
def test_shallow_nested(self):
data = [{'state': 'Florida',
'shortname': 'FL',
'info': {
'governor': 'Rick Scott'
},
'counties': [{'name': 'Dade', 'population': 12345},
{'name': 'Broward', 'population': 40000},
{'name': 'Palm Beach', 'population': 60000}]},
{'state': 'Ohio',
'shortname': 'OH',
'info': {
'governor': 'John Kasich'
},
'counties': [{'name': 'Summit', 'population': 1234},
{'name': 'Cuyahoga', 'population': 1337}]}]
result = json_normalize(data, 'counties',
['state', 'shortname',
['info', 'governor']])
ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit',
'Cuyahoga'],
'state': ['Florida'] * 3 + ['Ohio'] * 2,
'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'],
'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2,
'population': [12345, 40000, 60000, 1234, 1337]}
expected = DataFrame(ex_data, columns=result.columns)
tm.assert_frame_equal(result, expected)
def test_meta_name_conflict(self):
data = [{'foo': 'hello',
'bar': 'there',
'data': [{'foo': 'something', 'bar': 'else'},
{'foo': 'something2', 'bar': 'else2'}]}]
with pytest.raises(ValueError):
json_normalize(data, 'data', meta=['foo', 'bar'])
result = json_normalize(data, 'data', meta=['foo', 'bar'],
meta_prefix='meta')
for val in ['metafoo', 'metabar', 'foo', 'bar']:
assert val in result
def test_meta_parameter_not_modified(self):
# GH 18610
data = [{'foo': 'hello',
'bar': 'there',
'data': [{'foo': 'something', 'bar': 'else'},
{'foo': 'something2', 'bar': 'else2'}]}]
COLUMNS = ['foo', 'bar']
result = json_normalize(data, 'data', meta=COLUMNS,
meta_prefix='meta')
assert COLUMNS == ['foo', 'bar']
for val in ['metafoo', 'metabar', 'foo', 'bar']:
assert val in result
def test_record_prefix(self, state_data):
result = json_normalize(state_data[0], 'counties')
expected = DataFrame(state_data[0]['counties'])
tm.assert_frame_equal(result, expected)
result = json_normalize(state_data, 'counties',
meta='state',
record_prefix='county_')
expected = []
for rec in state_data:
expected.extend(rec['counties'])
expected = DataFrame(expected)
expected = expected.rename(columns=lambda x: 'county_' + x)
expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
tm.assert_frame_equal(result, expected)
def test_non_ascii_key(self):
if compat.PY3:
testjson = (
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
).decode('utf8')
else:
testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]')
testdata = {
u'sub.A': [1, 3],
u'sub.B': [2, 4],
b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1]
}
expected = DataFrame(testdata)
result = json_normalize(json.loads(testjson))
tm.assert_frame_equal(result, expected)
def test_missing_field(self, author_missing_data):
# GH20030:
result = json_normalize(author_missing_data)
ex_data = [
{'info': np.nan,
'author_name.first': np.nan,
'author_name.last_name': np.nan,
'info.created_at': np.nan,
'info.last_updated': np.nan},
{'info': None,
'author_name.first': 'Jane',
'author_name.last_name': 'Doe',
'info.created_at': '11/08/1993',
'info.last_updated': '26/05/2012'}
]
expected = DataFrame(ex_data)
tm.assert_frame_equal(result, expected)
class TestNestedToRecord(object):
def test_flat_stays_flat(self):
recs = [dict(flat1=1, flat2=2),
dict(flat1=3, flat2=4),
]
result = nested_to_record(recs)
expected = recs
assert result == expected
def test_one_level_deep_flattens(self):
data = dict(flat1=1,
dict1=dict(c=1, d=2))
result = nested_to_record(data)
expected = {'dict1.c': 1,
'dict1.d': 2,
'flat1': 1}
assert result == expected
def test_nested_flattens(self):
data = dict(flat1=1,
dict1=dict(c=1, d=2),
nested=dict(e=dict(c=1, d=2),
d=2))
result = nested_to_record(data)
expected = {'dict1.c': 1,
'dict1.d': 2,
'flat1': 1,
'nested.d': 2,
'nested.e.c': 1,
'nested.e.d': 2}
assert result == expected
def test_json_normalize_errors(self):
# GH14583: If meta keys are not always present
# a new option to set errors='ignore' has been implemented
i = {
"Trades": [{
"general": {
"tradeid": 100,
"trade_version": 1,
"stocks": [{
"symbol": "AAPL",
"name": "Apple",
"price": "0"
}, {
"symbol": "GOOG",
"name": "Google",
"price": "0"
}
]
}
}, {
"general": {
"tradeid": 100,
"stocks": [{
"symbol": "AAPL",
"name": "Apple",
"price": "0"
}, {
"symbol": "GOOG",
"name": "Google",
"price": "0"
}
]
}
}
]
}
j = json_normalize(data=i['Trades'],
record_path=[['general', 'stocks']],
meta=[['general', 'tradeid'],
['general', 'trade_version']],
errors='ignore')
expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
'price': {0: '0', 1: '0', 2: '0', 3: '0'},
'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
assert j.fillna('').to_dict() == expected
pytest.raises(KeyError,
json_normalize, data=i['Trades'],
record_path=[['general', 'stocks']],
meta=[['general', 'tradeid'],
['general', 'trade_version']],
errors='raise'
)
def test_donot_drop_nonevalues(self):
# GH21356
data = [
{'info': None,
'author_name':
{'first': 'Smith', 'last_name': 'Appleseed'}
},
{'info':
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
'author_name':
{'first': 'Jane', 'last_name': 'Doe'}
}
]
result = nested_to_record(data)
expected = [
{'info': None,
'author_name.first': 'Smith',
'author_name.last_name': 'Appleseed'},
{'author_name.first': 'Jane',
'author_name.last_name': 'Doe',
'info.created_at': '11/08/1993',
'info.last_updated': '26/05/2012'}]
assert result == expected
def test_nonetype_top_level_bottom_level(self):
# GH21158: If inner level json has a key with a null value
# make sure it doesnt do a new_d.pop twice and except
data = {
"id": None,
"location": {
"country": {
"state": {
"id": None,
"town.info": {
"id": None,
"region": None,
"x": 49.151580810546875,
"y": -33.148521423339844,
"z": 27.572303771972656}}}
}
}
result = nested_to_record(data)
expected = {
'id': None,
'location.country.state.id': None,
'location.country.state.town.info.id': None,
'location.country.state.town.info.region': None,
'location.country.state.town.info.x': 49.151580810546875,
'location.country.state.town.info.y': -33.148521423339844,
'location.country.state.town.info.z': 27.572303771972656}
assert result == expected
def test_nonetype_multiple_levels(self):
# GH21158: If inner level json has a key with a null value
# make sure it doesnt do a new_d.pop twice and except
data = {
"id": None,
"location": {
"id": None,
"country": {
"id": None,
"state": {
"id": None,
"town.info": {
"region": None,
"x": 49.151580810546875,
"y": -33.148521423339844,
"z": 27.572303771972656}}}
}
}
result = nested_to_record(data)
expected = {
'id': None,
'location.id': None,
'location.country.id': None,
'location.country.state.id': None,
'location.country.state.town.info.region': None,
'location.country.state.town.info.x': 49.151580810546875,
'location.country.state.town.info.y': -33.148521423339844,
'location.country.state.town.info.z': 27.572303771972656}
assert result == expected
File diff suppressed because it is too large Load Diff
@@ -1,169 +0,0 @@
# -*- coding: utf-8 -*-
import pytest
import pandas as pd
from pandas import DataFrame, read_json
from pandas.compat import StringIO
from pandas.io.json.json import JsonReader
import pandas.util.testing as tm
from pandas.util.testing import (assert_frame_equal, assert_series_equal,
ensure_clean)
@pytest.fixture
def lines_json_df():
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
return df.to_json(lines=True, orient="records")
def test_read_jsonl():
# GH9180
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)
def test_read_jsonl_unicode_chars():
# GH15132: non-ascii unicode characters
# \u201d == RIGHT DOUBLE QUOTATION MARK
# simulate file handle
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
json = StringIO(json)
result = read_json(json, lines=True)
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
columns=['a', 'b'])
assert_frame_equal(result, expected)
# simulate string
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
result = read_json(json, lines=True)
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
columns=['a', 'b'])
assert_frame_equal(result, expected)
def test_to_jsonl():
# GH9180
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
result = df.to_json(orient="records", lines=True)
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
assert result == expected
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
result = df.to_json(orient="records", lines=True)
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
assert result == expected
assert_frame_equal(read_json(result, lines=True), df)
# GH15096: escaped characters in columns and data
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
columns=["a\\", 'b'])
result = df.to_json(orient="records", lines=True)
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
'{"a\\\\":"foo\\"","b":"bar"}')
assert result == expected
assert_frame_equal(read_json(result, lines=True), df)
@pytest.mark.parametrize("chunksize", [1, 1.0])
def test_readjson_chunks(lines_json_df, chunksize):
# Basic test that read_json(chunks=True) gives the same result as
# read_json(chunks=False)
# GH17048: memory usage when lines=True
unchunked = read_json(StringIO(lines_json_df), lines=True)
reader = read_json(StringIO(lines_json_df), lines=True,
chunksize=chunksize)
chunked = pd.concat(reader)
assert_frame_equal(chunked, unchunked)
def test_readjson_chunksize_requires_lines(lines_json_df):
msg = "chunksize can only be passed if lines=True"
with tm.assert_raises_regex(ValueError, msg):
pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
def test_readjson_chunks_series():
# Test reading line-format JSON to Series with chunksize param
s = pd.Series({'A': 1, 'B': 2})
strio = StringIO(s.to_json(lines=True, orient="records"))
unchunked = pd.read_json(strio, lines=True, typ='Series')
strio = StringIO(s.to_json(lines=True, orient="records"))
chunked = pd.concat(pd.read_json(
strio, lines=True, typ='Series', chunksize=1
))
assert_series_equal(chunked, unchunked)
def test_readjson_each_chunk(lines_json_df):
# Other tests check that the final result of read_json(chunksize=True)
# is correct. This checks the intermediate chunks.
chunks = list(
pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
)
assert chunks[0].shape == (2, 2)
assert chunks[1].shape == (1, 2)
def test_readjson_chunks_from_file():
with ensure_clean('test.json') as path:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
unchunked = pd.read_json(path, lines=True)
assert_frame_equal(unchunked, chunked)
@pytest.mark.parametrize("chunksize", [None, 1])
def test_readjson_chunks_closes(chunksize):
with ensure_clean('test.json') as path:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
reader = JsonReader(
path, orient=None, typ="frame", dtype=True, convert_axes=True,
convert_dates=True, keep_default_dates=True, numpy=False,
precise_float=False, date_unit=None, encoding=None,
lines=True, chunksize=chunksize, compression=None)
reader.read()
assert reader.open_stream.closed, "didn't close stream with \
chunksize = {chunksize}".format(chunksize=chunksize)
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
def test_readjson_invalid_chunksize(lines_json_df, chunksize):
msg = r"'chunksize' must be an integer >=1"
with tm.assert_raises_regex(ValueError, msg):
pd.read_json(StringIO(lines_json_df), lines=True,
chunksize=chunksize)
@pytest.mark.parametrize("chunksize", [None, 1, 2])
def test_readjson_chunks_multiple_empty_lines(chunksize):
j = """
{"A":1,"B":4}
{"A":2,"B":5}
{"A":3,"B":6}
"""
orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
test = pd.read_json(j, lines=True, chunksize=chunksize)
if chunksize is not None:
test = pd.concat(test)
tm.assert_frame_equal(
orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize))
File diff suppressed because it is too large Load Diff