Static code analysis and corrections

This commit is contained in:
Kristjan Komlosi
2019-07-17 16:06:09 +02:00
parent 674692c2fc
commit 21bfae9fbc
10086 changed files with 2102103 additions and 51 deletions
@@ -0,0 +1,20 @@
"""
Data IO api
"""
# flake8: noqa
from pandas.io.clipboards import read_clipboard
from pandas.io.excel import ExcelFile, ExcelWriter, read_excel
from pandas.io.feather_format import read_feather
from pandas.io.gbq import read_gbq
from pandas.io.html import read_html
from pandas.io.json import read_json
from pandas.io.packers import read_msgpack, to_msgpack
from pandas.io.parquet import read_parquet
from pandas.io.parsers import read_csv, read_fwf, read_table
from pandas.io.pickle import read_pickle, to_pickle
from pandas.io.pytables import HDFStore, read_hdf
from pandas.io.sas import read_sas
from pandas.io.sql import read_sql, read_sql_query, read_sql_table
from pandas.io.stata import read_stata
@@ -0,0 +1,125 @@
"""
Pyperclip
A cross-platform clipboard module for Python. (only handles plain text for now)
By Al Sweigart al@inventwithpython.com
BSD License
Usage:
import pyperclip
pyperclip.copy('The text to be copied to the clipboard.')
spam = pyperclip.paste()
if not pyperclip.copy:
print("Copy functionality unavailable!")
On Windows, no additional modules are needed.
On Mac, the module uses pbcopy and pbpaste, which should come with the os.
On Linux, install xclip or xsel via package manager. For example, in Debian:
sudo apt-get install xclip
Otherwise on Linux, you will need the gtk, qtpy or PyQt modules installed.
qtpy also requires a python-qt-bindings module: PyQt4, PyQt5, PySide, PySide2
gtk and PyQt4 modules are not available for Python 3,
and this module does not work with PyGObject yet.
"""
__version__ = '1.5.27'
import platform
import os
import subprocess
from .clipboards import (init_osx_clipboard,
init_gtk_clipboard, init_qt_clipboard,
init_xclip_clipboard, init_xsel_clipboard,
init_klipper_clipboard, init_no_clipboard)
from .windows import init_windows_clipboard
# `import qtpy` sys.exit()s if DISPLAY is not in the environment.
# Thus, we need to detect the presence of $DISPLAY manually
# and not load qtpy if it is absent.
HAS_DISPLAY = os.getenv("DISPLAY", False)
CHECK_CMD = "where" if platform.system() == "Windows" else "which"
def _executable_exists(name):
return subprocess.call([CHECK_CMD, name],
stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
def determine_clipboard():
# Determine the OS/platform and set
# the copy() and paste() functions accordingly.
if 'cygwin' in platform.system().lower():
# FIXME: pyperclip currently does not support Cygwin,
# see https://github.com/asweigart/pyperclip/issues/55
pass
elif os.name == 'nt' or platform.system() == 'Windows':
return init_windows_clipboard()
if os.name == 'mac' or platform.system() == 'Darwin':
return init_osx_clipboard()
if HAS_DISPLAY:
# Determine which command/module is installed, if any.
try:
# Check if gtk is installed
import gtk # noqa
except ImportError:
pass
else:
return init_gtk_clipboard()
try:
# qtpy is a small abstraction layer that lets you write
# applications using a single api call to either PyQt or PySide
# https://pypi.org/project/QtPy
import qtpy # noqa
except ImportError:
# If qtpy isn't installed, fall back on importing PyQt5, or PyQt5
try:
import PyQt5 # noqa
except ImportError:
try:
import PyQt4 # noqa
except ImportError:
pass # fail fast for all non-ImportError exceptions.
else:
return init_qt_clipboard()
else:
return init_qt_clipboard()
pass
else:
return init_qt_clipboard()
if _executable_exists("xclip"):
return init_xclip_clipboard()
if _executable_exists("xsel"):
return init_xsel_clipboard()
if _executable_exists("klipper") and _executable_exists("qdbus"):
return init_klipper_clipboard()
return init_no_clipboard()
def set_clipboard(clipboard):
global copy, paste
clipboard_types = {'osx': init_osx_clipboard,
'gtk': init_gtk_clipboard,
'qt': init_qt_clipboard,
'xclip': init_xclip_clipboard,
'xsel': init_xsel_clipboard,
'klipper': init_klipper_clipboard,
'windows': init_windows_clipboard,
'no': init_no_clipboard}
copy, paste = clipboard_types[clipboard]()
copy, paste = determine_clipboard()
__all__ = ["copy", "paste"]
# pandas aliases
clipboard_get = paste
clipboard_set = copy
@@ -0,0 +1,145 @@
import subprocess
from pandas.compat import PY2, text_type
from .exceptions import PyperclipException
EXCEPT_MSG = """
Pyperclip could not find a copy/paste mechanism for your system.
For more information, please visit https://pyperclip.readthedocs.org """
def init_osx_clipboard():
def copy_osx(text):
p = subprocess.Popen(['pbcopy', 'w'],
stdin=subprocess.PIPE, close_fds=True)
p.communicate(input=text.encode('utf-8'))
def paste_osx():
p = subprocess.Popen(['pbpaste', 'r'],
stdout=subprocess.PIPE, close_fds=True)
stdout, stderr = p.communicate()
return stdout.decode('utf-8')
return copy_osx, paste_osx
def init_gtk_clipboard():
import gtk
def copy_gtk(text):
global cb
cb = gtk.Clipboard()
cb.set_text(text)
cb.store()
def paste_gtk():
clipboardContents = gtk.Clipboard().wait_for_text()
# for python 2, returns None if the clipboard is blank.
if clipboardContents is None:
return ''
else:
return clipboardContents
return copy_gtk, paste_gtk
def init_qt_clipboard():
# $DISPLAY should exist
# Try to import from qtpy, but if that fails try PyQt5 then PyQt4
try:
from qtpy.QtWidgets import QApplication
except ImportError:
try:
from PyQt5.QtWidgets import QApplication
except ImportError:
from PyQt4.QtGui import QApplication
app = QApplication.instance()
if app is None:
app = QApplication([])
def copy_qt(text):
cb = app.clipboard()
cb.setText(text)
def paste_qt():
cb = app.clipboard()
return text_type(cb.text())
return copy_qt, paste_qt
def init_xclip_clipboard():
def copy_xclip(text):
p = subprocess.Popen(['xclip', '-selection', 'c'],
stdin=subprocess.PIPE, close_fds=True)
p.communicate(input=text.encode('utf-8'))
def paste_xclip():
p = subprocess.Popen(['xclip', '-selection', 'c', '-o'],
stdout=subprocess.PIPE, close_fds=True)
stdout, stderr = p.communicate()
return stdout.decode('utf-8')
return copy_xclip, paste_xclip
def init_xsel_clipboard():
def copy_xsel(text):
p = subprocess.Popen(['xsel', '-b', '-i'],
stdin=subprocess.PIPE, close_fds=True)
p.communicate(input=text.encode('utf-8'))
def paste_xsel():
p = subprocess.Popen(['xsel', '-b', '-o'],
stdout=subprocess.PIPE, close_fds=True)
stdout, stderr = p.communicate()
return stdout.decode('utf-8')
return copy_xsel, paste_xsel
def init_klipper_clipboard():
def copy_klipper(text):
p = subprocess.Popen(
['qdbus', 'org.kde.klipper', '/klipper', 'setClipboardContents',
text.encode('utf-8')],
stdin=subprocess.PIPE, close_fds=True)
p.communicate(input=None)
def paste_klipper():
p = subprocess.Popen(
['qdbus', 'org.kde.klipper', '/klipper', 'getClipboardContents'],
stdout=subprocess.PIPE, close_fds=True)
stdout, stderr = p.communicate()
# Workaround for https://bugs.kde.org/show_bug.cgi?id=342874
# TODO: https://github.com/asweigart/pyperclip/issues/43
clipboardContents = stdout.decode('utf-8')
# even if blank, Klipper will append a newline at the end
assert len(clipboardContents) > 0
# make sure that newline is there
assert clipboardContents.endswith('\n')
if clipboardContents.endswith('\n'):
clipboardContents = clipboardContents[:-1]
return clipboardContents
return copy_klipper, paste_klipper
def init_no_clipboard():
class ClipboardUnavailable(object):
def __call__(self, *args, **kwargs):
raise PyperclipException(EXCEPT_MSG)
if PY2:
def __nonzero__(self):
return False
else:
def __bool__(self):
return False
return ClipboardUnavailable(), ClipboardUnavailable()
@@ -0,0 +1,12 @@
import ctypes
class PyperclipException(RuntimeError):
pass
class PyperclipWindowsException(PyperclipException):
def __init__(self, message):
message += " ({err})".format(err=ctypes.WinError())
super(PyperclipWindowsException, self).__init__(message)
@@ -0,0 +1,154 @@
"""
This module implements clipboard handling on Windows using ctypes.
"""
import contextlib
import ctypes
from ctypes import c_size_t, c_wchar, c_wchar_p, get_errno, sizeof
import time
from .exceptions import PyperclipWindowsException
class CheckedCall(object):
def __init__(self, f):
super(CheckedCall, self).__setattr__("f", f)
def __call__(self, *args):
ret = self.f(*args)
if not ret and get_errno():
raise PyperclipWindowsException("Error calling " + self.f.__name__)
return ret
def __setattr__(self, key, value):
setattr(self.f, key, value)
def init_windows_clipboard():
from ctypes.wintypes import (HGLOBAL, LPVOID, DWORD, LPCSTR, INT, HWND,
HINSTANCE, HMENU, BOOL, UINT, HANDLE)
windll = ctypes.windll
safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA)
safeCreateWindowExA.argtypes = [DWORD, LPCSTR, LPCSTR, DWORD, INT, INT,
INT, INT, HWND, HMENU, HINSTANCE, LPVOID]
safeCreateWindowExA.restype = HWND
safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow)
safeDestroyWindow.argtypes = [HWND]
safeDestroyWindow.restype = BOOL
OpenClipboard = windll.user32.OpenClipboard
OpenClipboard.argtypes = [HWND]
OpenClipboard.restype = BOOL
safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard)
safeCloseClipboard.argtypes = []
safeCloseClipboard.restype = BOOL
safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard)
safeEmptyClipboard.argtypes = []
safeEmptyClipboard.restype = BOOL
safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData)
safeGetClipboardData.argtypes = [UINT]
safeGetClipboardData.restype = HANDLE
safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData)
safeSetClipboardData.argtypes = [UINT, HANDLE]
safeSetClipboardData.restype = HANDLE
safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc)
safeGlobalAlloc.argtypes = [UINT, c_size_t]
safeGlobalAlloc.restype = HGLOBAL
safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock)
safeGlobalLock.argtypes = [HGLOBAL]
safeGlobalLock.restype = LPVOID
safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock)
safeGlobalUnlock.argtypes = [HGLOBAL]
safeGlobalUnlock.restype = BOOL
GMEM_MOVEABLE = 0x0002
CF_UNICODETEXT = 13
@contextlib.contextmanager
def window():
"""
Context that provides a valid Windows hwnd.
"""
# we really just need the hwnd, so setting "STATIC"
# as predefined lpClass is just fine.
hwnd = safeCreateWindowExA(0, b"STATIC", None, 0, 0, 0, 0, 0,
None, None, None, None)
try:
yield hwnd
finally:
safeDestroyWindow(hwnd)
@contextlib.contextmanager
def clipboard(hwnd):
"""
Context manager that opens the clipboard and prevents
other applications from modifying the clipboard content.
"""
# We may not get the clipboard handle immediately because
# some other application is accessing it (?)
# We try for at least 500ms to get the clipboard.
t = time.time() + 0.5
success = False
while time.time() < t:
success = OpenClipboard(hwnd)
if success:
break
time.sleep(0.01)
if not success:
raise PyperclipWindowsException("Error calling OpenClipboard")
try:
yield
finally:
safeCloseClipboard()
def copy_windows(text):
# This function is heavily based on
# http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard
with window() as hwnd:
# http://msdn.com/ms649048
# If an application calls OpenClipboard with hwnd set to NULL,
# EmptyClipboard sets the clipboard owner to NULL;
# this causes SetClipboardData to fail.
# => We need a valid hwnd to copy something.
with clipboard(hwnd):
safeEmptyClipboard()
if text:
# http://msdn.com/ms649051
# If the hMem parameter identifies a memory object,
# the object must have been allocated using the
# function with the GMEM_MOVEABLE flag.
count = len(text) + 1
handle = safeGlobalAlloc(GMEM_MOVEABLE,
count * sizeof(c_wchar))
locked_handle = safeGlobalLock(handle)
ctypes.memmove(c_wchar_p(locked_handle),
c_wchar_p(text), count * sizeof(c_wchar))
safeGlobalUnlock(handle)
safeSetClipboardData(CF_UNICODETEXT, handle)
def paste_windows():
with clipboard(None):
handle = safeGetClipboardData(CF_UNICODETEXT)
if not handle:
# GetClipboardData may return NULL with errno == NO_ERROR
# if the clipboard is empty.
# (Also, it may return a handle to an empty buffer,
# but technically that's not empty)
return ""
return c_wchar_p(handle).value
return copy_windows, paste_windows
@@ -0,0 +1,145 @@
""" io on the clipboard """
import warnings
import pandas.compat as compat
from pandas.compat import PY2, PY3, StringIO
from pandas.core.dtypes.generic import ABCDataFrame
from pandas import get_option, option_context
def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover
r"""
Read text from clipboard and pass to read_csv. See read_csv for the
full argument list
Parameters
----------
sep : str, default '\s+'
A string or regex delimiter. The default of '\s+' denotes
one or more whitespace characters.
Returns
-------
parsed : DataFrame
"""
encoding = kwargs.pop('encoding', 'utf-8')
# only utf-8 is valid for passed value because that's what clipboard
# supports
if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
raise NotImplementedError(
'reading from clipboard only supports utf-8 encoding')
from pandas.io.clipboard import clipboard_get
from pandas.io.parsers import read_csv
text = clipboard_get()
# try to decode (if needed on PY3)
# Strange. linux py33 doesn't complain, win py33 does
if PY3:
try:
text = compat.bytes_to_str(
text, encoding=(kwargs.get('encoding') or
get_option('display.encoding'))
)
except AttributeError:
pass
# Excel copies into clipboard with \t separation
# inspect no more then the 10 first lines, if they
# all contain an equal number (>0) of tabs, infer
# that this came from excel and set 'sep' accordingly
lines = text[:10000].split('\n')[:-1][:10]
# Need to remove leading white space, since read_csv
# accepts:
# a b
# 0 1 2
# 1 3 4
counts = {x.lstrip().count('\t') for x in lines}
if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
sep = '\t'
# Edge case where sep is specified to be None, return to default
if sep is None and kwargs.get('delim_whitespace') is None:
sep = r'\s+'
# Regex separator currently only works with python engine.
# Default to python if separator is multi-character (regex)
if len(sep) > 1 and kwargs.get('engine') is None:
kwargs['engine'] = 'python'
elif len(sep) > 1 and kwargs.get('engine') == 'c':
warnings.warn('read_clipboard with regex separator does not work'
' properly with c engine')
# In PY2, the c table reader first encodes text with UTF-8 but Python
# table reader uses the format of the passed string. For consistency,
# encode strings for python engine so that output from python and c
# engines produce consistent results
if kwargs.get('engine') == 'python' and PY2:
text = text.encode('utf-8')
return read_csv(StringIO(text), sep=sep, **kwargs)
def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover
"""
Attempt to write text representation of object to the system clipboard
The clipboard can be then pasted into Excel for example.
Parameters
----------
obj : the object to write to the clipboard
excel : boolean, defaults to True
if True, use the provided separator, writing in a csv
format for allowing easy pasting into excel.
if False, write a string representation of the object
to the clipboard
sep : optional, defaults to tab
other keywords are passed to to_csv
Notes
-----
Requirements for your platform
- Linux: xclip, or xsel (with gtk or PyQt4 modules)
- Windows:
- OS X:
"""
encoding = kwargs.pop('encoding', 'utf-8')
# testing if an invalid encoding is passed to clipboard
if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
raise ValueError('clipboard only supports utf-8 encoding')
from pandas.io.clipboard import clipboard_set
if excel is None:
excel = True
if excel:
try:
if sep is None:
sep = '\t'
buf = StringIO()
# clipboard_set (pyperclip) expects unicode
obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs)
text = buf.getvalue()
if PY2:
text = text.decode('utf-8')
clipboard_set(text)
return
except TypeError:
warnings.warn('to_clipboard in excel mode requires a single '
'character separator.')
elif sep is not None:
warnings.warn('to_clipboard with excel=False ignores the sep argument')
if isinstance(obj, ABCDataFrame):
# str(df) has various unhelpful defaults, like truncation
with option_context('display.max_colwidth', 999999):
objstr = obj.to_string(**kwargs)
else:
objstr = str(obj)
clipboard_set(objstr)
@@ -0,0 +1,617 @@
"""Common IO api utilities"""
import codecs
from contextlib import closing, contextmanager
import csv
import mmap
import os
import zipfile
import pandas.compat as compat
from pandas.compat import BytesIO, StringIO, string_types, text_type
from pandas.errors import ( # noqa
AbstractMethodError, DtypeWarning, EmptyDataError, ParserError,
ParserWarning)
from pandas.core.dtypes.common import is_file_like, is_number
from pandas.io.formats.printing import pprint_thing
# gh-12665: Alias for now and remove later.
CParserError = ParserError
# common NA values
# no longer excluding inf representations
# '1.#INF','-1.#INF', '1.#INF000000',
_NA_VALUES = {'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A',
'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan',
'-nan', ''}
if compat.PY3:
from urllib.request import urlopen, pathname2url
_urlopen = urlopen
from urllib.parse import urlparse as parse_url
from urllib.parse import (uses_relative, uses_netloc, uses_params,
urlencode, urljoin)
from urllib.error import URLError
from http.client import HTTPException # noqa
else:
from urllib2 import urlopen as _urlopen
from urllib import urlencode, pathname2url # noqa
from urlparse import urlparse as parse_url
from urlparse import uses_relative, uses_netloc, uses_params, urljoin
from urllib2 import URLError # noqa
from httplib import HTTPException # noqa
from contextlib import contextmanager, closing # noqa
from functools import wraps # noqa
# @wraps(_urlopen)
@contextmanager
def urlopen(*args, **kwargs):
with closing(_urlopen(*args, **kwargs)) as f:
yield f
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard('')
class BaseIterator(object):
"""Subclass this and provide a "__next__()" method to obtain an iterator.
Useful only when the object being iterated is non-reusable (e.g. OK for a
parser, not for an in-memory table, yes for its iterator)."""
def __iter__(self):
return self
def __next__(self):
raise AbstractMethodError(self)
if not compat.PY3:
BaseIterator.next = lambda self: self.__next__()
def _is_url(url):
"""Check to see if a URL has a valid protocol.
Parameters
----------
url : str or unicode
Returns
-------
isurl : bool
If `url` has a valid protocol return True otherwise False.
"""
try:
return parse_url(url).scheme in _VALID_URLS
except Exception:
return False
def _expand_user(filepath_or_buffer):
"""Return the argument with an initial component of ~ or ~user
replaced by that user's home directory.
Parameters
----------
filepath_or_buffer : object to be converted if possible
Returns
-------
expanded_filepath_or_buffer : an expanded filepath or the
input if not expandable
"""
if isinstance(filepath_or_buffer, string_types):
return os.path.expanduser(filepath_or_buffer)
return filepath_or_buffer
def _validate_header_arg(header):
if isinstance(header, bool):
raise TypeError("Passing a bool to header is invalid. "
"Use header=None for no header or "
"header=int or list-like of ints to specify "
"the row(s) making up the column names")
def _stringify_path(filepath_or_buffer):
"""Attempt to convert a path-like object to a string.
Parameters
----------
filepath_or_buffer : object to be converted
Returns
-------
str_filepath_or_buffer : maybe a string version of the object
Notes
-----
Objects supporting the fspath protocol (python 3.6+) are coerced
according to its __fspath__ method.
For backwards compatibility with older pythons, pathlib.Path and
py.path objects are specially coerced.
Any other object is passed through unchanged, which includes bytes,
strings, buffers, or anything else that's not even path-like.
"""
try:
import pathlib
_PATHLIB_INSTALLED = True
except ImportError:
_PATHLIB_INSTALLED = False
try:
from py.path import local as LocalPath
_PY_PATH_INSTALLED = True
except ImportError:
_PY_PATH_INSTALLED = False
if hasattr(filepath_or_buffer, '__fspath__'):
return filepath_or_buffer.__fspath__()
if _PATHLIB_INSTALLED and isinstance(filepath_or_buffer, pathlib.Path):
return text_type(filepath_or_buffer)
if _PY_PATH_INSTALLED and isinstance(filepath_or_buffer, LocalPath):
return filepath_or_buffer.strpath
return _expand_user(filepath_or_buffer)
def is_s3_url(url):
"""Check for an s3, s3n, or s3a url"""
try:
return parse_url(url).scheme in ['s3', 's3n', 's3a']
except Exception:
return False
def is_gcs_url(url):
"""Check for a gcs url"""
try:
return parse_url(url).scheme in ['gcs', 'gs']
except Exception:
return False
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=None, mode=None):
"""
If the filepath_or_buffer is a url, translate and return the buffer.
Otherwise passthrough.
Parameters
----------
filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
or buffer
encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
mode : str, optional
Returns
-------
tuple of ({a filepath_ or buffer or S3File instance},
encoding, str,
compression, str,
should_close, bool)
"""
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if _is_url(filepath_or_buffer):
req = _urlopen(filepath_or_buffer)
content_encoding = req.headers.get('Content-Encoding', None)
if content_encoding == 'gzip':
# Override compression based on Content-Encoding header
compression = 'gzip'
reader = BytesIO(req.read())
req.close()
return reader, encoding, compression, True
if is_s3_url(filepath_or_buffer):
from pandas.io import s3
return s3.get_filepath_or_buffer(filepath_or_buffer,
encoding=encoding,
compression=compression,
mode=mode)
if is_gcs_url(filepath_or_buffer):
from pandas.io import gcs
return gcs.get_filepath_or_buffer(filepath_or_buffer,
encoding=encoding,
compression=compression,
mode=mode)
if isinstance(filepath_or_buffer, (compat.string_types,
compat.binary_type,
mmap.mmap)):
return _expand_user(filepath_or_buffer), None, compression, False
if not is_file_like(filepath_or_buffer):
msg = "Invalid file path or buffer object type: {_type}"
raise ValueError(msg.format(_type=type(filepath_or_buffer)))
return filepath_or_buffer, None, compression, False
def file_path_to_url(path):
"""
converts an absolute native path to a FILE URL.
Parameters
----------
path : a path in native format
Returns
-------
a valid FILE URL
"""
return urljoin('file:', pathname2url(path))
_compression_to_extension = {
'gzip': '.gz',
'bz2': '.bz2',
'zip': '.zip',
'xz': '.xz',
}
def _infer_compression(filepath_or_buffer, compression):
"""
Get the compression method for filepath_or_buffer. If compression='infer',
the inferred compression method is returned. Otherwise, the input
compression method is returned unchanged, unless it's invalid, in which
case an error is raised.
Parameters
----------
filepath_or_buffer :
a path (str) or buffer
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
If 'infer' and `filepath_or_buffer` is path-like, then detect
compression from the following extensions: '.gz', '.bz2', '.zip',
or '.xz' (otherwise no compression).
Returns
-------
string or None :
compression method
Raises
------
ValueError on invalid compression specified
"""
# No compression has been explicitly specified
if compression is None:
return None
# Infer compression
if compression == 'infer':
# Convert all path types (e.g. pathlib.Path) to strings
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, compat.string_types):
# Cannot infer compression of a buffer, assume no compression
return None
# Infer compression from the filename/URL extension
for compression, extension in _compression_to_extension.items():
if filepath_or_buffer.endswith(extension):
return compression
return None
# Compression has been specified. Check that it's valid
if compression in _compression_to_extension:
return compression
msg = 'Unrecognized compression type: {}'.format(compression)
valid = ['infer', None] + sorted(_compression_to_extension)
msg += '\nValid compression types are {}'.format(valid)
raise ValueError(msg)
def _get_handle(path_or_buf, mode, encoding=None, compression=None,
memory_map=False, is_text=True):
"""
Get file handle for given path/buffer and mode.
Parameters
----------
path_or_buf :
a path (str) or buffer
mode : str
mode to open path_or_buf with
encoding : str or None
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
If 'infer' and `filepath_or_buffer` is path-like, then detect
compression from the following extensions: '.gz', '.bz2', '.zip',
or '.xz' (otherwise no compression).
memory_map : boolean, default False
See parsers._parser_params for more information.
is_text : boolean, default True
whether file/buffer is in text format (csv, json, etc.), or in binary
mode (pickle, etc.)
Returns
-------
f : file-like
A file-like object
handles : list of file-like objects
A list of file-like object that were opened in this function.
"""
try:
from s3fs import S3File
need_text_wrapping = (BytesIO, S3File)
except ImportError:
need_text_wrapping = (BytesIO,)
handles = list()
f = path_or_buf
# Convert pathlib.Path/py.path.local or string
path_or_buf = _stringify_path(path_or_buf)
is_path = isinstance(path_or_buf, compat.string_types)
if is_path:
compression = _infer_compression(path_or_buf, compression)
if compression:
if compat.PY2 and not is_path and encoding:
msg = 'compression with encoding is not yet supported in Python 2'
raise ValueError(msg)
# GZ Compression
if compression == 'gzip':
import gzip
if is_path:
f = gzip.open(path_or_buf, mode)
else:
f = gzip.GzipFile(fileobj=path_or_buf)
# BZ Compression
elif compression == 'bz2':
import bz2
if is_path:
f = bz2.BZ2File(path_or_buf, mode)
elif compat.PY2:
# Python 2's bz2 module can't take file objects, so have to
# run through decompress manually
f = StringIO(bz2.decompress(path_or_buf.read()))
path_or_buf.close()
else:
f = bz2.BZ2File(path_or_buf)
# ZIP Compression
elif compression == 'zip':
zf = BytesZipFile(path_or_buf, mode)
# Ensure the container is closed as well.
handles.append(zf)
if zf.mode == 'w':
f = zf
elif zf.mode == 'r':
zip_names = zf.namelist()
if len(zip_names) == 1:
f = zf.open(zip_names.pop())
elif len(zip_names) == 0:
raise ValueError('Zero files found in ZIP file {}'
.format(path_or_buf))
else:
raise ValueError('Multiple files found in ZIP file.'
' Only one file per ZIP: {}'
.format(zip_names))
# XZ Compression
elif compression == 'xz':
lzma = compat.import_lzma()
f = lzma.LZMAFile(path_or_buf, mode)
# Unrecognized Compression
else:
msg = 'Unrecognized compression type: {}'.format(compression)
raise ValueError(msg)
handles.append(f)
elif is_path:
if compat.PY2:
# Python 2
mode = "wb" if mode == "w" else mode
f = open(path_or_buf, mode)
elif encoding:
# Python 3 and encoding
f = open(path_or_buf, mode, encoding=encoding, newline="")
elif is_text:
# Python 3 and no explicit encoding
f = open(path_or_buf, mode, errors='replace', newline="")
else:
# Python 3 and binary mode
f = open(path_or_buf, mode)
handles.append(f)
# in Python 3, convert BytesIO or fileobjects passed with an encoding
if (compat.PY3 and is_text and
(compression or isinstance(f, need_text_wrapping))):
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding)
handles.append(f)
if memory_map and hasattr(f, 'fileno'):
try:
g = MMapWrapper(f)
f.close()
f = g
except Exception:
# we catch any errors that may have occurred
# because that is consistent with the lower-level
# functionality of the C engine (pd.read_csv), so
# leave the file handler as is then
pass
return f, handles
class BytesZipFile(zipfile.ZipFile, BytesIO):
"""
Wrapper for standard library class ZipFile and allow the returned file-like
handle to accept byte strings via `write` method.
BytesIO provides attributes of file-like object and ZipFile.writestr writes
bytes strings into a member of the archive.
"""
# GH 17778
def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
if mode in ['wb', 'rb']:
mode = mode.replace('b', '')
super(BytesZipFile, self).__init__(file, mode, compression, **kwargs)
def write(self, data):
super(BytesZipFile, self).writestr(self.filename, data)
@property
def closed(self):
return self.fp is None
class MMapWrapper(BaseIterator):
"""
Wrapper for the Python's mmap class so that it can be properly read in
by Python's csv.reader class.
Parameters
----------
f : file object
File object to be mapped onto memory. Must support the 'fileno'
method or have an equivalent attribute
"""
def __init__(self, f):
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
def __getattr__(self, name):
return getattr(self.mmap, name)
def __iter__(self):
return self
def __next__(self):
newline = self.mmap.readline()
# readline returns bytes, not str, in Python 3,
# but Python's CSV reader expects str, so convert
# the output to str before continuing
if compat.PY3:
newline = compat.bytes_to_str(newline)
# mmap doesn't raise if reading past the allocated
# data but instead returns an empty string, so raise
# if that is returned
if newline == '':
raise StopIteration
return newline
if not compat.PY3:
MMapWrapper.next = lambda self: self.__next__()
class UTF8Recoder(BaseIterator):
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def read(self, bytes=-1):
return self.reader.read(bytes).encode("utf-8")
def readline(self):
return self.reader.readline().encode("utf-8")
def next(self):
return next(self.reader).encode("utf-8")
if compat.PY3: # pragma: no cover
def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
# ignore encoding
return csv.reader(f, dialect=dialect, **kwds)
def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
return csv.writer(f, dialect=dialect, **kwds)
else:
class UnicodeReader(BaseIterator):
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
On Python 3, this is replaced (below) by csv.reader, which handles
unicode.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def __next__(self):
row = next(self.reader)
return [compat.text_type(s, "utf-8") for s in row]
class UnicodeWriter(object):
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
self.quoting = kwds.get("quoting", None)
def writerow(self, row):
def _check_as_is(x):
return (self.quoting == csv.QUOTE_NONNUMERIC and
is_number(x)) or isinstance(x, str)
row = [x if _check_as_is(x)
else pprint_thing(x).encode("utf-8") for x in row]
self.writer.writerow([s for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and re-encode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
def _check_as_is(x):
return (self.quoting == csv.QUOTE_NONNUMERIC and
is_number(x)) or isinstance(x, str)
for i, row in enumerate(rows):
rows[i] = [x if _check_as_is(x)
else pprint_thing(x).encode("utf-8") for x in row]
self.writer.writerows([[s for s in row] for row in rows])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and re-encode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
@@ -0,0 +1,64 @@
"""This module is designed for community supported date conversion functions"""
import numpy as np
from pandas._libs.tslibs import parsing
from pandas.compat import map, range
def parse_date_time(date_col, time_col):
date_col = _maybe_cast(date_col)
time_col = _maybe_cast(time_col)
return parsing.try_parse_date_and_time(date_col, time_col)
def parse_date_fields(year_col, month_col, day_col):
year_col = _maybe_cast(year_col)
month_col = _maybe_cast(month_col)
day_col = _maybe_cast(day_col)
return parsing.try_parse_year_month_day(year_col, month_col, day_col)
def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col,
second_col):
year_col = _maybe_cast(year_col)
month_col = _maybe_cast(month_col)
day_col = _maybe_cast(day_col)
hour_col = _maybe_cast(hour_col)
minute_col = _maybe_cast(minute_col)
second_col = _maybe_cast(second_col)
return parsing.try_parse_datetime_components(year_col, month_col, day_col,
hour_col, minute_col,
second_col)
def generic_parser(parse_func, *cols):
N = _check_columns(cols)
results = np.empty(N, dtype=object)
for i in range(N):
args = [c[i] for c in cols]
results[i] = parse_func(*args)
return results
def _maybe_cast(arr):
if not arr.dtype.type == np.object_:
arr = np.array(arr, dtype=object)
return arr
def _check_columns(cols):
if not len(cols):
raise AssertionError("There must be at least 1 column")
head, tail = cols[0], cols[1:]
N = len(head)
for i, n in enumerate(map(len, tail)):
if n != N:
raise AssertionError('All columns must have the same length: {0}; '
'column {1} has length {2}'.format(N, i, n))
return N
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,127 @@
""" feather-format compat """
from distutils.version import LooseVersion
from pandas.compat import range
from pandas.util._decorators import deprecate_kwarg
from pandas import DataFrame, Int64Index, RangeIndex
from pandas.io.common import _stringify_path
def _try_import():
# since pandas is a dependency of pyarrow
# we need to import on first use
try:
import pyarrow
from pyarrow import feather
except ImportError:
# give a nice error message
raise ImportError("pyarrow is not installed\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"or via pip\n"
"pip install -U pyarrow\n")
if LooseVersion(pyarrow.__version__) < LooseVersion('0.9.0'):
raise ImportError("pyarrow >= 0.9.0 required for feather support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge"
"or via pip\n"
"pip install -U pyarrow\n")
return feather, pyarrow
def to_feather(df, path):
"""
Write a DataFrame to the feather-format
Parameters
----------
df : DataFrame
path : string file path, or file-like object
"""
path = _stringify_path(path)
if not isinstance(df, DataFrame):
raise ValueError("feather only support IO with DataFrames")
feather = _try_import()[0]
valid_types = {'string', 'unicode'}
# validate index
# --------------
# validate that we have only a default index
# raise on anything else as we don't serialize the index
if not isinstance(df.index, Int64Index):
raise ValueError("feather does not support serializing {} "
"for the index; you can .reset_index()"
"to make the index into column(s)".format(
type(df.index)))
if not df.index.equals(RangeIndex.from_range(range(len(df)))):
raise ValueError("feather does not support serializing a "
"non-default index for the index; you "
"can .reset_index() to make the index "
"into column(s)")
if df.index.name is not None:
raise ValueError("feather does not serialize index meta-data on a "
"default index")
# validate columns
# ----------------
# must have value column names (strings only)
if df.columns.inferred_type not in valid_types:
raise ValueError("feather must have string column names")
feather.write_feather(df, path)
@deprecate_kwarg(old_arg_name='nthreads', new_arg_name='use_threads')
def read_feather(path, columns=None, use_threads=True):
"""
Load a feather-format object from the file path
.. versionadded 0.20.0
Parameters
----------
path : string file path, or file-like object
columns : sequence, default None
If not provided, all columns are read
.. versionadded 0.24.0
nthreads : int, default 1
Number of CPU threads to use when reading to pandas.DataFrame
.. versionadded 0.21.0
.. deprecated 0.24.0
use_threads : bool, default True
Whether to parallelize reading using multiple threads
.. versionadded 0.24.0
Returns
-------
type of object stored in file
"""
feather, pyarrow = _try_import()
path = _stringify_path(path)
if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'):
int_use_threads = int(use_threads)
if int_use_threads < 1:
int_use_threads = 1
return feather.read_feather(path, columns=columns,
nthreads=int_use_threads)
return feather.read_feather(path, columns=columns,
use_threads=bool(use_threads))
@@ -0,0 +1,159 @@
"""
Internal module for console introspection
"""
import locale
import sys
from pandas.io.formats.terminal import get_terminal_size
# -----------------------------------------------------------------------------
# Global formatting options
_initial_defencoding = None
def detect_console_encoding():
"""
Try to find the most capable encoding supported by the console.
slightly modified from the way IPython handles the same issue.
"""
global _initial_defencoding
encoding = None
try:
encoding = sys.stdout.encoding or sys.stdin.encoding
except (AttributeError, IOError):
pass
# try again for something better
if not encoding or 'ascii' in encoding.lower():
try:
encoding = locale.getpreferredencoding()
except Exception:
pass
# when all else fails. this will usually be "ascii"
if not encoding or 'ascii' in encoding.lower():
encoding = sys.getdefaultencoding()
# GH3360, save the reported defencoding at import time
# MPL backends may change it. Make available for debugging.
if not _initial_defencoding:
_initial_defencoding = sys.getdefaultencoding()
return encoding
def get_console_size():
"""Return console size as tuple = (width, height).
Returns (None,None) in non-interactive session.
"""
from pandas import get_option
display_width = get_option('display.width')
# deprecated.
display_height = get_option('display.max_rows')
# Consider
# interactive shell terminal, can detect term size
# interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term
# size non-interactive script, should disregard term size
# in addition
# width,height have default values, but setting to 'None' signals
# should use Auto-Detection, But only in interactive shell-terminal.
# Simple. yeah.
if in_interactive_session():
if in_ipython_frontend():
# sane defaults for interactive non-shell terminal
# match default for width,height in config_init
from pandas.core.config import get_default_val
terminal_width = get_default_val('display.width')
terminal_height = get_default_val('display.max_rows')
else:
# pure terminal
terminal_width, terminal_height = get_terminal_size()
else:
terminal_width, terminal_height = None, None
# Note if the User sets width/Height to None (auto-detection)
# and we're in a script (non-inter), this will return (None,None)
# caller needs to deal.
return (display_width or terminal_width, display_height or terminal_height)
# ----------------------------------------------------------------------
# Detect our environment
def in_interactive_session():
""" check if we're running in an interactive shell
returns True if running under python/ipython interactive shell
"""
from pandas import get_option
def check_main():
try:
import __main__ as main
except ModuleNotFoundError:
return get_option('mode.sim_interactive')
return (not hasattr(main, '__file__') or
get_option('mode.sim_interactive'))
try:
return __IPYTHON__ or check_main() # noqa
except NameError:
return check_main()
def in_qtconsole():
"""
check if we're inside an IPython qtconsole
.. deprecated:: 0.14.1
This is no longer needed, or working, in IPython 3 and above.
"""
try:
ip = get_ipython() # noqa
front_end = (
ip.config.get('KernelApp', {}).get('parent_appname', "") or
ip.config.get('IPKernelApp', {}).get('parent_appname', ""))
if 'qtconsole' in front_end.lower():
return True
except NameError:
return False
return False
def in_ipnb():
"""
check if we're inside an IPython Notebook
.. deprecated:: 0.14.1
This is no longer needed, or working, in IPython 3 and above.
"""
try:
ip = get_ipython() # noqa
front_end = (
ip.config.get('KernelApp', {}).get('parent_appname', "") or
ip.config.get('IPKernelApp', {}).get('parent_appname', ""))
if 'notebook' in front_end.lower():
return True
except NameError:
return False
return False
def in_ipython_frontend():
"""
check if we're inside an an IPython zmq frontend
"""
try:
ip = get_ipython() # noqa
return 'zmq' in str(type(ip)).lower()
except NameError:
pass
return False
@@ -0,0 +1,250 @@
"""Utilities for interpreting CSS from Stylers for formatting non-HTML outputs
"""
import re
import warnings
class CSSWarning(UserWarning):
"""This CSS syntax cannot currently be parsed"""
pass
class CSSResolver(object):
"""A callable for parsing and resolving CSS to atomic properties
"""
INITIAL_STYLE = {
}
def __call__(self, declarations_str, inherited=None):
""" the given declarations to atomic properties
Parameters
----------
declarations_str : str
A list of CSS declarations
inherited : dict, optional
Atomic properties indicating the inherited style context in which
declarations_str is to be resolved. ``inherited`` should already
be resolved, i.e. valid output of this method.
Returns
-------
props : dict
Atomic CSS 2.2 properties
Examples
--------
>>> resolve = CSSResolver()
>>> inherited = {'font-family': 'serif', 'font-weight': 'bold'}
>>> out = resolve('''
... border-color: BLUE RED;
... font-size: 1em;
... font-size: 2em;
... font-weight: normal;
... font-weight: inherit;
... ''', inherited)
>>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE
[('border-bottom-color', 'blue'),
('border-left-color', 'red'),
('border-right-color', 'red'),
('border-top-color', 'blue'),
('font-family', 'serif'),
('font-size', '24pt'),
('font-weight', 'bold')]
"""
props = dict(self.atomize(self.parse(declarations_str)))
if inherited is None:
inherited = {}
# 1. resolve inherited, initial
for prop, val in inherited.items():
if prop not in props:
props[prop] = val
for prop, val in list(props.items()):
if val == 'inherit':
val = inherited.get(prop, 'initial')
if val == 'initial':
val = self.INITIAL_STYLE.get(prop)
if val is None:
# we do not define a complete initial stylesheet
del props[prop]
else:
props[prop] = val
# 2. resolve relative font size
if props.get('font-size'):
if 'font-size' in inherited:
em_pt = inherited['font-size']
assert em_pt[-2:] == 'pt'
em_pt = float(em_pt[:-2])
else:
em_pt = None
props['font-size'] = self.size_to_pt(
props['font-size'], em_pt, conversions=self.FONT_SIZE_RATIOS)
font_size = float(props['font-size'][:-2])
else:
font_size = None
# 3. TODO: resolve other font-relative units
for side in self.SIDES:
prop = 'border-{side}-width'.format(side=side)
if prop in props:
props[prop] = self.size_to_pt(
props[prop], em_pt=font_size,
conversions=self.BORDER_WIDTH_RATIOS)
for prop in ['margin-{side}'.format(side=side),
'padding-{side}'.format(side=side)]:
if prop in props:
# TODO: support %
props[prop] = self.size_to_pt(
props[prop], em_pt=font_size,
conversions=self.MARGIN_RATIOS)
return props
UNIT_RATIOS = {
'rem': ('pt', 12),
'ex': ('em', .5),
# 'ch':
'px': ('pt', .75),
'pc': ('pt', 12),
'in': ('pt', 72),
'cm': ('in', 1 / 2.54),
'mm': ('in', 1 / 25.4),
'q': ('mm', .25),
'!!default': ('em', 0),
}
FONT_SIZE_RATIOS = UNIT_RATIOS.copy()
FONT_SIZE_RATIOS.update({
'%': ('em', .01),
'xx-small': ('rem', .5),
'x-small': ('rem', .625),
'small': ('rem', .8),
'medium': ('rem', 1),
'large': ('rem', 1.125),
'x-large': ('rem', 1.5),
'xx-large': ('rem', 2),
'smaller': ('em', 1 / 1.2),
'larger': ('em', 1.2),
'!!default': ('em', 1),
})
MARGIN_RATIOS = UNIT_RATIOS.copy()
MARGIN_RATIOS.update({
'none': ('pt', 0),
})
BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy()
BORDER_WIDTH_RATIOS.update({
'none': ('pt', 0),
'thick': ('px', 4),
'medium': ('px', 2),
'thin': ('px', 1),
# Default: medium only if solid
})
def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS):
def _error():
warnings.warn('Unhandled size: {val!r}'.format(val=in_val),
CSSWarning)
return self.size_to_pt('1!!default', conversions=conversions)
try:
val, unit = re.match(r'^(\S*?)([a-zA-Z%!].*)', in_val).groups()
except AttributeError:
return _error()
if val == '':
# hack for 'large' etc.
val = 1
else:
try:
val = float(val)
except ValueError:
return _error()
while unit != 'pt':
if unit == 'em':
if em_pt is None:
unit = 'rem'
else:
val *= em_pt
unit = 'pt'
continue
try:
unit, mul = conversions[unit]
except KeyError:
return _error()
val *= mul
val = round(val, 5)
if int(val) == val:
size_fmt = '{fmt:d}pt'.format(fmt=int(val))
else:
size_fmt = '{fmt:f}pt'.format(fmt=val)
return size_fmt
def atomize(self, declarations):
for prop, value in declarations:
attr = 'expand_' + prop.replace('-', '_')
try:
expand = getattr(self, attr)
except AttributeError:
yield prop, value
else:
for prop, value in expand(prop, value):
yield prop, value
SIDE_SHORTHANDS = {
1: [0, 0, 0, 0],
2: [0, 1, 0, 1],
3: [0, 1, 2, 1],
4: [0, 1, 2, 3],
}
SIDES = ('top', 'right', 'bottom', 'left')
def _side_expander(prop_fmt):
def expand(self, prop, value):
tokens = value.split()
try:
mapping = self.SIDE_SHORTHANDS[len(tokens)]
except KeyError:
warnings.warn('Could not expand "{prop}: {val}"'
.format(prop=prop, val=value), CSSWarning)
return
for key, idx in zip(self.SIDES, mapping):
yield prop_fmt.format(key), tokens[idx]
return expand
expand_border_color = _side_expander('border-{:s}-color')
expand_border_style = _side_expander('border-{:s}-style')
expand_border_width = _side_expander('border-{:s}-width')
expand_margin = _side_expander('margin-{:s}')
expand_padding = _side_expander('padding-{:s}')
def parse(self, declarations_str):
"""Generates (prop, value) pairs from declarations
In a future version may generate parsed tokens from tinycss/tinycss2
"""
for decl in declarations_str.split(';'):
if not decl.strip():
continue
prop, sep, val = decl.partition(':')
prop = prop.strip().lower()
# TODO: don't lowercase case sensitive parts of values (strings)
val = val.strip().lower()
if sep:
yield prop, val
else:
warnings.warn('Ill-formatted attribute: expected a colon '
'in {decl!r}'.format(decl=decl), CSSWarning)
@@ -0,0 +1,315 @@
# -*- coding: utf-8 -*-
"""
Module for formatting output data into CSV files.
"""
from __future__ import print_function
import csv as csvlib
import os
import warnings
from zipfile import ZipFile
import numpy as np
from pandas._libs import writers as libwriters
from pandas.compat import StringIO, range, zip
from pandas.core.dtypes.generic import (
ABCDatetimeIndex, ABCIndexClass, ABCMultiIndex, ABCPeriodIndex)
from pandas.core.dtypes.missing import notna
from pandas import compat
from pandas.io.common import (
UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer)
class CSVFormatter(object):
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
float_format=None, cols=None, header=True, index=True,
index_label=None, mode='w', nanRep=None, encoding=None,
compression='infer', quoting=None, line_terminator='\n',
chunksize=None, tupleize_cols=False, quotechar='"',
date_format=None, doublequote=True, escapechar=None,
decimal='.'):
self.obj = obj
if path_or_buf is None:
path_or_buf = StringIO()
self.path_or_buf, _, _, _ = get_filepath_or_buffer(
path_or_buf, encoding=encoding, compression=compression, mode=mode
)
self.sep = sep
self.na_rep = na_rep
self.float_format = float_format
self.decimal = decimal
self.header = header
self.index = index
self.index_label = index_label
self.mode = mode
if encoding is None:
encoding = 'ascii' if compat.PY2 else 'utf-8'
self.encoding = encoding
self.compression = _infer_compression(self.path_or_buf, compression)
if quoting is None:
quoting = csvlib.QUOTE_MINIMAL
self.quoting = quoting
if quoting == csvlib.QUOTE_NONE:
# prevents crash in _csv
quotechar = None
self.quotechar = quotechar
self.doublequote = doublequote
self.escapechar = escapechar
self.line_terminator = line_terminator or os.linesep
self.date_format = date_format
self.tupleize_cols = tupleize_cols
self.has_mi_columns = (isinstance(obj.columns, ABCMultiIndex) and
not self.tupleize_cols)
# validate mi options
if self.has_mi_columns:
if cols is not None:
raise TypeError("cannot specify cols with a MultiIndex on the "
"columns")
if cols is not None:
if isinstance(cols, ABCIndexClass):
cols = cols.to_native_types(na_rep=na_rep,
float_format=float_format,
date_format=date_format,
quoting=self.quoting)
else:
cols = list(cols)
self.obj = self.obj.loc[:, cols]
# update columns to include possible multiplicity of dupes
# and make sure sure cols is just a list of labels
cols = self.obj.columns
if isinstance(cols, ABCIndexClass):
cols = cols.to_native_types(na_rep=na_rep,
float_format=float_format,
date_format=date_format,
quoting=self.quoting)
else:
cols = list(cols)
# save it
self.cols = cols
# preallocate data 2d list
self.blocks = self.obj._data.blocks
ncols = sum(b.shape[0] for b in self.blocks)
self.data = [None] * ncols
if chunksize is None:
chunksize = (100000 // (len(self.cols) or 1)) or 1
self.chunksize = int(chunksize)
self.data_index = obj.index
if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and
date_format is not None):
from pandas import Index
self.data_index = Index([x.strftime(date_format) if notna(x) else
'' for x in self.data_index])
self.nlevels = getattr(self.data_index, 'nlevels', 1)
if not index:
self.nlevels = 0
def save(self):
"""
Create the writer & save
"""
# GH21227 internal compression is not used when file-like passed.
if self.compression and hasattr(self.path_or_buf, 'write'):
msg = ("compression has no effect when passing file-like "
"object as input.")
warnings.warn(msg, RuntimeWarning, stacklevel=2)
# when zip compression is called.
is_zip = isinstance(self.path_or_buf, ZipFile) or (
not hasattr(self.path_or_buf, 'write')
and self.compression == 'zip')
if is_zip:
# zipfile doesn't support writing string to archive. uses string
# buffer to receive csv writing and dump into zip compression
# file handle. GH21241, GH21118
f = StringIO()
close = False
elif hasattr(self.path_or_buf, 'write'):
f = self.path_or_buf
close = False
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=self.encoding,
compression=self.compression)
close = True
try:
writer_kwargs = dict(lineterminator=self.line_terminator,
delimiter=self.sep, quoting=self.quoting,
doublequote=self.doublequote,
escapechar=self.escapechar,
quotechar=self.quotechar)
if self.encoding == 'ascii':
self.writer = csvlib.writer(f, **writer_kwargs)
else:
writer_kwargs['encoding'] = self.encoding
self.writer = UnicodeWriter(f, **writer_kwargs)
self._save()
finally:
if is_zip:
# GH17778 handles zip compression separately.
buf = f.getvalue()
if hasattr(self.path_or_buf, 'write'):
self.path_or_buf.write(buf)
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=self.encoding,
compression=self.compression)
f.write(buf)
close = True
if close:
f.close()
for _fh in handles:
_fh.close()
def _save_header(self):
writer = self.writer
obj = self.obj
index_label = self.index_label
cols = self.cols
has_mi_columns = self.has_mi_columns
header = self.header
encoded_labels = []
has_aliases = isinstance(header, (tuple, list, np.ndarray,
ABCIndexClass))
if not (has_aliases or self.header):
return
if has_aliases:
if len(header) != len(cols):
raise ValueError(('Writing {ncols} cols but got {nalias} '
'aliases'.format(ncols=len(cols),
nalias=len(header))))
else:
write_cols = header
else:
write_cols = cols
if self.index:
# should write something for index label
if index_label is not False:
if index_label is None:
if isinstance(obj.index, ABCMultiIndex):
index_label = []
for i, name in enumerate(obj.index.names):
if name is None:
name = ''
index_label.append(name)
else:
index_label = obj.index.name
if index_label is None:
index_label = ['']
else:
index_label = [index_label]
elif not isinstance(index_label,
(list, tuple, np.ndarray, ABCIndexClass)):
# given a string for a DF with Index
index_label = [index_label]
encoded_labels = list(index_label)
else:
encoded_labels = []
if not has_mi_columns or has_aliases:
encoded_labels += list(write_cols)
writer.writerow(encoded_labels)
else:
# write out the mi
columns = obj.columns
# write out the names for each level, then ALL of the values for
# each level
for i in range(columns.nlevels):
# we need at least 1 index column to write our col names
col_line = []
if self.index:
# name is the first column
col_line.append(columns.names[i])
if isinstance(index_label, list) and len(index_label) > 1:
col_line.extend([''] * (len(index_label) - 1))
col_line.extend(columns._get_level_values(i))
writer.writerow(col_line)
# Write out the index line if it's not empty.
# Otherwise, we will print out an extraneous
# blank line between the mi and the data rows.
if encoded_labels and set(encoded_labels) != {''}:
encoded_labels.extend([''] * len(columns))
writer.writerow(encoded_labels)
def _save(self):
self._save_header()
nrows = len(self.data_index)
# write in chunksize bites
chunksize = self.chunksize
chunks = int(nrows / chunksize) + 1
for i in range(chunks):
start_i = i * chunksize
end_i = min((i + 1) * chunksize, nrows)
if start_i >= end_i:
break
self._save_chunk(start_i, end_i)
def _save_chunk(self, start_i, end_i):
data_index = self.data_index
# create the data for a chunk
slicer = slice(start_i, end_i)
for i in range(len(self.blocks)):
b = self.blocks[i]
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
quoting=self.quoting)
for col_loc, col in zip(b.mgr_locs, d):
# self.data is a preallocated list
self.data[col_loc] = col
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
quoting=self.quoting)
libwriters.write_csv_rows(self.data, ix, self.nlevels,
self.cols, self.writer)
@@ -0,0 +1,664 @@
"""Utilities for conversion to writer-agnostic Excel representation
"""
import itertools
import re
import warnings
import numpy as np
from pandas.compat import reduce
from pandas.core.dtypes import missing
from pandas.core.dtypes.common import is_float, is_scalar
from pandas.core.dtypes.generic import ABCMultiIndex, ABCPeriodIndex
from pandas import Index
import pandas.core.common as com
from pandas.io.formats.css import CSSResolver, CSSWarning
from pandas.io.formats.format import get_level_lengths
from pandas.io.formats.printing import pprint_thing
class ExcelCell(object):
__fields__ = ('row', 'col', 'val', 'style', 'mergestart', 'mergeend')
__slots__ = __fields__
def __init__(self, row, col, val, style=None, mergestart=None,
mergeend=None):
self.row = row
self.col = col
self.val = val
self.style = style
self.mergestart = mergestart
self.mergeend = mergeend
class CSSToExcelConverter(object):
"""A callable for converting CSS declarations to ExcelWriter styles
Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow),
focusing on font styling, backgrounds, borders and alignment.
Operates by first computing CSS styles in a fairly generic
way (see :meth:`compute_css`) then determining Excel style
properties from CSS properties (see :meth:`build_xlstyle`).
Parameters
----------
inherited : str, optional
CSS declarations understood to be the containing scope for the
CSS processed by :meth:`__call__`.
"""
# NB: Most of the methods here could be classmethods, as only __init__
# and __call__ make use of instance attributes. We leave them as
# instancemethods so that users can easily experiment with extensions
# without monkey-patching.
def __init__(self, inherited=None):
if inherited is not None:
inherited = self.compute_css(inherited,
self.compute_css.INITIAL_STYLE)
self.inherited = inherited
compute_css = CSSResolver()
def __call__(self, declarations_str):
"""Convert CSS declarations to ExcelWriter style
Parameters
----------
declarations_str : str
List of CSS declarations.
e.g. "font-weight: bold; background: blue"
Returns
-------
xlstyle : dict
A style as interpreted by ExcelWriter when found in
ExcelCell.style.
"""
# TODO: memoize?
properties = self.compute_css(declarations_str, self.inherited)
return self.build_xlstyle(properties)
def build_xlstyle(self, props):
out = {
'alignment': self.build_alignment(props),
'border': self.build_border(props),
'fill': self.build_fill(props),
'font': self.build_font(props),
'number_format': self.build_number_format(props),
}
# TODO: handle cell width and height: needs support in pandas.io.excel
def remove_none(d):
"""Remove key where value is None, through nested dicts"""
for k, v in list(d.items()):
if v is None:
del d[k]
elif isinstance(v, dict):
remove_none(v)
if not v:
del d[k]
remove_none(out)
return out
VERTICAL_MAP = {
'top': 'top',
'text-top': 'top',
'middle': 'center',
'baseline': 'bottom',
'bottom': 'bottom',
'text-bottom': 'bottom',
# OpenXML also has 'justify', 'distributed'
}
def build_alignment(self, props):
# TODO: text-indent, padding-left -> alignment.indent
return {'horizontal': props.get('text-align'),
'vertical': self.VERTICAL_MAP.get(props.get('vertical-align')),
'wrap_text': (None if props.get('white-space') is None else
props['white-space'] not in
('nowrap', 'pre', 'pre-line'))
}
def build_border(self, props):
return {side: {
'style': self._border_style(props.get('border-{side}-style'
.format(side=side)),
props.get('border-{side}-width'
.format(side=side))),
'color': self.color_to_excel(
props.get('border-{side}-color'.format(side=side))),
} for side in ['top', 'right', 'bottom', 'left']}
def _border_style(self, style, width):
# convert styles and widths to openxml, one of:
# 'dashDot'
# 'dashDotDot'
# 'dashed'
# 'dotted'
# 'double'
# 'hair'
# 'medium'
# 'mediumDashDot'
# 'mediumDashDotDot'
# 'mediumDashed'
# 'slantDashDot'
# 'thick'
# 'thin'
if width is None and style is None:
return None
if style == 'none' or style == 'hidden':
return None
if width is None:
width = '2pt'
width = float(width[:-2])
if width < 1e-5:
return None
elif width < 1.3:
width_name = 'thin'
elif width < 2.8:
width_name = 'medium'
else:
width_name = 'thick'
if style in (None, 'groove', 'ridge', 'inset', 'outset'):
# not handled
style = 'solid'
if style == 'double':
return 'double'
if style == 'solid':
return width_name
if style == 'dotted':
if width_name in ('hair', 'thin'):
return 'dotted'
return 'mediumDashDotDot'
if style == 'dashed':
if width_name in ('hair', 'thin'):
return 'dashed'
return 'mediumDashed'
def build_fill(self, props):
# TODO: perhaps allow for special properties
# -excel-pattern-bgcolor and -excel-pattern-type
fill_color = props.get('background-color')
if fill_color not in (None, 'transparent', 'none'):
return {
'fgColor': self.color_to_excel(fill_color),
'patternType': 'solid',
}
BOLD_MAP = {'bold': True, 'bolder': True, '600': True, '700': True,
'800': True, '900': True,
'normal': False, 'lighter': False, '100': False, '200': False,
'300': False, '400': False, '500': False}
ITALIC_MAP = {'normal': False, 'italic': True, 'oblique': True}
def build_font(self, props):
size = props.get('font-size')
if size is not None:
assert size.endswith('pt')
size = float(size[:-2])
font_names_tmp = re.findall(r'''(?x)
(
"(?:[^"]|\\")+"
|
'(?:[^']|\\')+'
|
[^'",]+
)(?=,|\s*$)
''', props.get('font-family', ''))
font_names = []
for name in font_names_tmp:
if name[:1] == '"':
name = name[1:-1].replace('\\"', '"')
elif name[:1] == '\'':
name = name[1:-1].replace('\\\'', '\'')
else:
name = name.strip()
if name:
font_names.append(name)
family = None
for name in font_names:
if name == 'serif':
family = 1 # roman
break
elif name == 'sans-serif':
family = 2 # swiss
break
elif name == 'cursive':
family = 4 # script
break
elif name == 'fantasy':
family = 5 # decorative
break
decoration = props.get('text-decoration')
if decoration is not None:
decoration = decoration.split()
else:
decoration = ()
return {
'name': font_names[0] if font_names else None,
'family': family,
'size': size,
'bold': self.BOLD_MAP.get(props.get('font-weight')),
'italic': self.ITALIC_MAP.get(props.get('font-style')),
'underline': ('single' if
'underline' in decoration
else None),
'strike': ('line-through' in decoration) or None,
'color': self.color_to_excel(props.get('color')),
# shadow if nonzero digit before shadow color
'shadow': (bool(re.search('^[^#(]*[1-9]',
props['text-shadow']))
if 'text-shadow' in props else None),
# 'vertAlign':,
# 'charset': ,
# 'scheme': ,
# 'outline': ,
# 'condense': ,
}
NAMED_COLORS = {
'maroon': '800000',
'brown': 'A52A2A',
'red': 'FF0000',
'pink': 'FFC0CB',
'orange': 'FFA500',
'yellow': 'FFFF00',
'olive': '808000',
'green': '008000',
'purple': '800080',
'fuchsia': 'FF00FF',
'lime': '00FF00',
'teal': '008080',
'aqua': '00FFFF',
'blue': '0000FF',
'navy': '000080',
'black': '000000',
'gray': '808080',
'grey': '808080',
'silver': 'C0C0C0',
'white': 'FFFFFF',
}
def color_to_excel(self, val):
if val is None:
return None
if val.startswith('#') and len(val) == 7:
return val[1:].upper()
if val.startswith('#') and len(val) == 4:
return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper()
try:
return self.NAMED_COLORS[val]
except KeyError:
warnings.warn('Unhandled color format: {val!r}'.format(val=val),
CSSWarning)
def build_number_format(self, props):
return {'format_code': props.get('number-format')}
class ExcelFormatter(object):
"""
Class for formatting a DataFrame to a list of ExcelCells,
Parameters
----------
df : DataFrame or Styler
na_rep: na representation
float_format : string, default None
Format string for floating point numbers
cols : sequence, optional
Columns to write
header : boolean or list of string, default True
Write out column names. If a list of string is given it is
assumed to be aliases for the column names
index : boolean, default True
output row names (index)
index_label : string or sequence, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the DataFrame uses MultiIndex.
merge_cells : boolean, default False
Format MultiIndex and Hierarchical Rows as merged cells.
inf_rep : string, default `'inf'`
representation for np.inf values (which aren't representable in Excel)
A `'-'` sign will be added in front of -inf.
style_converter : callable, optional
This translates Styler styles (CSS) into ExcelWriter styles.
Defaults to ``CSSToExcelConverter()``.
It should have signature css_declarations string -> excel style.
This is only called for body cells.
"""
def __init__(self, df, na_rep='', float_format=None, cols=None,
header=True, index=True, index_label=None, merge_cells=False,
inf_rep='inf', style_converter=None):
self.rowcounter = 0
self.na_rep = na_rep
if hasattr(df, 'render'):
self.styler = df
df = df.data
if style_converter is None:
style_converter = CSSToExcelConverter()
self.style_converter = style_converter
else:
self.styler = None
self.df = df
if cols is not None:
# all missing, raise
if not len(Index(cols) & df.columns):
raise KeyError(
"passes columns are not ALL present dataframe")
# deprecatedin gh-17295
# 1 missing is ok (for now)
if len(Index(cols) & df.columns) != len(cols):
warnings.warn(
"Not all names specified in 'columns' are found; "
"this will raise a KeyError in the future",
FutureWarning)
self.df = df.reindex(columns=cols)
self.columns = self.df.columns
self.float_format = float_format
self.index = index
self.index_label = index_label
self.header = header
self.merge_cells = merge_cells
self.inf_rep = inf_rep
@property
def header_style(self):
return {"font": {"bold": True},
"borders": {"top": "thin",
"right": "thin",
"bottom": "thin",
"left": "thin"},
"alignment": {"horizontal": "center",
"vertical": "top"}}
def _format_value(self, val):
if is_scalar(val) and missing.isna(val):
val = self.na_rep
elif is_float(val):
if missing.isposinf_scalar(val):
val = self.inf_rep
elif missing.isneginf_scalar(val):
val = '-{inf}'.format(inf=self.inf_rep)
elif self.float_format is not None:
val = float(self.float_format % val)
return val
def _format_header_mi(self):
if self.columns.nlevels > 1:
if not self.index:
raise NotImplementedError("Writing to Excel with MultiIndex"
" columns and no index "
"('index'=False) is not yet "
"implemented.")
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if not (has_aliases or self.header):
return
columns = self.columns
level_strs = columns.format(sparsify=self.merge_cells, adjoin=False,
names=False)
level_lengths = get_level_lengths(level_strs)
coloffset = 0
lnum = 0
if self.index and isinstance(self.df.index, ABCMultiIndex):
coloffset = len(self.df.index[0]) - 1
if self.merge_cells:
# Format multi-index as a merged cells.
for lnum in range(len(level_lengths)):
name = columns.names[lnum]
yield ExcelCell(lnum, coloffset, name, self.header_style)
for lnum, (spans, levels, level_codes) in enumerate(zip(
level_lengths, columns.levels, columns.codes)):
values = levels.take(level_codes)
for i in spans:
if spans[i] > 1:
yield ExcelCell(lnum, coloffset + i + 1, values[i],
self.header_style, lnum,
coloffset + i + spans[i])
else:
yield ExcelCell(lnum, coloffset + i + 1, values[i],
self.header_style)
else:
# Format in legacy format with dots to indicate levels.
for i, values in enumerate(zip(*level_strs)):
v = ".".join(map(pprint_thing, values))
yield ExcelCell(lnum, coloffset + i + 1, v, self.header_style)
self.rowcounter = lnum
def _format_header_regular(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if has_aliases or self.header:
coloffset = 0
if self.index:
coloffset = 1
if isinstance(self.df.index, ABCMultiIndex):
coloffset = len(self.df.index[0])
colnames = self.columns
if has_aliases:
if len(self.header) != len(self.columns):
raise ValueError('Writing {cols} cols but got {alias} '
'aliases'.format(cols=len(self.columns),
alias=len(self.header)))
else:
colnames = self.header
for colindex, colname in enumerate(colnames):
yield ExcelCell(self.rowcounter, colindex + coloffset, colname,
self.header_style)
def _format_header(self):
if isinstance(self.columns, ABCMultiIndex):
gen = self._format_header_mi()
else:
gen = self._format_header_regular()
gen2 = ()
if self.df.index.names:
row = [x if x is not None else ''
for x in self.df.index.names] + [''] * len(self.columns)
if reduce(lambda x, y: x and y, map(lambda x: x != '', row)):
gen2 = (ExcelCell(self.rowcounter, colindex, val,
self.header_style)
for colindex, val in enumerate(row))
self.rowcounter += 1
return itertools.chain(gen, gen2)
def _format_body(self):
if isinstance(self.df.index, ABCMultiIndex):
return self._format_hierarchical_rows()
else:
return self._format_regular_rows()
def _format_regular_rows(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if has_aliases or self.header:
self.rowcounter += 1
# output index and index_label?
if self.index:
# check aliases
# if list only take first as this is not a MultiIndex
if (self.index_label and
isinstance(self.index_label, (list, tuple, np.ndarray,
Index))):
index_label = self.index_label[0]
# if string good to go
elif self.index_label and isinstance(self.index_label, str):
index_label = self.index_label
else:
index_label = self.df.index.names[0]
if isinstance(self.columns, ABCMultiIndex):
self.rowcounter += 1
if index_label and self.header is not False:
yield ExcelCell(self.rowcounter - 1, 0, index_label,
self.header_style)
# write index_values
index_values = self.df.index
if isinstance(self.df.index, ABCPeriodIndex):
index_values = self.df.index.to_timestamp()
for idx, idxval in enumerate(index_values):
yield ExcelCell(self.rowcounter + idx, 0, idxval,
self.header_style)
coloffset = 1
else:
coloffset = 0
for cell in self._generate_body(coloffset):
yield cell
def _format_hierarchical_rows(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if has_aliases or self.header:
self.rowcounter += 1
gcolidx = 0
if self.index:
index_labels = self.df.index.names
# check for aliases
if (self.index_label and
isinstance(self.index_label, (list, tuple, np.ndarray,
Index))):
index_labels = self.index_label
# MultiIndex columns require an extra row
# with index names (blank if None) for
# unambigous round-trip, unless not merging,
# in which case the names all go on one row Issue #11328
if isinstance(self.columns, ABCMultiIndex) and self.merge_cells:
self.rowcounter += 1
# if index labels are not empty go ahead and dump
if com._any_not_none(*index_labels) and self.header is not False:
for cidx, name in enumerate(index_labels):
yield ExcelCell(self.rowcounter - 1, cidx, name,
self.header_style)
if self.merge_cells:
# Format hierarchical rows as merged cells.
level_strs = self.df.index.format(sparsify=True, adjoin=False,
names=False)
level_lengths = get_level_lengths(level_strs)
for spans, levels, level_codes in zip(level_lengths,
self.df.index.levels,
self.df.index.codes):
values = levels.take(level_codes,
allow_fill=levels._can_hold_na,
fill_value=True)
for i in spans:
if spans[i] > 1:
yield ExcelCell(self.rowcounter + i, gcolidx,
values[i], self.header_style,
self.rowcounter + i + spans[i] - 1,
gcolidx)
else:
yield ExcelCell(self.rowcounter + i, gcolidx,
values[i], self.header_style)
gcolidx += 1
else:
# Format hierarchical rows with non-merged values.
for indexcolvals in zip(*self.df.index):
for idx, indexcolval in enumerate(indexcolvals):
yield ExcelCell(self.rowcounter + idx, gcolidx,
indexcolval, self.header_style)
gcolidx += 1
for cell in self._generate_body(gcolidx):
yield cell
def _generate_body(self, coloffset):
if self.styler is None:
styles = None
else:
styles = self.styler._compute().ctx
if not styles:
styles = None
xlstyle = None
# Write the body of the frame data series by series.
for colidx in range(len(self.columns)):
series = self.df.iloc[:, colidx]
for i, val in enumerate(series):
if styles is not None:
xlstyle = self.style_converter(';'.join(styles[i, colidx]))
yield ExcelCell(self.rowcounter + i, colidx + coloffset, val,
xlstyle)
def get_formatted_cells(self):
for cell in itertools.chain(self._format_header(),
self._format_body()):
cell.val = self._format_value(cell.val)
yield cell
def write(self, writer, sheet_name='Sheet1', startrow=0,
startcol=0, freeze_panes=None, engine=None):
"""
writer : string or ExcelWriter object
File path or existing ExcelWriter
sheet_name : string, default 'Sheet1'
Name of sheet which will contain DataFrame
startrow :
upper left cell row to dump data frame
startcol :
upper left cell column to dump data frame
freeze_panes : tuple of integer (length 2), default None
Specifies the one-based bottommost row and rightmost column that
is to be frozen
engine : string, default None
write engine to use if writer is a path - you can also set this
via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``,
and ``io.excel.xlsm.writer``.
"""
from pandas.io.excel import ExcelWriter
from pandas.io.common import _stringify_path
if isinstance(writer, ExcelWriter):
need_save = False
else:
writer = ExcelWriter(_stringify_path(writer), engine=engine)
need_save = True
formatted_cells = self.get_formatted_cells()
writer.write_cells(formatted_cells, sheet_name,
startrow=startrow, startcol=startcol,
freeze_panes=freeze_panes)
if need_save:
writer.save()
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,531 @@
# -*- coding: utf-8 -*-
"""
Module for formatting output data in HTML.
"""
from __future__ import print_function
from textwrap import dedent
from pandas.compat import OrderedDict, lzip, map, range, u, unichr, zip
from pandas.core.dtypes.generic import ABCMultiIndex
from pandas import compat
import pandas.core.common as com
from pandas.core.config import get_option
from pandas.io.common import _is_url
from pandas.io.formats.format import TableFormatter, get_level_lengths
from pandas.io.formats.printing import pprint_thing
class HTMLFormatter(TableFormatter):
"""
Internal class for formatting output data in html.
This class is intended for shared functionality between
DataFrame.to_html() and DataFrame._repr_html_().
Any logic in common with other output formatting methods
should ideally be inherited from classes in format.py
and this class responsible for only producing html markup.
"""
indent_delta = 2
def __init__(self, formatter, classes=None, border=None):
self.fmt = formatter
self.classes = classes
self.frame = self.fmt.frame
self.columns = self.fmt.tr_frame.columns
self.elements = []
self.bold_rows = self.fmt.kwds.get('bold_rows', False)
self.escape = self.fmt.kwds.get('escape', True)
self.show_dimensions = self.fmt.show_dimensions
if border is None:
border = get_option('display.html.border')
self.border = border
self.table_id = self.fmt.table_id
self.render_links = self.fmt.render_links
@property
def show_row_idx_names(self):
return self.fmt.show_row_idx_names
@property
def show_col_idx_names(self):
return self.fmt.show_col_idx_names
@property
def row_levels(self):
if self.fmt.index:
# showing (row) index
return self.frame.index.nlevels
elif self.show_col_idx_names:
# see gh-22579
# Column misalignment also occurs for
# a standard index when the columns index is named.
# If the row index is not displayed a column of
# blank cells need to be included before the DataFrame values.
return 1
# not showing (row) index
return 0
@property
def is_truncated(self):
return self.fmt.is_truncated
@property
def ncols(self):
return len(self.fmt.tr_frame.columns)
def write(self, s, indent=0):
rs = pprint_thing(s)
self.elements.append(' ' * indent + rs)
def write_th(self, s, indent=0, tags=None):
if self.fmt.col_space is not None and self.fmt.col_space > 0:
tags = (tags or "")
tags += ('style="min-width: {colspace};"'
.format(colspace=self.fmt.col_space))
return self._write_cell(s, kind='th', indent=indent, tags=tags)
def write_td(self, s, indent=0, tags=None):
return self._write_cell(s, kind='td', indent=indent, tags=tags)
def _write_cell(self, s, kind='td', indent=0, tags=None):
if tags is not None:
start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags)
else:
start_tag = '<{kind}>'.format(kind=kind)
if self.escape:
# escape & first to prevent double escaping of &
esc = OrderedDict([('&', r'&amp;'), ('<', r'&lt;'),
('>', r'&gt;')])
else:
esc = {}
rs = pprint_thing(s, escape_chars=esc).strip()
if self.render_links and _is_url(rs):
rs_unescaped = pprint_thing(s, escape_chars={}).strip()
start_tag += '<a href="{url}" target="_blank">'.format(
url=rs_unescaped)
end_a = '</a>'
else:
end_a = ''
self.write(u'{start}{rs}{end_a}</{kind}>'.format(
start=start_tag, rs=rs, end_a=end_a, kind=kind), indent)
def write_tr(self, line, indent=0, indent_delta=0, header=False,
align=None, tags=None, nindex_levels=0):
if tags is None:
tags = {}
if align is None:
self.write('<tr>', indent)
else:
self.write('<tr style="text-align: {align};">'
.format(align=align), indent)
indent += indent_delta
for i, s in enumerate(line):
val_tag = tags.get(i, None)
if header or (self.bold_rows and i < nindex_levels):
self.write_th(s, indent, tags=val_tag)
else:
self.write_td(s, indent, tags=val_tag)
indent -= indent_delta
self.write('</tr>', indent)
def render(self):
self._write_table()
if self.should_show_dimensions:
by = chr(215) if compat.PY3 else unichr(215) # ×
self.write(u('<p>{rows} rows {by} {cols} columns</p>')
.format(rows=len(self.frame),
by=by,
cols=len(self.frame.columns)))
return self.elements
def _write_table(self, indent=0):
_classes = ['dataframe'] # Default class.
use_mathjax = get_option("display.html.use_mathjax")
if not use_mathjax:
_classes.append('tex2jax_ignore')
if self.classes is not None:
if isinstance(self.classes, str):
self.classes = self.classes.split()
if not isinstance(self.classes, (list, tuple)):
raise AssertionError('classes must be list or tuple, not {typ}'
.format(typ=type(self.classes)))
_classes.extend(self.classes)
if self.table_id is None:
id_section = ""
else:
id_section = ' id="{table_id}"'.format(table_id=self.table_id)
self.write('<table border="{border}" class="{cls}"{id_section}>'
.format(border=self.border, cls=' '.join(_classes),
id_section=id_section), indent)
if self.fmt.header or self.show_row_idx_names:
self._write_header(indent + self.indent_delta)
self._write_body(indent + self.indent_delta)
self.write('</table>', indent)
def _write_col_header(self, indent):
truncate_h = self.fmt.truncate_h
if isinstance(self.columns, ABCMultiIndex):
template = 'colspan="{span:d}" halign="left"'
if self.fmt.sparsify:
# GH3547
sentinel = com.sentinel_factory()
else:
sentinel = False
levels = self.columns.format(sparsify=sentinel, adjoin=False,
names=False)
level_lengths = get_level_lengths(levels, sentinel)
inner_lvl = len(level_lengths) - 1
for lnum, (records, values) in enumerate(zip(level_lengths,
levels)):
if truncate_h:
# modify the header lines
ins_col = self.fmt.tr_col_num
if self.fmt.sparsify:
recs_new = {}
# Increment tags after ... col.
for tag, span in list(records.items()):
if tag >= ins_col:
recs_new[tag + 1] = span
elif tag + span > ins_col:
recs_new[tag] = span + 1
if lnum == inner_lvl:
values = (values[:ins_col] + (u('...'),) +
values[ins_col:])
else:
# sparse col headers do not receive a ...
values = (values[:ins_col] +
(values[ins_col - 1], ) +
values[ins_col:])
else:
recs_new[tag] = span
# if ins_col lies between tags, all col headers
# get ...
if tag + span == ins_col:
recs_new[ins_col] = 1
values = (values[:ins_col] + (u('...'),) +
values[ins_col:])
records = recs_new
inner_lvl = len(level_lengths) - 1
if lnum == inner_lvl:
records[ins_col] = 1
else:
recs_new = {}
for tag, span in list(records.items()):
if tag >= ins_col:
recs_new[tag + 1] = span
else:
recs_new[tag] = span
recs_new[ins_col] = 1
records = recs_new
values = (values[:ins_col] + [u('...')] +
values[ins_col:])
# see gh-22579
# Column Offset Bug with to_html(index=False) with
# MultiIndex Columns and Index.
# Initially fill row with blank cells before column names.
# TODO: Refactor to remove code duplication with code
# block below for standard columns index.
row = [''] * (self.row_levels - 1)
if self.fmt.index or self.show_col_idx_names:
# see gh-22747
# If to_html(index_names=False) do not show columns
# index names.
# TODO: Refactor to use _get_column_name_list from
# DataFrameFormatter class and create a
# _get_formatted_column_labels function for code
# parity with DataFrameFormatter class.
if self.fmt.show_index_names:
name = self.columns.names[lnum]
row.append(pprint_thing(name or ''))
else:
row.append('')
tags = {}
j = len(row)
for i, v in enumerate(values):
if i in records:
if records[i] > 1:
tags[j] = template.format(span=records[i])
else:
continue
j += 1
row.append(v)
self.write_tr(row, indent, self.indent_delta, tags=tags,
header=True)
else:
# see gh-22579
# Column misalignment also occurs for
# a standard index when the columns index is named.
# Initially fill row with blank cells before column names.
# TODO: Refactor to remove code duplication with code block
# above for columns MultiIndex.
row = [''] * (self.row_levels - 1)
if self.fmt.index or self.show_col_idx_names:
# see gh-22747
# If to_html(index_names=False) do not show columns
# index names.
# TODO: Refactor to use _get_column_name_list from
# DataFrameFormatter class.
if self.fmt.show_index_names:
row.append(self.columns.name or '')
else:
row.append('')
row.extend(self.columns)
align = self.fmt.justify
if truncate_h:
ins_col = self.row_levels + self.fmt.tr_col_num
row.insert(ins_col, '...')
self.write_tr(row, indent, self.indent_delta, header=True,
align=align)
def _write_row_header(self, indent):
truncate_h = self.fmt.truncate_h
row = ([x if x is not None else '' for x in self.frame.index.names]
+ [''] * (self.ncols + (1 if truncate_h else 0)))
self.write_tr(row, indent, self.indent_delta, header=True)
def _write_header(self, indent):
self.write('<thead>', indent)
if self.fmt.header:
self._write_col_header(indent + self.indent_delta)
if self.show_row_idx_names:
self._write_row_header(indent + self.indent_delta)
self.write('</thead>', indent)
def _write_body(self, indent):
self.write('<tbody>', indent)
fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)}
# write values
if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
self._write_hierarchical_rows(
fmt_values, indent + self.indent_delta)
else:
self._write_regular_rows(
fmt_values, indent + self.indent_delta)
self.write('</tbody>', indent)
def _write_regular_rows(self, fmt_values, indent):
truncate_h = self.fmt.truncate_h
truncate_v = self.fmt.truncate_v
nrows = len(self.fmt.tr_frame)
if self.fmt.index:
fmt = self.fmt._get_formatter('__index__')
if fmt is not None:
index_values = self.fmt.tr_frame.index.map(fmt)
else:
index_values = self.fmt.tr_frame.index.format()
row = []
for i in range(nrows):
if truncate_v and i == (self.fmt.tr_row_num):
str_sep_row = ['...'] * len(row)
self.write_tr(str_sep_row, indent, self.indent_delta,
tags=None, nindex_levels=self.row_levels)
row = []
if self.fmt.index:
row.append(index_values[i])
# see gh-22579
# Column misalignment also occurs for
# a standard index when the columns index is named.
# Add blank cell before data cells.
elif self.show_col_idx_names:
row.append('')
row.extend(fmt_values[j][i] for j in range(self.ncols))
if truncate_h:
dot_col_ix = self.fmt.tr_col_num + self.row_levels
row.insert(dot_col_ix, '...')
self.write_tr(row, indent, self.indent_delta, tags=None,
nindex_levels=self.row_levels)
def _write_hierarchical_rows(self, fmt_values, indent):
template = 'rowspan="{span}" valign="top"'
truncate_h = self.fmt.truncate_h
truncate_v = self.fmt.truncate_v
frame = self.fmt.tr_frame
nrows = len(frame)
idx_values = frame.index.format(sparsify=False, adjoin=False,
names=False)
idx_values = lzip(*idx_values)
if self.fmt.sparsify:
# GH3547
sentinel = com.sentinel_factory()
levels = frame.index.format(sparsify=sentinel, adjoin=False,
names=False)
level_lengths = get_level_lengths(levels, sentinel)
inner_lvl = len(level_lengths) - 1
if truncate_v:
# Insert ... row and adjust idx_values and
# level_lengths to take this into account.
ins_row = self.fmt.tr_row_num
inserted = False
for lnum, records in enumerate(level_lengths):
rec_new = {}
for tag, span in list(records.items()):
if tag >= ins_row:
rec_new[tag + 1] = span
elif tag + span > ins_row:
rec_new[tag] = span + 1
# GH 14882 - Make sure insertion done once
if not inserted:
dot_row = list(idx_values[ins_row - 1])
dot_row[-1] = u('...')
idx_values.insert(ins_row, tuple(dot_row))
inserted = True
else:
dot_row = list(idx_values[ins_row])
dot_row[inner_lvl - lnum] = u('...')
idx_values[ins_row] = tuple(dot_row)
else:
rec_new[tag] = span
# If ins_row lies between tags, all cols idx cols
# receive ...
if tag + span == ins_row:
rec_new[ins_row] = 1
if lnum == 0:
idx_values.insert(ins_row, tuple(
[u('...')] * len(level_lengths)))
# GH 14882 - Place ... in correct level
elif inserted:
dot_row = list(idx_values[ins_row])
dot_row[inner_lvl - lnum] = u('...')
idx_values[ins_row] = tuple(dot_row)
level_lengths[lnum] = rec_new
level_lengths[inner_lvl][ins_row] = 1
for ix_col in range(len(fmt_values)):
fmt_values[ix_col].insert(ins_row, '...')
nrows += 1
for i in range(nrows):
row = []
tags = {}
sparse_offset = 0
j = 0
for records, v in zip(level_lengths, idx_values[i]):
if i in records:
if records[i] > 1:
tags[j] = template.format(span=records[i])
else:
sparse_offset += 1
continue
j += 1
row.append(v)
row.extend(fmt_values[j][i] for j in range(self.ncols))
if truncate_h:
row.insert(self.row_levels - sparse_offset +
self.fmt.tr_col_num, '...')
self.write_tr(row, indent, self.indent_delta, tags=tags,
nindex_levels=len(levels) - sparse_offset)
else:
row = []
for i in range(len(frame)):
if truncate_v and i == (self.fmt.tr_row_num):
str_sep_row = ['...'] * len(row)
self.write_tr(str_sep_row, indent, self.indent_delta,
tags=None, nindex_levels=self.row_levels)
idx_values = list(zip(*frame.index.format(
sparsify=False, adjoin=False, names=False)))
row = []
row.extend(idx_values[i])
row.extend(fmt_values[j][i] for j in range(self.ncols))
if truncate_h:
row.insert(self.row_levels + self.fmt.tr_col_num, '...')
self.write_tr(row, indent, self.indent_delta, tags=None,
nindex_levels=frame.index.nlevels)
class NotebookFormatter(HTMLFormatter):
"""
Internal class for formatting output data in html for display in Jupyter
Notebooks. This class is intended for functionality specific to
DataFrame._repr_html_() and DataFrame.to_html(notebook=True)
"""
def write_style(self):
# We use the "scoped" attribute here so that the desired
# style properties for the data frame are not then applied
# throughout the entire notebook.
template_first = """\
<style scoped>"""
template_last = """\
</style>"""
template_select = """\
.dataframe %s {
%s: %s;
}"""
element_props = [('tbody tr th:only-of-type',
'vertical-align',
'middle'),
('tbody tr th',
'vertical-align',
'top')]
if isinstance(self.columns, ABCMultiIndex):
element_props.append(('thead tr th',
'text-align',
'left'))
if self.show_row_idx_names:
element_props.append(('thead tr:last-of-type th',
'text-align',
'right'))
else:
element_props.append(('thead th',
'text-align',
'right'))
template_mid = '\n\n'.join(map(lambda t: template_select % t,
element_props))
template = dedent('\n'.join((template_first,
template_mid,
template_last)))
self.write(template)
def render(self):
self.write('<div>')
self.write_style()
super(NotebookFormatter, self).render()
self.write('</div>')
return self.elements
@@ -0,0 +1,246 @@
# -*- coding: utf-8 -*-
"""
Module for formatting output data in Latex.
"""
from __future__ import print_function
import numpy as np
from pandas.compat import map, range, u, zip
from pandas.core.dtypes.generic import ABCMultiIndex
from pandas import compat
from pandas.io.formats.format import TableFormatter
class LatexFormatter(TableFormatter):
""" Used to render a DataFrame to a LaTeX tabular/longtable environment
output.
Parameters
----------
formatter : `DataFrameFormatter`
column_format : str, default None
The columns format as specified in `LaTeX table format
<https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g 'rcl' for 3 columns
longtable : boolean, default False
Use a longtable environment instead of tabular.
See Also
--------
HTMLFormatter
"""
def __init__(self, formatter, column_format=None, longtable=False,
multicolumn=False, multicolumn_format=None, multirow=False):
self.fmt = formatter
self.frame = self.fmt.frame
self.bold_rows = self.fmt.kwds.get('bold_rows', False)
self.column_format = column_format
self.longtable = longtable
self.multicolumn = multicolumn
self.multicolumn_format = multicolumn_format
self.multirow = multirow
def write_result(self, buf):
"""
Render a DataFrame to a LaTeX tabular/longtable environment output.
"""
# string representation of the columns
if len(self.frame.columns) == 0 or len(self.frame.index) == 0:
info_line = (u('Empty {name}\nColumns: {col}\nIndex: {idx}')
.format(name=type(self.frame).__name__,
col=self.frame.columns,
idx=self.frame.index))
strcols = [[info_line]]
else:
strcols = self.fmt._to_str_columns()
def get_col_type(dtype):
if issubclass(dtype.type, np.number):
return 'r'
else:
return 'l'
# reestablish the MultiIndex that has been joined by _to_str_column
if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
out = self.frame.index.format(
adjoin=False, sparsify=self.fmt.sparsify,
names=self.fmt.has_index_names, na_rep=self.fmt.na_rep
)
# index.format will sparsify repeated entries with empty strings
# so pad these with some empty space
def pad_empties(x):
for pad in reversed(x):
if pad:
break
return [x[0]] + [i if i else ' ' * len(pad) for i in x[1:]]
out = (pad_empties(i) for i in out)
# Add empty spaces for each column level
clevels = self.frame.columns.nlevels
out = [[' ' * len(i[-1])] * clevels + i for i in out]
# Add the column names to the last index column
cnames = self.frame.columns.names
if any(cnames):
new_names = [i if i else '{}' for i in cnames]
out[self.frame.index.nlevels - 1][:clevels] = new_names
# Get rid of old multiindex column and add new ones
strcols = out + strcols[1:]
column_format = self.column_format
if column_format is None:
dtypes = self.frame.dtypes._values
column_format = ''.join(map(get_col_type, dtypes))
if self.fmt.index:
index_format = 'l' * self.frame.index.nlevels
column_format = index_format + column_format
elif not isinstance(column_format,
compat.string_types): # pragma: no cover
raise AssertionError('column_format must be str or unicode, '
'not {typ}'.format(typ=type(column_format)))
if not self.longtable:
buf.write('\\begin{{tabular}}{{{fmt}}}\n'
.format(fmt=column_format))
buf.write('\\toprule\n')
else:
buf.write('\\begin{{longtable}}{{{fmt}}}\n'
.format(fmt=column_format))
buf.write('\\toprule\n')
ilevels = self.frame.index.nlevels
clevels = self.frame.columns.nlevels
nlevels = clevels
if self.fmt.has_index_names and self.fmt.show_index_names:
nlevels += 1
strrows = list(zip(*strcols))
self.clinebuf = []
for i, row in enumerate(strrows):
if i == nlevels and self.fmt.header:
buf.write('\\midrule\n') # End of header
if self.longtable:
buf.write('\\endhead\n')
buf.write('\\midrule\n')
buf.write('\\multicolumn{{{n}}}{{r}}{{{{Continued on next '
'page}}}} \\\\\n'.format(n=len(row)))
buf.write('\\midrule\n')
buf.write('\\endfoot\n\n')
buf.write('\\bottomrule\n')
buf.write('\\endlastfoot\n')
if self.fmt.kwds.get('escape', True):
# escape backslashes first
crow = [(x.replace('\\', '\\textbackslash ')
.replace('_', '\\_')
.replace('%', '\\%').replace('$', '\\$')
.replace('#', '\\#').replace('{', '\\{')
.replace('}', '\\}').replace('~', '\\textasciitilde ')
.replace('^', '\\textasciicircum ')
.replace('&', '\\&')
if (x and x != '{}') else '{}') for x in row]
else:
crow = [x if x else '{}' for x in row]
if self.bold_rows and self.fmt.index:
# bold row labels
crow = ['\\textbf{{{x}}}'.format(x=x)
if j < ilevels and x.strip() not in ['', '{}'] else x
for j, x in enumerate(crow)]
if i < clevels and self.fmt.header and self.multicolumn:
# sum up columns to multicolumns
crow = self._format_multicolumn(crow, ilevels)
if (i >= nlevels and self.fmt.index and self.multirow and
ilevels > 1):
# sum up rows to multirows
crow = self._format_multirow(crow, ilevels, i, strrows)
buf.write(' & '.join(crow))
buf.write(' \\\\\n')
if self.multirow and i < len(strrows) - 1:
self._print_cline(buf, i, len(strcols))
if not self.longtable:
buf.write('\\bottomrule\n')
buf.write('\\end{tabular}\n')
else:
buf.write('\\end{longtable}\n')
def _format_multicolumn(self, row, ilevels):
r"""
Combine columns belonging to a group to a single multicolumn entry
according to self.multicolumn_format
e.g.:
a & & & b & c &
will become
\multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c}
"""
row2 = list(row[:ilevels])
ncol = 1
coltext = ''
def append_col():
# write multicolumn if needed
if ncol > 1:
row2.append('\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}'
.format(ncol=ncol, fmt=self.multicolumn_format,
txt=coltext.strip()))
# don't modify where not needed
else:
row2.append(coltext)
for c in row[ilevels:]:
# if next col has text, write the previous
if c.strip():
if coltext:
append_col()
coltext = c
ncol = 1
# if not, add it to the previous multicolumn
else:
ncol += 1
# write last column name
if coltext:
append_col()
return row2
def _format_multirow(self, row, ilevels, i, rows):
r"""
Check following rows, whether row should be a multirow
e.g.: becomes:
a & 0 & \multirow{2}{*}{a} & 0 &
& 1 & & 1 &
b & 0 & \cline{1-2}
b & 0 &
"""
for j in range(ilevels):
if row[j].strip():
nrow = 1
for r in rows[i + 1:]:
if not r[j].strip():
nrow += 1
else:
break
if nrow > 1:
# overwrite non-multirow entry
row[j] = '\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}'.format(
nrow=nrow, row=row[j].strip())
# save when to end the current block with \cline
self.clinebuf.append([i + nrow - 1, j + 1])
return row
def _print_cline(self, buf, i, icol):
"""
Print clines after multirow-blocks are finished
"""
for cl in self.clinebuf:
if cl[0] == i:
buf.write('\\cline{{{cl:d}-{icol:d}}}\n'
.format(cl=cl[1], icol=icol))
# remove entries that have been written to buffer
self.clinebuf = [x for x in self.clinebuf if x[0] != i]
@@ -0,0 +1,435 @@
"""
printing tools
"""
import sys
from pandas.compat import u
from pandas.core.dtypes.inference import is_sequence
from pandas import compat
from pandas.core.config import get_option
def adjoin(space, *lists, **kwargs):
"""
Glues together two sets of strings using the amount of space requested.
The idea is to prettify.
----------
space : int
number of spaces for padding
lists : str
list of str which being joined
strlen : callable
function used to calculate the length of each str. Needed for unicode
handling.
justfunc : callable
function used to justify str. Needed for unicode handling.
"""
strlen = kwargs.pop('strlen', len)
justfunc = kwargs.pop('justfunc', justify)
out_lines = []
newLists = []
lengths = [max(map(strlen, x)) + space for x in lists[:-1]]
# not the last one
lengths.append(max(map(len, lists[-1])))
maxLen = max(map(len, lists))
for i, lst in enumerate(lists):
nl = justfunc(lst, lengths[i], mode='left')
nl.extend([' ' * lengths[i]] * (maxLen - len(lst)))
newLists.append(nl)
toJoin = zip(*newLists)
for lines in toJoin:
out_lines.append(_join_unicode(lines))
return _join_unicode(out_lines, sep='\n')
def justify(texts, max_len, mode='right'):
"""
Perform ljust, center, rjust against string or list-like
"""
if mode == 'left':
return [x.ljust(max_len) for x in texts]
elif mode == 'center':
return [x.center(max_len) for x in texts]
else:
return [x.rjust(max_len) for x in texts]
def _join_unicode(lines, sep=''):
try:
return sep.join(lines)
except UnicodeDecodeError:
sep = compat.text_type(sep)
return sep.join([x.decode('utf-8') if isinstance(x, str) else x
for x in lines])
# Unicode consolidation
# ---------------------
#
# pprinting utility functions for generating Unicode text or
# bytes(3.x)/str(2.x) representations of objects.
# Try to use these as much as possible rather then rolling your own.
#
# When to use
# -----------
#
# 1) If you're writing code internal to pandas (no I/O directly involved),
# use pprint_thing().
#
# It will always return unicode text which can handled by other
# parts of the package without breakage.
#
# 2) if you need to write something out to file, use
# pprint_thing_encoded(encoding).
#
# If no encoding is specified, it defaults to utf-8. Since encoding pure
# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're
# working with straight ascii.
def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds):
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather then calling this directly.
bounds length of printed sequence, depending on options
"""
if isinstance(seq, set):
fmt = u("{{{body}}}")
else:
fmt = u("[{body}]") if hasattr(seq, '__setitem__') else u("({body})")
if max_seq_items is False:
nitems = len(seq)
else:
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
s = iter(seq)
# handle sets, no slicing
r = [pprint_thing(next(s),
_nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
for i in range(min(nitems, len(seq)))]
body = ", ".join(r)
if nitems < len(seq):
body += ", ..."
elif isinstance(seq, tuple) and len(seq) == 1:
body += ','
return fmt.format(body=body)
def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds):
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather then calling this directly.
"""
fmt = u("{{{things}}}")
pairs = []
pfmt = u("{key}: {val}")
if max_seq_items is False:
nitems = len(seq)
else:
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
for k, v in list(seq.items())[:nitems]:
pairs.append(
pfmt.format(
key=pprint_thing(k, _nest_lvl + 1,
max_seq_items=max_seq_items, **kwds),
val=pprint_thing(v, _nest_lvl + 1,
max_seq_items=max_seq_items, **kwds)))
if nitems < len(seq):
return fmt.format(things=", ".join(pairs) + ", ...")
else:
return fmt.format(things=", ".join(pairs))
def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False,
quote_strings=False, max_seq_items=None):
"""
This function is the sanctioned way of converting objects
to a unicode representation.
properly handles nested sequences containing unicode strings
(unicode(object) does not)
Parameters
----------
thing : anything to be formatted
_nest_lvl : internal use only. pprint_thing() is mutually-recursive
with pprint_sequence, this argument is used to keep track of the
current nesting level, and limit it.
escape_chars : list or dict, optional
Characters to escape. If a dict is passed the values are the
replacements
default_escapes : bool, default False
Whether the input escape characters replaces or adds to the defaults
max_seq_items : False, int, default None
Pass thru to other pretty printers to limit sequence printing
Returns
-------
result - unicode object on py2, str on py3. Always Unicode.
"""
def as_escaped_unicode(thing, escape_chars=escape_chars):
# Unicode is fine, else we try to decode using utf-8 and 'replace'
# if that's not it either, we have no way of knowing and the user
# should deal with it himself.
try:
result = compat.text_type(thing) # we should try this first
except UnicodeDecodeError:
# either utf-8 or we replace errors
result = str(thing).decode('utf-8', "replace")
translate = {'\t': r'\t', '\n': r'\n', '\r': r'\r', }
if isinstance(escape_chars, dict):
if default_escapes:
translate.update(escape_chars)
else:
translate = escape_chars
escape_chars = list(escape_chars.keys())
else:
escape_chars = escape_chars or tuple()
for c in escape_chars:
result = result.replace(c, translate[c])
return compat.text_type(result)
if (compat.PY3 and hasattr(thing, '__next__')) or hasattr(thing, 'next'):
return compat.text_type(thing)
elif (isinstance(thing, dict) and
_nest_lvl < get_option("display.pprint_nest_depth")):
result = _pprint_dict(thing, _nest_lvl, quote_strings=True,
max_seq_items=max_seq_items)
elif (is_sequence(thing) and
_nest_lvl < get_option("display.pprint_nest_depth")):
result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars,
quote_strings=quote_strings,
max_seq_items=max_seq_items)
elif isinstance(thing, compat.string_types) and quote_strings:
if compat.PY3:
fmt = u("'{thing}'")
else:
fmt = u("u'{thing}'")
result = fmt.format(thing=as_escaped_unicode(thing))
else:
result = as_escaped_unicode(thing)
return compat.text_type(result) # always unicode
def pprint_thing_encoded(object, encoding='utf-8', errors='replace', **kwds):
value = pprint_thing(object) # get unicode representation of object
return value.encode(encoding, errors, **kwds)
def _enable_data_resource_formatter(enable):
if 'IPython' not in sys.modules:
# definitely not in IPython
return
from IPython import get_ipython
ip = get_ipython()
if ip is None:
# still not in IPython
return
formatters = ip.display_formatter.formatters
mimetype = "application/vnd.dataresource+json"
if enable:
if mimetype not in formatters:
# define tableschema formatter
from IPython.core.formatters import BaseFormatter
class TableSchemaFormatter(BaseFormatter):
print_method = '_repr_data_resource_'
_return_type = (dict,)
# register it:
formatters[mimetype] = TableSchemaFormatter()
# enable it if it's been disabled:
formatters[mimetype].enabled = True
else:
# unregister tableschema mime-type
if mimetype in formatters:
formatters[mimetype].enabled = False
default_pprint = lambda x, max_seq_items=None: \
pprint_thing(x, escape_chars=('\t', '\r', '\n'), quote_strings=True,
max_seq_items=max_seq_items)
def format_object_summary(obj, formatter, is_justify=True, name=None,
indent_for_name=True):
"""
Return the formatted obj as a unicode string
Parameters
----------
obj : object
must be iterable and support __getitem__
formatter : callable
string formatter for an element
is_justify : boolean
should justify the display
name : name, optional
defaults to the class name of the obj
indent_for_name : bool, default True
Whether subsequent lines should be be indented to
align with the name.
Returns
-------
summary string
"""
from pandas.io.formats.console import get_console_size
from pandas.io.formats.format import _get_adjustment
display_width, _ = get_console_size()
if display_width is None:
display_width = get_option('display.width') or 80
if name is None:
name = obj.__class__.__name__
if indent_for_name:
name_len = len(name)
space1 = "\n%s" % (' ' * (name_len + 1))
space2 = "\n%s" % (' ' * (name_len + 2))
else:
space1 = "\n"
space2 = "\n " # space for the opening '['
n = len(obj)
sep = ','
max_seq_items = get_option('display.max_seq_items') or n
# are we a truncated display
is_truncated = n > max_seq_items
# adj can optionally handle unicode eastern asian width
adj = _get_adjustment()
def _extend_line(s, line, value, display_width, next_line_prefix):
if (adj.len(line.rstrip()) + adj.len(value.rstrip()) >=
display_width):
s += line.rstrip()
line = next_line_prefix
line += value
return s, line
def best_len(values):
if values:
return max(adj.len(x) for x in values)
else:
return 0
close = u', '
if n == 0:
summary = u'[]{}'.format(close)
elif n == 1:
first = formatter(obj[0])
summary = u'[{}]{}'.format(first, close)
elif n == 2:
first = formatter(obj[0])
last = formatter(obj[-1])
summary = u'[{}, {}]{}'.format(first, last, close)
else:
if n > max_seq_items:
n = min(max_seq_items // 2, 10)
head = [formatter(x) for x in obj[:n]]
tail = [formatter(x) for x in obj[-n:]]
else:
head = []
tail = [formatter(x) for x in obj]
# adjust all values to max length if needed
if is_justify:
# however, if we are not truncated and we are only a single
# line, then don't justify
if (is_truncated or
not (len(', '.join(head)) < display_width and
len(', '.join(tail)) < display_width)):
max_len = max(best_len(head), best_len(tail))
head = [x.rjust(max_len) for x in head]
tail = [x.rjust(max_len) for x in tail]
summary = ""
line = space2
for i in range(len(head)):
word = head[i] + sep + ' '
summary, line = _extend_line(summary, line, word,
display_width, space2)
if is_truncated:
# remove trailing space of last line
summary += line.rstrip() + space2 + '...'
line = space2
for i in range(len(tail) - 1):
word = tail[i] + sep + ' '
summary, line = _extend_line(summary, line, word,
display_width, space2)
# last value: no sep added + 1 space of width used for trailing ','
summary, line = _extend_line(summary, line, tail[-1],
display_width - 2, space2)
summary += line
# right now close is either '' or ', '
# Now we want to include the ']', but not the maybe space.
close = ']' + close.rstrip(' ')
summary += close
if len(summary) > (display_width):
summary += space1
else: # one row
summary += ' '
# remove initial space
summary = '[' + summary[len(space2):]
return summary
def format_object_attrs(obj):
"""
Return a list of tuples of the (attr, formatted_value)
for common attrs, including dtype, name, length
Parameters
----------
obj : object
must be iterable
Returns
-------
list
"""
attrs = []
if hasattr(obj, 'dtype'):
attrs.append(('dtype', "'{}'".format(obj.dtype)))
if getattr(obj, 'name', None) is not None:
attrs.append(('name', default_pprint(obj.name)))
max_seq_items = get_option('display.max_seq_items') or len(obj)
if len(obj) > max_seq_items:
attrs.append(('length', len(obj)))
return attrs
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,70 @@
{# Update the template_structure.html document too #}
{%- block before_style -%}{%- endblock before_style -%}
{% block style %}
<style type="text/css" >
{% block table_styles %}
{% for s in table_styles %}
#T_{{uuid}} {{s.selector}} {
{% for p,val in s.props %}
{{p}}: {{val}};
{% endfor -%}
}
{%- endfor -%}
{% endblock table_styles %}
{% block before_cellstyle %}{% endblock before_cellstyle %}
{% block cellstyle %}
{%- for s in cellstyle %}
#T_{{uuid}}{{s.selector}} {
{% for p,val in s.props %}
{{p}}: {{val}};
{% endfor %}
}
{%- endfor -%}
{%- endblock cellstyle %}
</style>
{%- endblock style %}
{%- block before_table %}{% endblock before_table %}
{%- block table %}
<table id="T_{{uuid}}" {% if table_attributes %}{{ table_attributes }}{% endif %}>
{%- block caption %}
{%- if caption -%}
<caption>{{caption}}</caption>
{%- endif -%}
{%- endblock caption %}
{%- block thead %}
<thead>
{%- block before_head_rows %}{% endblock %}
{%- for r in head %}
{%- block head_tr scoped %}
<tr>
{%- for c in r %}
{%- if c.is_visible != False %}
<{{ c.type }} class="{{c.class}}" {{ c.attributes|join(" ") }}>{{c.value}}</{{ c.type }}>
{%- endif %}
{%- endfor %}
</tr>
{%- endblock head_tr %}
{%- endfor %}
{%- block after_head_rows %}{% endblock %}
</thead>
{%- endblock thead %}
{%- block tbody %}
<tbody>
{% block before_rows %}{% endblock before_rows %}
{% for r in body %}
{% block tr scoped %}
<tr>
{% for c in r %}
{% if c.is_visible != False %}
<{{ c.type }} {% if c.id is defined -%} id="T_{{ uuid }}{{ c.id }}" {%- endif %} class="{{ c.class }}" {{ c.attributes|join(" ") }}>{{ c.display_value }}</{{ c.type }}>
{% endif %}
{%- endfor %}
</tr>
{% endblock tr %}
{%- endfor %}
{%- block after_rows %}{%- endblock after_rows %}
</tbody>
{%- endblock tbody %}
</table>
{%- endblock table %}
{%- block after_table %}{% endblock after_table %}
@@ -0,0 +1,144 @@
"""
get_terminal_size() -- return width and height of terminal as a tuple
code from:
http://stackoverflow.com/questions/566746/how-to-get-console- window-width-in-
python
written by
Harco Kuppens (http://stackoverflow.com/users/825214/harco-kuppens)
It is mentioned in the stackoverflow response that this code works
on linux, os x, windows and cygwin (windows).
"""
from __future__ import print_function
import os
import shutil
from pandas.compat import PY3
__all__ = ['get_terminal_size', 'is_terminal']
def get_terminal_size():
"""
Detect terminal size and return tuple = (width, height).
Only to be used when running in a terminal. Note that the IPython notebook,
IPython zmq frontends, or IDLE do not run in a terminal,
"""
import platform
if PY3:
return shutil.get_terminal_size()
current_os = platform.system()
tuple_xy = None
if current_os == 'Windows':
tuple_xy = _get_terminal_size_windows()
if tuple_xy is None:
tuple_xy = _get_terminal_size_tput()
# needed for window's python in cygwin's xterm!
if (current_os == 'Linux' or current_os == 'Darwin' or
current_os.startswith('CYGWIN')):
tuple_xy = _get_terminal_size_linux()
if tuple_xy is None:
tuple_xy = (80, 25) # default value
return tuple_xy
def is_terminal():
"""
Detect if Python is running in a terminal.
Returns True if Python is running in a terminal or False if not.
"""
try:
ip = get_ipython()
except NameError: # assume standard Python interpreter in a terminal
return True
else:
if hasattr(ip, 'kernel'): # IPython as a Jupyter kernel
return False
else: # IPython in a terminal
return True
def _get_terminal_size_windows():
try:
from ctypes import windll, create_string_buffer
# stdin handle is -10
# stdout handle is -11
# stderr handle is -12
h = windll.kernel32.GetStdHandle(-12)
csbi = create_string_buffer(22)
res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi)
except (AttributeError, ValueError):
return None
if res:
import struct
(bufx, bufy, curx, cury, wattr, left, top, right, bottom, maxx,
maxy) = struct.unpack("hhhhHhhhhhh", csbi.raw)
sizex = right - left + 1
sizey = bottom - top + 1
return sizex, sizey
else:
return None
def _get_terminal_size_tput():
# get terminal width
# src: http://stackoverflow.com/questions/263890/how-do-i-find-the-width
# -height-of-a-terminal-window
try:
import subprocess
proc = subprocess.Popen(["tput", "cols"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
output = proc.communicate(input=None)
cols = int(output[0])
proc = subprocess.Popen(["tput", "lines"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
output = proc.communicate(input=None)
rows = int(output[0])
return (cols, rows)
except OSError:
return None
def _get_terminal_size_linux():
def ioctl_GWINSZ(fd):
try:
import fcntl
import termios
import struct
cr = struct.unpack(
'hh', fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234'))
except (struct.error, IOError):
return None
return cr
cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
if not cr:
try:
fd = os.open(os.ctermid(), os.O_RDONLY)
cr = ioctl_GWINSZ(fd)
os.close(fd)
except OSError:
pass
if not cr or cr == (0, 0):
try:
from os import environ as env
cr = (env['LINES'], env['COLUMNS'])
except (ValueError, KeyError):
return None
return int(cr[1]), int(cr[0])
if __name__ == "__main__":
sizex, sizey = get_terminal_size()
print('width = {w} height = {h}'.format(w=sizex, h=sizey))
@@ -0,0 +1,162 @@
""" Google BigQuery support """
import warnings
def _try_import():
# since pandas is a dependency of pandas-gbq
# we need to import on first use
try:
import pandas_gbq
except ImportError:
# give a nice error message
raise ImportError("Load data from Google BigQuery\n"
"\n"
"the pandas-gbq package is not installed\n"
"see the docs: https://pandas-gbq.readthedocs.io\n"
"\n"
"you can install via pip or conda:\n"
"pip install pandas-gbq\n"
"conda install pandas-gbq -c conda-forge\n")
return pandas_gbq
def read_gbq(query, project_id=None, index_col=None, col_order=None,
reauth=False, auth_local_webserver=False, dialect=None,
location=None, configuration=None, credentials=None,
private_key=None, verbose=None):
"""
Load data from Google BigQuery.
This function requires the `pandas-gbq package
<https://pandas-gbq.readthedocs.io>`__.
See the `How to authenticate with Google BigQuery
<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
guide for authentication instructions.
Parameters
----------
query : str
SQL-Like Query to return data values.
project_id : str, optional
Google BigQuery Account project ID. Optional when available from
the environment.
index_col : str, optional
Name of result column to use for index in results DataFrame.
col_order : list(str), optional
List of BigQuery column names in the desired order for results
DataFrame.
reauth : boolean, default False
Force Google BigQuery to re-authenticate the user. This is useful
if multiple accounts are used.
auth_local_webserver : boolean, default False
Use the `local webserver flow`_ instead of the `console flow`_
when getting user credentials.
.. _local webserver flow:
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
.. _console flow:
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
*New in version 0.2.0 of pandas-gbq*.
dialect : str, default 'legacy'
Note: The default value is changing to 'standard' in a future verion.
SQL syntax dialect to use. Value can be one of:
``'legacy'``
Use BigQuery's legacy SQL dialect. For more information see
`BigQuery Legacy SQL Reference
<https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
``'standard'``
Use BigQuery's standard SQL, which is
compliant with the SQL 2011 standard. For more information
see `BigQuery Standard SQL Reference
<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
.. versionchanged:: 0.24.0
location : str, optional
Location where the query job should run. See the `BigQuery locations
documentation
<https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
list of available locations. The location must match that of any
datasets used in the query.
*New in version 0.5.0 of pandas-gbq*.
configuration : dict, optional
Query config parameters for job processing.
For example:
configuration = {'query': {'useQueryCache': False}}
For more information see `BigQuery REST API Reference
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
credentials : google.auth.credentials.Credentials, optional
Credentials for accessing Google APIs. Use this parameter to override
default credentials, such as to use Compute Engine
:class:`google.auth.compute_engine.Credentials` or Service Account
:class:`google.oauth2.service_account.Credentials` directly.
*New in version 0.8.0 of pandas-gbq*.
.. versionadded:: 0.24.0
private_key : str, deprecated
Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``
parameter and
:func:`google.oauth2.service_account.Credentials.from_service_account_info`
or
:func:`google.oauth2.service_account.Credentials.from_service_account_file`
instead.
Service account private key in JSON format. Can be file path
or string contents. This is useful for remote server
authentication (eg. Jupyter/IPython notebook on remote host).
verbose : None, deprecated
Deprecated in pandas-gbq version 0.4.0. Use the `logging module to
adjust verbosity instead
<https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
Returns
-------
df: DataFrame
DataFrame representing results of query.
See Also
--------
pandas_gbq.read_gbq : This function in the pandas-gbq library.
pandas.DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
"""
pandas_gbq = _try_import()
if dialect is None:
dialect = "legacy"
warnings.warn(
'The default value for dialect is changing to "standard" in a '
'future version of pandas-gbq. Pass in dialect="legacy" to '
"disable this warning.",
FutureWarning,
stacklevel=2,
)
return pandas_gbq.read_gbq(
query, project_id=project_id, index_col=index_col,
col_order=col_order, reauth=reauth,
auth_local_webserver=auth_local_webserver, dialect=dialect,
location=location, configuration=configuration,
credentials=credentials, verbose=verbose, private_key=private_key)
def to_gbq(dataframe, destination_table, project_id=None, chunksize=None,
reauth=False, if_exists='fail', auth_local_webserver=False,
table_schema=None, location=None, progress_bar=True,
credentials=None, verbose=None, private_key=None):
pandas_gbq = _try_import()
return pandas_gbq.to_gbq(
dataframe, destination_table, project_id=project_id,
chunksize=chunksize, reauth=reauth, if_exists=if_exists,
auth_local_webserver=auth_local_webserver, table_schema=table_schema,
location=location, progress_bar=progress_bar,
credentials=credentials, verbose=verbose, private_key=private_key)
@@ -0,0 +1,16 @@
""" GCS support for remote file interactivity """
try:
import gcsfs
except ImportError:
raise ImportError("The gcsfs library is required to handle GCS files")
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=None, mode=None):
if mode is None:
mode = 'rb'
fs = gcsfs.GCSFileSystem()
filepath_or_buffer = fs.open(filepath_or_buffer, mode)
return filepath_or_buffer, None, compression, True
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,5 @@
from .json import to_json, read_json, loads, dumps # noqa
from .normalize import json_normalize # noqa
from .table_schema import build_table_schema # noqa
del json, normalize, table_schema # noqa
@@ -0,0 +1,951 @@
# pylint: disable-msg=E1101,W0613,W0603
from itertools import islice
import os
import numpy as np
import pandas._libs.json as json
from pandas._libs.tslibs import iNaT
from pandas.compat import StringIO, long, to_str, u
from pandas.errors import AbstractMethodError
from pandas.core.dtypes.common import is_period_dtype
from pandas import DataFrame, MultiIndex, Series, compat, isna, to_datetime
from pandas.core.reshape.concat import concat
from pandas.io.common import (
BaseIterator, _get_handle, _infer_compression, _stringify_path,
get_filepath_or_buffer)
from pandas.io.formats.printing import pprint_thing
from pandas.io.parsers import _validate_integer
from .normalize import _convert_to_line_delimits
from .table_schema import build_table_schema, parse_table_schema
loads = json.loads
dumps = json.dumps
TABLE_SCHEMA_VERSION = '0.20.0'
# interface to/from
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
double_precision=10, force_ascii=True, date_unit='ms',
default_handler=None, lines=False, compression='infer',
index=True):
if not index and orient not in ['split', 'table']:
raise ValueError("'index=False' is only valid when 'orient' is "
"'split' or 'table'")
path_or_buf = _stringify_path(path_or_buf)
if lines and orient != 'records':
raise ValueError(
"'lines' keyword only valid when 'orient' is records")
if orient == 'table' and isinstance(obj, Series):
obj = obj.to_frame(name=obj.name or 'values')
if orient == 'table' and isinstance(obj, DataFrame):
writer = JSONTableWriter
elif isinstance(obj, Series):
writer = SeriesWriter
elif isinstance(obj, DataFrame):
writer = FrameWriter
else:
raise NotImplementedError("'obj' should be a Series or a DataFrame")
s = writer(
obj, orient=orient, date_format=date_format,
double_precision=double_precision, ensure_ascii=force_ascii,
date_unit=date_unit, default_handler=default_handler,
index=index).write()
if lines:
s = _convert_to_line_delimits(s)
if isinstance(path_or_buf, compat.string_types):
fh, handles = _get_handle(path_or_buf, 'w', compression=compression)
try:
fh.write(s)
finally:
fh.close()
elif path_or_buf is None:
return s
else:
path_or_buf.write(s)
class Writer(object):
def __init__(self, obj, orient, date_format, double_precision,
ensure_ascii, date_unit, index, default_handler=None):
self.obj = obj
if orient is None:
orient = self._default_orient
self.orient = orient
self.date_format = date_format
self.double_precision = double_precision
self.ensure_ascii = ensure_ascii
self.date_unit = date_unit
self.default_handler = default_handler
self.index = index
self.is_copy = None
self._format_axes()
def _format_axes(self):
raise AbstractMethodError(self)
def write(self):
return self._write(self.obj, self.orient, self.double_precision,
self.ensure_ascii, self.date_unit,
self.date_format == 'iso', self.default_handler)
def _write(self, obj, orient, double_precision, ensure_ascii,
date_unit, iso_dates, default_handler):
return dumps(
obj,
orient=orient,
double_precision=double_precision,
ensure_ascii=ensure_ascii,
date_unit=date_unit,
iso_dates=iso_dates,
default_handler=default_handler
)
class SeriesWriter(Writer):
_default_orient = 'index'
def _format_axes(self):
if not self.obj.index.is_unique and self.orient == 'index':
raise ValueError("Series index must be unique for orient="
"'{orient}'".format(orient=self.orient))
def _write(self, obj, orient, double_precision, ensure_ascii,
date_unit, iso_dates, default_handler):
if not self.index and orient == 'split':
obj = {"name": obj.name, "data": obj.values}
return super(SeriesWriter, self)._write(obj, orient,
double_precision,
ensure_ascii, date_unit,
iso_dates, default_handler)
class FrameWriter(Writer):
_default_orient = 'columns'
def _format_axes(self):
"""
Try to format axes if they are datelike.
"""
if not self.obj.index.is_unique and self.orient in (
'index', 'columns'):
raise ValueError("DataFrame index must be unique for orient="
"'{orient}'.".format(orient=self.orient))
if not self.obj.columns.is_unique and self.orient in (
'index', 'columns', 'records'):
raise ValueError("DataFrame columns must be unique for orient="
"'{orient}'.".format(orient=self.orient))
def _write(self, obj, orient, double_precision, ensure_ascii,
date_unit, iso_dates, default_handler):
if not self.index and orient == 'split':
obj = obj.to_dict(orient='split')
del obj["index"]
return super(FrameWriter, self)._write(obj, orient,
double_precision,
ensure_ascii, date_unit,
iso_dates, default_handler)
class JSONTableWriter(FrameWriter):
_default_orient = 'records'
def __init__(self, obj, orient, date_format, double_precision,
ensure_ascii, date_unit, index, default_handler=None):
"""
Adds a `schema` attribute with the Table Schema, resets
the index (can't do in caller, because the schema inference needs
to know what the index is, forces orient to records, and forces
date_format to 'iso'.
"""
super(JSONTableWriter, self).__init__(
obj, orient, date_format, double_precision, ensure_ascii,
date_unit, index, default_handler=default_handler)
if date_format != 'iso':
msg = ("Trying to write with `orient='table'` and "
"`date_format='{fmt}'`. Table Schema requires dates "
"to be formatted with `date_format='iso'`"
.format(fmt=date_format))
raise ValueError(msg)
self.schema = build_table_schema(obj, index=self.index)
# NotImplementd on a column MultiIndex
if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):
raise NotImplementedError(
"orient='table' is not supported for MultiIndex")
# TODO: Do this timedelta properly in objToJSON.c See GH #15137
if ((obj.ndim == 1) and (obj.name in set(obj.index.names)) or
len(obj.columns & obj.index.names)):
msg = "Overlapping names between the index and columns"
raise ValueError(msg)
obj = obj.copy()
timedeltas = obj.select_dtypes(include=['timedelta']).columns
if len(timedeltas):
obj[timedeltas] = obj[timedeltas].applymap(
lambda x: x.isoformat())
# Convert PeriodIndex to datetimes before serialzing
if is_period_dtype(obj.index):
obj.index = obj.index.to_timestamp()
# exclude index from obj if index=False
if not self.index:
self.obj = obj.reset_index(drop=True)
else:
self.obj = obj.reset_index(drop=False)
self.date_format = 'iso'
self.orient = 'records'
self.index = index
def _write(self, obj, orient, double_precision, ensure_ascii,
date_unit, iso_dates, default_handler):
data = super(JSONTableWriter, self)._write(obj, orient,
double_precision,
ensure_ascii, date_unit,
iso_dates,
default_handler)
serialized = '{{"schema": {schema}, "data": {data}}}'.format(
schema=dumps(self.schema), data=data)
return serialized
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
convert_axes=True, convert_dates=True, keep_default_dates=True,
numpy=False, precise_float=False, date_unit=None, encoding=None,
lines=False, chunksize=None, compression='infer'):
"""
Convert a JSON string to pandas object.
Parameters
----------
path_or_buf : a valid JSON string or file-like, default: None
The string could be a URL. Valid URL schemes include http, ftp, s3,
gcs, and file. For file URLs, a host is expected. For instance, a local
file could be ``file://localhost/path/to/table.json``
orient : string,
Indication of expected JSON string format.
Compatible JSON strings can be produced by ``to_json()`` with a
corresponding orient value.
The set of possible orients is:
- ``'split'`` : dict like
``{index -> [index], columns -> [columns], data -> [values]}``
- ``'records'`` : list like
``[{column -> value}, ... , {column -> value}]``
- ``'index'`` : dict like ``{index -> {column -> value}}``
- ``'columns'`` : dict like ``{column -> {index -> value}}``
- ``'values'`` : just the values array
The allowed and default values depend on the value
of the `typ` parameter.
* when ``typ == 'series'``,
- allowed orients are ``{'split','records','index'}``
- default is ``'index'``
- The Series index must be unique for orient ``'index'``.
* when ``typ == 'frame'``,
- allowed orients are ``{'split','records','index',
'columns','values', 'table'}``
- default is ``'columns'``
- The DataFrame index must be unique for orients ``'index'`` and
``'columns'``.
- The DataFrame columns must be unique for orients ``'index'``,
``'columns'``, and ``'records'``.
.. versionadded:: 0.23.0
'table' as an allowed value for the ``orient`` argument
typ : type of object to recover (series or frame), default 'frame'
dtype : boolean or dict, default True
If True, infer dtypes, if a dict of column to dtype, then use those,
if False, then don't infer dtypes at all, applies only to the data.
convert_axes : boolean, default True
Try to convert the axes to the proper dtypes.
convert_dates : boolean, default True
List of columns to parse for dates; If True, then try to parse
datelike columns default is True; a column label is datelike if
* it ends with ``'_at'``,
* it ends with ``'_time'``,
* it begins with ``'timestamp'``,
* it is ``'modified'``, or
* it is ``'date'``
keep_default_dates : boolean, default True
If parsing dates, then parse the default datelike columns
numpy : boolean, default False
Direct decoding to numpy arrays. Supports numeric data only, but
non-numeric column and index labels are supported. Note also that the
JSON ordering MUST be the same for each term if numpy=True.
precise_float : boolean, default False
Set to enable usage of higher precision (strtod) function when
decoding string to double values. Default (False) is to use fast but
less precise builtin functionality
date_unit : string, default None
The timestamp unit to detect if converting dates. The default behaviour
is to try and detect the correct precision, but if this is not desired
then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
milliseconds, microseconds or nanoseconds respectively.
encoding : str, default is 'utf-8'
The encoding to use to decode py3 bytes.
.. versionadded:: 0.19.0
lines : boolean, default False
Read the file as a json object per line.
.. versionadded:: 0.19.0
chunksize : integer, default None
Return JsonReader object for iteration.
See the `line-delimted json docs
<http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
for more information on ``chunksize``.
This can only be passed if `lines=True`.
If this is None, the file will be read into memory all at once.
.. versionadded:: 0.21.0
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, zip or xz if path_or_buf is a string ending in
'.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
otherwise. If using 'zip', the ZIP file must contain only one data
file to be read in. Set to None for no decompression.
.. versionadded:: 0.21.0
Returns
-------
result : Series or DataFrame, depending on the value of `typ`.
See Also
--------
DataFrame.to_json
Notes
-----
Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
:class:`Index` name of `index` gets written with :func:`to_json`, the
subsequent read operation will incorrectly set the :class:`Index` name to
``None``. This is because `index` is also used by :func:`DataFrame.to_json`
to denote a missing :class:`Index` name, and the subsequent
:func:`read_json` operation cannot distinguish between the two. The same
limitation is encountered with a :class:`MultiIndex` and any names
beginning with ``'level_'``.
Examples
--------
>>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
... index=['row 1', 'row 2'],
... columns=['col 1', 'col 2'])
Encoding/decoding a Dataframe using ``'split'`` formatted JSON:
>>> df.to_json(orient='split')
'{"columns":["col 1","col 2"],
"index":["row 1","row 2"],
"data":[["a","b"],["c","d"]]}'
>>> pd.read_json(_, orient='split')
col 1 col 2
row 1 a b
row 2 c d
Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
>>> df.to_json(orient='index')
'{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
>>> pd.read_json(_, orient='index')
col 1 col 2
row 1 a b
row 2 c d
Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
Note that index labels are not preserved with this encoding.
>>> df.to_json(orient='records')
'[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
>>> pd.read_json(_, orient='records')
col 1 col 2
0 a b
1 c d
Encoding with Table Schema
>>> df.to_json(orient='table')
'{"schema": {"fields": [{"name": "index", "type": "string"},
{"name": "col 1", "type": "string"},
{"name": "col 2", "type": "string"}],
"primaryKey": "index",
"pandas_version": "0.20.0"},
"data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
{"index": "row 2", "col 1": "c", "col 2": "d"}]}'
"""
compression = _infer_compression(path_or_buf, compression)
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
path_or_buf, encoding=encoding, compression=compression,
)
json_reader = JsonReader(
filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
convert_axes=convert_axes, convert_dates=convert_dates,
keep_default_dates=keep_default_dates, numpy=numpy,
precise_float=precise_float, date_unit=date_unit, encoding=encoding,
lines=lines, chunksize=chunksize, compression=compression,
)
if chunksize:
return json_reader
result = json_reader.read()
if should_close:
try:
filepath_or_buffer.close()
except: # noqa: flake8
pass
return result
class JsonReader(BaseIterator):
"""
JsonReader provides an interface for reading in a JSON file.
If initialized with ``lines=True`` and ``chunksize``, can be iterated over
``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
whole document.
"""
def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
convert_dates, keep_default_dates, numpy, precise_float,
date_unit, encoding, lines, chunksize, compression):
self.path_or_buf = filepath_or_buffer
self.orient = orient
self.typ = typ
self.dtype = dtype
self.convert_axes = convert_axes
self.convert_dates = convert_dates
self.keep_default_dates = keep_default_dates
self.numpy = numpy
self.precise_float = precise_float
self.date_unit = date_unit
self.encoding = encoding
self.compression = compression
self.lines = lines
self.chunksize = chunksize
self.nrows_seen = 0
self.should_close = False
if self.chunksize is not None:
self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
if not self.lines:
raise ValueError("chunksize can only be passed if lines=True")
data = self._get_data_from_filepath(filepath_or_buffer)
self.data = self._preprocess_data(data)
def _preprocess_data(self, data):
"""
At this point, the data either has a `read` attribute (e.g. a file
object or a StringIO) or is a string that is a JSON document.
If self.chunksize, we prepare the data for the `__next__` method.
Otherwise, we read it into memory for the `read` method.
"""
if hasattr(data, 'read') and not self.chunksize:
data = data.read()
if not hasattr(data, 'read') and self.chunksize:
data = StringIO(data)
return data
def _get_data_from_filepath(self, filepath_or_buffer):
"""
The function read_json accepts three input types:
1. filepath (string-like)
2. file-like object (e.g. open file object, StringIO)
3. JSON string
This method turns (1) into (2) to simplify the rest of the processing.
It returns input types (2) and (3) unchanged.
"""
data = filepath_or_buffer
exists = False
if isinstance(data, compat.string_types):
try:
exists = os.path.exists(filepath_or_buffer)
# gh-5874: if the filepath is too long will raise here
except (TypeError, ValueError):
pass
if exists or self.compression is not None:
data, _ = _get_handle(filepath_or_buffer, 'r',
encoding=self.encoding,
compression=self.compression)
self.should_close = True
self.open_stream = data
return data
def _combine_lines(self, lines):
"""
Combines a list of JSON objects into one JSON object.
"""
lines = filter(None, map(lambda x: x.strip(), lines))
return '[' + ','.join(lines) + ']'
def read(self):
"""
Read the whole JSON input into a pandas object.
"""
if self.lines and self.chunksize:
obj = concat(self)
elif self.lines:
data = to_str(self.data)
obj = self._get_object_parser(
self._combine_lines(data.split('\n'))
)
else:
obj = self._get_object_parser(self.data)
self.close()
return obj
def _get_object_parser(self, json):
"""
Parses a json document into a pandas object.
"""
typ = self.typ
dtype = self.dtype
kwargs = {
"orient": self.orient, "dtype": self.dtype,
"convert_axes": self.convert_axes,
"convert_dates": self.convert_dates,
"keep_default_dates": self.keep_default_dates, "numpy": self.numpy,
"precise_float": self.precise_float, "date_unit": self.date_unit
}
obj = None
if typ == 'frame':
obj = FrameParser(json, **kwargs).parse()
if typ == 'series' or obj is None:
if not isinstance(dtype, bool):
kwargs['dtype'] = dtype
obj = SeriesParser(json, **kwargs).parse()
return obj
def close(self):
"""
If we opened a stream earlier, in _get_data_from_filepath, we should
close it.
If an open stream or file was passed, we leave it open.
"""
if self.should_close:
try:
self.open_stream.close()
except (IOError, AttributeError):
pass
def __next__(self):
lines = list(islice(self.data, self.chunksize))
if lines:
lines_json = self._combine_lines(lines)
obj = self._get_object_parser(lines_json)
# Make sure that the returned objects have the right index.
obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
self.nrows_seen += len(obj)
return obj
self.close()
raise StopIteration
class Parser(object):
_STAMP_UNITS = ('s', 'ms', 'us', 'ns')
_MIN_STAMPS = {
's': long(31536000),
'ms': long(31536000000),
'us': long(31536000000000),
'ns': long(31536000000000000)}
def __init__(self, json, orient, dtype=True, convert_axes=True,
convert_dates=True, keep_default_dates=False, numpy=False,
precise_float=False, date_unit=None):
self.json = json
if orient is None:
orient = self._default_orient
self.orient = orient
self.dtype = dtype
if orient == "split":
numpy = False
if date_unit is not None:
date_unit = date_unit.lower()
if date_unit not in self._STAMP_UNITS:
raise ValueError('date_unit must be one of {units}'
.format(units=self._STAMP_UNITS))
self.min_stamp = self._MIN_STAMPS[date_unit]
else:
self.min_stamp = self._MIN_STAMPS['s']
self.numpy = numpy
self.precise_float = precise_float
self.convert_axes = convert_axes
self.convert_dates = convert_dates
self.date_unit = date_unit
self.keep_default_dates = keep_default_dates
self.obj = None
def check_keys_split(self, decoded):
"""
Checks that dict has only the appropriate keys for orient='split'.
"""
bad_keys = set(decoded.keys()).difference(set(self._split_keys))
if bad_keys:
bad_keys = ", ".join(bad_keys)
raise ValueError(u("JSON data had unexpected key(s): {bad_keys}")
.format(bad_keys=pprint_thing(bad_keys)))
def parse(self):
# try numpy
numpy = self.numpy
if numpy:
self._parse_numpy()
else:
self._parse_no_numpy()
if self.obj is None:
return None
if self.convert_axes:
self._convert_axes()
self._try_convert_types()
return self.obj
def _convert_axes(self):
"""
Try to convert axes.
"""
for axis in self.obj._AXIS_NUMBERS.keys():
new_axis, result = self._try_convert_data(
axis, self.obj._get_axis(axis), use_dtypes=False,
convert_dates=True)
if result:
setattr(self.obj, axis, new_axis)
def _try_convert_types(self):
raise AbstractMethodError(self)
def _try_convert_data(self, name, data, use_dtypes=True,
convert_dates=True):
"""
Try to parse a ndarray like into a column by inferring dtype.
"""
# don't try to coerce, unless a force conversion
if use_dtypes:
if self.dtype is False:
return data, False
elif self.dtype is True:
pass
else:
# dtype to force
dtype = (self.dtype.get(name)
if isinstance(self.dtype, dict) else self.dtype)
if dtype is not None:
try:
dtype = np.dtype(dtype)
return data.astype(dtype), True
except (TypeError, ValueError):
return data, False
if convert_dates:
new_data, result = self._try_convert_to_date(data)
if result:
return new_data, True
result = False
if data.dtype == 'object':
# try float
try:
data = data.astype('float64')
result = True
except (TypeError, ValueError):
pass
if data.dtype.kind == 'f':
if data.dtype != 'float64':
# coerce floats to 64
try:
data = data.astype('float64')
result = True
except (TypeError, ValueError):
pass
# don't coerce 0-len data
if len(data) and (data.dtype == 'float' or data.dtype == 'object'):
# coerce ints if we can
try:
new_data = data.astype('int64')
if (new_data == data).all():
data = new_data
result = True
except (TypeError, ValueError):
pass
# coerce ints to 64
if data.dtype == 'int':
# coerce floats to 64
try:
data = data.astype('int64')
result = True
except (TypeError, ValueError):
pass
return data, result
def _try_convert_to_date(self, data):
"""
Try to parse a ndarray like into a date column.
Try to coerce object in epoch/iso formats and integer/float in epoch
formats. Return a boolean if parsing was successful.
"""
# no conversion on empty
if not len(data):
return data, False
new_data = data
if new_data.dtype == 'object':
try:
new_data = data.astype('int64')
except (TypeError, ValueError, OverflowError):
pass
# ignore numbers that are out of range
if issubclass(new_data.dtype.type, np.number):
in_range = (isna(new_data.values) | (new_data > self.min_stamp) |
(new_data.values == iNaT))
if not in_range.all():
return data, False
date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
for date_unit in date_units:
try:
new_data = to_datetime(new_data, errors='raise',
unit=date_unit)
except ValueError:
continue
except Exception:
break
return new_data, True
return data, False
def _try_convert_dates(self):
raise AbstractMethodError(self)
class SeriesParser(Parser):
_default_orient = 'index'
_split_keys = ('name', 'index', 'data')
def _parse_no_numpy(self):
json = self.json
orient = self.orient
if orient == "split":
decoded = {str(k): v for k, v in compat.iteritems(
loads(json, precise_float=self.precise_float))}
self.check_keys_split(decoded)
self.obj = Series(dtype=None, **decoded)
else:
self.obj = Series(
loads(json, precise_float=self.precise_float), dtype=None)
def _parse_numpy(self):
json = self.json
orient = self.orient
if orient == "split":
decoded = loads(json, dtype=None, numpy=True,
precise_float=self.precise_float)
decoded = {str(k): v for k, v in compat.iteritems(decoded)}
self.check_keys_split(decoded)
self.obj = Series(**decoded)
elif orient == "columns" or orient == "index":
self.obj = Series(*loads(json, dtype=None, numpy=True,
labelled=True,
precise_float=self.precise_float))
else:
self.obj = Series(loads(json, dtype=None, numpy=True,
precise_float=self.precise_float))
def _try_convert_types(self):
if self.obj is None:
return
obj, result = self._try_convert_data(
'data', self.obj, convert_dates=self.convert_dates)
if result:
self.obj = obj
class FrameParser(Parser):
_default_orient = 'columns'
_split_keys = ('columns', 'index', 'data')
def _parse_numpy(self):
json = self.json
orient = self.orient
if orient == "columns":
args = loads(json, dtype=None, numpy=True, labelled=True,
precise_float=self.precise_float)
if len(args):
args = (args[0].T, args[2], args[1])
self.obj = DataFrame(*args)
elif orient == "split":
decoded = loads(json, dtype=None, numpy=True,
precise_float=self.precise_float)
decoded = {str(k): v for k, v in compat.iteritems(decoded)}
self.check_keys_split(decoded)
self.obj = DataFrame(**decoded)
elif orient == "values":
self.obj = DataFrame(loads(json, dtype=None, numpy=True,
precise_float=self.precise_float))
else:
self.obj = DataFrame(*loads(json, dtype=None, numpy=True,
labelled=True,
precise_float=self.precise_float))
def _parse_no_numpy(self):
json = self.json
orient = self.orient
if orient == "columns":
self.obj = DataFrame(
loads(json, precise_float=self.precise_float), dtype=None)
elif orient == "split":
decoded = {str(k): v for k, v in compat.iteritems(
loads(json, precise_float=self.precise_float))}
self.check_keys_split(decoded)
self.obj = DataFrame(dtype=None, **decoded)
elif orient == "index":
self.obj = DataFrame(
loads(json, precise_float=self.precise_float), dtype=None).T
elif orient == 'table':
self.obj = parse_table_schema(json,
precise_float=self.precise_float)
else:
self.obj = DataFrame(
loads(json, precise_float=self.precise_float), dtype=None)
def _process_converter(self, f, filt=None):
"""
Take a conversion function and possibly recreate the frame.
"""
if filt is None:
filt = lambda col, c: True
needs_new_obj = False
new_obj = dict()
for i, (col, c) in enumerate(self.obj.iteritems()):
if filt(col, c):
new_data, result = f(col, c)
if result:
c = new_data
needs_new_obj = True
new_obj[i] = c
if needs_new_obj:
# possibly handle dup columns
new_obj = DataFrame(new_obj, index=self.obj.index)
new_obj.columns = self.obj.columns
self.obj = new_obj
def _try_convert_types(self):
if self.obj is None:
return
if self.convert_dates:
self._try_convert_dates()
self._process_converter(
lambda col, c: self._try_convert_data(col, c, convert_dates=False))
def _try_convert_dates(self):
if self.obj is None:
return
# our columns to parse
convert_dates = self.convert_dates
if convert_dates is True:
convert_dates = []
convert_dates = set(convert_dates)
def is_ok(col):
"""
Return if this col is ok to try for a date parse.
"""
if not isinstance(col, compat.string_types):
return False
col_lower = col.lower()
if (col_lower.endswith('_at') or
col_lower.endswith('_time') or
col_lower == 'modified' or
col_lower == 'date' or
col_lower == 'datetime' or
col_lower.startswith('timestamp')):
return True
return False
self._process_converter(
lambda col, c: self._try_convert_to_date(c),
lambda col, c: ((self.keep_default_dates and is_ok(col)) or
col in convert_dates))
@@ -0,0 +1,286 @@
# ---------------------------------------------------------------------
# JSON normalization routines
from collections import defaultdict
import copy
import numpy as np
from pandas._libs.writers import convert_json_to_lines
from pandas import DataFrame, compat
def _convert_to_line_delimits(s):
"""
Helper function that converts JSON lists to line delimited JSON.
"""
# Determine we have a JSON list to turn to lines otherwise just return the
# json object, only lists can
if not s[0] == '[' and s[-1] == ']':
return s
s = s[1:-1]
return convert_json_to_lines(s)
def nested_to_record(ds, prefix="", sep=".", level=0):
"""
A simplified json_normalize.
Converts a nested dict into a flat dict ("record"), unlike json_normalize,
it does not attempt to extract a subset of the data.
Parameters
----------
ds : dict or list of dicts
prefix: the prefix, optional, default: ""
sep : string, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
.. versionadded:: 0.20.0
level: the number of levels in the jason string, optional, default: 0
Returns
-------
d - dict or list of dicts, matching `ds`
Examples
--------
IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),
nested=dict(e=dict(c=1,d=2),d=2)))
Out[52]:
{'dict1.c': 1,
'dict1.d': 2,
'flat1': 1,
'nested.d': 2,
'nested.e.c': 1,
'nested.e.d': 2}
"""
singleton = False
if isinstance(ds, dict):
ds = [ds]
singleton = True
new_ds = []
for d in ds:
new_d = copy.deepcopy(d)
for k, v in d.items():
# each key gets renamed with prefix
if not isinstance(k, compat.string_types):
k = str(k)
if level == 0:
newkey = k
else:
newkey = prefix + sep + k
# only dicts gets recurse-flattend
# only at level>1 do we rename the rest of the keys
if not isinstance(v, dict):
if level != 0: # so we skip copying for top level, common case
v = new_d.pop(k)
new_d[newkey] = v
continue
else:
v = new_d.pop(k)
new_d.update(nested_to_record(v, newkey, sep, level + 1))
new_ds.append(new_d)
if singleton:
return new_ds[0]
return new_ds
def json_normalize(data, record_path=None, meta=None,
meta_prefix=None,
record_prefix=None,
errors='raise',
sep='.'):
"""
Normalize semi-structured JSON data into a flat table.
Parameters
----------
data : dict or list of dicts
Unserialized JSON objects
record_path : string or list of strings, default None
Path in each object to list of records. If not passed, data will be
assumed to be an array of records
meta : list of paths (string or list of strings), default None
Fields to use as metadata for each record in resulting table
meta_prefix : string, default None
record_prefix : string, default None
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
path to records is ['foo', 'bar']
errors : {'raise', 'ignore'}, default 'raise'
* 'ignore' : will ignore KeyError if keys listed in meta are not
always present
* 'raise' : will raise KeyError if keys listed in meta are not
always present
.. versionadded:: 0.20.0
sep : string, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
.. versionadded:: 0.20.0
Returns
-------
frame : DataFrame
Examples
--------
>>> from pandas.io.json import json_normalize
>>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
... {'name': {'given': 'Mose', 'family': 'Regner'}},
... {'id': 2, 'name': 'Faye Raker'}]
>>> json_normalize(data)
id name name.family name.first name.given name.last
0 1.0 NaN NaN Coleen NaN Volk
1 NaN NaN Regner NaN Mose NaN
2 2.0 Faye Raker NaN NaN NaN NaN
>>> data = [{'state': 'Florida',
... 'shortname': 'FL',
... 'info': {
... 'governor': 'Rick Scott'
... },
... 'counties': [{'name': 'Dade', 'population': 12345},
... {'name': 'Broward', 'population': 40000},
... {'name': 'Palm Beach', 'population': 60000}]},
... {'state': 'Ohio',
... 'shortname': 'OH',
... 'info': {
... 'governor': 'John Kasich'
... },
... 'counties': [{'name': 'Summit', 'population': 1234},
... {'name': 'Cuyahoga', 'population': 1337}]}]
>>> result = json_normalize(data, 'counties', ['state', 'shortname',
... ['info', 'governor']])
>>> result
name population info.governor state shortname
0 Dade 12345 Rick Scott Florida FL
1 Broward 40000 Rick Scott Florida FL
2 Palm Beach 60000 Rick Scott Florida FL
3 Summit 1234 John Kasich Ohio OH
4 Cuyahoga 1337 John Kasich Ohio OH
>>> data = {'A': [1, 2]}
>>> json_normalize(data, 'A', record_prefix='Prefix.')
Prefix.0
0 1
1 2
"""
def _pull_field(js, spec):
result = js
if isinstance(spec, list):
for field in spec:
result = result[field]
else:
result = result[spec]
return result
if isinstance(data, list) and not data:
return DataFrame()
# A bit of a hackjob
if isinstance(data, dict):
data = [data]
if record_path is None:
if any([isinstance(x, dict)
for x in compat.itervalues(y)] for y in data):
# naive normalization, this is idempotent for flat records
# and potentially will inflate the data considerably for
# deeply nested structures:
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
#
# TODO: handle record value which are lists, at least error
# reasonably
data = nested_to_record(data, sep=sep)
return DataFrame(data)
elif not isinstance(record_path, list):
record_path = [record_path]
if meta is None:
meta = []
elif not isinstance(meta, list):
meta = [meta]
meta = [m if isinstance(m, list) else [m] for m in meta]
# Disastrously inefficient for now
records = []
lengths = []
meta_vals = defaultdict(list)
if not isinstance(sep, compat.string_types):
sep = str(sep)
meta_keys = [sep.join(val) for val in meta]
def _recursive_extract(data, path, seen_meta, level=0):
if isinstance(data, dict):
data = [data]
if len(path) > 1:
for obj in data:
for val, key in zip(meta, meta_keys):
if level + 1 == len(val):
seen_meta[key] = _pull_field(obj, val[-1])
_recursive_extract(obj[path[0]], path[1:],
seen_meta, level=level + 1)
else:
for obj in data:
recs = _pull_field(obj, path[0])
# For repeating the metadata later
lengths.append(len(recs))
for val, key in zip(meta, meta_keys):
if level + 1 > len(val):
meta_val = seen_meta[key]
else:
try:
meta_val = _pull_field(obj, val[level:])
except KeyError as e:
if errors == 'ignore':
meta_val = np.nan
else:
raise KeyError("Try running with "
"errors='ignore' as key "
"{err} is not always present"
.format(err=e))
meta_vals[key].append(meta_val)
records.extend(recs)
_recursive_extract(data, record_path, {}, level=0)
result = DataFrame(records)
if record_prefix is not None:
result = result.rename(
columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))
# Data types, a problem
for k, v in compat.iteritems(meta_vals):
if meta_prefix is not None:
k = meta_prefix + k
if k in result:
raise ValueError('Conflicting metadata name {name}, '
'need distinguishing prefix '.format(name=k))
result[k] = np.array(v).repeat(lengths)
return result
@@ -0,0 +1,325 @@
"""
Table Schema builders
http://specs.frictionlessdata.io/json-table-schema/
"""
import warnings
import pandas._libs.json as json
from pandas.core.dtypes.common import (
is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
is_datetime64tz_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype,
is_string_dtype, is_timedelta64_dtype)
from pandas import DataFrame
from pandas.api.types import CategoricalDtype
import pandas.core.common as com
loads = json.loads
def as_json_table_type(x):
"""
Convert a NumPy / pandas type to its corresponding json_table.
Parameters
----------
x : array or dtype
Returns
-------
t : str
the Table Schema data types
Notes
-----
This table shows the relationship between NumPy / pandas dtypes,
and Table Schema dtypes.
============== =================
Pandas type Table Schema type
============== =================
int64 integer
float64 number
bool boolean
datetime64[ns] datetime
timedelta64[ns] duration
object str
categorical any
=============== =================
"""
if is_integer_dtype(x):
return 'integer'
elif is_bool_dtype(x):
return 'boolean'
elif is_numeric_dtype(x):
return 'number'
elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
is_period_dtype(x)):
return 'datetime'
elif is_timedelta64_dtype(x):
return 'duration'
elif is_categorical_dtype(x):
return 'any'
elif is_string_dtype(x):
return 'string'
else:
return 'any'
def set_default_names(data):
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
if com._all_not_none(*data.index.names):
nms = data.index.names
if len(nms) == 1 and data.index.name == 'index':
warnings.warn("Index name of 'index' is not round-trippable")
elif len(nms) > 1 and any(x.startswith('level_') for x in nms):
warnings.warn("Index names beginning with 'level_' are not "
"round-trippable")
return data
data = data.copy()
if data.index.nlevels > 1:
names = [name if name is not None else 'level_{}'.format(i)
for i, name in enumerate(data.index.names)]
data.index.names = names
else:
data.index.name = data.index.name or 'index'
return data
def convert_pandas_type_to_json_field(arr, dtype=None):
dtype = dtype or arr.dtype
if arr.name is None:
name = 'values'
else:
name = arr.name
field = {'name': name,
'type': as_json_table_type(dtype)}
if is_categorical_dtype(arr):
if hasattr(arr, 'categories'):
cats = arr.categories
ordered = arr.ordered
else:
cats = arr.cat.categories
ordered = arr.cat.ordered
field['constraints'] = {"enum": list(cats)}
field['ordered'] = ordered
elif is_period_dtype(arr):
field['freq'] = arr.freqstr
elif is_datetime64tz_dtype(arr):
if hasattr(arr, 'dt'):
field['tz'] = arr.dt.tz.zone
else:
field['tz'] = arr.tz.zone
return field
def convert_json_field_to_pandas_type(field):
"""
Converts a JSON field descriptor into its corresponding NumPy / pandas type
Parameters
----------
field
A JSON field descriptor
Returns
-------
dtype
Raises
-----
ValueError
If the type of the provided field is unknown or currently unsupported
Examples
--------
>>> convert_json_field_to_pandas_type({'name': 'an_int',
'type': 'integer'})
'int64'
>>> convert_json_field_to_pandas_type({'name': 'a_categorical',
'type': 'any',
'contraints': {'enum': [
'a', 'b', 'c']},
'ordered': True})
'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)'
>>> convert_json_field_to_pandas_type({'name': 'a_datetime',
'type': 'datetime'})
'datetime64[ns]'
>>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz',
'type': 'datetime',
'tz': 'US/Central'})
'datetime64[ns, US/Central]'
"""
typ = field['type']
if typ == 'string':
return 'object'
elif typ == 'integer':
return 'int64'
elif typ == 'number':
return 'float64'
elif typ == 'boolean':
return 'bool'
elif typ == 'duration':
return 'timedelta64'
elif typ == 'datetime':
if field.get('tz'):
return 'datetime64[ns, {tz}]'.format(tz=field['tz'])
else:
return 'datetime64[ns]'
elif typ == 'any':
if 'constraints' in field and 'ordered' in field:
return CategoricalDtype(categories=field['constraints']['enum'],
ordered=field['ordered'])
else:
return 'object'
raise ValueError("Unsupported or invalid field type: {}".format(typ))
def build_table_schema(data, index=True, primary_key=None, version=True):
"""
Create a Table schema from ``data``.
Parameters
----------
data : Series, DataFrame
index : bool, default True
Whether to include ``data.index`` in the schema.
primary_key : bool or None, default True
column names to designate as the primary key.
The default `None` will set `'primaryKey'` to the index
level or levels if the index is unique.
version : bool, default True
Whether to include a field `pandas_version` with the version
of pandas that generated the schema.
Returns
-------
schema : dict
Notes
-----
See `_as_json_table_type` for conversion types.
Timedeltas as converted to ISO8601 duration format with
9 decimal places after the seconds field for nanosecond precision.
Categoricals are converted to the `any` dtype, and use the `enum` field
constraint to list the allowed values. The `ordered` attribute is included
in an `ordered` field.
Examples
--------
>>> df = pd.DataFrame(
... {'A': [1, 2, 3],
... 'B': ['a', 'b', 'c'],
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
... }, index=pd.Index(range(3), name='idx'))
>>> build_table_schema(df)
{'fields': [{'name': 'idx', 'type': 'integer'},
{'name': 'A', 'type': 'integer'},
{'name': 'B', 'type': 'string'},
{'name': 'C', 'type': 'datetime'}],
'pandas_version': '0.20.0',
'primaryKey': ['idx']}
"""
if index is True:
data = set_default_names(data)
schema = {}
fields = []
if index:
if data.index.nlevels > 1:
for level in data.index.levels:
fields.append(convert_pandas_type_to_json_field(level))
else:
fields.append(convert_pandas_type_to_json_field(data.index))
if data.ndim > 1:
for column, s in data.iteritems():
fields.append(convert_pandas_type_to_json_field(s))
else:
fields.append(convert_pandas_type_to_json_field(data))
schema['fields'] = fields
if index and data.index.is_unique and primary_key is None:
if data.index.nlevels == 1:
schema['primaryKey'] = [data.index.name]
else:
schema['primaryKey'] = data.index.names
elif primary_key is not None:
schema['primaryKey'] = primary_key
if version:
schema['pandas_version'] = '0.20.0'
return schema
def parse_table_schema(json, precise_float):
"""
Builds a DataFrame from a given schema
Parameters
----------
json :
A JSON table schema
precise_float : boolean
Flag controlling precision when decoding string to double values, as
dictated by ``read_json``
Returns
-------
df : DataFrame
Raises
------
NotImplementedError
If the JSON table schema contains either timezone or timedelta data
Notes
-----
Because :func:`DataFrame.to_json` uses the string 'index' to denote a
name-less :class:`Index`, this function sets the name of the returned
:class:`DataFrame` to ``None`` when said string is encountered with a
normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
applies to any strings beginning with 'level_'. Therefore, an
:class:`Index` name of 'index' and :class:`MultiIndex` names starting
with 'level_' are not supported.
See Also
--------
build_table_schema : Inverse function.
pandas.read_json
"""
table = loads(json, precise_float=precise_float)
col_order = [field['name'] for field in table['schema']['fields']]
df = DataFrame(table['data'], columns=col_order)[col_order]
dtypes = {field['name']: convert_json_field_to_pandas_type(field)
for field in table['schema']['fields']}
# Cannot directly use as_type with timezone data on object; raise for now
if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
raise NotImplementedError('table="orient" can not yet read timezone '
'data')
# No ISO constructor for Timedelta as of yet, so need to raise
if 'timedelta64' in dtypes.values():
raise NotImplementedError('table="orient" can not yet read '
'ISO-formatted Timedelta data')
df = df.astype(dtypes)
df = df.set_index(table['schema']['primaryKey'])
if len(df.index.names) == 1:
if df.index.name == 'index':
df.index.name = None
else:
df.index.names = [None if x.startswith('level_') else x for x in
df.index.names]
return df
@@ -0,0 +1,50 @@
# coding: utf-8
from collections import namedtuple
from pandas.io.msgpack.exceptions import * # noqa
from pandas.io.msgpack._version import version # noqa
class ExtType(namedtuple('ExtType', 'code data')):
"""ExtType represents ext type in msgpack."""
def __new__(cls, code, data):
if not isinstance(code, int):
raise TypeError("code must be int")
if not isinstance(data, bytes):
raise TypeError("data must be bytes")
if not 0 <= code <= 127:
raise ValueError("code must be 0~127")
return super(ExtType, cls).__new__(cls, code, data)
import os # noqa
from pandas.io.msgpack._packer import Packer # noqa
from pandas.io.msgpack._unpacker import unpack, unpackb, Unpacker # noqa
def pack(o, stream, **kwargs):
"""
Pack object `o` and write it to `stream`
See :class:`Packer` for options.
"""
packer = Packer(**kwargs)
stream.write(packer.pack(o))
def packb(o, **kwargs):
"""
Pack object `o` and return packed bytes
See :class:`Packer` for options.
"""
return Packer(**kwargs).pack(o)
# alias for compatibility to simplejson/marshal/pickle.
load = unpack
loads = unpackb
dump = pack
dumps = packb
@@ -0,0 +1 @@
version = (0, 4, 6)
@@ -0,0 +1,32 @@
class UnpackException(Exception):
pass
class BufferFull(UnpackException):
pass
class OutOfData(UnpackException):
pass
class UnpackValueError(UnpackException, ValueError):
pass
class ExtraData(ValueError):
def __init__(self, unpacked, extra):
self.unpacked = unpacked
self.extra = extra
def __str__(self):
return "unpack(b) received extra data."
class PackException(Exception):
pass
class PackValueError(PackException, ValueError):
pass
@@ -0,0 +1,830 @@
"""
Msgpack serializer support for reading and writing pandas data structures
to disk
portions of msgpack_numpy package, by Lev Givon were incorporated
into this module (and tests_packers.py)
License
=======
Copyright (c) 2013, Lev Givon.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Lev Givon nor the names of any
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from datetime import date, datetime, timedelta
import os
from textwrap import dedent
import warnings
from dateutil.parser import parse
import numpy as np
import pandas.compat as compat
from pandas.compat import u, u_safe
from pandas.errors import PerformanceWarning
from pandas.util._move import (
BadMove as _BadMove, move_into_mutable_buffer as _move_into_mutable_buffer)
from pandas.core.dtypes.common import (
is_categorical_dtype, is_datetime64tz_dtype, is_object_dtype,
needs_i8_conversion, pandas_dtype)
from pandas import ( # noqa:F401
Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
Index, Int64Index, Interval, IntervalIndex, MultiIndex, NaT, Panel, Period,
PeriodIndex, RangeIndex, Series, TimedeltaIndex, Timestamp)
from pandas.core import internals
from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray
from pandas.core.arrays.sparse import BlockIndex, IntIndex
from pandas.core.generic import NDFrame
from pandas.core.internals import BlockManager, _safe_reshape, make_block
from pandas.core.sparse.api import SparseDataFrame, SparseSeries
from pandas.io.common import _stringify_path, get_filepath_or_buffer
from pandas.io.msgpack import ExtType, Packer as _Packer, Unpacker as _Unpacker
# check which compression libs we have installed
try:
import zlib
def _check_zlib():
pass
except ImportError:
def _check_zlib():
raise ImportError('zlib is not installed')
_check_zlib.__doc__ = dedent(
"""\
Check if zlib is installed.
Raises
------
ImportError
Raised when zlib is not installed.
""",
)
try:
import blosc
def _check_blosc():
pass
except ImportError:
def _check_blosc():
raise ImportError('blosc is not installed')
_check_blosc.__doc__ = dedent(
"""\
Check if blosc is installed.
Raises
------
ImportError
Raised when blosc is not installed.
""",
)
# until we can pass this into our conversion functions,
# this is pretty hacky
compressor = None
def to_msgpack(path_or_buf, *args, **kwargs):
"""
msgpack (serialize) object to input file path
THIS IS AN EXPERIMENTAL LIBRARY and the storage format
may not be stable until a future release.
Parameters
----------
path_or_buf : string File path, buffer-like, or None
if None, return generated string
args : an object or objects to serialize
encoding : encoding for unicode objects
append : boolean whether to append to an existing msgpack
(default is False)
compress : type of compressor (zlib or blosc), default to None (no
compression)
"""
global compressor
compressor = kwargs.pop('compress', None)
if compressor:
compressor = u(compressor)
append = kwargs.pop('append', None)
if append:
mode = 'a+b'
else:
mode = 'wb'
def writer(fh):
for a in args:
fh.write(pack(a, **kwargs))
path_or_buf = _stringify_path(path_or_buf)
if isinstance(path_or_buf, compat.string_types):
with open(path_or_buf, mode) as fh:
writer(fh)
elif path_or_buf is None:
buf = compat.BytesIO()
writer(buf)
return buf.getvalue()
else:
writer(path_or_buf)
def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
"""
Load msgpack pandas object from the specified
file path
THIS IS AN EXPERIMENTAL LIBRARY and the storage format
may not be stable until a future release.
Parameters
----------
path_or_buf : string File path, BytesIO like or string
encoding : Encoding for decoding msgpack str type
iterator : boolean, if True, return an iterator to the unpacker
(default is False)
Returns
-------
obj : same type as object stored in file
"""
path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)
if iterator:
return Iterator(path_or_buf)
def read(fh):
unpacked_obj = list(unpack(fh, encoding=encoding, **kwargs))
if len(unpacked_obj) == 1:
return unpacked_obj[0]
if should_close:
try:
path_or_buf.close()
except IOError:
pass
return unpacked_obj
# see if we have an actual file
if isinstance(path_or_buf, compat.string_types):
try:
exists = os.path.exists(path_or_buf)
except (TypeError, ValueError):
exists = False
if exists:
with open(path_or_buf, 'rb') as fh:
return read(fh)
if isinstance(path_or_buf, compat.binary_type):
# treat as a binary-like
fh = None
try:
# We can't distinguish between a path and a buffer of bytes in
# Python 2 so instead assume the first byte of a valid path is
# less than 0x80.
if compat.PY3 or ord(path_or_buf[0]) >= 0x80:
fh = compat.BytesIO(path_or_buf)
return read(fh)
finally:
if fh is not None:
fh.close()
elif hasattr(path_or_buf, 'read') and compat.callable(path_or_buf.read):
# treat as a buffer like
return read(path_or_buf)
raise ValueError('path_or_buf needs to be a string file path or file-like')
dtype_dict = {21: np.dtype('M8[ns]'),
u('datetime64[ns]'): np.dtype('M8[ns]'),
u('datetime64[us]'): np.dtype('M8[us]'),
22: np.dtype('m8[ns]'),
u('timedelta64[ns]'): np.dtype('m8[ns]'),
u('timedelta64[us]'): np.dtype('m8[us]'),
# this is platform int, which we need to remap to np.int64
# for compat on windows platforms
7: np.dtype('int64'),
'category': 'category'
}
def dtype_for(t):
""" return my dtype mapping, whether number or name """
if t in dtype_dict:
return dtype_dict[t]
return np.typeDict.get(t, t)
c2f_dict = {'complex': np.float64,
'complex128': np.float64,
'complex64': np.float32}
# windows (32 bit) compat
if hasattr(np, 'float128'):
c2f_dict['complex256'] = np.float128
def c2f(r, i, ctype_name):
"""
Convert strings to complex number instance with specified numpy type.
"""
ftype = c2f_dict[ctype_name]
return np.typeDict[ctype_name](ftype(r) + 1j * ftype(i))
def convert(values):
""" convert the numpy values to a list """
dtype = values.dtype
if is_categorical_dtype(values):
return values
elif is_object_dtype(dtype):
return values.ravel().tolist()
if needs_i8_conversion(dtype):
values = values.view('i8')
v = values.ravel()
if compressor == 'zlib':
_check_zlib()
# return string arrays like they are
if dtype == np.object_:
return v.tolist()
# convert to a bytes array
v = v.tostring()
return ExtType(0, zlib.compress(v))
elif compressor == 'blosc':
_check_blosc()
# return string arrays like they are
if dtype == np.object_:
return v.tolist()
# convert to a bytes array
v = v.tostring()
return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))
# ndarray (on original dtype)
return ExtType(0, v.tostring())
def unconvert(values, dtype, compress=None):
as_is_ext = isinstance(values, ExtType) and values.code == 0
if as_is_ext:
values = values.data
if is_categorical_dtype(dtype):
return values
elif is_object_dtype(dtype):
return np.array(values, dtype=object)
dtype = pandas_dtype(dtype).base
if not as_is_ext:
values = values.encode('latin1')
if compress:
if compress == u'zlib':
_check_zlib()
decompress = zlib.decompress
elif compress == u'blosc':
_check_blosc()
decompress = blosc.decompress
else:
raise ValueError("compress must be one of 'zlib' or 'blosc'")
try:
return np.frombuffer(
_move_into_mutable_buffer(decompress(values)),
dtype=dtype,
)
except _BadMove as e:
# Pull the decompressed data off of the `_BadMove` exception.
# We don't just store this in the locals because we want to
# minimize the risk of giving users access to a `bytes` object
# whose data is also given to a mutable buffer.
values = e.args[0]
if len(values) > 1:
# The empty string and single characters are memoized in many
# string creating functions in the capi. This case should not
# warn even though we need to make a copy because we are only
# copying at most 1 byte.
warnings.warn(
'copying data after decompressing; this may mean that'
' decompress is caching its result',
PerformanceWarning,
)
# fall through to copying `np.fromstring`
# Copy the bytes into a numpy array.
buf = np.frombuffer(values, dtype=dtype)
buf = buf.copy() # required to not mutate the original data
buf.flags.writeable = True
return buf
def encode(obj):
"""
Data encoder
"""
tobj = type(obj)
if isinstance(obj, Index):
if isinstance(obj, RangeIndex):
return {u'typ': u'range_index',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'start': getattr(obj, '_start', None),
u'stop': getattr(obj, '_stop', None),
u'step': getattr(obj, '_step', None)}
elif isinstance(obj, PeriodIndex):
return {u'typ': u'period_index',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'freq': u_safe(getattr(obj, 'freqstr', None)),
u'dtype': u(obj.dtype.name),
u'data': convert(obj.asi8),
u'compress': compressor}
elif isinstance(obj, DatetimeIndex):
tz = getattr(obj, 'tz', None)
# store tz info and data as UTC
if tz is not None:
tz = u(tz.zone)
obj = obj.tz_convert('UTC')
return {u'typ': u'datetime_index',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'dtype': u(obj.dtype.name),
u'data': convert(obj.asi8),
u'freq': u_safe(getattr(obj, 'freqstr', None)),
u'tz': tz,
u'compress': compressor}
elif isinstance(obj, (IntervalIndex, IntervalArray)):
if isinstance(obj, IntervalIndex):
typ = u'interval_index'
else:
typ = u'interval_array'
return {u'typ': typ,
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'left': getattr(obj, 'left', None),
u'right': getattr(obj, 'right', None),
u'closed': getattr(obj, 'closed', None)}
elif isinstance(obj, MultiIndex):
return {u'typ': u'multi_index',
u'klass': u(obj.__class__.__name__),
u'names': getattr(obj, 'names', None),
u'dtype': u(obj.dtype.name),
u'data': convert(obj.values),
u'compress': compressor}
else:
return {u'typ': u'index',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'dtype': u(obj.dtype.name),
u'data': convert(obj.values),
u'compress': compressor}
elif isinstance(obj, Categorical):
return {u'typ': u'category',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'codes': obj.codes,
u'categories': obj.categories,
u'ordered': obj.ordered,
u'compress': compressor}
elif isinstance(obj, Series):
if isinstance(obj, SparseSeries):
raise NotImplementedError(
'msgpack sparse series is not implemented'
)
# d = {'typ': 'sparse_series',
# 'klass': obj.__class__.__name__,
# 'dtype': obj.dtype.name,
# 'index': obj.index,
# 'sp_index': obj.sp_index,
# 'sp_values': convert(obj.sp_values),
# 'compress': compressor}
# for f in ['name', 'fill_value', 'kind']:
# d[f] = getattr(obj, f, None)
# return d
else:
return {u'typ': u'series',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'index': obj.index,
u'dtype': u(obj.dtype.name),
u'data': convert(obj.values),
u'compress': compressor}
elif issubclass(tobj, NDFrame):
if isinstance(obj, SparseDataFrame):
raise NotImplementedError(
'msgpack sparse frame is not implemented'
)
# d = {'typ': 'sparse_dataframe',
# 'klass': obj.__class__.__name__,
# 'columns': obj.columns}
# for f in ['default_fill_value', 'default_kind']:
# d[f] = getattr(obj, f, None)
# d['data'] = dict([(name, ss)
# for name, ss in compat.iteritems(obj)])
# return d
else:
data = obj._data
if not data.is_consolidated():
data = data.consolidate()
# the block manager
return {u'typ': u'block_manager',
u'klass': u(obj.__class__.__name__),
u'axes': data.axes,
u'blocks': [{u'locs': b.mgr_locs.as_array,
u'values': convert(b.values),
u'shape': b.values.shape,
u'dtype': u(b.dtype.name),
u'klass': u(b.__class__.__name__),
u'compress': compressor} for b in data.blocks]
}
elif isinstance(obj, (datetime, date, np.datetime64, timedelta,
np.timedelta64)) or obj is NaT:
if isinstance(obj, Timestamp):
tz = obj.tzinfo
if tz is not None:
tz = u(tz.zone)
freq = obj.freq
if freq is not None:
freq = u(freq.freqstr)
return {u'typ': u'timestamp',
u'value': obj.value,
u'freq': freq,
u'tz': tz}
if obj is NaT:
return {u'typ': u'nat'}
elif isinstance(obj, np.timedelta64):
return {u'typ': u'timedelta64',
u'data': obj.view('i8')}
elif isinstance(obj, timedelta):
return {u'typ': u'timedelta',
u'data': (obj.days, obj.seconds, obj.microseconds)}
elif isinstance(obj, np.datetime64):
return {u'typ': u'datetime64',
u'data': u(str(obj))}
elif isinstance(obj, datetime):
return {u'typ': u'datetime',
u'data': u(obj.isoformat())}
elif isinstance(obj, date):
return {u'typ': u'date',
u'data': u(obj.isoformat())}
raise Exception(
"cannot encode this datetimelike object: {obj}".format(obj=obj))
elif isinstance(obj, Period):
return {u'typ': u'period',
u'ordinal': obj.ordinal,
u'freq': u_safe(obj.freqstr)}
elif isinstance(obj, Interval):
return {u'typ': u'interval',
u'left': obj.left,
u'right': obj.right,
u'closed': obj.closed}
elif isinstance(obj, BlockIndex):
return {u'typ': u'block_index',
u'klass': u(obj.__class__.__name__),
u'blocs': obj.blocs,
u'blengths': obj.blengths,
u'length': obj.length}
elif isinstance(obj, IntIndex):
return {u'typ': u'int_index',
u'klass': u(obj.__class__.__name__),
u'indices': obj.indices,
u'length': obj.length}
elif isinstance(obj, np.ndarray):
return {u'typ': u'ndarray',
u'shape': obj.shape,
u'ndim': obj.ndim,
u'dtype': u(obj.dtype.name),
u'data': convert(obj),
u'compress': compressor}
elif isinstance(obj, np.number):
if np.iscomplexobj(obj):
return {u'typ': u'np_scalar',
u'sub_typ': u'np_complex',
u'dtype': u(obj.dtype.name),
u'real': u(obj.real.__repr__()),
u'imag': u(obj.imag.__repr__())}
else:
return {u'typ': u'np_scalar',
u'dtype': u(obj.dtype.name),
u'data': u(obj.__repr__())}
elif isinstance(obj, complex):
return {u'typ': u'np_complex',
u'real': u(obj.real.__repr__()),
u'imag': u(obj.imag.__repr__())}
return obj
def decode(obj):
"""
Decoder for deserializing numpy data types.
"""
typ = obj.get(u'typ')
if typ is None:
return obj
elif typ == u'timestamp':
freq = obj[u'freq'] if 'freq' in obj else obj[u'offset']
return Timestamp(obj[u'value'], tz=obj[u'tz'], freq=freq)
elif typ == u'nat':
return NaT
elif typ == u'period':
return Period(ordinal=obj[u'ordinal'], freq=obj[u'freq'])
elif typ == u'index':
dtype = dtype_for(obj[u'dtype'])
data = unconvert(obj[u'data'], dtype,
obj.get(u'compress'))
return Index(data, dtype=dtype, name=obj[u'name'])
elif typ == u'range_index':
return RangeIndex(obj[u'start'],
obj[u'stop'],
obj[u'step'],
name=obj[u'name'])
elif typ == u'multi_index':
dtype = dtype_for(obj[u'dtype'])
data = unconvert(obj[u'data'], dtype,
obj.get(u'compress'))
data = [tuple(x) for x in data]
return MultiIndex.from_tuples(data, names=obj[u'names'])
elif typ == u'period_index':
data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
d = dict(name=obj[u'name'], freq=obj[u'freq'])
freq = d.pop('freq', None)
return PeriodIndex(PeriodArray(data, freq), **d)
elif typ == u'datetime_index':
data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
d = dict(name=obj[u'name'], freq=obj[u'freq'])
result = DatetimeIndex(data, **d)
tz = obj[u'tz']
# reverse tz conversion
if tz is not None:
result = result.tz_localize('UTC').tz_convert(tz)
return result
elif typ in (u'interval_index', 'interval_array'):
return globals()[obj[u'klass']].from_arrays(obj[u'left'],
obj[u'right'],
obj[u'closed'],
name=obj[u'name'])
elif typ == u'category':
from_codes = globals()[obj[u'klass']].from_codes
return from_codes(codes=obj[u'codes'],
categories=obj[u'categories'],
ordered=obj[u'ordered'])
elif typ == u'interval':
return Interval(obj[u'left'], obj[u'right'], obj[u'closed'])
elif typ == u'series':
dtype = dtype_for(obj[u'dtype'])
pd_dtype = pandas_dtype(dtype)
index = obj[u'index']
result = Series(unconvert(obj[u'data'], dtype, obj[u'compress']),
index=index,
dtype=pd_dtype,
name=obj[u'name'])
return result
elif typ == u'block_manager':
axes = obj[u'axes']
def create_block(b):
values = _safe_reshape(unconvert(
b[u'values'], dtype_for(b[u'dtype']),
b[u'compress']), b[u'shape'])
# locs handles duplicate column names, and should be used instead
# of items; see GH 9618
if u'locs' in b:
placement = b[u'locs']
else:
placement = axes[0].get_indexer(b[u'items'])
if is_datetime64tz_dtype(b[u'dtype']):
assert isinstance(values, np.ndarray), type(values)
assert values.dtype == 'M8[ns]', values.dtype
values = DatetimeArray(values, dtype=b[u'dtype'])
return make_block(values=values,
klass=getattr(internals, b[u'klass']),
placement=placement,
dtype=b[u'dtype'])
blocks = [create_block(b) for b in obj[u'blocks']]
return globals()[obj[u'klass']](BlockManager(blocks, axes))
elif typ == u'datetime':
return parse(obj[u'data'])
elif typ == u'datetime64':
return np.datetime64(parse(obj[u'data']))
elif typ == u'date':
return parse(obj[u'data']).date()
elif typ == u'timedelta':
return timedelta(*obj[u'data'])
elif typ == u'timedelta64':
return np.timedelta64(int(obj[u'data']))
# elif typ == 'sparse_series':
# dtype = dtype_for(obj['dtype'])
# return SparseSeries(
# unconvert(obj['sp_values'], dtype, obj['compress']),
# sparse_index=obj['sp_index'], index=obj['index'],
# fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
# elif typ == 'sparse_dataframe':
# return SparseDataFrame(
# obj['data'], columns=obj['columns'],
# default_fill_value=obj['default_fill_value'],
# default_kind=obj['default_kind']
# )
# elif typ == 'sparse_panel':
# return SparsePanel(
# obj['data'], items=obj['items'],
# default_fill_value=obj['default_fill_value'],
# default_kind=obj['default_kind'])
elif typ == u'block_index':
return globals()[obj[u'klass']](obj[u'length'], obj[u'blocs'],
obj[u'blengths'])
elif typ == u'int_index':
return globals()[obj[u'klass']](obj[u'length'], obj[u'indices'])
elif typ == u'ndarray':
return unconvert(obj[u'data'], np.typeDict[obj[u'dtype']],
obj.get(u'compress')).reshape(obj[u'shape'])
elif typ == u'np_scalar':
if obj.get(u'sub_typ') == u'np_complex':
return c2f(obj[u'real'], obj[u'imag'], obj[u'dtype'])
else:
dtype = dtype_for(obj[u'dtype'])
try:
return dtype(obj[u'data'])
except (ValueError, TypeError):
return dtype.type(obj[u'data'])
elif typ == u'np_complex':
return complex(obj[u'real'] + u'+' + obj[u'imag'] + u'j')
elif isinstance(obj, (dict, list, set)):
return obj
else:
return obj
def pack(o, default=encode,
encoding='utf-8', unicode_errors='strict', use_single_float=False,
autoreset=1, use_bin_type=1):
"""
Pack an object and return the packed bytes.
"""
return Packer(default=default, encoding=encoding,
unicode_errors=unicode_errors,
use_single_float=use_single_float,
autoreset=autoreset,
use_bin_type=use_bin_type).pack(o)
def unpack(packed, object_hook=decode,
list_hook=None, use_list=False, encoding='utf-8',
unicode_errors='strict', object_pairs_hook=None,
max_buffer_size=0, ext_hook=ExtType):
"""
Unpack a packed object, return an iterator
Note: packed lists will be returned as tuples
"""
return Unpacker(packed, object_hook=object_hook,
list_hook=list_hook,
use_list=use_list, encoding=encoding,
unicode_errors=unicode_errors,
object_pairs_hook=object_pairs_hook,
max_buffer_size=max_buffer_size,
ext_hook=ext_hook)
class Packer(_Packer):
def __init__(self, default=encode,
encoding='utf-8',
unicode_errors='strict',
use_single_float=False,
autoreset=1,
use_bin_type=1):
super(Packer, self).__init__(default=default,
encoding=encoding,
unicode_errors=unicode_errors,
use_single_float=use_single_float,
autoreset=autoreset,
use_bin_type=use_bin_type)
class Unpacker(_Unpacker):
def __init__(self, file_like=None, read_size=0, use_list=False,
object_hook=decode,
object_pairs_hook=None, list_hook=None, encoding='utf-8',
unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType):
super(Unpacker, self).__init__(file_like=file_like,
read_size=read_size,
use_list=use_list,
object_hook=object_hook,
object_pairs_hook=object_pairs_hook,
list_hook=list_hook,
encoding=encoding,
unicode_errors=unicode_errors,
max_buffer_size=max_buffer_size,
ext_hook=ext_hook)
class Iterator(object):
""" manage the unpacking iteration,
close the file on completion """
def __init__(self, path, **kwargs):
self.path = path
self.kwargs = kwargs
def __iter__(self):
needs_closing = True
try:
# see if we have an actual file
if isinstance(self.path, compat.string_types):
try:
path_exists = os.path.exists(self.path)
except TypeError:
path_exists = False
if path_exists:
fh = open(self.path, 'rb')
else:
fh = compat.BytesIO(self.path)
else:
if not hasattr(self.path, 'read'):
fh = compat.BytesIO(self.path)
else:
# a file-like
needs_closing = False
fh = self.path
unpacker = unpack(fh)
for o in unpacker:
yield o
finally:
if needs_closing:
fh.close()
@@ -0,0 +1,282 @@
""" parquet compat """
from distutils.version import LooseVersion
from warnings import catch_warnings
from pandas.compat import string_types
from pandas.errors import AbstractMethodError
from pandas import DataFrame, get_option
from pandas.io.common import get_filepath_or_buffer, is_s3_url
def get_engine(engine):
""" return our implementation """
if engine == 'auto':
engine = get_option('io.parquet.engine')
if engine == 'auto':
# try engines in this order
try:
return PyArrowImpl()
except ImportError:
pass
try:
return FastParquetImpl()
except ImportError:
pass
raise ImportError("Unable to find a usable engine; "
"tried using: 'pyarrow', 'fastparquet'.\n"
"pyarrow or fastparquet is required for parquet "
"support")
if engine not in ['pyarrow', 'fastparquet']:
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
if engine == 'pyarrow':
return PyArrowImpl()
elif engine == 'fastparquet':
return FastParquetImpl()
class BaseImpl(object):
api = None # module
@staticmethod
def validate_dataframe(df):
if not isinstance(df, DataFrame):
raise ValueError("to_parquet only supports IO with DataFrames")
# must have value column names (strings only)
if df.columns.inferred_type not in {'string', 'unicode'}:
raise ValueError("parquet must have string column names")
# index level names must be strings
valid_names = all(
isinstance(name, string_types)
for name in df.index.names
if name is not None
)
if not valid_names:
raise ValueError("Index level names must be strings")
def write(self, df, path, compression, **kwargs):
raise AbstractMethodError(self)
def read(self, path, columns=None, **kwargs):
raise AbstractMethodError(self)
class PyArrowImpl(BaseImpl):
def __init__(self):
# since pandas is a dependency of pyarrow
# we need to import on first use
try:
import pyarrow
import pyarrow.parquet
except ImportError:
raise ImportError(
"pyarrow is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)
if LooseVersion(pyarrow.__version__) < '0.9.0':
raise ImportError(
"pyarrow >= 0.9.0 is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)
self.api = pyarrow
def write(self, df, path, compression='snappy',
coerce_timestamps='ms', index=None, partition_cols=None,
**kwargs):
self.validate_dataframe(df)
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
if index is None:
from_pandas_kwargs = {}
else:
from_pandas_kwargs = {'preserve_index': index}
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
if partition_cols is not None:
self.api.parquet.write_to_dataset(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps,
partition_cols=partition_cols, **kwargs)
else:
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
def read(self, path, columns=None, **kwargs):
path, _, _, should_close = get_filepath_or_buffer(path)
kwargs['use_pandas_metadata'] = True
result = self.api.parquet.read_table(path, columns=columns,
**kwargs).to_pandas()
if should_close:
try:
path.close()
except: # noqa: flake8
pass
return result
class FastParquetImpl(BaseImpl):
def __init__(self):
# since pandas is a dependency of fastparquet
# we need to import on first use
try:
import fastparquet
except ImportError:
raise ImportError(
"fastparquet is required for parquet support\n\n"
"you can install via conda\n"
"conda install fastparquet -c conda-forge\n"
"\nor via pip\n"
"pip install -U fastparquet"
)
if LooseVersion(fastparquet.__version__) < '0.2.1':
raise ImportError(
"fastparquet >= 0.2.1 is required for parquet "
"support\n\n"
"you can install via conda\n"
"conda install fastparquet -c conda-forge\n"
"\nor via pip\n"
"pip install -U fastparquet"
)
self.api = fastparquet
def write(self, df, path, compression='snappy', index=None,
partition_cols=None, **kwargs):
self.validate_dataframe(df)
# thriftpy/protocol/compact.py:339:
# DeprecationWarning: tostring() is deprecated.
# Use tobytes() instead.
if 'partition_on' in kwargs and partition_cols is not None:
raise ValueError("Cannot use both partition_on and "
"partition_cols. Use partition_cols for "
"partitioning data")
elif 'partition_on' in kwargs:
partition_cols = kwargs.pop('partition_on')
if partition_cols is not None:
kwargs['file_scheme'] = 'hive'
if is_s3_url(path):
# path is s3:// so we need to open the s3file in 'wb' mode.
# TODO: Support 'ab'
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
# And pass the opened s3file to the fastparquet internal impl.
kwargs['open_with'] = lambda path, _: path
else:
path, _, _, _ = get_filepath_or_buffer(path)
with catch_warnings(record=True):
self.api.write(path, df, compression=compression,
write_index=index, partition_on=partition_cols,
**kwargs)
def read(self, path, columns=None, **kwargs):
if is_s3_url(path):
# When path is s3:// an S3File is returned.
# We need to retain the original path(str) while also
# pass the S3File().open function to fsatparquet impl.
s3, _, _, should_close = get_filepath_or_buffer(path)
try:
parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open)
finally:
s3.close()
else:
path, _, _, _ = get_filepath_or_buffer(path)
parquet_file = self.api.ParquetFile(path)
return parquet_file.to_pandas(columns=columns, **kwargs)
def to_parquet(df, path, engine='auto', compression='snappy', index=None,
partition_cols=None, **kwargs):
"""
Write a DataFrame to the parquet format.
Parameters
----------
path : str
File path or Root Directory path. Will be used as Root Directory path
while writing a partitioned dataset.
.. versionchanged:: 0.24.0
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
behavior is to try 'pyarrow', falling back to 'fastparquet' if
'pyarrow' is unavailable.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
index : bool, default None
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file. If ``None``, the
engine's default behavior will be used.
.. versionadded 0.24.0
partition_cols : list, optional, default None
Column names by which to partition the dataset
Columns are partitioned in the order they are given
.. versionadded:: 0.24.0
kwargs
Additional keyword arguments passed to the engine
"""
impl = get_engine(engine)
return impl.write(df, path, compression=compression, index=index,
partition_cols=partition_cols, **kwargs)
def read_parquet(path, engine='auto', columns=None, **kwargs):
"""
Load a parquet object from the file path, returning a DataFrame.
.. versionadded 0.21.0
Parameters
----------
path : string
File path
columns : list, default=None
If not None, only these columns will be read from the file.
.. versionadded 0.21.1
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
behavior is to try 'pyarrow', falling back to 'fastparquet' if
'pyarrow' is unavailable.
kwargs are passed to the engine
Returns
-------
DataFrame
"""
impl = get_engine(engine)
return impl.read(path, columns=columns, **kwargs)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,201 @@
""" pickle compat """
import warnings
import numpy as np
from numpy.lib.format import read_array, write_array
from pandas.compat import PY3, BytesIO, cPickle as pkl, pickle_compat as pc
from pandas.io.common import _get_handle, _stringify_path
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
"""
Pickle (serialize) object to file.
Parameters
----------
obj : any object
Any python object.
path : str
File path where the pickled object will be stored.
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
A string representing the compression to use in the output file. By
default, infers from the file extension in specified path.
.. versionadded:: 0.20.0
protocol : int
Int which indicates which protocol should be used by the pickler,
default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
values for this parameter depend on the version of Python. For Python
2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
For Python >= 3.4, 4 is a valid value. A negative value for the
protocol parameter is equivalent to setting its value to
HIGHEST_PROTOCOL.
.. [1] https://docs.python.org/3/library/pickle.html
.. versionadded:: 0.21.0
See Also
--------
read_pickle : Load pickled pandas object (or any object) from file.
DataFrame.to_hdf : Write DataFrame to an HDF5 file.
DataFrame.to_sql : Write DataFrame to a SQL database.
DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
Examples
--------
>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
>>> original_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> pd.to_pickle(original_df, "./dummy.pkl")
>>> unpickled_df = pd.read_pickle("./dummy.pkl")
>>> unpickled_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> import os
>>> os.remove("./dummy.pkl")
"""
path = _stringify_path(path)
f, fh = _get_handle(path, 'wb',
compression=compression,
is_text=False)
if protocol < 0:
protocol = pkl.HIGHEST_PROTOCOL
try:
f.write(pkl.dumps(obj, protocol=protocol))
finally:
for _f in fh:
_f.close()
def read_pickle(path, compression='infer'):
"""
Load pickled pandas object (or any object) from file.
.. warning::
Loading pickled data received from untrusted sources can be
unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.
Parameters
----------
path : str
File path where the pickled object will be loaded.
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
or '.zip' respectively, and no decompression otherwise.
Set to None for no decompression.
.. versionadded:: 0.20.0
Returns
-------
unpickled : same type as object stored in file
See Also
--------
DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
Series.to_pickle : Pickle (serialize) Series object to file.
read_hdf : Read HDF5 file into a DataFrame.
read_sql : Read SQL query or database table into a DataFrame.
read_parquet : Load a parquet object, returning a DataFrame.
Examples
--------
>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
>>> original_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> pd.to_pickle(original_df, "./dummy.pkl")
>>> unpickled_df = pd.read_pickle("./dummy.pkl")
>>> unpickled_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> import os
>>> os.remove("./dummy.pkl")
"""
path = _stringify_path(path)
def read_wrapper(func):
# wrapper file handle open/close operation
f, fh = _get_handle(path, 'rb',
compression=compression,
is_text=False)
try:
return func(f)
finally:
for _f in fh:
_f.close()
def try_read(path, encoding=None):
# try with cPickle
# try with current pickle, if we have a Type Error then
# try with the compat pickle to handle subclass changes
# pass encoding only if its not None as py2 doesn't handle
# the param
# cpickle
# GH 6899
try:
with warnings.catch_warnings(record=True):
# We want to silence any warnings about, e.g. moved modules.
warnings.simplefilter("ignore", Warning)
return read_wrapper(lambda f: pkl.load(f))
except Exception: # noqa: E722
# reg/patched pickle
# compat not used in pandas/compat/pickle_compat.py::load
# TODO: remove except block OR modify pc.load to use compat
try:
return read_wrapper(
lambda f: pc.load(f, encoding=encoding, compat=False))
# compat pickle
except Exception: # noqa: E722
return read_wrapper(
lambda f: pc.load(f, encoding=encoding, compat=True))
try:
return try_read(path)
except Exception: # noqa: E722
if PY3:
return try_read(path, encoding='latin1')
raise
# compat with sparse pickle / unpickle
def _pickle_array(arr):
arr = arr.view(np.ndarray)
buf = BytesIO()
write_array(buf, arr)
return buf.getvalue()
def _unpickle_array(bytes):
arr = read_array(BytesIO(bytes))
return arr
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,40 @@
""" s3 support for remote file interactivity """
from pandas import compat
try:
import s3fs
from botocore.exceptions import NoCredentialsError
except ImportError:
raise ImportError("The s3fs library is required to handle s3 files")
if compat.PY3:
from urllib.parse import urlparse as parse_url
else:
from urlparse import urlparse as parse_url
def _strip_schema(url):
"""Returns the url without the s3:// part"""
result = parse_url(url)
return result.netloc + result.path
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=None, mode=None):
if mode is None:
mode = 'rb'
fs = s3fs.S3FileSystem(anon=False)
try:
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
except (compat.FileNotFoundError, NoCredentialsError):
# boto3 has troubles when trying to access a public file
# when credentialed...
# An OSError is raised if you have credentials, but they
# aren't valid for that bucket.
# A NoCredentialsError is raised if you don't have creds
# for that bucket.
fs = s3fs.S3FileSystem(anon=True)
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
return filepath_or_buffer, None, compression, True
@@ -0,0 +1 @@
from .sasreader import read_sas # noqa
@@ -0,0 +1,703 @@
"""
Read SAS7BDAT files
Based on code written by Jared Hobbs:
https://bitbucket.org/jaredhobbs/sas7bdat
See also:
https://github.com/BioStatMatt/sas7bdat
Partial documentation of the file format:
https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
Reference for binary data compression:
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
"""
from datetime import datetime
import struct
import numpy as np
from pandas.errors import EmptyDataError
import pandas as pd
from pandas import compat
from pandas.io.common import BaseIterator, get_filepath_or_buffer
from pandas.io.sas._sas import Parser
import pandas.io.sas.sas_constants as const
class _subheader_pointer(object):
pass
class _column(object):
pass
# SAS7BDAT represents a SAS data file in SAS7BDAT format.
class SAS7BDATReader(BaseIterator):
"""
Read SAS files in SAS7BDAT format.
Parameters
----------
path_or_buf : path name or buffer
Name of SAS file or file-like object pointing to SAS file
contents.
index : column identifier, defaults to None
Column to use as index.
convert_dates : boolean, defaults to True
Attempt to convert dates to Pandas datetime values. Note that
some rarely used SAS date formats may be unsupported.
blank_missing : boolean, defaults to True
Convert empty strings to missing values (SAS uses blanks to
indicate missing character variables).
chunksize : int, defaults to None
Return SAS7BDATReader object for iterations, returns chunks
with given number of lines.
encoding : string, defaults to None
String encoding.
convert_text : bool, defaults to True
If False, text variables are left as raw bytes.
convert_header_text : bool, defaults to True
If False, header text, including column names, are left as raw
bytes.
"""
def __init__(self, path_or_buf, index=None, convert_dates=True,
blank_missing=True, chunksize=None, encoding=None,
convert_text=True, convert_header_text=True):
self.index = index
self.convert_dates = convert_dates
self.blank_missing = blank_missing
self.chunksize = chunksize
self.encoding = encoding
self.convert_text = convert_text
self.convert_header_text = convert_header_text
self.default_encoding = "latin-1"
self.compression = ""
self.column_names_strings = []
self.column_names = []
self.column_formats = []
self.columns = []
self._current_page_data_subheader_pointers = []
self._cached_page = None
self._column_data_lengths = []
self._column_data_offsets = []
self._column_types = []
self._current_row_in_file_index = 0
self._current_row_on_page_index = 0
self._current_row_in_file_index = 0
self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
if isinstance(self._path_or_buf, compat.string_types):
self._path_or_buf = open(self._path_or_buf, 'rb')
self.handle = self._path_or_buf
self._get_properties()
self._parse_metadata()
def column_data_lengths(self):
"""Return a numpy int64 array of the column data lengths"""
return np.asarray(self._column_data_lengths, dtype=np.int64)
def column_data_offsets(self):
"""Return a numpy int64 array of the column offsets"""
return np.asarray(self._column_data_offsets, dtype=np.int64)
def column_types(self):
"""Returns a numpy character array of the column types:
s (string) or d (double)"""
return np.asarray(self._column_types, dtype=np.dtype('S1'))
def close(self):
try:
self.handle.close()
except AttributeError:
pass
def _get_properties(self):
# Check magic number
self._path_or_buf.seek(0)
self._cached_page = self._path_or_buf.read(288)
if self._cached_page[0:len(const.magic)] != const.magic:
self.close()
raise ValueError("magic number mismatch (not a SAS file?)")
# Get alignment information
align1, align2 = 0, 0
buf = self._read_bytes(const.align_1_offset, const.align_1_length)
if buf == const.u64_byte_checker_value:
align2 = const.align_2_value
self.U64 = True
self._int_length = 8
self._page_bit_offset = const.page_bit_offset_x64
self._subheader_pointer_length = const.subheader_pointer_length_x64
else:
self.U64 = False
self._page_bit_offset = const.page_bit_offset_x86
self._subheader_pointer_length = const.subheader_pointer_length_x86
self._int_length = 4
buf = self._read_bytes(const.align_2_offset, const.align_2_length)
if buf == const.align_1_checker_value:
align1 = const.align_2_value
total_align = align1 + align2
# Get endianness information
buf = self._read_bytes(const.endianness_offset,
const.endianness_length)
if buf == b'\x01':
self.byte_order = "<"
else:
self.byte_order = ">"
# Get encoding information
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
if buf in const.encoding_names:
self.file_encoding = const.encoding_names[buf]
else:
self.file_encoding = "unknown (code={name!s})".format(name=buf)
# Get platform information
buf = self._read_bytes(const.platform_offset, const.platform_length)
if buf == b'1':
self.platform = "unix"
elif buf == b'2':
self.platform = "windows"
else:
self.platform = "unknown"
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
self.name = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.name = self.name.decode(
self.encoding or self.default_encoding)
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
self.file_type = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.file_type = self.file_type.decode(
self.encoding or self.default_encoding)
# Timestamp is epoch 01/01/1960
epoch = datetime(1960, 1, 1)
x = self._read_float(const.date_created_offset + align1,
const.date_created_length)
self.date_created = epoch + pd.to_timedelta(x, unit='s')
x = self._read_float(const.date_modified_offset + align1,
const.date_modified_length)
self.date_modified = epoch + pd.to_timedelta(x, unit='s')
self.header_length = self._read_int(const.header_size_offset + align1,
const.header_size_length)
# Read the rest of the header into cached_page.
buf = self._path_or_buf.read(self.header_length - 288)
self._cached_page += buf
if len(self._cached_page) != self.header_length:
self.close()
raise ValueError("The SAS7BDAT file appears to be truncated.")
self._page_length = self._read_int(const.page_size_offset + align1,
const.page_size_length)
self._page_count = self._read_int(const.page_count_offset + align1,
const.page_count_length)
buf = self._read_bytes(const.sas_release_offset + total_align,
const.sas_release_length)
self.sas_release = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.sas_release = self.sas_release.decode(
self.encoding or self.default_encoding)
buf = self._read_bytes(const.sas_server_type_offset + total_align,
const.sas_server_type_length)
self.server_type = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.server_type = self.server_type.decode(
self.encoding or self.default_encoding)
buf = self._read_bytes(const.os_version_number_offset + total_align,
const.os_version_number_length)
self.os_version = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.os_version = self.os_version.decode(
self.encoding or self.default_encoding)
buf = self._read_bytes(const.os_name_offset + total_align,
const.os_name_length)
buf = buf.rstrip(b'\x00 ')
if len(buf) > 0:
self.os_name = buf.decode(self.encoding or self.default_encoding)
else:
buf = self._read_bytes(const.os_maker_offset + total_align,
const.os_maker_length)
self.os_name = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.os_name = self.os_name.decode(
self.encoding or self.default_encoding)
def __next__(self):
da = self.read(nrows=self.chunksize or 1)
if da is None:
raise StopIteration
return da
# Read a single float of the given width (4 or 8).
def _read_float(self, offset, width):
if width not in (4, 8):
self.close()
raise ValueError("invalid float width")
buf = self._read_bytes(offset, width)
fd = "f" if width == 4 else "d"
return struct.unpack(self.byte_order + fd, buf)[0]
# Read a single signed integer of the given width (1, 2, 4 or 8).
def _read_int(self, offset, width):
if width not in (1, 2, 4, 8):
self.close()
raise ValueError("invalid int width")
buf = self._read_bytes(offset, width)
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
iv = struct.unpack(self.byte_order + it, buf)[0]
return iv
def _read_bytes(self, offset, length):
if self._cached_page is None:
self._path_or_buf.seek(offset)
buf = self._path_or_buf.read(length)
if len(buf) < length:
self.close()
msg = "Unable to read {:d} bytes from file position {:d}."
raise ValueError(msg.format(length, offset))
return buf
else:
if offset + length > len(self._cached_page):
self.close()
raise ValueError("The cached page is too small.")
return self._cached_page[offset:offset + length]
def _parse_metadata(self):
done = False
while not done:
self._cached_page = self._path_or_buf.read(self._page_length)
if len(self._cached_page) <= 0:
break
if len(self._cached_page) != self._page_length:
self.close()
raise ValueError(
"Failed to read a meta data page from the SAS file.")
done = self._process_page_meta()
def _process_page_meta(self):
self._read_page_header()
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
if self._current_page_type in pt:
self._process_page_metadata()
is_data_page = self._current_page_type & const.page_data_type
is_mix_page = self._current_page_type in const.page_mix_types
return (is_data_page or is_mix_page
or self._current_page_data_subheader_pointers != [])
def _read_page_header(self):
bit_offset = self._page_bit_offset
tx = const.page_type_offset + bit_offset
self._current_page_type = self._read_int(tx, const.page_type_length)
tx = const.block_count_offset + bit_offset
self._current_page_block_count = self._read_int(
tx, const.block_count_length)
tx = const.subheader_count_offset + bit_offset
self._current_page_subheaders_count = (
self._read_int(tx, const.subheader_count_length))
def _process_page_metadata(self):
bit_offset = self._page_bit_offset
for i in range(self._current_page_subheaders_count):
pointer = self._process_subheader_pointers(
const.subheader_pointers_offset + bit_offset, i)
if pointer.length == 0:
continue
if pointer.compression == const.truncated_subheader_id:
continue
subheader_signature = self._read_subheader_signature(
pointer.offset)
subheader_index = (
self._get_subheader_index(subheader_signature,
pointer.compression, pointer.ptype))
self._process_subheader(subheader_index, pointer)
def _get_subheader_index(self, signature, compression, ptype):
index = const.subheader_signature_to_index.get(signature)
if index is None:
f1 = ((compression == const.compressed_subheader_id) or
(compression == 0))
f2 = (ptype == const.compressed_subheader_type)
if (self.compression != "") and f1 and f2:
index = const.SASIndex.data_subheader_index
else:
self.close()
raise ValueError("Unknown subheader signature")
return index
def _process_subheader_pointers(self, offset, subheader_pointer_index):
subheader_pointer_length = self._subheader_pointer_length
total_offset = (offset +
subheader_pointer_length * subheader_pointer_index)
subheader_offset = self._read_int(total_offset, self._int_length)
total_offset += self._int_length
subheader_length = self._read_int(total_offset, self._int_length)
total_offset += self._int_length
subheader_compression = self._read_int(total_offset, 1)
total_offset += 1
subheader_type = self._read_int(total_offset, 1)
x = _subheader_pointer()
x.offset = subheader_offset
x.length = subheader_length
x.compression = subheader_compression
x.ptype = subheader_type
return x
def _read_subheader_signature(self, offset):
subheader_signature = self._read_bytes(offset, self._int_length)
return subheader_signature
def _process_subheader(self, subheader_index, pointer):
offset = pointer.offset
length = pointer.length
if subheader_index == const.SASIndex.row_size_index:
processor = self._process_rowsize_subheader
elif subheader_index == const.SASIndex.column_size_index:
processor = self._process_columnsize_subheader
elif subheader_index == const.SASIndex.column_text_index:
processor = self._process_columntext_subheader
elif subheader_index == const.SASIndex.column_name_index:
processor = self._process_columnname_subheader
elif subheader_index == const.SASIndex.column_attributes_index:
processor = self._process_columnattributes_subheader
elif subheader_index == const.SASIndex.format_and_label_index:
processor = self._process_format_subheader
elif subheader_index == const.SASIndex.column_list_index:
processor = self._process_columnlist_subheader
elif subheader_index == const.SASIndex.subheader_counts_index:
processor = self._process_subheader_counts
elif subheader_index == const.SASIndex.data_subheader_index:
self._current_page_data_subheader_pointers.append(pointer)
return
else:
raise ValueError("unknown subheader index")
processor(offset, length)
def _process_rowsize_subheader(self, offset, length):
int_len = self._int_length
lcs_offset = offset
lcp_offset = offset
if self.U64:
lcs_offset += 682
lcp_offset += 706
else:
lcs_offset += 354
lcp_offset += 378
self.row_length = self._read_int(
offset + const.row_length_offset_multiplier * int_len, int_len)
self.row_count = self._read_int(
offset + const.row_count_offset_multiplier * int_len, int_len)
self.col_count_p1 = self._read_int(
offset + const.col_count_p1_multiplier * int_len, int_len)
self.col_count_p2 = self._read_int(
offset + const.col_count_p2_multiplier * int_len, int_len)
mx = const.row_count_on_mix_page_offset_multiplier * int_len
self._mix_page_row_count = self._read_int(offset + mx, int_len)
self._lcs = self._read_int(lcs_offset, 2)
self._lcp = self._read_int(lcp_offset, 2)
def _process_columnsize_subheader(self, offset, length):
int_len = self._int_length
offset += int_len
self.column_count = self._read_int(offset, int_len)
if (self.col_count_p1 + self.col_count_p2 !=
self.column_count):
print(
"Warning: column count mismatch ({p1} + {p2} != "
"{column_count})\n".format(
p1=self.col_count_p1, p2=self.col_count_p2,
column_count=self.column_count))
# Unknown purpose
def _process_subheader_counts(self, offset, length):
pass
def _process_columntext_subheader(self, offset, length):
offset += self._int_length
text_block_size = self._read_int(offset, const.text_block_size_length)
buf = self._read_bytes(offset, text_block_size)
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
cname = cname_raw
if self.convert_header_text:
cname = cname.decode(self.encoding or self.default_encoding)
self.column_names_strings.append(cname)
if len(self.column_names_strings) == 1:
compression_literal = ""
for cl in const.compression_literals:
if cl in cname_raw:
compression_literal = cl
self.compression = compression_literal
offset -= self._int_length
offset1 = offset + 16
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
compression_literal = buf.rstrip(b"\x00")
if compression_literal == "":
self._lcs = 0
offset1 = offset + 32
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
self.creator_proc = buf[0:self._lcp]
elif compression_literal == const.rle_compression:
offset1 = offset + 40
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
self.creator_proc = buf[0:self._lcp]
elif self._lcs > 0:
self._lcp = 0
offset1 = offset + 16
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcs)
self.creator_proc = buf[0:self._lcp]
if self.convert_header_text:
if hasattr(self, "creator_proc"):
self.creator_proc = self.creator_proc.decode(
self.encoding or self.default_encoding)
def _process_columnname_subheader(self, offset, length):
int_len = self._int_length
offset += int_len
column_name_pointers_count = (length - 2 * int_len - 12) // 8
for i in range(column_name_pointers_count):
text_subheader = offset + const.column_name_pointer_length * \
(i + 1) + const.column_name_text_subheader_offset
col_name_offset = offset + const.column_name_pointer_length * \
(i + 1) + const.column_name_offset_offset
col_name_length = offset + const.column_name_pointer_length * \
(i + 1) + const.column_name_length_offset
idx = self._read_int(
text_subheader, const.column_name_text_subheader_length)
col_offset = self._read_int(
col_name_offset, const.column_name_offset_length)
col_len = self._read_int(
col_name_length, const.column_name_length_length)
name_str = self.column_names_strings[idx]
self.column_names.append(name_str[col_offset:col_offset + col_len])
def _process_columnattributes_subheader(self, offset, length):
int_len = self._int_length
column_attributes_vectors_count = (
length - 2 * int_len - 12) // (int_len + 8)
for i in range(column_attributes_vectors_count):
col_data_offset = (offset + int_len +
const.column_data_offset_offset +
i * (int_len + 8))
col_data_len = (offset + 2 * int_len +
const.column_data_length_offset +
i * (int_len + 8))
col_types = (offset + 2 * int_len +
const.column_type_offset + i * (int_len + 8))
x = self._read_int(col_data_offset, int_len)
self._column_data_offsets.append(x)
x = self._read_int(col_data_len, const.column_data_length_length)
self._column_data_lengths.append(x)
x = self._read_int(col_types, const.column_type_length)
self._column_types.append(b'd' if x == 1 else b's')
def _process_columnlist_subheader(self, offset, length):
# unknown purpose
pass
def _process_format_subheader(self, offset, length):
int_len = self._int_length
text_subheader_format = (
offset +
const.column_format_text_subheader_index_offset +
3 * int_len)
col_format_offset = (offset +
const.column_format_offset_offset +
3 * int_len)
col_format_len = (offset +
const.column_format_length_offset +
3 * int_len)
text_subheader_label = (
offset +
const.column_label_text_subheader_index_offset +
3 * int_len)
col_label_offset = (offset +
const.column_label_offset_offset +
3 * int_len)
col_label_len = offset + const.column_label_length_offset + 3 * int_len
x = self._read_int(text_subheader_format,
const.column_format_text_subheader_index_length)
format_idx = min(x, len(self.column_names_strings) - 1)
format_start = self._read_int(
col_format_offset, const.column_format_offset_length)
format_len = self._read_int(
col_format_len, const.column_format_length_length)
label_idx = self._read_int(
text_subheader_label,
const.column_label_text_subheader_index_length)
label_idx = min(label_idx, len(self.column_names_strings) - 1)
label_start = self._read_int(
col_label_offset, const.column_label_offset_length)
label_len = self._read_int(col_label_len,
const.column_label_length_length)
label_names = self.column_names_strings[label_idx]
column_label = label_names[label_start: label_start + label_len]
format_names = self.column_names_strings[format_idx]
column_format = format_names[format_start: format_start + format_len]
current_column_number = len(self.columns)
col = _column()
col.col_id = current_column_number
col.name = self.column_names[current_column_number]
col.label = column_label
col.format = column_format
col.ctype = self._column_types[current_column_number]
col.length = self._column_data_lengths[current_column_number]
self.column_formats.append(column_format)
self.columns.append(col)
def read(self, nrows=None):
if (nrows is None) and (self.chunksize is not None):
nrows = self.chunksize
elif nrows is None:
nrows = self.row_count
if len(self._column_types) == 0:
self.close()
raise EmptyDataError("No columns to parse from file")
if self._current_row_in_file_index >= self.row_count:
return None
m = self.row_count - self._current_row_in_file_index
if nrows > m:
nrows = m
nd = self._column_types.count(b'd')
ns = self._column_types.count(b's')
self._string_chunk = np.empty((ns, nrows), dtype=np.object)
self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
self._current_row_in_chunk_index = 0
p = Parser(self)
p.read(nrows)
rslt = self._chunk_to_dataframe()
if self.index is not None:
rslt = rslt.set_index(self.index)
return rslt
def _read_next_page(self):
self._current_page_data_subheader_pointers = []
self._cached_page = self._path_or_buf.read(self._page_length)
if len(self._cached_page) <= 0:
return True
elif len(self._cached_page) != self._page_length:
self.close()
msg = ("failed to read complete page from file "
"(read {:d} of {:d} bytes)")
raise ValueError(msg.format(len(self._cached_page),
self._page_length))
self._read_page_header()
page_type = self._current_page_type
if page_type == const.page_meta_type:
self._process_page_metadata()
is_data_page = page_type & const.page_data_type
pt = [const.page_meta_type] + const.page_mix_types
if not is_data_page and self._current_page_type not in pt:
return self._read_next_page()
return False
def _chunk_to_dataframe(self):
n = self._current_row_in_chunk_index
m = self._current_row_in_file_index
ix = range(m - n, m)
rslt = pd.DataFrame(index=ix)
js, jb = 0, 0
for j in range(self.column_count):
name = self.column_names[j]
if self._column_types[j] == b'd':
rslt[name] = self._byte_chunk[jb, :].view(
dtype=self.byte_order + 'd')
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
if self.convert_dates:
unit = None
if self.column_formats[j] in const.sas_date_formats:
unit = 'd'
elif self.column_formats[j] in const.sas_datetime_formats:
unit = 's'
if unit:
rslt[name] = pd.to_datetime(rslt[name], unit=unit,
origin="1960-01-01")
jb += 1
elif self._column_types[j] == b's':
rslt[name] = self._string_chunk[js, :]
if self.convert_text and (self.encoding is not None):
rslt[name] = rslt[name].str.decode(
self.encoding or self.default_encoding)
if self.blank_missing:
ii = rslt[name].str.len() == 0
rslt.loc[ii, name] = np.nan
js += 1
else:
self.close()
raise ValueError("unknown column type {type}".format(
type=self._column_types[j]))
return rslt
@@ -0,0 +1,171 @@
magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" +
b"\x00\x00\x00\x00\xc2\xea\x81\x60" +
b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" +
b"\x09\xc7\x31\x8c\x18\x1f\x10\x11")
align_1_checker_value = b'3'
align_1_offset = 32
align_1_length = 1
align_1_value = 4
u64_byte_checker_value = b'3'
align_2_offset = 35
align_2_length = 1
align_2_value = 4
endianness_offset = 37
endianness_length = 1
platform_offset = 39
platform_length = 1
encoding_offset = 70
encoding_length = 1
dataset_offset = 92
dataset_length = 64
file_type_offset = 156
file_type_length = 8
date_created_offset = 164
date_created_length = 8
date_modified_offset = 172
date_modified_length = 8
header_size_offset = 196
header_size_length = 4
page_size_offset = 200
page_size_length = 4
page_count_offset = 204
page_count_length = 4
sas_release_offset = 216
sas_release_length = 8
sas_server_type_offset = 224
sas_server_type_length = 16
os_version_number_offset = 240
os_version_number_length = 16
os_maker_offset = 256
os_maker_length = 16
os_name_offset = 272
os_name_length = 16
page_bit_offset_x86 = 16
page_bit_offset_x64 = 32
subheader_pointer_length_x86 = 12
subheader_pointer_length_x64 = 24
page_type_offset = 0
page_type_length = 2
block_count_offset = 2
block_count_length = 2
subheader_count_offset = 4
subheader_count_length = 2
page_meta_type = 0
page_data_type = 256
page_amd_type = 1024
page_metc_type = 16384
page_comp_type = -28672
page_mix_types = [512, 640]
subheader_pointers_offset = 8
truncated_subheader_id = 1
compressed_subheader_id = 4
compressed_subheader_type = 1
text_block_size_length = 2
row_length_offset_multiplier = 5
row_count_offset_multiplier = 6
col_count_p1_multiplier = 9
col_count_p2_multiplier = 10
row_count_on_mix_page_offset_multiplier = 15
column_name_pointer_length = 8
column_name_text_subheader_offset = 0
column_name_text_subheader_length = 2
column_name_offset_offset = 2
column_name_offset_length = 2
column_name_length_offset = 4
column_name_length_length = 2
column_data_offset_offset = 8
column_data_length_offset = 8
column_data_length_length = 4
column_type_offset = 14
column_type_length = 1
column_format_text_subheader_index_offset = 22
column_format_text_subheader_index_length = 2
column_format_offset_offset = 24
column_format_offset_length = 2
column_format_length_offset = 26
column_format_length_length = 2
column_label_text_subheader_index_offset = 28
column_label_text_subheader_index_length = 2
column_label_offset_offset = 30
column_label_offset_length = 2
column_label_length_offset = 32
column_label_length_length = 2
rle_compression = b'SASYZCRL'
rdc_compression = b'SASYZCR2'
compression_literals = [rle_compression, rdc_compression]
# Incomplete list of encodings, using SAS nomenclature:
# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2",
61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"}
class SASIndex(object):
row_size_index = 0
column_size_index = 1
subheader_counts_index = 2
column_text_index = 3
column_name_index = 4
column_attributes_index = 5
format_and_label_index = 6
column_list_index = 7
data_subheader_index = 8
subheader_signature_to_index = {
b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index}
# List of frequently used SAS date and datetime formats
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
sas_date_formats = ("DATE", "DAY", "DDMMYY", "DOWNAME", "JULDAY", "JULIAN",
"MMDDYY", "MMYY", "MMYYC", "MMYYD", "MMYYP", "MMYYS",
"MMYYN", "MONNAME", "MONTH", "MONYY", "QTR", "QTRR",
"NENGO", "WEEKDATE", "WEEKDATX", "WEEKDAY", "WEEKV",
"WORDDATE", "WORDDATX", "YEAR", "YYMM", "YYMMC", "YYMMD",
"YYMMP", "YYMMS", "YYMMN", "YYMON", "YYMMDD", "YYQ",
"YYQC", "YYQD", "YYQP", "YYQS", "YYQN", "YYQR", "YYQRC",
"YYQRD", "YYQRP", "YYQRS", "YYQRN",
"YYMMDDP", "YYMMDDC", "E8601DA", "YYMMDDN", "MMDDYYC",
"MMDDYYS", "MMDDYYD", "YYMMDDS", "B8601DA", "DDMMYYN",
"YYMMDDD", "DDMMYYB", "DDMMYYP", "MMDDYYP", "YYMMDDB",
"MMDDYYN", "DDMMYYC", "DDMMYYD", "DDMMYYS",
"MINGUO")
sas_datetime_formats = ("DATETIME", "DTWKDATX",
"B8601DN", "B8601DT", "B8601DX", "B8601DZ", "B8601LX",
"E8601DN", "E8601DT", "E8601DX", "E8601DZ", "E8601LX",
"DATEAMPM", "DTDATE", "DTMONYY", "DTMONYY", "DTWKDATX",
"DTYEAR", "TOD", "MDYAMPM")
@@ -0,0 +1,464 @@
"""
Read a SAS XPort format file into a Pandas DataFrame.
Based on code from Jack Cushman (github.com/jcushman/xport).
The file format is defined here:
https://support.sas.com/techsup/technote/ts140.pdf
"""
from datetime import datetime
import struct
import warnings
import numpy as np
from pandas.util._decorators import Appender
import pandas as pd
from pandas import compat
from pandas.io.common import BaseIterator, get_filepath_or_buffer
_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
"000000000000000000000000000000 ")
_correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!"
"000000000000000001600000000")
_correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
"000000000000000000000000000000 ")
_correct_obs_header = ("HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
"000000000000000000000000000000 ")
_fieldkeys = ['ntype', 'nhfun', 'field_length', 'nvar0', 'name', 'label',
'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform',
'nifl', 'nifd', 'npos', '_']
_base_params_doc = """\
Parameters
----------
filepath_or_buffer : string or file-like object
Path to SAS file or object implementing binary read method."""
_params2_doc = """\
index : identifier of index column
Identifier of column that should be used as index of the DataFrame.
encoding : string
Encoding for text data.
chunksize : int
Read file `chunksize` lines at a time, returns iterator."""
_format_params_doc = """\
format : string
File format, only `xport` is currently supported."""
_iterator_doc = """\
iterator : boolean, default False
Return XportReader object for reading file incrementally."""
_read_sas_doc = """Read a SAS file into a DataFrame.
%(_base_params_doc)s
%(_format_params_doc)s
%(_params2_doc)s
%(_iterator_doc)s
Returns
-------
DataFrame or XportReader
Examples
--------
Read a SAS Xport file:
>>> df = pd.read_sas('filename.XPT')
Read a Xport file in 10,000 line chunks:
>>> itr = pd.read_sas('filename.XPT', chunksize=10000)
>>> for chunk in itr:
>>> do_something(chunk)
""" % {"_base_params_doc": _base_params_doc,
"_format_params_doc": _format_params_doc,
"_params2_doc": _params2_doc,
"_iterator_doc": _iterator_doc}
_xport_reader_doc = """\
Class for reading SAS Xport files.
%(_base_params_doc)s
%(_params2_doc)s
Attributes
----------
member_info : list
Contains information about the file
fields : list
Contains information about the variables in the file
""" % {"_base_params_doc": _base_params_doc,
"_params2_doc": _params2_doc}
_read_method_doc = """\
Read observations from SAS Xport file, returning as data frame.
Parameters
----------
nrows : int
Number of rows to read from data file; if None, read whole
file.
Returns
-------
A DataFrame.
"""
def _parse_date(datestr):
""" Given a date in xport format, return Python date. """
try:
# e.g. "16FEB11:10:07:55"
return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
except ValueError:
return pd.NaT
def _split_line(s, parts):
"""
Parameters
----------
s: string
Fixed-length string to split
parts: list of (name, length) pairs
Used to break up string, name '_' will be filtered from output.
Returns
-------
Dict of name:contents of string at given location.
"""
out = {}
start = 0
for name, length in parts:
out[name] = s[start:start + length].strip()
start += length
del out['_']
return out
def _handle_truncated_float_vec(vec, nbytes):
# This feature is not well documented, but some SAS XPORT files
# have 2-7 byte "truncated" floats. To read these truncated
# floats, pad them with zeros on the right to make 8 byte floats.
#
# References:
# https://github.com/jcushman/xport/pull/3
# The R "foreign" library
if nbytes != 8:
vec1 = np.zeros(len(vec), np.dtype('S8'))
dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes))
vec2 = vec1.view(dtype=dtype)
vec2['f0'] = vec
return vec2
return vec
def _parse_float_vec(vec):
"""
Parse a vector of float values representing IBM 8 byte floats into
native 8 byte floats.
"""
dtype = np.dtype('>u4,>u4')
vec1 = vec.view(dtype=dtype)
xport1 = vec1['f0']
xport2 = vec1['f1']
# Start by setting first half of ieee number to first half of IBM
# number sans exponent
ieee1 = xport1 & 0x00ffffff
# The fraction bit to the left of the binary point in the ieee
# format was set and the number was shifted 0, 1, 2, or 3
# places. This will tell us how to adjust the ibm exponent to be a
# power of 2 ieee exponent and how to shift the fraction bits to
# restore the correct magnitude.
shift = np.zeros(len(vec), dtype=np.uint8)
shift[np.where(xport1 & 0x00200000)] = 1
shift[np.where(xport1 & 0x00400000)] = 2
shift[np.where(xport1 & 0x00800000)] = 3
# shift the ieee number down the correct number of places then
# set the second half of the ieee number to be the second half
# of the ibm number shifted appropriately, ored with the bits
# from the first half that would have been shifted in if we
# could shift a double. All we are worried about are the low
# order 3 bits of the first half since we're only shifting by
# 1, 2, or 3.
ieee1 >>= shift
ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
# clear the 1 bit to the left of the binary point
ieee1 &= 0xffefffff
# set the exponent of the ieee number to be the actual exponent
# plus the shift count + 1023. Or this into the first half of the
# ieee number. The ibm exponent is excess 64 but is adjusted by 65
# since during conversion to ibm format the exponent is
# incremented by 1 and the fraction bits left 4 positions to the
# right of the radix point. (had to add >> 24 because C treats &
# 0x7f as 0x7f000000 and Python doesn't)
ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) +
shift + 1023) << 20) | (xport1 & 0x80000000)
ieee = np.empty((len(ieee1),), dtype='>u4,>u4')
ieee['f0'] = ieee1
ieee['f1'] = ieee2
ieee = ieee.view(dtype='>f8')
ieee = ieee.astype('f8')
return ieee
class XportReader(BaseIterator):
__doc__ = _xport_reader_doc
def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
chunksize=None):
self._encoding = encoding
self._lines_read = 0
self._index = index
self._chunksize = chunksize
if isinstance(filepath_or_buffer, str):
(filepath_or_buffer, encoding,
compression, should_close) = get_filepath_or_buffer(
filepath_or_buffer, encoding=encoding)
if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)):
self.filepath_or_buffer = open(filepath_or_buffer, 'rb')
else:
# Copy to BytesIO, and ensure no encoding
contents = filepath_or_buffer.read()
try:
contents = contents.encode(self._encoding)
except UnicodeEncodeError:
pass
self.filepath_or_buffer = compat.BytesIO(contents)
self._read_header()
def close(self):
self.filepath_or_buffer.close()
def _get_row(self):
return self.filepath_or_buffer.read(80).decode()
def _read_header(self):
self.filepath_or_buffer.seek(0)
# read file header
line1 = self._get_row()
if line1 != _correct_line1:
self.close()
raise ValueError("Header record is not an XPORT file.")
line2 = self._get_row()
fif = [['prefix', 24], ['version', 8], ['OS', 8],
['_', 24], ['created', 16]]
file_info = _split_line(line2, fif)
if file_info['prefix'] != "SAS SAS SASLIB":
self.close()
raise ValueError("Header record has invalid prefix.")
file_info['created'] = _parse_date(file_info['created'])
self.file_info = file_info
line3 = self._get_row()
file_info['modified'] = _parse_date(line3[:16])
# read member header
header1 = self._get_row()
header2 = self._get_row()
headflag1 = header1.startswith(_correct_header1)
headflag2 = (header2 == _correct_header2)
if not (headflag1 and headflag2):
self.close()
raise ValueError("Member header not found")
# usually 140, could be 135
fieldnamelength = int(header1[-5:-2])
# member info
mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8],
['version', 8], ['OS', 8], ['_', 24], ['created', 16]]
member_info = _split_line(self._get_row(), mem)
mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]]
member_info.update(_split_line(self._get_row(), mem))
member_info['modified'] = _parse_date(member_info['modified'])
member_info['created'] = _parse_date(member_info['created'])
self.member_info = member_info
# read field names
types = {1: 'numeric', 2: 'char'}
fieldcount = int(self._get_row()[54:58])
datalength = fieldnamelength * fieldcount
# round up to nearest 80
if datalength % 80:
datalength += 80 - datalength % 80
fielddata = self.filepath_or_buffer.read(datalength)
fields = []
obs_length = 0
while len(fielddata) >= fieldnamelength:
# pull data for one field
field, fielddata = (fielddata[:fieldnamelength],
fielddata[fieldnamelength:])
# rest at end gets ignored, so if field is short, pad out
# to match struct pattern below
field = field.ljust(140)
fieldstruct = struct.unpack('>hhhh8s40s8shhh2s8shhl52s', field)
field = dict(zip(_fieldkeys, fieldstruct))
del field['_']
field['ntype'] = types[field['ntype']]
fl = field['field_length']
if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
self.close()
msg = "Floating field width {0} is not between 2 and 8."
raise TypeError(msg.format(fl))
for k, v in field.items():
try:
field[k] = v.strip()
except AttributeError:
pass
obs_length += field['field_length']
fields += [field]
header = self._get_row()
if not header == _correct_obs_header:
self.close()
raise ValueError("Observation header not found.")
self.fields = fields
self.record_length = obs_length
self.record_start = self.filepath_or_buffer.tell()
self.nobs = self._record_count()
self.columns = [x['name'].decode() for x in self.fields]
# Setup the dtype.
dtypel = [('s' + str(i), "S" + str(field['field_length']))
for i, field in enumerate(self.fields)]
dtype = np.dtype(dtypel)
self._dtype = dtype
def __next__(self):
return self.read(nrows=self._chunksize or 1)
def _record_count(self):
"""
Get number of records in file.
This is maybe suboptimal because we have to seek to the end of
the file.
Side effect: returns file position to record_start.
"""
self.filepath_or_buffer.seek(0, 2)
total_records_length = (self.filepath_or_buffer.tell() -
self.record_start)
if total_records_length % 80 != 0:
warnings.warn("xport file may be corrupted")
if self.record_length > 80:
self.filepath_or_buffer.seek(self.record_start)
return total_records_length // self.record_length
self.filepath_or_buffer.seek(-80, 2)
last_card = self.filepath_or_buffer.read(80)
last_card = np.frombuffer(last_card, dtype=np.uint64)
# 8 byte blank
ix = np.flatnonzero(last_card == 2314885530818453536)
if len(ix) == 0:
tail_pad = 0
else:
tail_pad = 8 * len(ix)
self.filepath_or_buffer.seek(self.record_start)
return (total_records_length - tail_pad) // self.record_length
def get_chunk(self, size=None):
"""
Reads lines from Xport file and returns as dataframe
Parameters
----------
size : int, defaults to None
Number of lines to read. If None, reads whole file.
Returns
-------
DataFrame
"""
if size is None:
size = self._chunksize
return self.read(nrows=size)
def _missing_double(self, vec):
v = vec.view(dtype='u1,u1,u2,u4')
miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0)
miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |
(v['f0'] == 0x5f) | (v['f0'] == 0x2e))
miss &= miss1
return miss
@Appender(_read_method_doc)
def read(self, nrows=None):
if nrows is None:
nrows = self.nobs
read_lines = min(nrows, self.nobs - self._lines_read)
read_len = read_lines * self.record_length
if read_len <= 0:
self.close()
raise StopIteration
raw = self.filepath_or_buffer.read(read_len)
data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
df = pd.DataFrame(index=range(read_lines))
for j, x in enumerate(self.columns):
vec = data['s%d' % j]
ntype = self.fields[j]['ntype']
if ntype == "numeric":
vec = _handle_truncated_float_vec(
vec, self.fields[j]['field_length'])
miss = self._missing_double(vec)
v = _parse_float_vec(vec)
v[miss] = np.nan
elif self.fields[j]['ntype'] == 'char':
v = [y.rstrip() for y in vec]
if compat.PY3:
if self._encoding is not None:
v = [y.decode(self._encoding) for y in v]
df[x] = v
if self._index is None:
df.index = range(self._lines_read, self._lines_read + read_lines)
else:
df = df.set_index(self._index)
self._lines_read += read_lines
return df
@@ -0,0 +1,68 @@
"""
Read SAS sas7bdat or xport files.
"""
from pandas import compat
from pandas.io.common import _stringify_path
def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
chunksize=None, iterator=False):
"""
Read SAS files stored as either XPORT or SAS7BDAT format files.
Parameters
----------
filepath_or_buffer : string or file-like object
Path to the SAS file.
format : string {'xport', 'sas7bdat'} or None
If None, file format is inferred from file extension. If 'xport' or
'sas7bdat', uses the corresponding format.
index : identifier of index column, defaults to None
Identifier of column that should be used as index of the DataFrame.
encoding : string, default is None
Encoding for text data. If None, text data are stored as raw bytes.
chunksize : int
Read file `chunksize` lines at a time, returns iterator.
iterator : bool, defaults to False
If True, returns an iterator for reading the file incrementally.
Returns
-------
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
or XportReader
"""
if format is None:
buffer_error_msg = ("If this is a buffer object rather "
"than a string name, you must specify "
"a format string")
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, compat.string_types):
raise ValueError(buffer_error_msg)
fname = filepath_or_buffer.lower()
if fname.endswith(".xpt"):
format = "xport"
elif fname.endswith(".sas7bdat"):
format = "sas7bdat"
else:
raise ValueError("unable to infer format of SAS file")
if format.lower() == 'xport':
from pandas.io.sas.sas_xport import XportReader
reader = XportReader(filepath_or_buffer, index=index,
encoding=encoding,
chunksize=chunksize)
elif format.lower() == 'sas7bdat':
from pandas.io.sas.sas7bdat import SAS7BDATReader
reader = SAS7BDATReader(filepath_or_buffer, index=index,
encoding=encoding,
chunksize=chunksize)
else:
raise ValueError('unknown SAS format')
if iterator or chunksize:
return reader
data = reader.read()
reader.close()
return data
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff