Static code analysis and corrections

This commit is contained in:
Kristjan Komlosi
2019-07-17 16:06:09 +02:00
parent 674692c2fc
commit 21bfae9fbc
10086 changed files with 2102103 additions and 51 deletions
@@ -0,0 +1,32 @@
"""
Data IO api
"""
# flake8: noqa
from pandas.io.parsers import read_csv, read_table, read_fwf
from pandas.io.clipboards import read_clipboard
from pandas.io.excel import ExcelFile, ExcelWriter, read_excel
from pandas.io.pytables import HDFStore, get_store, read_hdf
from pandas.io.json import read_json
from pandas.io.html import read_html
from pandas.io.sql import read_sql, read_sql_table, read_sql_query
from pandas.io.sas import read_sas
from pandas.io.feather_format import read_feather
from pandas.io.parquet import read_parquet
from pandas.io.stata import read_stata
from pandas.io.pickle import read_pickle, to_pickle
from pandas.io.packers import read_msgpack, to_msgpack
from pandas.io.gbq import read_gbq
# deprecation, xref #13790
def Term(*args, **kwargs):
import warnings
warnings.warn("pd.Term is deprecated as it is not "
"applicable to user code. Instead use in-line "
"string expressions in the where clause when "
"searching in HDFStore",
FutureWarning, stacklevel=2)
from pandas.io.pytables import Term
return Term(*args, **kwargs)
@@ -0,0 +1,125 @@
"""
Pyperclip
A cross-platform clipboard module for Python. (only handles plain text for now)
By Al Sweigart al@inventwithpython.com
BSD License
Usage:
import pyperclip
pyperclip.copy('The text to be copied to the clipboard.')
spam = pyperclip.paste()
if not pyperclip.copy:
print("Copy functionality unavailable!")
On Windows, no additional modules are needed.
On Mac, the module uses pbcopy and pbpaste, which should come with the os.
On Linux, install xclip or xsel via package manager. For example, in Debian:
sudo apt-get install xclip
Otherwise on Linux, you will need the gtk, qtpy or PyQt modules installed.
qtpy also requires a python-qt-bindings module: PyQt4, PyQt5, PySide, PySide2
gtk and PyQt4 modules are not available for Python 3,
and this module does not work with PyGObject yet.
"""
__version__ = '1.5.27'
import platform
import os
import subprocess
from .clipboards import (init_osx_clipboard,
init_gtk_clipboard, init_qt_clipboard,
init_xclip_clipboard, init_xsel_clipboard,
init_klipper_clipboard, init_no_clipboard)
from .windows import init_windows_clipboard
# `import qtpy` sys.exit()s if DISPLAY is not in the environment.
# Thus, we need to detect the presence of $DISPLAY manually
# and not load qtpy if it is absent.
HAS_DISPLAY = os.getenv("DISPLAY", False)
CHECK_CMD = "where" if platform.system() == "Windows" else "which"
def _executable_exists(name):
return subprocess.call([CHECK_CMD, name],
stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
def determine_clipboard():
# Determine the OS/platform and set
# the copy() and paste() functions accordingly.
if 'cygwin' in platform.system().lower():
# FIXME: pyperclip currently does not support Cygwin,
# see https://github.com/asweigart/pyperclip/issues/55
pass
elif os.name == 'nt' or platform.system() == 'Windows':
return init_windows_clipboard()
if os.name == 'mac' or platform.system() == 'Darwin':
return init_osx_clipboard()
if HAS_DISPLAY:
# Determine which command/module is installed, if any.
try:
# Check if gtk is installed
import gtk # noqa
except ImportError:
pass
else:
return init_gtk_clipboard()
try:
# qtpy is a small abstraction layer that lets you write
# applications using a single api call to either PyQt or PySide
# https://pypi.org/project/QtPy
import qtpy # noqa
except ImportError:
# If qtpy isn't installed, fall back on importing PyQt5, or PyQt5
try:
import PyQt5 # noqa
except ImportError:
try:
import PyQt4 # noqa
except ImportError:
pass # fail fast for all non-ImportError exceptions.
else:
return init_qt_clipboard()
else:
return init_qt_clipboard()
pass
else:
return init_qt_clipboard()
if _executable_exists("xclip"):
return init_xclip_clipboard()
if _executable_exists("xsel"):
return init_xsel_clipboard()
if _executable_exists("klipper") and _executable_exists("qdbus"):
return init_klipper_clipboard()
return init_no_clipboard()
def set_clipboard(clipboard):
global copy, paste
clipboard_types = {'osx': init_osx_clipboard,
'gtk': init_gtk_clipboard,
'qt': init_qt_clipboard,
'xclip': init_xclip_clipboard,
'xsel': init_xsel_clipboard,
'klipper': init_klipper_clipboard,
'windows': init_windows_clipboard,
'no': init_no_clipboard}
copy, paste = clipboard_types[clipboard]()
copy, paste = determine_clipboard()
__all__ = ["copy", "paste"]
# pandas aliases
clipboard_get = paste
clipboard_set = copy
@@ -0,0 +1,143 @@
import subprocess
from .exceptions import PyperclipException
from pandas.compat import PY2, text_type
EXCEPT_MSG = """
Pyperclip could not find a copy/paste mechanism for your system.
For more information, please visit https://pyperclip.readthedocs.org """
def init_osx_clipboard():
def copy_osx(text):
p = subprocess.Popen(['pbcopy', 'w'],
stdin=subprocess.PIPE, close_fds=True)
p.communicate(input=text.encode('utf-8'))
def paste_osx():
p = subprocess.Popen(['pbpaste', 'r'],
stdout=subprocess.PIPE, close_fds=True)
stdout, stderr = p.communicate()
return stdout.decode('utf-8')
return copy_osx, paste_osx
def init_gtk_clipboard():
import gtk
def copy_gtk(text):
global cb
cb = gtk.Clipboard()
cb.set_text(text)
cb.store()
def paste_gtk():
clipboardContents = gtk.Clipboard().wait_for_text()
# for python 2, returns None if the clipboard is blank.
if clipboardContents is None:
return ''
else:
return clipboardContents
return copy_gtk, paste_gtk
def init_qt_clipboard():
# $DISPLAY should exist
# Try to import from qtpy, but if that fails try PyQt5 then PyQt4
try:
from qtpy.QtWidgets import QApplication
except ImportError:
try:
from PyQt5.QtWidgets import QApplication
except ImportError:
from PyQt4.QtGui import QApplication
app = QApplication.instance()
if app is None:
app = QApplication([])
def copy_qt(text):
cb = app.clipboard()
cb.setText(text)
def paste_qt():
cb = app.clipboard()
return text_type(cb.text())
return copy_qt, paste_qt
def init_xclip_clipboard():
def copy_xclip(text):
p = subprocess.Popen(['xclip', '-selection', 'c'],
stdin=subprocess.PIPE, close_fds=True)
p.communicate(input=text.encode('utf-8'))
def paste_xclip():
p = subprocess.Popen(['xclip', '-selection', 'c', '-o'],
stdout=subprocess.PIPE, close_fds=True)
stdout, stderr = p.communicate()
return stdout.decode('utf-8')
return copy_xclip, paste_xclip
def init_xsel_clipboard():
def copy_xsel(text):
p = subprocess.Popen(['xsel', '-b', '-i'],
stdin=subprocess.PIPE, close_fds=True)
p.communicate(input=text.encode('utf-8'))
def paste_xsel():
p = subprocess.Popen(['xsel', '-b', '-o'],
stdout=subprocess.PIPE, close_fds=True)
stdout, stderr = p.communicate()
return stdout.decode('utf-8')
return copy_xsel, paste_xsel
def init_klipper_clipboard():
def copy_klipper(text):
p = subprocess.Popen(
['qdbus', 'org.kde.klipper', '/klipper', 'setClipboardContents',
text.encode('utf-8')],
stdin=subprocess.PIPE, close_fds=True)
p.communicate(input=None)
def paste_klipper():
p = subprocess.Popen(
['qdbus', 'org.kde.klipper', '/klipper', 'getClipboardContents'],
stdout=subprocess.PIPE, close_fds=True)
stdout, stderr = p.communicate()
# Workaround for https://bugs.kde.org/show_bug.cgi?id=342874
# TODO: https://github.com/asweigart/pyperclip/issues/43
clipboardContents = stdout.decode('utf-8')
# even if blank, Klipper will append a newline at the end
assert len(clipboardContents) > 0
# make sure that newline is there
assert clipboardContents.endswith('\n')
if clipboardContents.endswith('\n'):
clipboardContents = clipboardContents[:-1]
return clipboardContents
return copy_klipper, paste_klipper
def init_no_clipboard():
class ClipboardUnavailable(object):
def __call__(self, *args, **kwargs):
raise PyperclipException(EXCEPT_MSG)
if PY2:
def __nonzero__(self):
return False
else:
def __bool__(self):
return False
return ClipboardUnavailable(), ClipboardUnavailable()
@@ -0,0 +1,12 @@
import ctypes
class PyperclipException(RuntimeError):
pass
class PyperclipWindowsException(PyperclipException):
def __init__(self, message):
message += " ({err})".format(err=ctypes.WinError())
super(PyperclipWindowsException, self).__init__(message)
@@ -0,0 +1,153 @@
"""
This module implements clipboard handling on Windows using ctypes.
"""
import time
import contextlib
import ctypes
from ctypes import c_size_t, sizeof, c_wchar_p, get_errno, c_wchar
from .exceptions import PyperclipWindowsException
class CheckedCall(object):
def __init__(self, f):
super(CheckedCall, self).__setattr__("f", f)
def __call__(self, *args):
ret = self.f(*args)
if not ret and get_errno():
raise PyperclipWindowsException("Error calling " + self.f.__name__)
return ret
def __setattr__(self, key, value):
setattr(self.f, key, value)
def init_windows_clipboard():
from ctypes.wintypes import (HGLOBAL, LPVOID, DWORD, LPCSTR, INT, HWND,
HINSTANCE, HMENU, BOOL, UINT, HANDLE)
windll = ctypes.windll
safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA)
safeCreateWindowExA.argtypes = [DWORD, LPCSTR, LPCSTR, DWORD, INT, INT,
INT, INT, HWND, HMENU, HINSTANCE, LPVOID]
safeCreateWindowExA.restype = HWND
safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow)
safeDestroyWindow.argtypes = [HWND]
safeDestroyWindow.restype = BOOL
OpenClipboard = windll.user32.OpenClipboard
OpenClipboard.argtypes = [HWND]
OpenClipboard.restype = BOOL
safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard)
safeCloseClipboard.argtypes = []
safeCloseClipboard.restype = BOOL
safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard)
safeEmptyClipboard.argtypes = []
safeEmptyClipboard.restype = BOOL
safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData)
safeGetClipboardData.argtypes = [UINT]
safeGetClipboardData.restype = HANDLE
safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData)
safeSetClipboardData.argtypes = [UINT, HANDLE]
safeSetClipboardData.restype = HANDLE
safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc)
safeGlobalAlloc.argtypes = [UINT, c_size_t]
safeGlobalAlloc.restype = HGLOBAL
safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock)
safeGlobalLock.argtypes = [HGLOBAL]
safeGlobalLock.restype = LPVOID
safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock)
safeGlobalUnlock.argtypes = [HGLOBAL]
safeGlobalUnlock.restype = BOOL
GMEM_MOVEABLE = 0x0002
CF_UNICODETEXT = 13
@contextlib.contextmanager
def window():
"""
Context that provides a valid Windows hwnd.
"""
# we really just need the hwnd, so setting "STATIC"
# as predefined lpClass is just fine.
hwnd = safeCreateWindowExA(0, b"STATIC", None, 0, 0, 0, 0, 0,
None, None, None, None)
try:
yield hwnd
finally:
safeDestroyWindow(hwnd)
@contextlib.contextmanager
def clipboard(hwnd):
"""
Context manager that opens the clipboard and prevents
other applications from modifying the clipboard content.
"""
# We may not get the clipboard handle immediately because
# some other application is accessing it (?)
# We try for at least 500ms to get the clipboard.
t = time.time() + 0.5
success = False
while time.time() < t:
success = OpenClipboard(hwnd)
if success:
break
time.sleep(0.01)
if not success:
raise PyperclipWindowsException("Error calling OpenClipboard")
try:
yield
finally:
safeCloseClipboard()
def copy_windows(text):
# This function is heavily based on
# http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard
with window() as hwnd:
# http://msdn.com/ms649048
# If an application calls OpenClipboard with hwnd set to NULL,
# EmptyClipboard sets the clipboard owner to NULL;
# this causes SetClipboardData to fail.
# => We need a valid hwnd to copy something.
with clipboard(hwnd):
safeEmptyClipboard()
if text:
# http://msdn.com/ms649051
# If the hMem parameter identifies a memory object,
# the object must have been allocated using the
# function with the GMEM_MOVEABLE flag.
count = len(text) + 1
handle = safeGlobalAlloc(GMEM_MOVEABLE,
count * sizeof(c_wchar))
locked_handle = safeGlobalLock(handle)
ctypes.memmove(c_wchar_p(locked_handle),
c_wchar_p(text), count * sizeof(c_wchar))
safeGlobalUnlock(handle)
safeSetClipboardData(CF_UNICODETEXT, handle)
def paste_windows():
with clipboard(None):
handle = safeGetClipboardData(CF_UNICODETEXT)
if not handle:
# GetClipboardData may return NULL with errno == NO_ERROR
# if the clipboard is empty.
# (Also, it may return a handle to an empty buffer,
# but technically that's not empty)
return ""
return c_wchar_p(handle).value
return copy_windows, paste_windows
@@ -0,0 +1,140 @@
""" io on the clipboard """
from pandas import compat, get_option, option_context, DataFrame
from pandas.compat import StringIO, PY2, PY3
import warnings
def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover
r"""
Read text from clipboard and pass to read_table. See read_table for the
full argument list
Parameters
----------
sep : str, default '\s+'.
A string or regex delimiter. The default of '\s+' denotes
one or more whitespace characters.
Returns
-------
parsed : DataFrame
"""
encoding = kwargs.pop('encoding', 'utf-8')
# only utf-8 is valid for passed value because that's what clipboard
# supports
if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
raise NotImplementedError(
'reading from clipboard only supports utf-8 encoding')
from pandas.io.clipboard import clipboard_get
from pandas.io.parsers import read_table
text = clipboard_get()
# try to decode (if needed on PY3)
# Strange. linux py33 doesn't complain, win py33 does
if PY3:
try:
text = compat.bytes_to_str(
text, encoding=(kwargs.get('encoding') or
get_option('display.encoding'))
)
except:
pass
# Excel copies into clipboard with \t separation
# inspect no more then the 10 first lines, if they
# all contain an equal number (>0) of tabs, infer
# that this came from excel and set 'sep' accordingly
lines = text[:10000].split('\n')[:-1][:10]
# Need to remove leading white space, since read_table
# accepts:
# a b
# 0 1 2
# 1 3 4
counts = {x.lstrip().count('\t') for x in lines}
if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
sep = '\t'
# Edge case where sep is specified to be None, return to default
if sep is None and kwargs.get('delim_whitespace') is None:
sep = r'\s+'
# Regex separator currently only works with python engine.
# Default to python if separator is multi-character (regex)
if len(sep) > 1 and kwargs.get('engine') is None:
kwargs['engine'] = 'python'
elif len(sep) > 1 and kwargs.get('engine') == 'c':
warnings.warn('read_clipboard with regex separator does not work'
' properly with c engine')
# In PY2, the c table reader first encodes text with UTF-8 but Python
# table reader uses the format of the passed string. For consistency,
# encode strings for python engine so that output from python and c
# engines produce consistent results
if kwargs.get('engine') == 'python' and PY2:
text = text.encode('utf-8')
return read_table(StringIO(text), sep=sep, **kwargs)
def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover
"""
Attempt to write text representation of object to the system clipboard
The clipboard can be then pasted into Excel for example.
Parameters
----------
obj : the object to write to the clipboard
excel : boolean, defaults to True
if True, use the provided separator, writing in a csv
format for allowing easy pasting into excel.
if False, write a string representation of the object
to the clipboard
sep : optional, defaults to tab
other keywords are passed to to_csv
Notes
-----
Requirements for your platform
- Linux: xclip, or xsel (with gtk or PyQt4 modules)
- Windows:
- OS X:
"""
encoding = kwargs.pop('encoding', 'utf-8')
# testing if an invalid encoding is passed to clipboard
if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
raise ValueError('clipboard only supports utf-8 encoding')
from pandas.io.clipboard import clipboard_set
if excel is None:
excel = True
if excel:
try:
if sep is None:
sep = '\t'
buf = StringIO()
# clipboard_set (pyperclip) expects unicode
obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs)
text = buf.getvalue()
if PY2:
text = text.decode('utf-8')
clipboard_set(text)
return
except TypeError:
warnings.warn('to_clipboard in excel mode requires a single '
'character separator.')
elif sep is not None:
warnings.warn('to_clipboard with excel=False ignores the sep argument')
if isinstance(obj, DataFrame):
# str(df) has various unhelpful defaults, like truncation
with option_context('display.max_colwidth', 999999):
objstr = obj.to_string(**kwargs)
else:
objstr = str(obj)
clipboard_set(objstr)
@@ -0,0 +1,593 @@
"""Common IO api utilities"""
import os
import csv
import codecs
import mmap
from contextlib import contextmanager, closing
import zipfile
from pandas.compat import StringIO, BytesIO, string_types, text_type
from pandas import compat
from pandas.io.formats.printing import pprint_thing
import pandas.core.common as com
from pandas.core.dtypes.common import is_number, is_file_like
# compat
from pandas.errors import (ParserError, DtypeWarning, # noqa
EmptyDataError, ParserWarning)
# gh-12665: Alias for now and remove later.
CParserError = ParserError
# common NA values
# no longer excluding inf representations
# '1.#INF','-1.#INF', '1.#INF000000',
_NA_VALUES = set([
'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A',
'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', ''
])
if compat.PY3:
from urllib.request import urlopen, pathname2url
_urlopen = urlopen
from urllib.parse import urlparse as parse_url
from urllib.parse import (uses_relative, uses_netloc, uses_params,
urlencode, urljoin)
from urllib.error import URLError
from http.client import HTTPException # noqa
else:
from urllib2 import urlopen as _urlopen
from urllib import urlencode, pathname2url # noqa
from urlparse import urlparse as parse_url
from urlparse import uses_relative, uses_netloc, uses_params, urljoin
from urllib2 import URLError # noqa
from httplib import HTTPException # noqa
from contextlib import contextmanager, closing # noqa
from functools import wraps # noqa
# @wraps(_urlopen)
@contextmanager
def urlopen(*args, **kwargs):
with closing(_urlopen(*args, **kwargs)) as f:
yield f
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard('')
class BaseIterator(object):
"""Subclass this and provide a "__next__()" method to obtain an iterator.
Useful only when the object being iterated is non-reusable (e.g. OK for a
parser, not for an in-memory table, yes for its iterator)."""
def __iter__(self):
return self
def __next__(self):
raise com.AbstractMethodError(self)
if not compat.PY3:
BaseIterator.next = lambda self: self.__next__()
def _is_url(url):
"""Check to see if a URL has a valid protocol.
Parameters
----------
url : str or unicode
Returns
-------
isurl : bool
If `url` has a valid protocol return True otherwise False.
"""
try:
return parse_url(url).scheme in _VALID_URLS
except:
return False
def _expand_user(filepath_or_buffer):
"""Return the argument with an initial component of ~ or ~user
replaced by that user's home directory.
Parameters
----------
filepath_or_buffer : object to be converted if possible
Returns
-------
expanded_filepath_or_buffer : an expanded filepath or the
input if not expandable
"""
if isinstance(filepath_or_buffer, string_types):
return os.path.expanduser(filepath_or_buffer)
return filepath_or_buffer
def _validate_header_arg(header):
if isinstance(header, bool):
raise TypeError("Passing a bool to header is invalid. "
"Use header=None for no header or "
"header=int or list-like of ints to specify "
"the row(s) making up the column names")
def _stringify_path(filepath_or_buffer):
"""Attempt to convert a path-like object to a string.
Parameters
----------
filepath_or_buffer : object to be converted
Returns
-------
str_filepath_or_buffer : maybe a string version of the object
Notes
-----
Objects supporting the fspath protocol (python 3.6+) are coerced
according to its __fspath__ method.
For backwards compatibility with older pythons, pathlib.Path and
py.path objects are specially coerced.
Any other object is passed through unchanged, which includes bytes,
strings, buffers, or anything else that's not even path-like.
"""
try:
import pathlib
_PATHLIB_INSTALLED = True
except ImportError:
_PATHLIB_INSTALLED = False
try:
from py.path import local as LocalPath
_PY_PATH_INSTALLED = True
except ImportError:
_PY_PATH_INSTALLED = False
if hasattr(filepath_or_buffer, '__fspath__'):
return filepath_or_buffer.__fspath__()
if _PATHLIB_INSTALLED and isinstance(filepath_or_buffer, pathlib.Path):
return text_type(filepath_or_buffer)
if _PY_PATH_INSTALLED and isinstance(filepath_or_buffer, LocalPath):
return filepath_or_buffer.strpath
return filepath_or_buffer
def is_s3_url(url):
"""Check for an s3, s3n, or s3a url"""
try:
return parse_url(url).scheme in ['s3', 's3n', 's3a']
except: # noqa
return False
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=None, mode=None):
"""
If the filepath_or_buffer is a url, translate and return the buffer.
Otherwise passthrough.
Parameters
----------
filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
or buffer
encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
mode : str, optional
Returns
-------
tuple of ({a filepath_ or buffer or S3File instance},
encoding, str,
compression, str,
should_close, bool)
"""
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if _is_url(filepath_or_buffer):
req = _urlopen(filepath_or_buffer)
content_encoding = req.headers.get('Content-Encoding', None)
if content_encoding == 'gzip':
# Override compression based on Content-Encoding header
compression = 'gzip'
reader = BytesIO(req.read())
req.close()
return reader, encoding, compression, True
if is_s3_url(filepath_or_buffer):
from pandas.io import s3
return s3.get_filepath_or_buffer(filepath_or_buffer,
encoding=encoding,
compression=compression,
mode=mode)
if isinstance(filepath_or_buffer, (compat.string_types,
compat.binary_type,
mmap.mmap)):
return _expand_user(filepath_or_buffer), None, compression, False
if not is_file_like(filepath_or_buffer):
msg = "Invalid file path or buffer object type: {_type}"
raise ValueError(msg.format(_type=type(filepath_or_buffer)))
return filepath_or_buffer, None, compression, False
def file_path_to_url(path):
"""
converts an absolute native path to a FILE URL.
Parameters
----------
path : a path in native format
Returns
-------
a valid FILE URL
"""
return urljoin('file:', pathname2url(path))
_compression_to_extension = {
'gzip': '.gz',
'bz2': '.bz2',
'zip': '.zip',
'xz': '.xz',
}
def _infer_compression(filepath_or_buffer, compression):
"""
Get the compression method for filepath_or_buffer. If compression='infer',
the inferred compression method is returned. Otherwise, the input
compression method is returned unchanged, unless it's invalid, in which
case an error is raised.
Parameters
----------
filepath_or_buf :
a path (str) or buffer
compression : str or None
the compression method including None for no compression and 'infer'
Returns
-------
string or None :
compression method
Raises
------
ValueError on invalid compression specified
"""
# No compression has been explicitly specified
if compression is None:
return None
# Infer compression
if compression == 'infer':
# Convert all path types (e.g. pathlib.Path) to strings
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, compat.string_types):
# Cannot infer compression of a buffer, assume no compression
return None
# Infer compression from the filename/URL extension
for compression, extension in _compression_to_extension.items():
if filepath_or_buffer.endswith(extension):
return compression
return None
# Compression has been specified. Check that it's valid
if compression in _compression_to_extension:
return compression
msg = 'Unrecognized compression type: {}'.format(compression)
valid = ['infer', None] + sorted(_compression_to_extension)
msg += '\nValid compression types are {}'.format(valid)
raise ValueError(msg)
def _get_handle(path_or_buf, mode, encoding=None, compression=None,
memory_map=False, is_text=True):
"""
Get file handle for given path/buffer and mode.
Parameters
----------
path_or_buf :
a path (str) or buffer
mode : str
mode to open path_or_buf with
encoding : str or None
compression : str or None
Supported compression protocols are gzip, bz2, zip, and xz
memory_map : boolean, default False
See parsers._parser_params for more information.
is_text : boolean, default True
whether file/buffer is in text format (csv, json, etc.), or in binary
mode (pickle, etc.)
Returns
-------
f : file-like
A file-like object
handles : list of file-like objects
A list of file-like object that were opened in this function.
"""
try:
from s3fs import S3File
need_text_wrapping = (BytesIO, S3File)
except ImportError:
need_text_wrapping = (BytesIO,)
handles = list()
f = path_or_buf
# Convert pathlib.Path/py.path.local or string
path_or_buf = _stringify_path(path_or_buf)
is_path = isinstance(path_or_buf, compat.string_types)
if compression:
if compat.PY2 and not is_path and encoding:
msg = 'compression with encoding is not yet supported in Python 2'
raise ValueError(msg)
# GZ Compression
if compression == 'gzip':
import gzip
if is_path:
f = gzip.open(path_or_buf, mode)
else:
f = gzip.GzipFile(fileobj=path_or_buf)
# BZ Compression
elif compression == 'bz2':
import bz2
if is_path:
f = bz2.BZ2File(path_or_buf, mode)
elif compat.PY2:
# Python 2's bz2 module can't take file objects, so have to
# run through decompress manually
f = StringIO(bz2.decompress(path_or_buf.read()))
path_or_buf.close()
else:
f = bz2.BZ2File(path_or_buf)
# ZIP Compression
elif compression == 'zip':
zf = BytesZipFile(path_or_buf, mode)
if zf.mode == 'w':
f = zf
elif zf.mode == 'r':
zip_names = zf.namelist()
if len(zip_names) == 1:
f = zf.open(zip_names.pop())
elif len(zip_names) == 0:
raise ValueError('Zero files found in ZIP file {}'
.format(path_or_buf))
else:
raise ValueError('Multiple files found in ZIP file.'
' Only one file per ZIP: {}'
.format(zip_names))
# XZ Compression
elif compression == 'xz':
lzma = compat.import_lzma()
f = lzma.LZMAFile(path_or_buf, mode)
# Unrecognized Compression
else:
msg = 'Unrecognized compression type: {}'.format(compression)
raise ValueError(msg)
handles.append(f)
elif is_path:
if compat.PY2:
# Python 2
f = open(path_or_buf, mode)
elif encoding:
# Python 3 and encoding
f = open(path_or_buf, mode, encoding=encoding)
elif is_text:
# Python 3 and no explicit encoding
f = open(path_or_buf, mode, errors='replace')
else:
# Python 3 and binary mode
f = open(path_or_buf, mode)
handles.append(f)
# in Python 3, convert BytesIO or fileobjects passed with an encoding
if compat.PY3 and is_text and\
(compression or isinstance(f, need_text_wrapping)):
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding)
handles.append(f)
if memory_map and hasattr(f, 'fileno'):
try:
g = MMapWrapper(f)
f.close()
f = g
except Exception:
# we catch any errors that may have occurred
# because that is consistent with the lower-level
# functionality of the C engine (pd.read_csv), so
# leave the file handler as is then
pass
return f, handles
class BytesZipFile(zipfile.ZipFile, BytesIO):
"""
Wrapper for standard library class ZipFile and allow the returned file-like
handle to accept byte strings via `write` method.
BytesIO provides attributes of file-like object and ZipFile.writestr writes
bytes strings into a member of the archive.
"""
# GH 17778
def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
if mode in ['wb', 'rb']:
mode = mode.replace('b', '')
super(BytesZipFile, self).__init__(file, mode, compression, **kwargs)
def write(self, data):
super(BytesZipFile, self).writestr(self.filename, data)
@property
def closed(self):
return self.fp is None
class MMapWrapper(BaseIterator):
"""
Wrapper for the Python's mmap class so that it can be properly read in
by Python's csv.reader class.
Parameters
----------
f : file object
File object to be mapped onto memory. Must support the 'fileno'
method or have an equivalent attribute
"""
def __init__(self, f):
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
def __getattr__(self, name):
return getattr(self.mmap, name)
def __iter__(self):
return self
def __next__(self):
newline = self.mmap.readline()
# readline returns bytes, not str, in Python 3,
# but Python's CSV reader expects str, so convert
# the output to str before continuing
if compat.PY3:
newline = compat.bytes_to_str(newline)
# mmap doesn't raise if reading past the allocated
# data but instead returns an empty string, so raise
# if that is returned
if newline == '':
raise StopIteration
return newline
if not compat.PY3:
MMapWrapper.next = lambda self: self.__next__()
class UTF8Recoder(BaseIterator):
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def read(self, bytes=-1):
return self.reader.read(bytes).encode("utf-8")
def readline(self):
return self.reader.readline().encode("utf-8")
def next(self):
return next(self.reader).encode("utf-8")
if compat.PY3: # pragma: no cover
def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
# ignore encoding
return csv.reader(f, dialect=dialect, **kwds)
def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
return csv.writer(f, dialect=dialect, **kwds)
else:
class UnicodeReader(BaseIterator):
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
On Python 3, this is replaced (below) by csv.reader, which handles
unicode.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def __next__(self):
row = next(self.reader)
return [compat.text_type(s, "utf-8") for s in row]
class UnicodeWriter(object):
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
self.quoting = kwds.get("quoting", None)
def writerow(self, row):
def _check_as_is(x):
return (self.quoting == csv.QUOTE_NONNUMERIC and
is_number(x)) or isinstance(x, str)
row = [x if _check_as_is(x)
else pprint_thing(x).encode("utf-8") for x in row]
self.writer.writerow([s for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and re-encode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
def _check_as_is(x):
return (self.quoting == csv.QUOTE_NONNUMERIC and
is_number(x)) or isinstance(x, str)
for i, row in enumerate(rows):
rows[i] = [x if _check_as_is(x)
else pprint_thing(x).encode("utf-8") for x in row]
self.writer.writerows([[s for s in row] for row in rows])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and re-encode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
@@ -0,0 +1,63 @@
"""This module is designed for community supported date conversion functions"""
from pandas.compat import range, map
import numpy as np
from pandas._libs.tslibs import parsing
def parse_date_time(date_col, time_col):
date_col = _maybe_cast(date_col)
time_col = _maybe_cast(time_col)
return parsing.try_parse_date_and_time(date_col, time_col)
def parse_date_fields(year_col, month_col, day_col):
year_col = _maybe_cast(year_col)
month_col = _maybe_cast(month_col)
day_col = _maybe_cast(day_col)
return parsing.try_parse_year_month_day(year_col, month_col, day_col)
def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col,
second_col):
year_col = _maybe_cast(year_col)
month_col = _maybe_cast(month_col)
day_col = _maybe_cast(day_col)
hour_col = _maybe_cast(hour_col)
minute_col = _maybe_cast(minute_col)
second_col = _maybe_cast(second_col)
return parsing.try_parse_datetime_components(year_col, month_col, day_col,
hour_col, minute_col,
second_col)
def generic_parser(parse_func, *cols):
N = _check_columns(cols)
results = np.empty(N, dtype=object)
for i in range(N):
args = [c[i] for c in cols]
results[i] = parse_func(*args)
return results
def _maybe_cast(arr):
if not arr.dtype.type == np.object_:
arr = np.array(arr, dtype=object)
return arr
def _check_columns(cols):
if not len(cols):
raise AssertionError("There must be at least 1 column")
head, tail = cols[0], cols[1:]
N = len(head)
for i, n in enumerate(map(len, tail)):
if n != N:
raise AssertionError('All columns must have the same length: {0}; '
'column {1} has length {2}'.format(N, i, n))
return N
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,112 @@
""" feather-format compat """
from distutils.version import LooseVersion
from pandas import DataFrame, RangeIndex, Int64Index
from pandas.compat import range
from pandas.io.common import _stringify_path
def _try_import():
# since pandas is a dependency of feather
# we need to import on first use
try:
import feather
except ImportError:
# give a nice error message
raise ImportError("the feather-format library is not installed\n"
"you can install via conda\n"
"conda install feather-format -c conda-forge\n"
"or via pip\n"
"pip install -U feather-format\n")
try:
LooseVersion(feather.__version__) >= LooseVersion('0.3.1')
except AttributeError:
raise ImportError("the feather-format library must be >= "
"version 0.3.1\n"
"you can install via conda\n"
"conda install feather-format -c conda-forge"
"or via pip\n"
"pip install -U feather-format\n")
return feather
def to_feather(df, path):
"""
Write a DataFrame to the feather-format
Parameters
----------
df : DataFrame
path : string file path, or file-like object
"""
path = _stringify_path(path)
if not isinstance(df, DataFrame):
raise ValueError("feather only support IO with DataFrames")
feather = _try_import()
valid_types = {'string', 'unicode'}
# validate index
# --------------
# validate that we have only a default index
# raise on anything else as we don't serialize the index
if not isinstance(df.index, Int64Index):
raise ValueError("feather does not support serializing {} "
"for the index; you can .reset_index()"
"to make the index into column(s)".format(
type(df.index)))
if not df.index.equals(RangeIndex.from_range(range(len(df)))):
raise ValueError("feather does not support serializing a "
"non-default index for the index; you "
"can .reset_index() to make the index "
"into column(s)")
if df.index.name is not None:
raise ValueError("feather does not serialize index meta-data on a "
"default index")
# validate columns
# ----------------
# must have value column names (strings only)
if df.columns.inferred_type not in valid_types:
raise ValueError("feather must have string column names")
feather.write_dataframe(df, path)
def read_feather(path, nthreads=1):
"""
Load a feather-format object from the file path
.. versionadded 0.20.0
Parameters
----------
path : string file path, or file-like object
nthreads : int, default 1
Number of CPU threads to use when reading to pandas.DataFrame
.. versionadded 0.21.0
Returns
-------
type of object stored in file
"""
feather = _try_import()
path = _stringify_path(path)
if LooseVersion(feather.__version__) < LooseVersion('0.4.0'):
return feather.read_dataframe(path)
return feather.read_dataframe(path, nthreads=nthreads)
@@ -0,0 +1,84 @@
"""
Internal module for console introspection
"""
import sys
import locale
from pandas.io.formats.terminal import get_terminal_size
# -----------------------------------------------------------------------------
# Global formatting options
_initial_defencoding = None
def detect_console_encoding():
"""
Try to find the most capable encoding supported by the console.
slightly modified from the way IPython handles the same issue.
"""
global _initial_defencoding
encoding = None
try:
encoding = sys.stdout.encoding or sys.stdin.encoding
except AttributeError:
pass
# try again for something better
if not encoding or 'ascii' in encoding.lower():
try:
encoding = locale.getpreferredencoding()
except Exception:
pass
# when all else fails. this will usually be "ascii"
if not encoding or 'ascii' in encoding.lower():
encoding = sys.getdefaultencoding()
# GH3360, save the reported defencoding at import time
# MPL backends may change it. Make available for debugging.
if not _initial_defencoding:
_initial_defencoding = sys.getdefaultencoding()
return encoding
def get_console_size():
"""Return console size as tuple = (width, height).
Returns (None,None) in non-interactive session.
"""
from pandas import get_option
from pandas.core import common as com
display_width = get_option('display.width')
# deprecated.
display_height = get_option('display.max_rows')
# Consider
# interactive shell terminal, can detect term size
# interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term
# size non-interactive script, should disregard term size
# in addition
# width,height have default values, but setting to 'None' signals
# should use Auto-Detection, But only in interactive shell-terminal.
# Simple. yeah.
if com.in_interactive_session():
if com.in_ipython_frontend():
# sane defaults for interactive non-shell terminal
# match default for width,height in config_init
from pandas.core.config import get_default_val
terminal_width = get_default_val('display.width')
terminal_height = get_default_val('display.max_rows')
else:
# pure terminal
terminal_width, terminal_height = get_terminal_size()
else:
terminal_width, terminal_height = None, None
# Note if the User sets width/Height to None (auto-detection)
# and we're in a script (non-inter), this will return (None,None)
# caller needs to deal.
return (display_width or terminal_width, display_height or terminal_height)
@@ -0,0 +1,250 @@
"""Utilities for interpreting CSS from Stylers for formatting non-HTML outputs
"""
import re
import warnings
class CSSWarning(UserWarning):
"""This CSS syntax cannot currently be parsed"""
pass
class CSSResolver(object):
"""A callable for parsing and resolving CSS to atomic properties
"""
INITIAL_STYLE = {
}
def __call__(self, declarations_str, inherited=None):
""" the given declarations to atomic properties
Parameters
----------
declarations_str : str
A list of CSS declarations
inherited : dict, optional
Atomic properties indicating the inherited style context in which
declarations_str is to be resolved. ``inherited`` should already
be resolved, i.e. valid output of this method.
Returns
-------
props : dict
Atomic CSS 2.2 properties
Examples
--------
>>> resolve = CSSResolver()
>>> inherited = {'font-family': 'serif', 'font-weight': 'bold'}
>>> out = resolve('''
... border-color: BLUE RED;
... font-size: 1em;
... font-size: 2em;
... font-weight: normal;
... font-weight: inherit;
... ''', inherited)
>>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE
[('border-bottom-color', 'blue'),
('border-left-color', 'red'),
('border-right-color', 'red'),
('border-top-color', 'blue'),
('font-family', 'serif'),
('font-size', '24pt'),
('font-weight', 'bold')]
"""
props = dict(self.atomize(self.parse(declarations_str)))
if inherited is None:
inherited = {}
# 1. resolve inherited, initial
for prop, val in inherited.items():
if prop not in props:
props[prop] = val
for prop, val in list(props.items()):
if val == 'inherit':
val = inherited.get(prop, 'initial')
if val == 'initial':
val = self.INITIAL_STYLE.get(prop)
if val is None:
# we do not define a complete initial stylesheet
del props[prop]
else:
props[prop] = val
# 2. resolve relative font size
if props.get('font-size'):
if 'font-size' in inherited:
em_pt = inherited['font-size']
assert em_pt[-2:] == 'pt'
em_pt = float(em_pt[:-2])
else:
em_pt = None
props['font-size'] = self.size_to_pt(
props['font-size'], em_pt, conversions=self.FONT_SIZE_RATIOS)
font_size = float(props['font-size'][:-2])
else:
font_size = None
# 3. TODO: resolve other font-relative units
for side in self.SIDES:
prop = 'border-{side}-width'.format(side=side)
if prop in props:
props[prop] = self.size_to_pt(
props[prop], em_pt=font_size,
conversions=self.BORDER_WIDTH_RATIOS)
for prop in ['margin-{side}'.format(side=side),
'padding-{side}'.format(side=side)]:
if prop in props:
# TODO: support %
props[prop] = self.size_to_pt(
props[prop], em_pt=font_size,
conversions=self.MARGIN_RATIOS)
return props
UNIT_RATIOS = {
'rem': ('pt', 12),
'ex': ('em', .5),
# 'ch':
'px': ('pt', .75),
'pc': ('pt', 12),
'in': ('pt', 72),
'cm': ('in', 1 / 2.54),
'mm': ('in', 1 / 25.4),
'q': ('mm', .25),
'!!default': ('em', 0),
}
FONT_SIZE_RATIOS = UNIT_RATIOS.copy()
FONT_SIZE_RATIOS.update({
'%': ('em', .01),
'xx-small': ('rem', .5),
'x-small': ('rem', .625),
'small': ('rem', .8),
'medium': ('rem', 1),
'large': ('rem', 1.125),
'x-large': ('rem', 1.5),
'xx-large': ('rem', 2),
'smaller': ('em', 1 / 1.2),
'larger': ('em', 1.2),
'!!default': ('em', 1),
})
MARGIN_RATIOS = UNIT_RATIOS.copy()
MARGIN_RATIOS.update({
'none': ('pt', 0),
})
BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy()
BORDER_WIDTH_RATIOS.update({
'none': ('pt', 0),
'thick': ('px', 4),
'medium': ('px', 2),
'thin': ('px', 1),
# Default: medium only if solid
})
def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS):
def _error():
warnings.warn('Unhandled size: {val!r}'.format(val=in_val),
CSSWarning)
return self.size_to_pt('1!!default', conversions=conversions)
try:
val, unit = re.match(r'^(\S*?)([a-zA-Z%!].*)', in_val).groups()
except AttributeError:
return _error()
if val == '':
# hack for 'large' etc.
val = 1
else:
try:
val = float(val)
except ValueError:
return _error()
while unit != 'pt':
if unit == 'em':
if em_pt is None:
unit = 'rem'
else:
val *= em_pt
unit = 'pt'
continue
try:
unit, mul = conversions[unit]
except KeyError:
return _error()
val *= mul
val = round(val, 5)
if int(val) == val:
size_fmt = '{fmt:d}pt'.format(fmt=int(val))
else:
size_fmt = '{fmt:f}pt'.format(fmt=val)
return size_fmt
def atomize(self, declarations):
for prop, value in declarations:
attr = 'expand_' + prop.replace('-', '_')
try:
expand = getattr(self, attr)
except AttributeError:
yield prop, value
else:
for prop, value in expand(prop, value):
yield prop, value
SIDE_SHORTHANDS = {
1: [0, 0, 0, 0],
2: [0, 1, 0, 1],
3: [0, 1, 2, 1],
4: [0, 1, 2, 3],
}
SIDES = ('top', 'right', 'bottom', 'left')
def _side_expander(prop_fmt):
def expand(self, prop, value):
tokens = value.split()
try:
mapping = self.SIDE_SHORTHANDS[len(tokens)]
except KeyError:
warnings.warn('Could not expand "{prop}: {val}"'
.format(prop=prop, val=value), CSSWarning)
return
for key, idx in zip(self.SIDES, mapping):
yield prop_fmt.format(key), tokens[idx]
return expand
expand_border_color = _side_expander('border-{:s}-color')
expand_border_style = _side_expander('border-{:s}-style')
expand_border_width = _side_expander('border-{:s}-width')
expand_margin = _side_expander('margin-{:s}')
expand_padding = _side_expander('padding-{:s}')
def parse(self, declarations_str):
"""Generates (prop, value) pairs from declarations
In a future version may generate parsed tokens from tinycss/tinycss2
"""
for decl in declarations_str.split(';'):
if not decl.strip():
continue
prop, sep, val = decl.partition(':')
prop = prop.strip().lower()
# TODO: don't lowercase case sensitive parts of values (strings)
val = val.strip().lower()
if sep:
yield prop, val
else:
warnings.warn('Ill-formatted attribute: expected a colon '
'in {decl!r}'.format(decl=decl), CSSWarning)
@@ -0,0 +1,313 @@
# -*- coding: utf-8 -*-
"""
Module for formatting output data into CSV files.
"""
from __future__ import print_function
import warnings
import csv as csvlib
from zipfile import ZipFile
import numpy as np
from pandas.core.dtypes.missing import notna
from pandas.core.index import Index, MultiIndex
from pandas import compat
from pandas.compat import (StringIO, range, zip)
from pandas.io.common import (_get_handle, UnicodeWriter, _expand_user,
_stringify_path)
from pandas._libs import writers as libwriters
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.period import PeriodIndex
class CSVFormatter(object):
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
float_format=None, cols=None, header=True, index=True,
index_label=None, mode='w', nanRep=None, encoding=None,
compression=None, quoting=None, line_terminator='\n',
chunksize=None, tupleize_cols=False, quotechar='"',
date_format=None, doublequote=True, escapechar=None,
decimal='.'):
self.obj = obj
if path_or_buf is None:
path_or_buf = StringIO()
self.path_or_buf = _expand_user(_stringify_path(path_or_buf))
self.sep = sep
self.na_rep = na_rep
self.float_format = float_format
self.decimal = decimal
self.header = header
self.index = index
self.index_label = index_label
self.mode = mode
self.encoding = encoding
self.compression = compression
if quoting is None:
quoting = csvlib.QUOTE_MINIMAL
self.quoting = quoting
if quoting == csvlib.QUOTE_NONE:
# prevents crash in _csv
quotechar = None
self.quotechar = quotechar
self.doublequote = doublequote
self.escapechar = escapechar
self.line_terminator = line_terminator
self.date_format = date_format
self.tupleize_cols = tupleize_cols
self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and
not self.tupleize_cols)
# validate mi options
if self.has_mi_columns:
if cols is not None:
raise TypeError("cannot specify cols with a MultiIndex on the "
"columns")
if cols is not None:
if isinstance(cols, Index):
cols = cols.to_native_types(na_rep=na_rep,
float_format=float_format,
date_format=date_format,
quoting=self.quoting)
else:
cols = list(cols)
self.obj = self.obj.loc[:, cols]
# update columns to include possible multiplicity of dupes
# and make sure sure cols is just a list of labels
cols = self.obj.columns
if isinstance(cols, Index):
cols = cols.to_native_types(na_rep=na_rep,
float_format=float_format,
date_format=date_format,
quoting=self.quoting)
else:
cols = list(cols)
# save it
self.cols = cols
# preallocate data 2d list
self.blocks = self.obj._data.blocks
ncols = sum(b.shape[0] for b in self.blocks)
self.data = [None] * ncols
if chunksize is None:
chunksize = (100000 // (len(self.cols) or 1)) or 1
self.chunksize = int(chunksize)
self.data_index = obj.index
if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and
date_format is not None):
self.data_index = Index([x.strftime(date_format) if notna(x) else
'' for x in self.data_index])
self.nlevels = getattr(self.data_index, 'nlevels', 1)
if not index:
self.nlevels = 0
def save(self):
# create the writer & save
if self.encoding is None:
if compat.PY2:
encoding = 'ascii'
else:
encoding = 'utf-8'
else:
encoding = self.encoding
# GH 21227 internal compression is not used when file-like passed.
if self.compression and hasattr(self.path_or_buf, 'write'):
msg = ("compression has no effect when passing file-like "
"object as input.")
warnings.warn(msg, RuntimeWarning, stacklevel=2)
# when zip compression is called.
is_zip = isinstance(self.path_or_buf, ZipFile) or (
not hasattr(self.path_or_buf, 'write')
and self.compression == 'zip')
if is_zip:
# zipfile doesn't support writing string to archive. uses string
# buffer to receive csv writing and dump into zip compression
# file handle. GH 21241, 21118
f = StringIO()
close = False
elif hasattr(self.path_or_buf, 'write'):
f = self.path_or_buf
close = False
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
close = True
try:
writer_kwargs = dict(lineterminator=self.line_terminator,
delimiter=self.sep, quoting=self.quoting,
doublequote=self.doublequote,
escapechar=self.escapechar,
quotechar=self.quotechar)
if encoding == 'ascii':
self.writer = csvlib.writer(f, **writer_kwargs)
else:
writer_kwargs['encoding'] = encoding
self.writer = UnicodeWriter(f, **writer_kwargs)
self._save()
finally:
if is_zip:
# GH 17778 handles zip compression separately.
buf = f.getvalue()
if hasattr(self.path_or_buf, 'write'):
self.path_or_buf.write(buf)
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
f.write(buf)
close = True
if close:
f.close()
for _fh in handles:
_fh.close()
def _save_header(self):
writer = self.writer
obj = self.obj
index_label = self.index_label
cols = self.cols
has_mi_columns = self.has_mi_columns
header = self.header
encoded_labels = []
has_aliases = isinstance(header, (tuple, list, np.ndarray, Index))
if not (has_aliases or self.header):
return
if has_aliases:
if len(header) != len(cols):
raise ValueError(('Writing {ncols} cols but got {nalias} '
'aliases'.format(ncols=len(cols),
nalias=len(header))))
else:
write_cols = header
else:
write_cols = cols
if self.index:
# should write something for index label
if index_label is not False:
if index_label is None:
if isinstance(obj.index, MultiIndex):
index_label = []
for i, name in enumerate(obj.index.names):
if name is None:
name = ''
index_label.append(name)
else:
index_label = obj.index.name
if index_label is None:
index_label = ['']
else:
index_label = [index_label]
elif not isinstance(index_label,
(list, tuple, np.ndarray, Index)):
# given a string for a DF with Index
index_label = [index_label]
encoded_labels = list(index_label)
else:
encoded_labels = []
if not has_mi_columns or has_aliases:
encoded_labels += list(write_cols)
writer.writerow(encoded_labels)
else:
# write out the mi
columns = obj.columns
# write out the names for each level, then ALL of the values for
# each level
for i in range(columns.nlevels):
# we need at least 1 index column to write our col names
col_line = []
if self.index:
# name is the first column
col_line.append(columns.names[i])
if isinstance(index_label, list) and len(index_label) > 1:
col_line.extend([''] * (len(index_label) - 1))
col_line.extend(columns._get_level_values(i))
writer.writerow(col_line)
# Write out the index line if it's not empty.
# Otherwise, we will print out an extraneous
# blank line between the mi and the data rows.
if encoded_labels and set(encoded_labels) != set(['']):
encoded_labels.extend([''] * len(columns))
writer.writerow(encoded_labels)
def _save(self):
self._save_header()
nrows = len(self.data_index)
# write in chunksize bites
chunksize = self.chunksize
chunks = int(nrows / chunksize) + 1
for i in range(chunks):
start_i = i * chunksize
end_i = min((i + 1) * chunksize, nrows)
if start_i >= end_i:
break
self._save_chunk(start_i, end_i)
def _save_chunk(self, start_i, end_i):
data_index = self.data_index
# create the data for a chunk
slicer = slice(start_i, end_i)
for i in range(len(self.blocks)):
b = self.blocks[i]
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
quoting=self.quoting)
for col_loc, col in zip(b.mgr_locs, d):
# self.data is a preallocated list
self.data[col_loc] = col
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
quoting=self.quoting)
libwriters.write_csv_rows(self.data, ix, self.nlevels,
self.cols, self.writer)
@@ -0,0 +1,654 @@
"""Utilities for conversion to writer-agnostic Excel representation
"""
import re
import warnings
import itertools
import numpy as np
from pandas.compat import reduce
from pandas.io.formats.css import CSSResolver, CSSWarning
from pandas.io.formats.printing import pprint_thing
import pandas.core.common as com
from pandas.core.dtypes.common import is_float, is_scalar
from pandas.core.dtypes import missing
from pandas import Index, MultiIndex, PeriodIndex
from pandas.io.formats.format import get_level_lengths
class ExcelCell(object):
__fields__ = ('row', 'col', 'val', 'style', 'mergestart', 'mergeend')
__slots__ = __fields__
def __init__(self, row, col, val, style=None, mergestart=None,
mergeend=None):
self.row = row
self.col = col
self.val = val
self.style = style
self.mergestart = mergestart
self.mergeend = mergeend
header_style = {"font": {"bold": True},
"borders": {"top": "thin",
"right": "thin",
"bottom": "thin",
"left": "thin"},
"alignment": {"horizontal": "center",
"vertical": "top"}}
class CSSToExcelConverter(object):
"""A callable for converting CSS declarations to ExcelWriter styles
Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow),
focusing on font styling, backgrounds, borders and alignment.
Operates by first computing CSS styles in a fairly generic
way (see :meth:`compute_css`) then determining Excel style
properties from CSS properties (see :meth:`build_xlstyle`).
Parameters
----------
inherited : str, optional
CSS declarations understood to be the containing scope for the
CSS processed by :meth:`__call__`.
"""
# NB: Most of the methods here could be classmethods, as only __init__
# and __call__ make use of instance attributes. We leave them as
# instancemethods so that users can easily experiment with extensions
# without monkey-patching.
def __init__(self, inherited=None):
if inherited is not None:
inherited = self.compute_css(inherited,
self.compute_css.INITIAL_STYLE)
self.inherited = inherited
compute_css = CSSResolver()
def __call__(self, declarations_str):
"""Convert CSS declarations to ExcelWriter style
Parameters
----------
declarations_str : str
List of CSS declarations.
e.g. "font-weight: bold; background: blue"
Returns
-------
xlstyle : dict
A style as interpreted by ExcelWriter when found in
ExcelCell.style.
"""
# TODO: memoize?
properties = self.compute_css(declarations_str, self.inherited)
return self.build_xlstyle(properties)
def build_xlstyle(self, props):
out = {
'alignment': self.build_alignment(props),
'border': self.build_border(props),
'fill': self.build_fill(props),
'font': self.build_font(props),
}
# TODO: support number format
# TODO: handle cell width and height: needs support in pandas.io.excel
def remove_none(d):
"""Remove key where value is None, through nested dicts"""
for k, v in list(d.items()):
if v is None:
del d[k]
elif isinstance(v, dict):
remove_none(v)
if not v:
del d[k]
remove_none(out)
return out
VERTICAL_MAP = {
'top': 'top',
'text-top': 'top',
'middle': 'center',
'baseline': 'bottom',
'bottom': 'bottom',
'text-bottom': 'bottom',
# OpenXML also has 'justify', 'distributed'
}
def build_alignment(self, props):
# TODO: text-indent, padding-left -> alignment.indent
return {'horizontal': props.get('text-align'),
'vertical': self.VERTICAL_MAP.get(props.get('vertical-align')),
'wrap_text': (None if props.get('white-space') is None else
props['white-space'] not in
('nowrap', 'pre', 'pre-line'))
}
def build_border(self, props):
return {side: {
'style': self._border_style(props.get('border-{side}-style'
.format(side=side)),
props.get('border-{side}-width'
.format(side=side))),
'color': self.color_to_excel(
props.get('border-{side}-color'.format(side=side))),
} for side in ['top', 'right', 'bottom', 'left']}
def _border_style(self, style, width):
# convert styles and widths to openxml, one of:
# 'dashDot'
# 'dashDotDot'
# 'dashed'
# 'dotted'
# 'double'
# 'hair'
# 'medium'
# 'mediumDashDot'
# 'mediumDashDotDot'
# 'mediumDashed'
# 'slantDashDot'
# 'thick'
# 'thin'
if width is None and style is None:
return None
if style == 'none' or style == 'hidden':
return None
if width is None:
width = '2pt'
width = float(width[:-2])
if width < 1e-5:
return None
elif width < 1.3:
width_name = 'thin'
elif width < 2.8:
width_name = 'medium'
else:
width_name = 'thick'
if style in (None, 'groove', 'ridge', 'inset', 'outset'):
# not handled
style = 'solid'
if style == 'double':
return 'double'
if style == 'solid':
return width_name
if style == 'dotted':
if width_name in ('hair', 'thin'):
return 'dotted'
return 'mediumDashDotDot'
if style == 'dashed':
if width_name in ('hair', 'thin'):
return 'dashed'
return 'mediumDashed'
def build_fill(self, props):
# TODO: perhaps allow for special properties
# -excel-pattern-bgcolor and -excel-pattern-type
fill_color = props.get('background-color')
if fill_color not in (None, 'transparent', 'none'):
return {
'fgColor': self.color_to_excel(fill_color),
'patternType': 'solid',
}
BOLD_MAP = {'bold': True, 'bolder': True, '600': True, '700': True,
'800': True, '900': True,
'normal': False, 'lighter': False, '100': False, '200': False,
'300': False, '400': False, '500': False}
ITALIC_MAP = {'normal': False, 'italic': True, 'oblique': True}
def build_font(self, props):
size = props.get('font-size')
if size is not None:
assert size.endswith('pt')
size = float(size[:-2])
font_names_tmp = re.findall(r'''(?x)
(
"(?:[^"]|\\")+"
|
'(?:[^']|\\')+'
|
[^'",]+
)(?=,|\s*$)
''', props.get('font-family', ''))
font_names = []
for name in font_names_tmp:
if name[:1] == '"':
name = name[1:-1].replace('\\"', '"')
elif name[:1] == '\'':
name = name[1:-1].replace('\\\'', '\'')
else:
name = name.strip()
if name:
font_names.append(name)
family = None
for name in font_names:
if name == 'serif':
family = 1 # roman
break
elif name == 'sans-serif':
family = 2 # swiss
break
elif name == 'cursive':
family = 4 # script
break
elif name == 'fantasy':
family = 5 # decorative
break
decoration = props.get('text-decoration')
if decoration is not None:
decoration = decoration.split()
else:
decoration = ()
return {
'name': font_names[0] if font_names else None,
'family': family,
'size': size,
'bold': self.BOLD_MAP.get(props.get('font-weight')),
'italic': self.ITALIC_MAP.get(props.get('font-style')),
'underline': ('single' if
'underline' in decoration
else None),
'strike': ('line-through' in decoration) or None,
'color': self.color_to_excel(props.get('color')),
# shadow if nonzero digit before shadow color
'shadow': (bool(re.search('^[^#(]*[1-9]',
props['text-shadow']))
if 'text-shadow' in props else None),
# 'vertAlign':,
# 'charset': ,
# 'scheme': ,
# 'outline': ,
# 'condense': ,
}
NAMED_COLORS = {
'maroon': '800000',
'brown': 'A52A2A',
'red': 'FF0000',
'pink': 'FFC0CB',
'orange': 'FFA500',
'yellow': 'FFFF00',
'olive': '808000',
'green': '008000',
'purple': '800080',
'fuchsia': 'FF00FF',
'lime': '00FF00',
'teal': '008080',
'aqua': '00FFFF',
'blue': '0000FF',
'navy': '000080',
'black': '000000',
'gray': '808080',
'grey': '808080',
'silver': 'C0C0C0',
'white': 'FFFFFF',
}
def color_to_excel(self, val):
if val is None:
return None
if val.startswith('#') and len(val) == 7:
return val[1:].upper()
if val.startswith('#') and len(val) == 4:
return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper()
try:
return self.NAMED_COLORS[val]
except KeyError:
warnings.warn('Unhandled color format: {val!r}'.format(val=val),
CSSWarning)
class ExcelFormatter(object):
"""
Class for formatting a DataFrame to a list of ExcelCells,
Parameters
----------
df : DataFrame or Styler
na_rep: na representation
float_format : string, default None
Format string for floating point numbers
cols : sequence, optional
Columns to write
header : boolean or list of string, default True
Write out column names. If a list of string is given it is
assumed to be aliases for the column names
index : boolean, default True
output row names (index)
index_label : string or sequence, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the DataFrame uses MultiIndex.
merge_cells : boolean, default False
Format MultiIndex and Hierarchical Rows as merged cells.
inf_rep : string, default `'inf'`
representation for np.inf values (which aren't representable in Excel)
A `'-'` sign will be added in front of -inf.
style_converter : callable, optional
This translates Styler styles (CSS) into ExcelWriter styles.
Defaults to ``CSSToExcelConverter()``.
It should have signature css_declarations string -> excel style.
This is only called for body cells.
"""
def __init__(self, df, na_rep='', float_format=None, cols=None,
header=True, index=True, index_label=None, merge_cells=False,
inf_rep='inf', style_converter=None):
self.rowcounter = 0
self.na_rep = na_rep
if hasattr(df, 'render'):
self.styler = df
df = df.data
if style_converter is None:
style_converter = CSSToExcelConverter()
self.style_converter = style_converter
else:
self.styler = None
self.df = df
if cols is not None:
# all missing, raise
if not len(Index(cols) & df.columns):
raise KeyError(
"passes columns are not ALL present dataframe")
# deprecatedin gh-17295
# 1 missing is ok (for now)
if len(Index(cols) & df.columns) != len(cols):
warnings.warn(
"Not all names specified in 'columns' are found; "
"this will raise a KeyError in the future",
FutureWarning)
self.df = df.reindex(columns=cols)
self.columns = self.df.columns
self.float_format = float_format
self.index = index
self.index_label = index_label
self.header = header
self.merge_cells = merge_cells
self.inf_rep = inf_rep
def _format_value(self, val):
if is_scalar(val) and missing.isna(val):
val = self.na_rep
elif is_float(val):
if missing.isposinf_scalar(val):
val = self.inf_rep
elif missing.isneginf_scalar(val):
val = '-{inf}'.format(inf=self.inf_rep)
elif self.float_format is not None:
val = float(self.float_format % val)
return val
def _format_header_mi(self):
if self.columns.nlevels > 1:
if not self.index:
raise NotImplementedError("Writing to Excel with MultiIndex"
" columns and no index "
"('index'=False) is not yet "
"implemented.")
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if not (has_aliases or self.header):
return
columns = self.columns
level_strs = columns.format(sparsify=self.merge_cells, adjoin=False,
names=False)
level_lengths = get_level_lengths(level_strs)
coloffset = 0
lnum = 0
if self.index and isinstance(self.df.index, MultiIndex):
coloffset = len(self.df.index[0]) - 1
if self.merge_cells:
# Format multi-index as a merged cells.
for lnum in range(len(level_lengths)):
name = columns.names[lnum]
yield ExcelCell(lnum, coloffset, name, header_style)
for lnum, (spans, levels, labels) in enumerate(zip(
level_lengths, columns.levels, columns.labels)):
values = levels.take(labels)
for i in spans:
if spans[i] > 1:
yield ExcelCell(lnum, coloffset + i + 1, values[i],
header_style, lnum,
coloffset + i + spans[i])
else:
yield ExcelCell(lnum, coloffset + i + 1, values[i],
header_style)
else:
# Format in legacy format with dots to indicate levels.
for i, values in enumerate(zip(*level_strs)):
v = ".".join(map(pprint_thing, values))
yield ExcelCell(lnum, coloffset + i + 1, v, header_style)
self.rowcounter = lnum
def _format_header_regular(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if has_aliases or self.header:
coloffset = 0
if self.index:
coloffset = 1
if isinstance(self.df.index, MultiIndex):
coloffset = len(self.df.index[0])
colnames = self.columns
if has_aliases:
if len(self.header) != len(self.columns):
raise ValueError('Writing {cols} cols but got {alias} '
'aliases'.format(cols=len(self.columns),
alias=len(self.header)))
else:
colnames = self.header
for colindex, colname in enumerate(colnames):
yield ExcelCell(self.rowcounter, colindex + coloffset, colname,
header_style)
def _format_header(self):
if isinstance(self.columns, MultiIndex):
gen = self._format_header_mi()
else:
gen = self._format_header_regular()
gen2 = ()
if self.df.index.names:
row = [x if x is not None else ''
for x in self.df.index.names] + [''] * len(self.columns)
if reduce(lambda x, y: x and y, map(lambda x: x != '', row)):
gen2 = (ExcelCell(self.rowcounter, colindex, val, header_style)
for colindex, val in enumerate(row))
self.rowcounter += 1
return itertools.chain(gen, gen2)
def _format_body(self):
if isinstance(self.df.index, MultiIndex):
return self._format_hierarchical_rows()
else:
return self._format_regular_rows()
def _format_regular_rows(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if has_aliases or self.header:
self.rowcounter += 1
# output index and index_label?
if self.index:
# check aliases
# if list only take first as this is not a MultiIndex
if (self.index_label and
isinstance(self.index_label, (list, tuple, np.ndarray,
Index))):
index_label = self.index_label[0]
# if string good to go
elif self.index_label and isinstance(self.index_label, str):
index_label = self.index_label
else:
index_label = self.df.index.names[0]
if isinstance(self.columns, MultiIndex):
self.rowcounter += 1
if index_label and self.header is not False:
yield ExcelCell(self.rowcounter - 1, 0, index_label,
header_style)
# write index_values
index_values = self.df.index
if isinstance(self.df.index, PeriodIndex):
index_values = self.df.index.to_timestamp()
for idx, idxval in enumerate(index_values):
yield ExcelCell(self.rowcounter + idx, 0, idxval, header_style)
coloffset = 1
else:
coloffset = 0
for cell in self._generate_body(coloffset):
yield cell
def _format_hierarchical_rows(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if has_aliases or self.header:
self.rowcounter += 1
gcolidx = 0
if self.index:
index_labels = self.df.index.names
# check for aliases
if (self.index_label and
isinstance(self.index_label, (list, tuple, np.ndarray,
Index))):
index_labels = self.index_label
# MultiIndex columns require an extra row
# with index names (blank if None) for
# unambigous round-trip, unless not merging,
# in which case the names all go on one row Issue #11328
if isinstance(self.columns, MultiIndex) and self.merge_cells:
self.rowcounter += 1
# if index labels are not empty go ahead and dump
if com._any_not_none(*index_labels) and self.header is not False:
for cidx, name in enumerate(index_labels):
yield ExcelCell(self.rowcounter - 1, cidx, name,
header_style)
if self.merge_cells:
# Format hierarchical rows as merged cells.
level_strs = self.df.index.format(sparsify=True, adjoin=False,
names=False)
level_lengths = get_level_lengths(level_strs)
for spans, levels, labels in zip(level_lengths,
self.df.index.levels,
self.df.index.labels):
values = levels.take(labels,
allow_fill=levels._can_hold_na,
fill_value=True)
for i in spans:
if spans[i] > 1:
yield ExcelCell(self.rowcounter + i, gcolidx,
values[i], header_style,
self.rowcounter + i + spans[i] - 1,
gcolidx)
else:
yield ExcelCell(self.rowcounter + i, gcolidx,
values[i], header_style)
gcolidx += 1
else:
# Format hierarchical rows with non-merged values.
for indexcolvals in zip(*self.df.index):
for idx, indexcolval in enumerate(indexcolvals):
yield ExcelCell(self.rowcounter + idx, gcolidx,
indexcolval, header_style)
gcolidx += 1
for cell in self._generate_body(gcolidx):
yield cell
def _generate_body(self, coloffset):
if self.styler is None:
styles = None
else:
styles = self.styler._compute().ctx
if not styles:
styles = None
xlstyle = None
# Write the body of the frame data series by series.
for colidx in range(len(self.columns)):
series = self.df.iloc[:, colidx]
for i, val in enumerate(series):
if styles is not None:
xlstyle = self.style_converter(';'.join(styles[i, colidx]))
yield ExcelCell(self.rowcounter + i, colidx + coloffset, val,
xlstyle)
def get_formatted_cells(self):
for cell in itertools.chain(self._format_header(),
self._format_body()):
cell.val = self._format_value(cell.val)
yield cell
def write(self, writer, sheet_name='Sheet1', startrow=0,
startcol=0, freeze_panes=None, engine=None):
"""
writer : string or ExcelWriter object
File path or existing ExcelWriter
sheet_name : string, default 'Sheet1'
Name of sheet which will contain DataFrame
startrow :
upper left cell row to dump data frame
startcol :
upper left cell column to dump data frame
freeze_panes : tuple of integer (length 2), default None
Specifies the one-based bottommost row and rightmost column that
is to be frozen
engine : string, default None
write engine to use if writer is a path - you can also set this
via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``,
and ``io.excel.xlsm.writer``.
"""
from pandas.io.excel import ExcelWriter
from pandas.io.common import _stringify_path
if isinstance(writer, ExcelWriter):
need_save = False
else:
writer = ExcelWriter(_stringify_path(writer), engine=engine)
need_save = True
formatted_cells = self.get_formatted_cells()
writer.write_cells(formatted_cells, sheet_name,
startrow=startrow, startcol=startcol,
freeze_panes=freeze_panes)
if need_save:
writer.save()
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,506 @@
# -*- coding: utf-8 -*-
"""
Module for formatting output data in HTML.
"""
from __future__ import print_function
from distutils.version import LooseVersion
from textwrap import dedent
import pandas.core.common as com
from pandas.core.index import MultiIndex
from pandas import compat
from pandas.compat import (lzip, range, map, zip, u,
OrderedDict, unichr)
from pandas.core.config import get_option
from pandas.io.formats.printing import pprint_thing
from pandas.io.formats.format import (get_level_lengths,
buffer_put_lines)
from pandas.io.formats.format import TableFormatter
class HTMLFormatter(TableFormatter):
indent_delta = 2
def __init__(self, formatter, classes=None, max_rows=None, max_cols=None,
notebook=False, border=None, table_id=None):
self.fmt = formatter
self.classes = classes
self.frame = self.fmt.frame
self.columns = self.fmt.tr_frame.columns
self.elements = []
self.bold_rows = self.fmt.kwds.get('bold_rows', False)
self.escape = self.fmt.kwds.get('escape', True)
self.max_rows = max_rows or len(self.fmt.frame)
self.max_cols = max_cols or len(self.fmt.columns)
self.show_dimensions = self.fmt.show_dimensions
self.is_truncated = (self.max_rows < len(self.fmt.frame) or
self.max_cols < len(self.fmt.columns))
self.notebook = notebook
if border is None:
border = get_option('display.html.border')
self.border = border
self.table_id = table_id
def write(self, s, indent=0):
rs = pprint_thing(s)
self.elements.append(' ' * indent + rs)
def write_th(self, s, indent=0, tags=None):
if self.fmt.col_space is not None and self.fmt.col_space > 0:
tags = (tags or "")
tags += ('style="min-width: {colspace};"'
.format(colspace=self.fmt.col_space))
return self._write_cell(s, kind='th', indent=indent, tags=tags)
def write_td(self, s, indent=0, tags=None):
return self._write_cell(s, kind='td', indent=indent, tags=tags)
def _write_cell(self, s, kind='td', indent=0, tags=None):
if tags is not None:
start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags)
else:
start_tag = '<{kind}>'.format(kind=kind)
if self.escape:
# escape & first to prevent double escaping of &
esc = OrderedDict([('&', r'&amp;'), ('<', r'&lt;'),
('>', r'&gt;')])
else:
esc = {}
rs = pprint_thing(s, escape_chars=esc).strip()
self.write(u'{start}{rs}</{kind}>'
.format(start=start_tag, rs=rs, kind=kind), indent)
def write_tr(self, line, indent=0, indent_delta=4, header=False,
align=None, tags=None, nindex_levels=0):
if tags is None:
tags = {}
if align is None:
self.write('<tr>', indent)
else:
self.write('<tr style="text-align: {align};">'
.format(align=align), indent)
indent += indent_delta
for i, s in enumerate(line):
val_tag = tags.get(i, None)
if header or (self.bold_rows and i < nindex_levels):
self.write_th(s, indent, tags=val_tag)
else:
self.write_td(s, indent, tags=val_tag)
indent -= indent_delta
self.write('</tr>', indent)
def write_style(self):
# We use the "scoped" attribute here so that the desired
# style properties for the data frame are not then applied
# throughout the entire notebook.
template_first = """\
<style scoped>"""
template_last = """\
</style>"""
template_select = """\
.dataframe %s {
%s: %s;
}"""
element_props = [('tbody tr th:only-of-type',
'vertical-align',
'middle'),
('tbody tr th',
'vertical-align',
'top')]
if isinstance(self.columns, MultiIndex):
element_props.append(('thead tr th',
'text-align',
'left'))
if all((self.fmt.has_index_names,
self.fmt.index,
self.fmt.show_index_names)):
element_props.append(('thead tr:last-of-type th',
'text-align',
'right'))
else:
element_props.append(('thead th',
'text-align',
'right'))
template_mid = '\n\n'.join(map(lambda t: template_select % t,
element_props))
template = dedent('\n'.join((template_first,
template_mid,
template_last)))
if self.notebook:
self.write(template)
def write_result(self, buf):
indent = 0
id_section = ""
frame = self.frame
_classes = ['dataframe'] # Default class.
use_mathjax = get_option("display.html.use_mathjax")
if not use_mathjax:
_classes.append('tex2jax_ignore')
if self.classes is not None:
if isinstance(self.classes, str):
self.classes = self.classes.split()
if not isinstance(self.classes, (list, tuple)):
raise AssertionError('classes must be list or tuple, not {typ}'
.format(typ=type(self.classes)))
_classes.extend(self.classes)
if self.notebook:
div_style = ''
try:
import IPython
if IPython.__version__ < LooseVersion('3.0.0'):
div_style = ' style="max-width:1500px;overflow:auto;"'
except (ImportError, AttributeError):
pass
self.write('<div{style}>'.format(style=div_style))
self.write_style()
if self.table_id is not None:
id_section = ' id="{table_id}"'.format(table_id=self.table_id)
self.write('<table border="{border}" class="{cls}"{id_section}>'
.format(border=self.border, cls=' '.join(_classes),
id_section=id_section), indent)
indent += self.indent_delta
indent = self._write_header(indent)
indent = self._write_body(indent)
self.write('</table>', indent)
if self.should_show_dimensions:
by = chr(215) if compat.PY3 else unichr(215) # ×
self.write(u('<p>{rows} rows {by} {cols} columns</p>')
.format(rows=len(frame),
by=by,
cols=len(frame.columns)))
if self.notebook:
self.write('</div>')
buffer_put_lines(buf, self.elements)
def _write_header(self, indent):
truncate_h = self.fmt.truncate_h
row_levels = self.frame.index.nlevels
if not self.fmt.header:
# write nothing
return indent
def _column_header():
if self.fmt.index:
row = [''] * (self.frame.index.nlevels - 1)
else:
row = []
if isinstance(self.columns, MultiIndex):
if self.fmt.has_column_names and self.fmt.index:
row.append(single_column_table(self.columns.names))
else:
row.append('')
style = "text-align: {just};".format(just=self.fmt.justify)
row.extend([single_column_table(c, self.fmt.justify, style)
for c in self.columns])
else:
if self.fmt.index:
row.append(self.columns.name or '')
row.extend(self.columns)
return row
self.write('<thead>', indent)
row = []
indent += self.indent_delta
if isinstance(self.columns, MultiIndex):
template = 'colspan="{span:d}" halign="left"'
if self.fmt.sparsify:
# GH3547
sentinel = com.sentinel_factory()
else:
sentinel = None
levels = self.columns.format(sparsify=sentinel, adjoin=False,
names=False)
level_lengths = get_level_lengths(levels, sentinel)
inner_lvl = len(level_lengths) - 1
for lnum, (records, values) in enumerate(zip(level_lengths,
levels)):
if truncate_h:
# modify the header lines
ins_col = self.fmt.tr_col_num
if self.fmt.sparsify:
recs_new = {}
# Increment tags after ... col.
for tag, span in list(records.items()):
if tag >= ins_col:
recs_new[tag + 1] = span
elif tag + span > ins_col:
recs_new[tag] = span + 1
if lnum == inner_lvl:
values = (values[:ins_col] + (u('...'),) +
values[ins_col:])
else:
# sparse col headers do not receive a ...
values = (values[:ins_col] +
(values[ins_col - 1], ) +
values[ins_col:])
else:
recs_new[tag] = span
# if ins_col lies between tags, all col headers
# get ...
if tag + span == ins_col:
recs_new[ins_col] = 1
values = (values[:ins_col] + (u('...'),) +
values[ins_col:])
records = recs_new
inner_lvl = len(level_lengths) - 1
if lnum == inner_lvl:
records[ins_col] = 1
else:
recs_new = {}
for tag, span in list(records.items()):
if tag >= ins_col:
recs_new[tag + 1] = span
else:
recs_new[tag] = span
recs_new[ins_col] = 1
records = recs_new
values = (values[:ins_col] + [u('...')] +
values[ins_col:])
name = self.columns.names[lnum]
row = [''] * (row_levels - 1) + ['' if name is None else
pprint_thing(name)]
if row == [""] and self.fmt.index is False:
row = []
tags = {}
j = len(row)
for i, v in enumerate(values):
if i in records:
if records[i] > 1:
tags[j] = template.format(span=records[i])
else:
continue
j += 1
row.append(v)
self.write_tr(row, indent, self.indent_delta, tags=tags,
header=True)
else:
col_row = _column_header()
align = self.fmt.justify
if truncate_h:
ins_col = row_levels + self.fmt.tr_col_num
col_row.insert(ins_col, '...')
self.write_tr(col_row, indent, self.indent_delta, header=True,
align=align)
if all((self.fmt.has_index_names,
self.fmt.index,
self.fmt.show_index_names)):
row = ([x if x is not None else ''
for x in self.frame.index.names] +
[''] * min(len(self.columns), self.max_cols))
if truncate_h:
ins_col = row_levels + self.fmt.tr_col_num
row.insert(ins_col, '')
self.write_tr(row, indent, self.indent_delta, header=True)
indent -= self.indent_delta
self.write('</thead>', indent)
return indent
def _write_body(self, indent):
self.write('<tbody>', indent)
indent += self.indent_delta
fmt_values = {}
for i in range(min(len(self.columns), self.max_cols)):
fmt_values[i] = self.fmt._format_col(i)
# write values
if self.fmt.index:
if isinstance(self.frame.index, MultiIndex):
self._write_hierarchical_rows(fmt_values, indent)
else:
self._write_regular_rows(fmt_values, indent)
else:
for i in range(min(len(self.frame), self.max_rows)):
row = [fmt_values[j][i] for j in range(len(self.columns))]
self.write_tr(row, indent, self.indent_delta, tags=None)
indent -= self.indent_delta
self.write('</tbody>', indent)
indent -= self.indent_delta
return indent
def _write_regular_rows(self, fmt_values, indent):
truncate_h = self.fmt.truncate_h
truncate_v = self.fmt.truncate_v
ncols = len(self.fmt.tr_frame.columns)
nrows = len(self.fmt.tr_frame)
fmt = self.fmt._get_formatter('__index__')
if fmt is not None:
index_values = self.fmt.tr_frame.index.map(fmt)
else:
index_values = self.fmt.tr_frame.index.format()
row = []
for i in range(nrows):
if truncate_v and i == (self.fmt.tr_row_num):
str_sep_row = ['...' for ele in row]
self.write_tr(str_sep_row, indent, self.indent_delta,
tags=None, nindex_levels=1)
row = []
row.append(index_values[i])
row.extend(fmt_values[j][i] for j in range(ncols))
if truncate_h:
dot_col_ix = self.fmt.tr_col_num + 1
row.insert(dot_col_ix, '...')
self.write_tr(row, indent, self.indent_delta, tags=None,
nindex_levels=1)
def _write_hierarchical_rows(self, fmt_values, indent):
template = 'rowspan="{span}" valign="top"'
truncate_h = self.fmt.truncate_h
truncate_v = self.fmt.truncate_v
frame = self.fmt.tr_frame
ncols = len(frame.columns)
nrows = len(frame)
row_levels = self.frame.index.nlevels
idx_values = frame.index.format(sparsify=False, adjoin=False,
names=False)
idx_values = lzip(*idx_values)
if self.fmt.sparsify:
# GH3547
sentinel = com.sentinel_factory()
levels = frame.index.format(sparsify=sentinel, adjoin=False,
names=False)
level_lengths = get_level_lengths(levels, sentinel)
inner_lvl = len(level_lengths) - 1
if truncate_v:
# Insert ... row and adjust idx_values and
# level_lengths to take this into account.
ins_row = self.fmt.tr_row_num
inserted = False
for lnum, records in enumerate(level_lengths):
rec_new = {}
for tag, span in list(records.items()):
if tag >= ins_row:
rec_new[tag + 1] = span
elif tag + span > ins_row:
rec_new[tag] = span + 1
# GH 14882 - Make sure insertion done once
if not inserted:
dot_row = list(idx_values[ins_row - 1])
dot_row[-1] = u('...')
idx_values.insert(ins_row, tuple(dot_row))
inserted = True
else:
dot_row = list(idx_values[ins_row])
dot_row[inner_lvl - lnum] = u('...')
idx_values[ins_row] = tuple(dot_row)
else:
rec_new[tag] = span
# If ins_row lies between tags, all cols idx cols
# receive ...
if tag + span == ins_row:
rec_new[ins_row] = 1
if lnum == 0:
idx_values.insert(ins_row, tuple(
[u('...')] * len(level_lengths)))
# GH 14882 - Place ... in correct level
elif inserted:
dot_row = list(idx_values[ins_row])
dot_row[inner_lvl - lnum] = u('...')
idx_values[ins_row] = tuple(dot_row)
level_lengths[lnum] = rec_new
level_lengths[inner_lvl][ins_row] = 1
for ix_col in range(len(fmt_values)):
fmt_values[ix_col].insert(ins_row, '...')
nrows += 1
for i in range(nrows):
row = []
tags = {}
sparse_offset = 0
j = 0
for records, v in zip(level_lengths, idx_values[i]):
if i in records:
if records[i] > 1:
tags[j] = template.format(span=records[i])
else:
sparse_offset += 1
continue
j += 1
row.append(v)
row.extend(fmt_values[j][i] for j in range(ncols))
if truncate_h:
row.insert(row_levels - sparse_offset +
self.fmt.tr_col_num, '...')
self.write_tr(row, indent, self.indent_delta, tags=tags,
nindex_levels=len(levels) - sparse_offset)
else:
for i in range(len(frame)):
idx_values = list(zip(*frame.index.format(
sparsify=False, adjoin=False, names=False)))
row = []
row.extend(idx_values[i])
row.extend(fmt_values[j][i] for j in range(ncols))
if truncate_h:
row.insert(row_levels + self.fmt.tr_col_num, '...')
self.write_tr(row, indent, self.indent_delta, tags=None,
nindex_levels=frame.index.nlevels)
def single_column_table(column, align=None, style=None):
table = '<table'
if align is not None:
table += (' align="{align}"'.format(align=align))
if style is not None:
table += (' style="{style}"'.format(style=style))
table += '><tbody>'
for i in column:
table += ('<tr><td>{i!s}</td></tr>'.format(i=i))
table += '</tbody></table>'
return table
def single_row_table(row): # pragma: no cover
table = '<table><tbody><tr>'
for i in row:
table += ('<td>{i!s}</td>'.format(i=i))
table += '</tr></tbody></table>'
return table
@@ -0,0 +1,243 @@
# -*- coding: utf-8 -*-
"""
Module for formatting output data in Latex.
"""
from __future__ import print_function
from pandas.core.index import MultiIndex
from pandas import compat
from pandas.compat import range, map, zip, u
from pandas.io.formats.format import TableFormatter
import numpy as np
class LatexFormatter(TableFormatter):
""" Used to render a DataFrame to a LaTeX tabular/longtable environment
output.
Parameters
----------
formatter : `DataFrameFormatter`
column_format : str, default None
The columns format as specified in `LaTeX table format
<https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g 'rcl' for 3 columns
longtable : boolean, default False
Use a longtable environment instead of tabular.
See Also
--------
HTMLFormatter
"""
def __init__(self, formatter, column_format=None, longtable=False,
multicolumn=False, multicolumn_format=None, multirow=False):
self.fmt = formatter
self.frame = self.fmt.frame
self.bold_rows = self.fmt.kwds.get('bold_rows', False)
self.column_format = column_format
self.longtable = longtable
self.multicolumn = multicolumn
self.multicolumn_format = multicolumn_format
self.multirow = multirow
def write_result(self, buf):
"""
Render a DataFrame to a LaTeX tabular/longtable environment output.
"""
# string representation of the columns
if len(self.frame.columns) == 0 or len(self.frame.index) == 0:
info_line = (u('Empty {name}\nColumns: {col}\nIndex: {idx}')
.format(name=type(self.frame).__name__,
col=self.frame.columns,
idx=self.frame.index))
strcols = [[info_line]]
else:
strcols = self.fmt._to_str_columns()
def get_col_type(dtype):
if issubclass(dtype.type, np.number):
return 'r'
else:
return 'l'
# reestablish the MultiIndex that has been joined by _to_str_column
if self.fmt.index and isinstance(self.frame.index, MultiIndex):
out = self.frame.index.format(
adjoin=False, sparsify=self.fmt.sparsify,
names=self.fmt.has_index_names, na_rep=self.fmt.na_rep
)
# index.format will sparsify repeated entries with empty strings
# so pad these with some empty space
def pad_empties(x):
for pad in reversed(x):
if pad:
break
return [x[0]] + [i if i else ' ' * len(pad) for i in x[1:]]
out = (pad_empties(i) for i in out)
# Add empty spaces for each column level
clevels = self.frame.columns.nlevels
out = [[' ' * len(i[-1])] * clevels + i for i in out]
# Add the column names to the last index column
cnames = self.frame.columns.names
if any(cnames):
new_names = [i if i else '{}' for i in cnames]
out[self.frame.index.nlevels - 1][:clevels] = new_names
# Get rid of old multiindex column and add new ones
strcols = out + strcols[1:]
column_format = self.column_format
if column_format is None:
dtypes = self.frame.dtypes._values
column_format = ''.join(map(get_col_type, dtypes))
if self.fmt.index:
index_format = 'l' * self.frame.index.nlevels
column_format = index_format + column_format
elif not isinstance(column_format,
compat.string_types): # pragma: no cover
raise AssertionError('column_format must be str or unicode, '
'not {typ}'.format(typ=type(column_format)))
if not self.longtable:
buf.write('\\begin{{tabular}}{{{fmt}}}\n'
.format(fmt=column_format))
buf.write('\\toprule\n')
else:
buf.write('\\begin{{longtable}}{{{fmt}}}\n'
.format(fmt=column_format))
buf.write('\\toprule\n')
ilevels = self.frame.index.nlevels
clevels = self.frame.columns.nlevels
nlevels = clevels
if self.fmt.has_index_names and self.fmt.show_index_names:
nlevels += 1
strrows = list(zip(*strcols))
self.clinebuf = []
for i, row in enumerate(strrows):
if i == nlevels and self.fmt.header:
buf.write('\\midrule\n') # End of header
if self.longtable:
buf.write('\\endhead\n')
buf.write('\\midrule\n')
buf.write('\\multicolumn{{{n}}}{{r}}{{{{Continued on next '
'page}}}} \\\\\n'.format(n=len(row)))
buf.write('\\midrule\n')
buf.write('\\endfoot\n\n')
buf.write('\\bottomrule\n')
buf.write('\\endlastfoot\n')
if self.fmt.kwds.get('escape', True):
# escape backslashes first
crow = [(x.replace('\\', '\\textbackslash ')
.replace('_', '\\_')
.replace('%', '\\%').replace('$', '\\$')
.replace('#', '\\#').replace('{', '\\{')
.replace('}', '\\}').replace('~', '\\textasciitilde ')
.replace('^', '\\textasciicircum ')
.replace('&', '\\&')
if (x and x != '{}') else '{}') for x in row]
else:
crow = [x if x else '{}' for x in row]
if self.bold_rows and self.fmt.index:
# bold row labels
crow = ['\\textbf{{{x}}}'.format(x=x)
if j < ilevels and x.strip() not in ['', '{}'] else x
for j, x in enumerate(crow)]
if i < clevels and self.fmt.header and self.multicolumn:
# sum up columns to multicolumns
crow = self._format_multicolumn(crow, ilevels)
if (i >= nlevels and self.fmt.index and self.multirow and
ilevels > 1):
# sum up rows to multirows
crow = self._format_multirow(crow, ilevels, i, strrows)
buf.write(' & '.join(crow))
buf.write(' \\\\\n')
if self.multirow and i < len(strrows) - 1:
self._print_cline(buf, i, len(strcols))
if not self.longtable:
buf.write('\\bottomrule\n')
buf.write('\\end{tabular}\n')
else:
buf.write('\\end{longtable}\n')
def _format_multicolumn(self, row, ilevels):
r"""
Combine columns belonging to a group to a single multicolumn entry
according to self.multicolumn_format
e.g.:
a & & & b & c &
will become
\multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c}
"""
row2 = list(row[:ilevels])
ncol = 1
coltext = ''
def append_col():
# write multicolumn if needed
if ncol > 1:
row2.append('\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}'
.format(ncol=ncol, fmt=self.multicolumn_format,
txt=coltext.strip()))
# don't modify where not needed
else:
row2.append(coltext)
for c in row[ilevels:]:
# if next col has text, write the previous
if c.strip():
if coltext:
append_col()
coltext = c
ncol = 1
# if not, add it to the previous multicolumn
else:
ncol += 1
# write last column name
if coltext:
append_col()
return row2
def _format_multirow(self, row, ilevels, i, rows):
r"""
Check following rows, whether row should be a multirow
e.g.: becomes:
a & 0 & \multirow{2}{*}{a} & 0 &
& 1 & & 1 &
b & 0 & \cline{1-2}
b & 0 &
"""
for j in range(ilevels):
if row[j].strip():
nrow = 1
for r in rows[i + 1:]:
if not r[j].strip():
nrow += 1
else:
break
if nrow > 1:
# overwrite non-multirow entry
row[j] = '\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}'.format(
nrow=nrow, row=row[j].strip())
# save when to end the current block with \cline
self.clinebuf.append([i + nrow - 1, j + 1])
return row
def _print_cline(self, buf, i, icol):
"""
Print clines after multirow-blocks are finished
"""
for cl in self.clinebuf:
if cl[0] == i:
buf.write('\\cline{{{cl:d}-{icol:d}}}\n'
.format(cl=cl[1], icol=icol))
# remove entries that have been written to buffer
self.clinebuf = [x for x in self.clinebuf if x[0] != i]
@@ -0,0 +1,263 @@
"""
printing tools
"""
import sys
from pandas.core.dtypes.inference import is_sequence
from pandas import compat
from pandas.compat import u
from pandas.core.config import get_option
def adjoin(space, *lists, **kwargs):
"""
Glues together two sets of strings using the amount of space requested.
The idea is to prettify.
----------
space : int
number of spaces for padding
lists : str
list of str which being joined
strlen : callable
function used to calculate the length of each str. Needed for unicode
handling.
justfunc : callable
function used to justify str. Needed for unicode handling.
"""
strlen = kwargs.pop('strlen', len)
justfunc = kwargs.pop('justfunc', justify)
out_lines = []
newLists = []
lengths = [max(map(strlen, x)) + space for x in lists[:-1]]
# not the last one
lengths.append(max(map(len, lists[-1])))
maxLen = max(map(len, lists))
for i, lst in enumerate(lists):
nl = justfunc(lst, lengths[i], mode='left')
nl.extend([' ' * lengths[i]] * (maxLen - len(lst)))
newLists.append(nl)
toJoin = zip(*newLists)
for lines in toJoin:
out_lines.append(_join_unicode(lines))
return _join_unicode(out_lines, sep='\n')
def justify(texts, max_len, mode='right'):
"""
Perform ljust, center, rjust against string or list-like
"""
if mode == 'left':
return [x.ljust(max_len) for x in texts]
elif mode == 'center':
return [x.center(max_len) for x in texts]
else:
return [x.rjust(max_len) for x in texts]
def _join_unicode(lines, sep=''):
try:
return sep.join(lines)
except UnicodeDecodeError:
sep = compat.text_type(sep)
return sep.join([x.decode('utf-8') if isinstance(x, str) else x
for x in lines])
# Unicode consolidation
# ---------------------
#
# pprinting utility functions for generating Unicode text or
# bytes(3.x)/str(2.x) representations of objects.
# Try to use these as much as possible rather then rolling your own.
#
# When to use
# -----------
#
# 1) If you're writing code internal to pandas (no I/O directly involved),
# use pprint_thing().
#
# It will always return unicode text which can handled by other
# parts of the package without breakage.
#
# 2) if you need to write something out to file, use
# pprint_thing_encoded(encoding).
#
# If no encoding is specified, it defaults to utf-8. Since encoding pure
# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're
# working with straight ascii.
def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds):
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather then calling this directly.
bounds length of printed sequence, depending on options
"""
if isinstance(seq, set):
fmt = u("{{{body}}}")
else:
fmt = u("[{body}]") if hasattr(seq, '__setitem__') else u("({body})")
if max_seq_items is False:
nitems = len(seq)
else:
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
s = iter(seq)
r = []
for i in range(min(nitems, len(seq))): # handle sets, no slicing
r.append(pprint_thing(
next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds))
body = ", ".join(r)
if nitems < len(seq):
body += ", ..."
elif isinstance(seq, tuple) and len(seq) == 1:
body += ','
return fmt.format(body=body)
def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds):
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather then calling this directly.
"""
fmt = u("{{{things}}}")
pairs = []
pfmt = u("{key}: {val}")
if max_seq_items is False:
nitems = len(seq)
else:
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
for k, v in list(seq.items())[:nitems]:
pairs.append(
pfmt.format(
key=pprint_thing(k, _nest_lvl + 1,
max_seq_items=max_seq_items, **kwds),
val=pprint_thing(v, _nest_lvl + 1,
max_seq_items=max_seq_items, **kwds)))
if nitems < len(seq):
return fmt.format(things=", ".join(pairs) + ", ...")
else:
return fmt.format(things=", ".join(pairs))
def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False,
quote_strings=False, max_seq_items=None):
"""
This function is the sanctioned way of converting objects
to a unicode representation.
properly handles nested sequences containing unicode strings
(unicode(object) does not)
Parameters
----------
thing : anything to be formatted
_nest_lvl : internal use only. pprint_thing() is mutually-recursive
with pprint_sequence, this argument is used to keep track of the
current nesting level, and limit it.
escape_chars : list or dict, optional
Characters to escape. If a dict is passed the values are the
replacements
default_escapes : bool, default False
Whether the input escape characters replaces or adds to the defaults
max_seq_items : False, int, default None
Pass thru to other pretty printers to limit sequence printing
Returns
-------
result - unicode object on py2, str on py3. Always Unicode.
"""
def as_escaped_unicode(thing, escape_chars=escape_chars):
# Unicode is fine, else we try to decode using utf-8 and 'replace'
# if that's not it either, we have no way of knowing and the user
# should deal with it himself.
try:
result = compat.text_type(thing) # we should try this first
except UnicodeDecodeError:
# either utf-8 or we replace errors
result = str(thing).decode('utf-8', "replace")
translate = {'\t': r'\t', '\n': r'\n', '\r': r'\r', }
if isinstance(escape_chars, dict):
if default_escapes:
translate.update(escape_chars)
else:
translate = escape_chars
escape_chars = list(escape_chars.keys())
else:
escape_chars = escape_chars or tuple()
for c in escape_chars:
result = result.replace(c, translate[c])
return compat.text_type(result)
if (compat.PY3 and hasattr(thing, '__next__')) or hasattr(thing, 'next'):
return compat.text_type(thing)
elif (isinstance(thing, dict) and
_nest_lvl < get_option("display.pprint_nest_depth")):
result = _pprint_dict(thing, _nest_lvl, quote_strings=True,
max_seq_items=max_seq_items)
elif (is_sequence(thing) and
_nest_lvl < get_option("display.pprint_nest_depth")):
result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars,
quote_strings=quote_strings,
max_seq_items=max_seq_items)
elif isinstance(thing, compat.string_types) and quote_strings:
if compat.PY3:
fmt = u("'{thing}'")
else:
fmt = u("u'{thing}'")
result = fmt.format(thing=as_escaped_unicode(thing))
else:
result = as_escaped_unicode(thing)
return compat.text_type(result) # always unicode
def pprint_thing_encoded(object, encoding='utf-8', errors='replace', **kwds):
value = pprint_thing(object) # get unicode representation of object
return value.encode(encoding, errors, **kwds)
def _enable_data_resource_formatter(enable):
if 'IPython' not in sys.modules:
# definitely not in IPython
return
from IPython import get_ipython
ip = get_ipython()
if ip is None:
# still not in IPython
return
formatters = ip.display_formatter.formatters
mimetype = "application/vnd.dataresource+json"
if enable:
if mimetype not in formatters:
# define tableschema formatter
from IPython.core.formatters import BaseFormatter
class TableSchemaFormatter(BaseFormatter):
print_method = '_repr_data_resource_'
_return_type = (dict,)
# register it:
formatters[mimetype] = TableSchemaFormatter()
# enable it if it's been disabled:
formatters[mimetype].enabled = True
else:
# unregister tableschema mime-type
if mimetype in formatters:
formatters[mimetype].enabled = False
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,70 @@
{# Update the template_structure.html document too #}
{%- block before_style -%}{%- endblock before_style -%}
{% block style %}
<style type="text/css" >
{% block table_styles %}
{% for s in table_styles %}
#T_{{uuid}} {{s.selector}} {
{% for p,val in s.props %}
{{p}}: {{val}};
{% endfor -%}
}
{%- endfor -%}
{% endblock table_styles %}
{% block before_cellstyle %}{% endblock before_cellstyle %}
{% block cellstyle %}
{%- for s in cellstyle %}
#T_{{uuid}}{{s.selector}} {
{% for p,val in s.props %}
{{p}}: {{val}};
{% endfor %}
}
{%- endfor -%}
{%- endblock cellstyle %}
</style>
{%- endblock style %}
{%- block before_table %}{% endblock before_table %}
{%- block table %}
<table id="T_{{uuid}}" {% if table_attributes %}{{ table_attributes }}{% endif %}>
{%- block caption %}
{%- if caption -%}
<caption>{{caption}}</caption>
{%- endif -%}
{%- endblock caption %}
{%- block thead %}
<thead>
{%- block before_head_rows %}{% endblock %}
{%- for r in head %}
{%- block head_tr scoped %}
<tr>
{%- for c in r %}
{%- if c.is_visible != False %}
<{{ c.type }} class="{{c.class}}" {{ c.attributes|join(" ") }}>{{c.value}}</{{ c.type }}>
{%- endif %}
{%- endfor %}
</tr>
{%- endblock head_tr %}
{%- endfor %}
{%- block after_head_rows %}{% endblock %}
</thead>
{%- endblock thead %}
{%- block tbody %}
<tbody>
{%- block before_rows %}{%- endblock before_rows %}
{%- for r in body %}
{%- block tr scoped %}
<tr>
{%- for c in r %}
{%- if c.is_visible != False %}
<{{ c.type }} id="T_{{ uuid }}{{ c.id }}" class="{{ c.class }}" {{ c.attributes|join(" ") }}>{{ c.display_value }}</{{ c.type }}>
{%- endif %}
{%- endfor %}
</tr>
{%- endblock tr %}
{%- endfor %}
{%- block after_rows %}{%- endblock after_rows %}
</tbody>
{%- endblock tbody %}
</table>
{%- endblock table %}
{%- block after_table %}{% endblock after_table %}
@@ -0,0 +1,145 @@
"""
get_terminal_size() -- return width and height of terminal as a tuple
code from:
http://stackoverflow.com/questions/566746/how-to-get-console- window-width-in-
python
written by
Harco Kuppens (http://stackoverflow.com/users/825214/harco-kuppens)
It is mentioned in the stackoverflow response that this code works
on linux, os x, windows and cygwin (windows).
"""
from __future__ import print_function
import os
import shutil
from pandas.compat import PY3
__all__ = ['get_terminal_size', 'is_terminal']
def get_terminal_size():
"""
Detect terminal size and return tuple = (width, height).
Only to be used when running in a terminal. Note that the IPython notebook,
IPython zmq frontends, or IDLE do not run in a terminal,
"""
import platform
if PY3:
return shutil.get_terminal_size()
current_os = platform.system()
tuple_xy = None
if current_os == 'Windows':
tuple_xy = _get_terminal_size_windows()
if tuple_xy is None:
tuple_xy = _get_terminal_size_tput()
# needed for window's python in cygwin's xterm!
if current_os == 'Linux' or \
current_os == 'Darwin' or \
current_os.startswith('CYGWIN'):
tuple_xy = _get_terminal_size_linux()
if tuple_xy is None:
tuple_xy = (80, 25) # default value
return tuple_xy
def is_terminal():
"""
Detect if Python is running in a terminal.
Returns True if Python is running in a terminal or False if not.
"""
try:
ip = get_ipython()
except NameError: # assume standard Python interpreter in a terminal
return True
else:
if hasattr(ip, 'kernel'): # IPython as a Jupyter kernel
return False
else: # IPython in a terminal
return True
def _get_terminal_size_windows():
res = None
try:
from ctypes import windll, create_string_buffer
# stdin handle is -10
# stdout handle is -11
# stderr handle is -12
h = windll.kernel32.GetStdHandle(-12)
csbi = create_string_buffer(22)
res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi)
except:
return None
if res:
import struct
(bufx, bufy, curx, cury, wattr, left, top, right, bottom, maxx,
maxy) = struct.unpack("hhhhHhhhhhh", csbi.raw)
sizex = right - left + 1
sizey = bottom - top + 1
return sizex, sizey
else:
return None
def _get_terminal_size_tput():
# get terminal width
# src: http://stackoverflow.com/questions/263890/how-do-i-find-the-width
# -height-of-a-terminal-window
try:
import subprocess
proc = subprocess.Popen(["tput", "cols"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
output = proc.communicate(input=None)
cols = int(output[0])
proc = subprocess.Popen(["tput", "lines"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
output = proc.communicate(input=None)
rows = int(output[0])
return (cols, rows)
except:
return None
def _get_terminal_size_linux():
def ioctl_GWINSZ(fd):
try:
import fcntl
import termios
import struct
cr = struct.unpack(
'hh', fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234'))
except:
return None
return cr
cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
if not cr:
try:
fd = os.open(os.ctermid(), os.O_RDONLY)
cr = ioctl_GWINSZ(fd)
os.close(fd)
except:
pass
if not cr or cr == (0, 0):
try:
from os import environ as env
cr = (env['LINES'], env['COLUMNS'])
except:
return None
return int(cr[1]), int(cr[0])
if __name__ == "__main__":
sizex, sizey = get_terminal_size()
print('width = {w} height = {h}'.format(w=sizex, h=sizey))
@@ -0,0 +1,119 @@
""" Google BigQuery support """
def _try_import():
# since pandas is a dependency of pandas-gbq
# we need to import on first use
try:
import pandas_gbq
except ImportError:
# give a nice error message
raise ImportError("Load data from Google BigQuery\n"
"\n"
"the pandas-gbq package is not installed\n"
"see the docs: https://pandas-gbq.readthedocs.io\n"
"\n"
"you can install via pip or conda:\n"
"pip install pandas-gbq\n"
"conda install pandas-gbq -c conda-forge\n")
return pandas_gbq
def read_gbq(query, project_id=None, index_col=None, col_order=None,
reauth=False, verbose=None, private_key=None, dialect='legacy',
**kwargs):
"""
Load data from Google BigQuery.
This function requires the `pandas-gbq package
<https://pandas-gbq.readthedocs.io>`__.
Authentication to the Google BigQuery service is via OAuth 2.0.
- If "private_key" is not provided:
By default "application default credentials" are used.
If default application credentials are not found or are restrictive,
user account credentials are used. In this case, you will be asked to
grant permissions for product name 'pandas GBQ'.
- If "private_key" is provided:
Service account credentials will be used to authenticate.
Parameters
----------
query : str
SQL-Like Query to return data values.
project_id : str
Google BigQuery Account project ID.
index_col : str, optional
Name of result column to use for index in results DataFrame.
col_order : list(str), optional
List of BigQuery column names in the desired order for results
DataFrame.
reauth : boolean, default False
Force Google BigQuery to reauthenticate the user. This is useful
if multiple accounts are used.
private_key : str, optional
Service account private key in JSON format. Can be file path
or string contents. This is useful for remote server
authentication (eg. Jupyter/IPython notebook on remote host).
dialect : str, default 'legacy'
SQL syntax dialect to use. Value can be one of:
``'legacy'``
Use BigQuery's legacy SQL dialect. For more information see
`BigQuery Legacy SQL Reference
<https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
``'standard'``
Use BigQuery's standard SQL, which is
compliant with the SQL 2011 standard. For more information
see `BigQuery Standard SQL Reference
<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
verbose : boolean, deprecated
*Deprecated in Pandas-GBQ 0.4.0.* Use the `logging module
to adjust verbosity instead
<https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
kwargs : dict
Arbitrary keyword arguments.
configuration (dict): query config parameters for job processing.
For example:
configuration = {'query': {'useQueryCache': False}}
For more information see `BigQuery SQL Reference
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__
Returns
-------
df: DataFrame
DataFrame representing results of query.
See Also
--------
pandas_gbq.read_gbq : This function in the pandas-gbq library.
pandas.DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
"""
pandas_gbq = _try_import()
return pandas_gbq.read_gbq(
query, project_id=project_id,
index_col=index_col, col_order=col_order,
reauth=reauth, verbose=verbose,
private_key=private_key,
dialect=dialect,
**kwargs)
def to_gbq(dataframe, destination_table, project_id, chunksize=None,
verbose=None, reauth=False, if_exists='fail', private_key=None,
auth_local_webserver=False, table_schema=None):
pandas_gbq = _try_import()
return pandas_gbq.to_gbq(
dataframe, destination_table, project_id, chunksize=chunksize,
verbose=verbose, reauth=reauth, if_exists=if_exists,
private_key=private_key, auth_local_webserver=auth_local_webserver,
table_schema=table_schema)
@@ -0,0 +1,987 @@
""":mod:`pandas.io.html` is a module containing functionality for dealing with
HTML IO.
"""
import os
import re
import numbers
import collections
from distutils.version import LooseVersion
import numpy as np
from pandas.core.dtypes.common import is_list_like
from pandas.errors import EmptyDataError
from pandas.io.common import _is_url, urlopen, _validate_header_arg
from pandas.io.parsers import TextParser
from pandas.compat import (lrange, lmap, u, string_types, iteritems,
raise_with_traceback, binary_type)
from pandas import Series
import pandas.core.common as com
from pandas.io.formats.printing import pprint_thing
_IMPORTS = False
_HAS_BS4 = False
_HAS_LXML = False
_HAS_HTML5LIB = False
def _importers():
# import things we need
# but make this done on a first use basis
global _IMPORTS
if _IMPORTS:
return
global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
try:
import bs4 # noqa
_HAS_BS4 = True
except ImportError:
pass
try:
import lxml # noqa
_HAS_LXML = True
except ImportError:
pass
try:
import html5lib # noqa
_HAS_HTML5LIB = True
except ImportError:
pass
_IMPORTS = True
#############
# READ HTML #
#############
_RE_WHITESPACE = re.compile(r'[\r\n]+|\s{2,}')
char_types = string_types + (binary_type,)
def _remove_whitespace(s, regex=_RE_WHITESPACE):
"""Replace extra whitespace inside of a string with a single space.
Parameters
----------
s : str or unicode
The string from which to remove extra whitespace.
regex : regex
The regular expression to use to remove extra whitespace.
Returns
-------
subd : str or unicode
`s` with all extra whitespace replaced with a single space.
"""
return regex.sub(' ', s.strip())
def _get_skiprows(skiprows):
"""Get an iterator given an integer, slice or container.
Parameters
----------
skiprows : int, slice, container
The iterator to use to skip rows; can also be a slice.
Raises
------
TypeError
* If `skiprows` is not a slice, integer, or Container
Returns
-------
it : iterable
A proper iterator to use to skip rows of a DataFrame.
"""
if isinstance(skiprows, slice):
return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1)
elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
return skiprows
elif skiprows is None:
return 0
raise TypeError('%r is not a valid type for skipping rows' %
type(skiprows).__name__)
def _read(obj):
"""Try to read from a url, file or string.
Parameters
----------
obj : str, unicode, or file-like
Returns
-------
raw_text : str
"""
if _is_url(obj):
with urlopen(obj) as url:
text = url.read()
elif hasattr(obj, 'read'):
text = obj.read()
elif isinstance(obj, char_types):
text = obj
try:
if os.path.isfile(text):
with open(text, 'rb') as f:
return f.read()
except (TypeError, ValueError):
pass
else:
raise TypeError("Cannot read object of type %r" % type(obj).__name__)
return text
class _HtmlFrameParser(object):
"""Base class for parsers that parse HTML into DataFrames.
Parameters
----------
io : str or file-like
This can be either a string of raw HTML, a valid URL using the HTTP,
FTP, or FILE protocols or a file-like object.
match : str or regex
The text to match in the document.
attrs : dict
List of HTML <table> element attributes to match.
encoding : str
Encoding to be used by parser
displayed_only : bool
Whether or not items with "display:none" should be ignored
.. versionadded:: 0.23.0
Attributes
----------
io : str or file-like
raw HTML, URL, or file-like object
match : regex
The text to match in the raw HTML
attrs : dict-like
A dictionary of valid table attributes to use to search for table
elements.
encoding : str
Encoding to be used by parser
displayed_only : bool
Whether or not items with "display:none" should be ignored
.. versionadded:: 0.23.0
Notes
-----
To subclass this class effectively you must override the following methods:
* :func:`_build_doc`
* :func:`_text_getter`
* :func:`_parse_td`
* :func:`_parse_tables`
* :func:`_parse_tr`
* :func:`_parse_thead`
* :func:`_parse_tbody`
* :func:`_parse_tfoot`
See each method's respective documentation for details on their
functionality.
"""
def __init__(self, io, match, attrs, encoding, displayed_only):
self.io = io
self.match = match
self.attrs = attrs
self.encoding = encoding
self.displayed_only = displayed_only
def parse_tables(self):
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
return (self._build_table(table) for table in tables)
def _parse_raw_data(self, rows):
"""Parse the raw data into a list of lists.
Parameters
----------
rows : iterable of node-like
A list of row elements.
text_getter : callable
A callable that gets the text from an individual node. This must be
defined by subclasses.
column_finder : callable
A callable that takes a row node as input and returns a list of the
column node in that row. This must be defined by subclasses.
Returns
-------
data : list of list of strings
"""
data = [[_remove_whitespace(self._text_getter(col)) for col in
self._parse_td(row)] for row in rows]
return data
def _text_getter(self, obj):
"""Return the text of an individual DOM node.
Parameters
----------
obj : node-like
A DOM node.
Returns
-------
text : str or unicode
The text from an individual DOM node.
"""
raise com.AbstractMethodError(self)
def _parse_td(self, obj):
"""Return the td elements from a row element.
Parameters
----------
obj : node-like
Returns
-------
columns : list of node-like
These are the elements of each row, i.e., the columns.
"""
raise com.AbstractMethodError(self)
def _parse_tables(self, doc, match, attrs):
"""Return all tables from the parsed DOM.
Parameters
----------
doc : tree-like
The DOM from which to parse the table element.
match : str or regular expression
The text to search for in the DOM tree.
attrs : dict
A dictionary of table attributes that can be used to disambiguate
multiple tables on a page.
Raises
------
ValueError
* If `match` does not match any text in the document.
Returns
-------
tables : list of node-like
A list of <table> elements to be parsed into raw data.
"""
raise com.AbstractMethodError(self)
def _parse_tr(self, table):
"""Return the list of row elements from the parsed table element.
Parameters
----------
table : node-like
A table element that contains row elements.
Returns
-------
rows : list of node-like
A list row elements of a table, usually <tr> or <th> elements.
"""
raise com.AbstractMethodError(self)
def _parse_thead(self, table):
"""Return the header of a table.
Parameters
----------
table : node-like
A table element that contains row elements.
Returns
-------
thead : node-like
A <thead>...</thead> element.
"""
raise com.AbstractMethodError(self)
def _parse_tbody(self, table):
"""Return the list of tbody elements from the parsed table element.
Parameters
----------
table : node-like
A table element that contains row elements.
Returns
-------
tbodys : list of node-like
A list of <tbody>...</tbody> elements
"""
raise com.AbstractMethodError(self)
def _parse_tfoot(self, table):
"""Return the footer of the table if any.
Parameters
----------
table : node-like
A table element that contains row elements.
Returns
-------
tfoot : node-like
A <tfoot>...</tfoot> element.
"""
raise com.AbstractMethodError(self)
def _build_doc(self):
"""Return a tree-like object that can be used to iterate over the DOM.
Returns
-------
obj : tree-like
"""
raise com.AbstractMethodError(self)
def _build_table(self, table):
header = self._parse_raw_thead(table)
body = self._parse_raw_tbody(table)
footer = self._parse_raw_tfoot(table)
return header, body, footer
def _parse_raw_thead(self, table):
thead = self._parse_thead(table)
res = []
if thead:
trs = self._parse_tr(thead[0])
for tr in trs:
cols = lmap(self._text_getter, self._parse_td(tr))
if any(col != '' for col in cols):
res.append(cols)
return res
def _parse_raw_tfoot(self, table):
tfoot = self._parse_tfoot(table)
res = []
if tfoot:
res = lmap(self._text_getter, self._parse_td(tfoot[0]))
return np.atleast_1d(
np.array(res).squeeze()) if res and len(res) == 1 else res
def _parse_raw_tbody(self, table):
tbodies = self._parse_tbody(table)
raw_data = []
if tbodies:
for tbody in tbodies:
raw_data.extend(self._parse_tr(tbody))
else:
raw_data.extend(self._parse_tr(table))
return self._parse_raw_data(raw_data)
def _handle_hidden_tables(self, tbl_list, attr_name):
"""Returns list of tables, potentially removing hidden elements
Parameters
----------
tbl_list : list of Tag or list of Element
Type of list elements will vary depending upon parser used
attr_name : str
Name of the accessor for retrieving HTML attributes
Returns
-------
list of Tag or list of Element
Return type matches `tbl_list`
"""
if not self.displayed_only:
return tbl_list
return [x for x in tbl_list if "display:none" not in
getattr(x, attr_name).get('style', '').replace(" ", "")]
class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
"""HTML to DataFrame parser that uses BeautifulSoup under the hood.
See Also
--------
pandas.io.html._HtmlFrameParser
pandas.io.html._LxmlFrameParser
Notes
-----
Documentation strings for this class are in the base class
:class:`pandas.io.html._HtmlFrameParser`.
"""
def __init__(self, *args, **kwargs):
super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
**kwargs)
from bs4 import SoupStrainer
self._strainer = SoupStrainer('table')
def _text_getter(self, obj):
return obj.text
def _parse_td(self, row):
return row.find_all(('td', 'th'))
def _parse_tr(self, element):
return element.find_all('tr')
def _parse_th(self, element):
return element.find_all('th')
def _parse_thead(self, table):
return table.find_all('thead')
def _parse_tbody(self, table):
return table.find_all('tbody')
def _parse_tfoot(self, table):
return table.find_all('tfoot')
def _parse_tables(self, doc, match, attrs):
element_name = self._strainer.name
tables = doc.find_all(element_name, attrs=attrs)
if not tables:
raise ValueError('No tables found')
result = []
unique_tables = set()
tables = self._handle_hidden_tables(tables, "attrs")
for table in tables:
if self.displayed_only:
for elem in table.find_all(
style=re.compile(r"display:\s*none")):
elem.decompose()
if (table not in unique_tables and
table.find(text=match) is not None):
result.append(table)
unique_tables.add(table)
if not result:
raise ValueError("No tables found matching pattern {patt!r}"
.format(patt=match.pattern))
return result
def _setup_build_doc(self):
raw_text = _read(self.io)
if not raw_text:
raise ValueError('No text parsed from document: {doc}'
.format(doc=self.io))
return raw_text
def _build_doc(self):
from bs4 import BeautifulSoup
return BeautifulSoup(self._setup_build_doc(), features='html5lib',
from_encoding=self.encoding)
def _build_xpath_expr(attrs):
"""Build an xpath expression to simulate bs4's ability to pass in kwargs to
search for attributes when using the lxml parser.
Parameters
----------
attrs : dict
A dict of HTML attributes. These are NOT checked for validity.
Returns
-------
expr : unicode
An XPath expression that checks for the given HTML attributes.
"""
# give class attribute as class_ because class is a python keyword
if 'class_' in attrs:
attrs['class'] = attrs.pop('class_')
s = [u("@{key}={val!r}").format(key=k, val=v) for k, v in iteritems(attrs)]
return u('[{expr}]').format(expr=' and '.join(s))
_re_namespace = {'re': 'http://exslt.org/regular-expressions'}
_valid_schemes = 'http', 'file', 'ftp'
class _LxmlFrameParser(_HtmlFrameParser):
"""HTML to DataFrame parser that uses lxml under the hood.
Warning
-------
This parser can only handle HTTP, FTP, and FILE urls.
See Also
--------
_HtmlFrameParser
_BeautifulSoupLxmlFrameParser
Notes
-----
Documentation strings for this class are in the base class
:class:`_HtmlFrameParser`.
"""
def __init__(self, *args, **kwargs):
super(_LxmlFrameParser, self).__init__(*args, **kwargs)
def _text_getter(self, obj):
return obj.text_content()
def _parse_td(self, row):
return row.xpath('.//td|.//th')
def _parse_tr(self, table):
return table.xpath('.//tr')
def _parse_tables(self, doc, match, kwargs):
pattern = match.pattern
# 1. check all descendants for the given pattern and only search tables
# 2. go up the tree until we find a table
query = '//table//*[re:test(text(), {patt!r})]/ancestor::table'
xpath_expr = u(query).format(patt=pattern)
# if any table attributes were given build an xpath expression to
# search for them
if kwargs:
xpath_expr += _build_xpath_expr(kwargs)
tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
tables = self._handle_hidden_tables(tables, "attrib")
if self.displayed_only:
for table in tables:
# lxml utilizes XPATH 1.0 which does not have regex
# support. As a result, we find all elements with a style
# attribute and iterate them to check for display:none
for elem in table.xpath('.//*[@style]'):
if "display:none" in elem.attrib.get(
"style", "").replace(" ", ""):
elem.getparent().remove(elem)
if not tables:
raise ValueError("No tables found matching regex {patt!r}"
.format(patt=pattern))
return tables
def _build_doc(self):
"""
Raises
------
ValueError
* If a URL that lxml cannot parse is passed.
Exception
* Any other ``Exception`` thrown. For example, trying to parse a
URL that is syntactically correct on a machine with no internet
connection will fail.
See Also
--------
pandas.io.html._HtmlFrameParser._build_doc
"""
from lxml.html import parse, fromstring, HTMLParser
from lxml.etree import XMLSyntaxError
parser = HTMLParser(recover=True, encoding=self.encoding)
try:
if _is_url(self.io):
with urlopen(self.io) as f:
r = parse(f, parser=parser)
else:
# try to parse the input in the simplest way
r = parse(self.io, parser=parser)
try:
r = r.getroot()
except AttributeError:
pass
except (UnicodeDecodeError, IOError) as e:
# if the input is a blob of html goop
if not _is_url(self.io):
r = fromstring(self.io, parser=parser)
try:
r = r.getroot()
except AttributeError:
pass
else:
raise e
else:
if not hasattr(r, 'text_content'):
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
return r
def _parse_tbody(self, table):
return table.xpath('.//tbody')
def _parse_thead(self, table):
return table.xpath('.//thead')
def _parse_tfoot(self, table):
return table.xpath('.//tfoot')
def _parse_raw_thead(self, table):
expr = './/thead'
thead = table.xpath(expr)
res = []
if thead:
# Grab any directly descending table headers first
ths = thead[0].xpath('./th')
if ths:
cols = [_remove_whitespace(x.text_content()) for x in ths]
if any(col != '' for col in cols):
res.append(cols)
else:
trs = self._parse_tr(thead[0])
for tr in trs:
cols = [_remove_whitespace(x.text_content()) for x in
self._parse_td(tr)]
if any(col != '' for col in cols):
res.append(cols)
return res
def _parse_raw_tfoot(self, table):
expr = './/tfoot//th|//tfoot//td'
return [_remove_whitespace(x.text_content()) for x in
table.xpath(expr)]
def _expand_elements(body):
lens = Series(lmap(len, body))
lens_max = lens.max()
not_max = lens[lens != lens_max]
empty = ['']
for ind, length in iteritems(not_max):
body[ind] += empty * (lens_max - length)
def _data_to_frame(**kwargs):
head, body, foot = kwargs.pop('data')
header = kwargs.pop('header')
kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
if head:
rows = lrange(len(head))
body = head + body
if header is None: # special case when a table has <th> elements
header = 0 if rows == [0] else rows
if foot:
body += [foot]
# fill out elements of body that are "ragged"
_expand_elements(body)
tp = TextParser(body, header=header, **kwargs)
df = tp.read()
return df
_valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser,
'html5lib': _BeautifulSoupHtml5LibFrameParser,
'bs4': _BeautifulSoupHtml5LibFrameParser}
def _parser_dispatch(flavor):
"""Choose the parser based on the input flavor.
Parameters
----------
flavor : str
The type of parser to use. This must be a valid backend.
Returns
-------
cls : _HtmlFrameParser subclass
The parser class based on the requested input flavor.
Raises
------
ValueError
* If `flavor` is not a valid backend.
ImportError
* If you do not have the requested `flavor`
"""
valid_parsers = list(_valid_parsers.keys())
if flavor not in valid_parsers:
raise ValueError('{invalid!r} is not a valid flavor, valid flavors '
'are {valid}'
.format(invalid=flavor, valid=valid_parsers))
if flavor in ('bs4', 'html5lib'):
if not _HAS_HTML5LIB:
raise ImportError("html5lib not found, please install it")
if not _HAS_BS4:
raise ImportError(
"BeautifulSoup4 (bs4) not found, please install it")
import bs4
if LooseVersion(bs4.__version__) <= LooseVersion('4.2.0'):
raise ValueError("A minimum version of BeautifulSoup 4.2.1 "
"is required")
else:
if not _HAS_LXML:
raise ImportError("lxml not found, please install it")
return _valid_parsers[flavor]
def _print_as_set(s):
return '{{arg}}'.format(arg=', '.join(pprint_thing(el) for el in s))
def _validate_flavor(flavor):
if flavor is None:
flavor = 'lxml', 'bs4'
elif isinstance(flavor, string_types):
flavor = flavor,
elif isinstance(flavor, collections.Iterable):
if not all(isinstance(flav, string_types) for flav in flavor):
raise TypeError('Object of type {typ!r} is not an iterable of '
'strings'
.format(typ=type(flavor).__name__))
else:
fmt = '{flavor!r}' if isinstance(flavor, string_types) else '{flavor}'
fmt += ' is not a valid flavor'
raise ValueError(fmt.format(flavor=flavor))
flavor = tuple(flavor)
valid_flavors = set(_valid_parsers)
flavor_set = set(flavor)
if not flavor_set & valid_flavors:
raise ValueError('{invalid} is not a valid set of flavors, valid '
'flavors are {valid}'
.format(invalid=_print_as_set(flavor_set),
valid=_print_as_set(valid_flavors)))
return flavor
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here
# hack around python 3 deleting the exception variable
retained = None
for flav in flavor:
parser = _parser_dispatch(flav)
p = parser(io, compiled_match, attrs, encoding, displayed_only)
try:
tables = p.parse_tables()
except Exception as caught:
# if `io` is an io-like object, check if it's seekable
# and try to rewind it before trying the next parser
if hasattr(io, 'seekable') and io.seekable():
io.seek(0)
elif hasattr(io, 'seekable') and not io.seekable():
# if we couldn't rewind it, let the user know
raise ValueError('The flavor {} failed to parse your input. '
'Since you passed a non-rewindable file '
'object, we can\'t rewind it to try '
'another parser. Try read_html() with a '
'different flavor.'.format(flav))
retained = caught
else:
break
else:
raise_with_traceback(retained)
ret = []
for table in tables:
try:
ret.append(_data_to_frame(data=table, **kwargs))
except EmptyDataError: # empty table
continue
return ret
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
skiprows=None, attrs=None, parse_dates=False,
tupleize_cols=None, thousands=',', encoding=None,
decimal='.', converters=None, na_values=None,
keep_default_na=True, displayed_only=True):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
Parameters
----------
io : str or file-like
A URL, a file-like object, or a raw string containing HTML. Note that
lxml only accepts the http, ftp and file url protocols. If you have a
URL that starts with ``'https'`` you might try removing the ``'s'``.
match : str or compiled regular expression, optional
The set of tables containing text matching this regex or string will be
returned. Unless the HTML is extremely simple you will probably need to
pass a non-empty string here. Defaults to '.+' (match any non-empty
string). The default value will return all tables contained on a page.
This value is converted to a regular expression so that there is
consistent behavior between Beautiful Soup and lxml.
flavor : str or None, container of strings
The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
each other, they are both there for backwards compatibility. The
default of ``None`` tries to use ``lxml`` to parse and if that fails it
falls back on ``bs4`` + ``html5lib``.
header : int or list-like or None, optional
The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
make the columns headers.
index_col : int or list-like or None, optional
The column (or list of columns) to use to create the index.
skiprows : int or list-like or slice or None, optional
0-based. Number of rows to skip after parsing the column integer. If a
sequence of integers or a slice is given, will skip the rows indexed by
that sequence. Note that a single element sequence means 'skip the nth
row' whereas an integer means 'skip n rows'.
attrs : dict or None, optional
This is a dictionary of attributes that you can pass to use to identify
the table in the HTML. These are not checked for validity before being
passed to lxml or Beautiful Soup. However, these attributes must be
valid HTML table attributes to work correctly. For example, ::
attrs = {'id': 'table'}
is a valid attribute dictionary because the 'id' HTML tag attribute is
a valid HTML attribute for *any* HTML tag as per `this document
<http://www.w3.org/TR/html-markup/global-attributes.html>`__. ::
attrs = {'asdf': 'table'}
is *not* a valid attribute dictionary because 'asdf' is not a valid
HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
table attributes can be found `here
<http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
working draft of the HTML 5 spec can be found `here
<http://www.w3.org/TR/html-markup/table.html>`__. It contains the
latest information on table attributes for the modern web.
parse_dates : bool, optional
See :func:`~pandas.read_csv` for more details.
tupleize_cols : bool, optional
If ``False`` try to parse multiple header rows into a
:class:`~pandas.MultiIndex`, otherwise return raw tuples. Defaults to
``False``.
.. deprecated:: 0.21.0
This argument will be removed and will always convert to MultiIndex
thousands : str, optional
Separator to use to parse thousands. Defaults to ``','``.
encoding : str or None, optional
The encoding used to decode the web page. Defaults to ``None``.``None``
preserves the previous encoding behavior, which depends on the
underlying parser library (e.g., the parser library will try to use
the encoding provided by the document).
decimal : str, default '.'
Character to recognize as decimal point (e.g. use ',' for European
data).
.. versionadded:: 0.19.0
converters : dict, default None
Dict of functions for converting values in certain columns. Keys can
either be integers or column labels, values are functions that take one
input argument, the cell (not column) content, and return the
transformed content.
.. versionadded:: 0.19.0
na_values : iterable, default None
Custom NA values
.. versionadded:: 0.19.0
keep_default_na : bool, default True
If na_values are specified and keep_default_na is False the default NaN
values are overridden, otherwise they're appended to
.. versionadded:: 0.19.0
display_only : bool, default True
Whether elements with "display: none" should be parsed
.. versionadded:: 0.23.0
Returns
-------
dfs : list of DataFrames
Notes
-----
Before using this function you should read the :ref:`gotchas about the
HTML parsing libraries <io.html.gotchas>`.
Expect to do some cleanup after you call this function. For example, you
might need to manually assign column names if the column names are
converted to NaN when you pass the `header=0` argument. We try to assume as
little as possible about the structure of the table and push the
idiosyncrasies of the HTML contained in the table to the user.
This function searches for ``<table>`` elements and only for ``<tr>``
and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
element in the table. ``<td>`` stands for "table data".
Similar to :func:`~pandas.read_csv` the `header` argument is applied
**after** `skiprows` is applied.
This function will *always* return a list of :class:`DataFrame` *or*
it will fail, e.g., it will *not* return an empty list.
Examples
--------
See the :ref:`read_html documentation in the IO section of the docs
<io.read_html>` for some examples of reading in HTML tables.
See Also
--------
pandas.read_csv
"""
_importers()
# Type check here. We don't want to parse only to fail because of an
# invalid value of an integer skiprows.
if isinstance(skiprows, numbers.Integral) and skiprows < 0:
raise ValueError('cannot skip rows starting from the end of the '
'data (you passed a negative value)')
_validate_header_arg(header)
return _parse(flavor=flavor, io=io, match=match, header=header,
index_col=index_col, skiprows=skiprows,
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
thousands=thousands, attrs=attrs, encoding=encoding,
decimal=decimal, converters=converters, na_values=na_values,
keep_default_na=keep_default_na,
displayed_only=displayed_only)
@@ -0,0 +1,5 @@
from .json import to_json, read_json, loads, dumps # noqa
from .normalize import json_normalize # noqa
from .table_schema import build_table_schema # noqa
del json, normalize, table_schema # noqa
@@ -0,0 +1,929 @@
# pylint: disable-msg=E1101,W0613,W0603
from itertools import islice
import os
import numpy as np
import pandas._libs.json as json
from pandas._libs.tslib import iNaT
from pandas.compat import StringIO, long, u, to_str
from pandas import compat, isna
from pandas import Series, DataFrame, to_datetime, MultiIndex
from pandas.io.common import (get_filepath_or_buffer, _get_handle,
_infer_compression, _stringify_path,
BaseIterator)
from pandas.io.parsers import _validate_integer
import pandas.core.common as com
from pandas.core.reshape.concat import concat
from pandas.io.formats.printing import pprint_thing
from .normalize import _convert_to_line_delimits
from .table_schema import build_table_schema, parse_table_schema
from pandas.core.dtypes.common import is_period_dtype
loads = json.loads
dumps = json.dumps
TABLE_SCHEMA_VERSION = '0.20.0'
# interface to/from
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
double_precision=10, force_ascii=True, date_unit='ms',
default_handler=None, lines=False, compression=None,
index=True):
if not index and orient not in ['split', 'table']:
raise ValueError("'index=False' is only valid when 'orient' is "
"'split' or 'table'")
path_or_buf = _stringify_path(path_or_buf)
if lines and orient != 'records':
raise ValueError(
"'lines' keyword only valid when 'orient' is records")
if orient == 'table' and isinstance(obj, Series):
obj = obj.to_frame(name=obj.name or 'values')
if orient == 'table' and isinstance(obj, DataFrame):
writer = JSONTableWriter
elif isinstance(obj, Series):
writer = SeriesWriter
elif isinstance(obj, DataFrame):
writer = FrameWriter
else:
raise NotImplementedError("'obj' should be a Series or a DataFrame")
s = writer(
obj, orient=orient, date_format=date_format,
double_precision=double_precision, ensure_ascii=force_ascii,
date_unit=date_unit, default_handler=default_handler,
index=index).write()
if lines:
s = _convert_to_line_delimits(s)
if isinstance(path_or_buf, compat.string_types):
fh, handles = _get_handle(path_or_buf, 'w', compression=compression)
try:
fh.write(s)
finally:
fh.close()
elif path_or_buf is None:
return s
else:
path_or_buf.write(s)
class Writer(object):
def __init__(self, obj, orient, date_format, double_precision,
ensure_ascii, date_unit, index, default_handler=None):
self.obj = obj
if orient is None:
orient = self._default_orient
self.orient = orient
self.date_format = date_format
self.double_precision = double_precision
self.ensure_ascii = ensure_ascii
self.date_unit = date_unit
self.default_handler = default_handler
self.index = index
self.is_copy = None
self._format_axes()
def _format_axes(self):
raise com.AbstractMethodError(self)
def write(self):
return self._write(self.obj, self.orient, self.double_precision,
self.ensure_ascii, self.date_unit,
self.date_format == 'iso', self.default_handler)
def _write(self, obj, orient, double_precision, ensure_ascii,
date_unit, iso_dates, default_handler):
return dumps(
obj,
orient=orient,
double_precision=double_precision,
ensure_ascii=ensure_ascii,
date_unit=date_unit,
iso_dates=iso_dates,
default_handler=default_handler
)
class SeriesWriter(Writer):
_default_orient = 'index'
def _format_axes(self):
if not self.obj.index.is_unique and self.orient == 'index':
raise ValueError("Series index must be unique for orient="
"'{orient}'".format(orient=self.orient))
def _write(self, obj, orient, double_precision, ensure_ascii,
date_unit, iso_dates, default_handler):
if not self.index and orient == 'split':
obj = {"name": obj.name, "data": obj.values}
return super(SeriesWriter, self)._write(obj, orient,
double_precision,
ensure_ascii, date_unit,
iso_dates, default_handler)
class FrameWriter(Writer):
_default_orient = 'columns'
def _format_axes(self):
""" try to axes if they are datelike """
if not self.obj.index.is_unique and self.orient in (
'index', 'columns'):
raise ValueError("DataFrame index must be unique for orient="
"'{orient}'.".format(orient=self.orient))
if not self.obj.columns.is_unique and self.orient in (
'index', 'columns', 'records'):
raise ValueError("DataFrame columns must be unique for orient="
"'{orient}'.".format(orient=self.orient))
def _write(self, obj, orient, double_precision, ensure_ascii,
date_unit, iso_dates, default_handler):
if not self.index and orient == 'split':
obj = obj.to_dict(orient='split')
del obj["index"]
return super(FrameWriter, self)._write(obj, orient,
double_precision,
ensure_ascii, date_unit,
iso_dates, default_handler)
class JSONTableWriter(FrameWriter):
_default_orient = 'records'
def __init__(self, obj, orient, date_format, double_precision,
ensure_ascii, date_unit, index, default_handler=None):
"""
Adds a `schema` attribute with the Table Schema, resets
the index (can't do in caller, because the schema inference needs
to know what the index is, forces orient to records, and forces
date_format to 'iso'.
"""
super(JSONTableWriter, self).__init__(
obj, orient, date_format, double_precision, ensure_ascii,
date_unit, index, default_handler=default_handler)
if date_format != 'iso':
msg = ("Trying to write with `orient='table'` and "
"`date_format='{fmt}'`. Table Schema requires dates "
"to be formatted with `date_format='iso'`"
.format(fmt=date_format))
raise ValueError(msg)
self.schema = build_table_schema(obj, index=self.index)
# NotImplementd on a column MultiIndex
if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):
raise NotImplementedError(
"orient='table' is not supported for MultiIndex")
# TODO: Do this timedelta properly in objToJSON.c See GH #15137
if ((obj.ndim == 1) and (obj.name in set(obj.index.names)) or
len(obj.columns & obj.index.names)):
msg = "Overlapping names between the index and columns"
raise ValueError(msg)
obj = obj.copy()
timedeltas = obj.select_dtypes(include=['timedelta']).columns
if len(timedeltas):
obj[timedeltas] = obj[timedeltas].applymap(
lambda x: x.isoformat())
# Convert PeriodIndex to datetimes before serialzing
if is_period_dtype(obj.index):
obj.index = obj.index.to_timestamp()
# exclude index from obj if index=False
if not self.index:
self.obj = obj.reset_index(drop=True)
else:
self.obj = obj.reset_index(drop=False)
self.date_format = 'iso'
self.orient = 'records'
self.index = index
def _write(self, obj, orient, double_precision, ensure_ascii,
date_unit, iso_dates, default_handler):
data = super(JSONTableWriter, self)._write(obj, orient,
double_precision,
ensure_ascii, date_unit,
iso_dates,
default_handler)
serialized = '{{"schema": {schema}, "data": {data}}}'.format(
schema=dumps(self.schema), data=data)
return serialized
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
convert_axes=True, convert_dates=True, keep_default_dates=True,
numpy=False, precise_float=False, date_unit=None, encoding=None,
lines=False, chunksize=None, compression='infer'):
"""
Convert a JSON string to pandas object
Parameters
----------
path_or_buf : a valid JSON string or file-like, default: None
The string could be a URL. Valid URL schemes include http, ftp, s3, and
file. For file URLs, a host is expected. For instance, a local file
could be ``file://localhost/path/to/table.json``
orient : string,
Indication of expected JSON string format.
Compatible JSON strings can be produced by ``to_json()`` with a
corresponding orient value.
The set of possible orients is:
- ``'split'`` : dict like
``{index -> [index], columns -> [columns], data -> [values]}``
- ``'records'`` : list like
``[{column -> value}, ... , {column -> value}]``
- ``'index'`` : dict like ``{index -> {column -> value}}``
- ``'columns'`` : dict like ``{column -> {index -> value}}``
- ``'values'`` : just the values array
The allowed and default values depend on the value
of the `typ` parameter.
* when ``typ == 'series'``,
- allowed orients are ``{'split','records','index'}``
- default is ``'index'``
- The Series index must be unique for orient ``'index'``.
* when ``typ == 'frame'``,
- allowed orients are ``{'split','records','index',
'columns','values', 'table'}``
- default is ``'columns'``
- The DataFrame index must be unique for orients ``'index'`` and
``'columns'``.
- The DataFrame columns must be unique for orients ``'index'``,
``'columns'``, and ``'records'``.
.. versionadded:: 0.23.0
'table' as an allowed value for the ``orient`` argument
typ : type of object to recover (series or frame), default 'frame'
dtype : boolean or dict, default True
If True, infer dtypes, if a dict of column to dtype, then use those,
if False, then don't infer dtypes at all, applies only to the data.
convert_axes : boolean, default True
Try to convert the axes to the proper dtypes.
convert_dates : boolean, default True
List of columns to parse for dates; If True, then try to parse
datelike columns default is True; a column label is datelike if
* it ends with ``'_at'``,
* it ends with ``'_time'``,
* it begins with ``'timestamp'``,
* it is ``'modified'``, or
* it is ``'date'``
keep_default_dates : boolean, default True
If parsing dates, then parse the default datelike columns
numpy : boolean, default False
Direct decoding to numpy arrays. Supports numeric data only, but
non-numeric column and index labels are supported. Note also that the
JSON ordering MUST be the same for each term if numpy=True.
precise_float : boolean, default False
Set to enable usage of higher precision (strtod) function when
decoding string to double values. Default (False) is to use fast but
less precise builtin functionality
date_unit : string, default None
The timestamp unit to detect if converting dates. The default behaviour
is to try and detect the correct precision, but if this is not desired
then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
milliseconds, microseconds or nanoseconds respectively.
lines : boolean, default False
Read the file as a json object per line.
.. versionadded:: 0.19.0
encoding : str, default is 'utf-8'
The encoding to use to decode py3 bytes.
.. versionadded:: 0.19.0
chunksize: integer, default None
Return JsonReader object for iteration.
See the `line-delimted json docs
<http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
for more information on ``chunksize``.
This can only be passed if `lines=True`.
If this is None, the file will be read into memory all at once.
.. versionadded:: 0.21.0
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, zip or xz if path_or_buf is a string ending in
'.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
otherwise. If using 'zip', the ZIP file must contain only one data
file to be read in. Set to None for no decompression.
.. versionadded:: 0.21.0
Returns
-------
result : Series or DataFrame, depending on the value of `typ`.
Notes
-----
Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
:class:`Index` name of `index` gets written with :func:`to_json`, the
subsequent read operation will incorrectly set the :class:`Index` name to
``None``. This is because `index` is also used by :func:`DataFrame.to_json`
to denote a missing :class:`Index` name, and the subsequent
:func:`read_json` operation cannot distinguish between the two. The same
limitation is encountered with a :class:`MultiIndex` and any names
beginning with ``'level_'``.
See Also
--------
DataFrame.to_json
Examples
--------
>>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
... index=['row 1', 'row 2'],
... columns=['col 1', 'col 2'])
Encoding/decoding a Dataframe using ``'split'`` formatted JSON:
>>> df.to_json(orient='split')
'{"columns":["col 1","col 2"],
"index":["row 1","row 2"],
"data":[["a","b"],["c","d"]]}'
>>> pd.read_json(_, orient='split')
col 1 col 2
row 1 a b
row 2 c d
Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
>>> df.to_json(orient='index')
'{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
>>> pd.read_json(_, orient='index')
col 1 col 2
row 1 a b
row 2 c d
Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
Note that index labels are not preserved with this encoding.
>>> df.to_json(orient='records')
'[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
>>> pd.read_json(_, orient='records')
col 1 col 2
0 a b
1 c d
Encoding with Table Schema
>>> df.to_json(orient='table')
'{"schema": {"fields": [{"name": "index", "type": "string"},
{"name": "col 1", "type": "string"},
{"name": "col 2", "type": "string"}],
"primaryKey": "index",
"pandas_version": "0.20.0"},
"data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
{"index": "row 2", "col 1": "c", "col 2": "d"}]}'
"""
compression = _infer_compression(path_or_buf, compression)
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
path_or_buf, encoding=encoding, compression=compression,
)
json_reader = JsonReader(
filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
convert_axes=convert_axes, convert_dates=convert_dates,
keep_default_dates=keep_default_dates, numpy=numpy,
precise_float=precise_float, date_unit=date_unit, encoding=encoding,
lines=lines, chunksize=chunksize, compression=compression,
)
if chunksize:
return json_reader
result = json_reader.read()
if should_close:
try:
filepath_or_buffer.close()
except: # noqa: flake8
pass
return result
class JsonReader(BaseIterator):
"""
JsonReader provides an interface for reading in a JSON file.
If initialized with ``lines=True`` and ``chunksize``, can be iterated over
``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
whole document.
"""
def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
convert_dates, keep_default_dates, numpy, precise_float,
date_unit, encoding, lines, chunksize, compression):
self.path_or_buf = filepath_or_buffer
self.orient = orient
self.typ = typ
self.dtype = dtype
self.convert_axes = convert_axes
self.convert_dates = convert_dates
self.keep_default_dates = keep_default_dates
self.numpy = numpy
self.precise_float = precise_float
self.date_unit = date_unit
self.encoding = encoding
self.compression = compression
self.lines = lines
self.chunksize = chunksize
self.nrows_seen = 0
self.should_close = False
if self.chunksize is not None:
self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
if not self.lines:
raise ValueError("chunksize can only be passed if lines=True")
data = self._get_data_from_filepath(filepath_or_buffer)
self.data = self._preprocess_data(data)
def _preprocess_data(self, data):
"""
At this point, the data either has a `read` attribute (e.g. a file
object or a StringIO) or is a string that is a JSON document.
If self.chunksize, we prepare the data for the `__next__` method.
Otherwise, we read it into memory for the `read` method.
"""
if hasattr(data, 'read') and not self.chunksize:
data = data.read()
if not hasattr(data, 'read') and self.chunksize:
data = StringIO(data)
return data
def _get_data_from_filepath(self, filepath_or_buffer):
"""
read_json accepts three input types:
1. filepath (string-like)
2. file-like object (e.g. open file object, StringIO)
3. JSON string
This method turns (1) into (2) to simplify the rest of the processing.
It returns input types (2) and (3) unchanged.
"""
data = filepath_or_buffer
exists = False
if isinstance(data, compat.string_types):
try:
exists = os.path.exists(filepath_or_buffer)
# gh-5874: if the filepath is too long will raise here
except (TypeError, ValueError):
pass
if exists or self.compression is not None:
data, _ = _get_handle(filepath_or_buffer, 'r',
encoding=self.encoding,
compression=self.compression)
self.should_close = True
self.open_stream = data
return data
def _combine_lines(self, lines):
"""Combines a list of JSON objects into one JSON object"""
lines = filter(None, map(lambda x: x.strip(), lines))
return '[' + ','.join(lines) + ']'
def read(self):
"""Read the whole JSON input into a pandas object"""
if self.lines and self.chunksize:
obj = concat(self)
elif self.lines:
data = to_str(self.data)
obj = self._get_object_parser(
self._combine_lines(data.split('\n'))
)
else:
obj = self._get_object_parser(self.data)
self.close()
return obj
def _get_object_parser(self, json):
"""parses a json document into a pandas object"""
typ = self.typ
dtype = self.dtype
kwargs = {
"orient": self.orient, "dtype": self.dtype,
"convert_axes": self.convert_axes,
"convert_dates": self.convert_dates,
"keep_default_dates": self.keep_default_dates, "numpy": self.numpy,
"precise_float": self.precise_float, "date_unit": self.date_unit
}
obj = None
if typ == 'frame':
obj = FrameParser(json, **kwargs).parse()
if typ == 'series' or obj is None:
if not isinstance(dtype, bool):
dtype = dict(data=dtype)
obj = SeriesParser(json, **kwargs).parse()
return obj
def close(self):
"""
If we opened a stream earlier, in _get_data_from_filepath, we should
close it. If an open stream or file was passed, we leave it open.
"""
if self.should_close:
try:
self.open_stream.close()
except (IOError, AttributeError):
pass
def __next__(self):
lines = list(islice(self.data, self.chunksize))
if lines:
lines_json = self._combine_lines(lines)
obj = self._get_object_parser(lines_json)
# Make sure that the returned objects have the right index.
obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
self.nrows_seen += len(obj)
return obj
self.close()
raise StopIteration
class Parser(object):
_STAMP_UNITS = ('s', 'ms', 'us', 'ns')
_MIN_STAMPS = {
's': long(31536000),
'ms': long(31536000000),
'us': long(31536000000000),
'ns': long(31536000000000000)}
def __init__(self, json, orient, dtype=True, convert_axes=True,
convert_dates=True, keep_default_dates=False, numpy=False,
precise_float=False, date_unit=None):
self.json = json
if orient is None:
orient = self._default_orient
self.orient = orient
self.dtype = dtype
if orient == "split":
numpy = False
if date_unit is not None:
date_unit = date_unit.lower()
if date_unit not in self._STAMP_UNITS:
raise ValueError('date_unit must be one of {units}'
.format(units=self._STAMP_UNITS))
self.min_stamp = self._MIN_STAMPS[date_unit]
else:
self.min_stamp = self._MIN_STAMPS['s']
self.numpy = numpy
self.precise_float = precise_float
self.convert_axes = convert_axes
self.convert_dates = convert_dates
self.date_unit = date_unit
self.keep_default_dates = keep_default_dates
self.obj = None
def check_keys_split(self, decoded):
"checks that dict has only the appropriate keys for orient='split'"
bad_keys = set(decoded.keys()).difference(set(self._split_keys))
if bad_keys:
bad_keys = ", ".join(bad_keys)
raise ValueError(u("JSON data had unexpected key(s): {bad_keys}")
.format(bad_keys=pprint_thing(bad_keys)))
def parse(self):
# try numpy
numpy = self.numpy
if numpy:
self._parse_numpy()
else:
self._parse_no_numpy()
if self.obj is None:
return None
if self.convert_axes:
self._convert_axes()
self._try_convert_types()
return self.obj
def _convert_axes(self):
""" try to convert axes """
for axis in self.obj._AXIS_NUMBERS.keys():
new_axis, result = self._try_convert_data(
axis, self.obj._get_axis(axis), use_dtypes=False,
convert_dates=True)
if result:
setattr(self.obj, axis, new_axis)
def _try_convert_types(self):
raise com.AbstractMethodError(self)
def _try_convert_data(self, name, data, use_dtypes=True,
convert_dates=True):
""" try to parse a ndarray like into a column by inferring dtype """
# don't try to coerce, unless a force conversion
if use_dtypes:
if self.dtype is False:
return data, False
elif self.dtype is True:
pass
else:
# dtype to force
dtype = (self.dtype.get(name)
if isinstance(self.dtype, dict) else self.dtype)
if dtype is not None:
try:
dtype = np.dtype(dtype)
return data.astype(dtype), True
except (TypeError, ValueError):
return data, False
if convert_dates:
new_data, result = self._try_convert_to_date(data)
if result:
return new_data, True
result = False
if data.dtype == 'object':
# try float
try:
data = data.astype('float64')
result = True
except (TypeError, ValueError):
pass
if data.dtype.kind == 'f':
if data.dtype != 'float64':
# coerce floats to 64
try:
data = data.astype('float64')
result = True
except (TypeError, ValueError):
pass
# do't coerce 0-len data
if len(data) and (data.dtype == 'float' or data.dtype == 'object'):
# coerce ints if we can
try:
new_data = data.astype('int64')
if (new_data == data).all():
data = new_data
result = True
except (TypeError, ValueError):
pass
# coerce ints to 64
if data.dtype == 'int':
# coerce floats to 64
try:
data = data.astype('int64')
result = True
except (TypeError, ValueError):
pass
return data, result
def _try_convert_to_date(self, data):
""" try to parse a ndarray like into a date column
try to coerce object in epoch/iso formats and
integer/float in epcoh formats, return a boolean if parsing
was successful """
# no conversion on empty
if not len(data):
return data, False
new_data = data
if new_data.dtype == 'object':
try:
new_data = data.astype('int64')
except (TypeError, ValueError, OverflowError):
pass
# ignore numbers that are out of range
if issubclass(new_data.dtype.type, np.number):
in_range = (isna(new_data.values) | (new_data > self.min_stamp) |
(new_data.values == iNaT))
if not in_range.all():
return data, False
date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
for date_unit in date_units:
try:
new_data = to_datetime(new_data, errors='raise',
unit=date_unit)
except ValueError:
continue
except Exception:
break
return new_data, True
return data, False
def _try_convert_dates(self):
raise com.AbstractMethodError(self)
class SeriesParser(Parser):
_default_orient = 'index'
_split_keys = ('name', 'index', 'data')
def _parse_no_numpy(self):
json = self.json
orient = self.orient
if orient == "split":
decoded = {str(k): v for k, v in compat.iteritems(
loads(json, precise_float=self.precise_float))}
self.check_keys_split(decoded)
self.obj = Series(dtype=None, **decoded)
else:
self.obj = Series(
loads(json, precise_float=self.precise_float), dtype=None)
def _parse_numpy(self):
json = self.json
orient = self.orient
if orient == "split":
decoded = loads(json, dtype=None, numpy=True,
precise_float=self.precise_float)
decoded = {str(k): v for k, v in compat.iteritems(decoded)}
self.check_keys_split(decoded)
self.obj = Series(**decoded)
elif orient == "columns" or orient == "index":
self.obj = Series(*loads(json, dtype=None, numpy=True,
labelled=True,
precise_float=self.precise_float))
else:
self.obj = Series(loads(json, dtype=None, numpy=True,
precise_float=self.precise_float))
def _try_convert_types(self):
if self.obj is None:
return
obj, result = self._try_convert_data(
'data', self.obj, convert_dates=self.convert_dates)
if result:
self.obj = obj
class FrameParser(Parser):
_default_orient = 'columns'
_split_keys = ('columns', 'index', 'data')
def _parse_numpy(self):
json = self.json
orient = self.orient
if orient == "columns":
args = loads(json, dtype=None, numpy=True, labelled=True,
precise_float=self.precise_float)
if len(args):
args = (args[0].T, args[2], args[1])
self.obj = DataFrame(*args)
elif orient == "split":
decoded = loads(json, dtype=None, numpy=True,
precise_float=self.precise_float)
decoded = {str(k): v for k, v in compat.iteritems(decoded)}
self.check_keys_split(decoded)
self.obj = DataFrame(**decoded)
elif orient == "values":
self.obj = DataFrame(loads(json, dtype=None, numpy=True,
precise_float=self.precise_float))
else:
self.obj = DataFrame(*loads(json, dtype=None, numpy=True,
labelled=True,
precise_float=self.precise_float))
def _parse_no_numpy(self):
json = self.json
orient = self.orient
if orient == "columns":
self.obj = DataFrame(
loads(json, precise_float=self.precise_float), dtype=None)
elif orient == "split":
decoded = {str(k): v for k, v in compat.iteritems(
loads(json, precise_float=self.precise_float))}
self.check_keys_split(decoded)
self.obj = DataFrame(dtype=None, **decoded)
elif orient == "index":
self.obj = DataFrame(
loads(json, precise_float=self.precise_float), dtype=None).T
elif orient == 'table':
self.obj = parse_table_schema(json,
precise_float=self.precise_float)
else:
self.obj = DataFrame(
loads(json, precise_float=self.precise_float), dtype=None)
def _process_converter(self, f, filt=None):
""" take a conversion function and possibly recreate the frame """
if filt is None:
filt = lambda col, c: True
needs_new_obj = False
new_obj = dict()
for i, (col, c) in enumerate(self.obj.iteritems()):
if filt(col, c):
new_data, result = f(col, c)
if result:
c = new_data
needs_new_obj = True
new_obj[i] = c
if needs_new_obj:
# possibly handle dup columns
new_obj = DataFrame(new_obj, index=self.obj.index)
new_obj.columns = self.obj.columns
self.obj = new_obj
def _try_convert_types(self):
if self.obj is None:
return
if self.convert_dates:
self._try_convert_dates()
self._process_converter(
lambda col, c: self._try_convert_data(col, c, convert_dates=False))
def _try_convert_dates(self):
if self.obj is None:
return
# our columns to parse
convert_dates = self.convert_dates
if convert_dates is True:
convert_dates = []
convert_dates = set(convert_dates)
def is_ok(col):
""" return if this col is ok to try for a date parse """
if not isinstance(col, compat.string_types):
return False
col_lower = col.lower()
if (col_lower.endswith('_at') or
col_lower.endswith('_time') or
col_lower == 'modified' or
col_lower == 'date' or
col_lower == 'datetime' or
col_lower.startswith('timestamp')):
return True
return False
self._process_converter(
lambda col, c: self._try_convert_to_date(c),
lambda col, c: ((self.keep_default_dates and is_ok(col)) or
col in convert_dates))
@@ -0,0 +1,281 @@
# ---------------------------------------------------------------------
# JSON normalization routines
import copy
from collections import defaultdict
import numpy as np
from pandas._libs.writers import convert_json_to_lines
from pandas import compat, DataFrame
def _convert_to_line_delimits(s):
"""Helper function that converts json lists to line delimited json."""
# Determine we have a JSON list to turn to lines otherwise just return the
# json object, only lists can
if not s[0] == '[' and s[-1] == ']':
return s
s = s[1:-1]
return convert_json_to_lines(s)
def nested_to_record(ds, prefix="", sep=".", level=0):
"""a simplified json_normalize
converts a nested dict into a flat dict ("record"), unlike json_normalize,
it does not attempt to extract a subset of the data.
Parameters
----------
ds : dict or list of dicts
prefix: the prefix, optional, default: ""
sep : string, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
.. versionadded:: 0.20.0
level: the number of levels in the jason string, optional, default: 0
Returns
-------
d - dict or list of dicts, matching `ds`
Examples
--------
IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),
nested=dict(e=dict(c=1,d=2),d=2)))
Out[52]:
{'dict1.c': 1,
'dict1.d': 2,
'flat1': 1,
'nested.d': 2,
'nested.e.c': 1,
'nested.e.d': 2}
"""
singleton = False
if isinstance(ds, dict):
ds = [ds]
singleton = True
new_ds = []
for d in ds:
new_d = copy.deepcopy(d)
for k, v in d.items():
# each key gets renamed with prefix
if not isinstance(k, compat.string_types):
k = str(k)
if level == 0:
newkey = k
else:
newkey = prefix + sep + k
# only dicts gets recurse-flattend
# only at level>1 do we rename the rest of the keys
if not isinstance(v, dict):
if level != 0: # so we skip copying for top level, common case
v = new_d.pop(k)
new_d[newkey] = v
continue
else:
v = new_d.pop(k)
new_d.update(nested_to_record(v, newkey, sep, level + 1))
new_ds.append(new_d)
if singleton:
return new_ds[0]
return new_ds
def json_normalize(data, record_path=None, meta=None,
meta_prefix=None,
record_prefix=None,
errors='raise',
sep='.'):
"""
"Normalize" semi-structured JSON data into a flat table
Parameters
----------
data : dict or list of dicts
Unserialized JSON objects
record_path : string or list of strings, default None
Path in each object to list of records. If not passed, data will be
assumed to be an array of records
meta : list of paths (string or list of strings), default None
Fields to use as metadata for each record in resulting table
record_prefix : string, default None
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
path to records is ['foo', 'bar']
meta_prefix : string, default None
errors : {'raise', 'ignore'}, default 'raise'
* 'ignore' : will ignore KeyError if keys listed in meta are not
always present
* 'raise' : will raise KeyError if keys listed in meta are not
always present
.. versionadded:: 0.20.0
sep : string, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
.. versionadded:: 0.20.0
Returns
-------
frame : DataFrame
Examples
--------
>>> from pandas.io.json import json_normalize
>>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
... {'name': {'given': 'Mose', 'family': 'Regner'}},
... {'id': 2, 'name': 'Faye Raker'}]
>>> json_normalize(data)
id name name.family name.first name.given name.last
0 1.0 NaN NaN Coleen NaN Volk
1 NaN NaN Regner NaN Mose NaN
2 2.0 Faye Raker NaN NaN NaN NaN
>>> data = [{'state': 'Florida',
... 'shortname': 'FL',
... 'info': {
... 'governor': 'Rick Scott'
... },
... 'counties': [{'name': 'Dade', 'population': 12345},
... {'name': 'Broward', 'population': 40000},
... {'name': 'Palm Beach', 'population': 60000}]},
... {'state': 'Ohio',
... 'shortname': 'OH',
... 'info': {
... 'governor': 'John Kasich'
... },
... 'counties': [{'name': 'Summit', 'population': 1234},
... {'name': 'Cuyahoga', 'population': 1337}]}]
>>> result = json_normalize(data, 'counties', ['state', 'shortname',
... ['info', 'governor']])
>>> result
name population info.governor state shortname
0 Dade 12345 Rick Scott Florida FL
1 Broward 40000 Rick Scott Florida FL
2 Palm Beach 60000 Rick Scott Florida FL
3 Summit 1234 John Kasich Ohio OH
4 Cuyahoga 1337 John Kasich Ohio OH
>>> data = {'A': [1, 2]}
>>> json_normalize(data, 'A', record_prefix='Prefix.')
Prefix.0
0 1
1 2
"""
def _pull_field(js, spec):
result = js
if isinstance(spec, list):
for field in spec:
result = result[field]
else:
result = result[spec]
return result
if isinstance(data, list) and not data:
return DataFrame()
# A bit of a hackjob
if isinstance(data, dict):
data = [data]
if record_path is None:
if any([[isinstance(x, dict)
for x in compat.itervalues(y)] for y in data]):
# naive normalization, this is idempotent for flat records
# and potentially will inflate the data considerably for
# deeply nested structures:
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
#
# TODO: handle record value which are lists, at least error
# reasonably
data = nested_to_record(data, sep=sep)
return DataFrame(data)
elif not isinstance(record_path, list):
record_path = [record_path]
if meta is None:
meta = []
elif not isinstance(meta, list):
meta = [meta]
meta = [m if isinstance(m, list) else [m] for m in meta]
# Disastrously inefficient for now
records = []
lengths = []
meta_vals = defaultdict(list)
if not isinstance(sep, compat.string_types):
sep = str(sep)
meta_keys = [sep.join(val) for val in meta]
def _recursive_extract(data, path, seen_meta, level=0):
if len(path) > 1:
for obj in data:
for val, key in zip(meta, meta_keys):
if level + 1 == len(val):
seen_meta[key] = _pull_field(obj, val[-1])
_recursive_extract(obj[path[0]], path[1:],
seen_meta, level=level + 1)
else:
for obj in data:
recs = _pull_field(obj, path[0])
# For repeating the metadata later
lengths.append(len(recs))
for val, key in zip(meta, meta_keys):
if level + 1 > len(val):
meta_val = seen_meta[key]
else:
try:
meta_val = _pull_field(obj, val[level:])
except KeyError as e:
if errors == 'ignore':
meta_val = np.nan
else:
raise \
KeyError("Try running with "
"errors='ignore' as key "
"{err} is not always present"
.format(err=e))
meta_vals[key].append(meta_val)
records.extend(recs)
_recursive_extract(data, record_path, {}, level=0)
result = DataFrame(records)
if record_prefix is not None:
result = result.rename(
columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))
# Data types, a problem
for k, v in compat.iteritems(meta_vals):
if meta_prefix is not None:
k = meta_prefix + k
if k in result:
raise ValueError('Conflicting metadata name {name}, '
'need distinguishing prefix '.format(name=k))
result[k] = np.array(v).repeat(lengths)
return result
@@ -0,0 +1,324 @@
"""
Table Schema builders
http://specs.frictionlessdata.io/json-table-schema/
"""
import warnings
import pandas._libs.json as json
from pandas import DataFrame
from pandas.api.types import CategoricalDtype
import pandas.core.common as com
from pandas.core.dtypes.common import (
is_integer_dtype, is_timedelta64_dtype, is_numeric_dtype,
is_bool_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
is_categorical_dtype, is_period_dtype, is_string_dtype
)
loads = json.loads
def as_json_table_type(x):
"""
Convert a NumPy / pandas type to its corresponding json_table.
Parameters
----------
x : array or dtype
Returns
-------
t : str
the Table Schema data types
Notes
-----
This table shows the relationship between NumPy / pandas dtypes,
and Table Schema dtypes.
============== =================
Pandas type Table Schema type
============== =================
int64 integer
float64 number
bool boolean
datetime64[ns] datetime
timedelta64[ns] duration
object str
categorical any
=============== =================
"""
if is_integer_dtype(x):
return 'integer'
elif is_bool_dtype(x):
return 'boolean'
elif is_numeric_dtype(x):
return 'number'
elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
is_period_dtype(x)):
return 'datetime'
elif is_timedelta64_dtype(x):
return 'duration'
elif is_categorical_dtype(x):
return 'any'
elif is_string_dtype(x):
return 'string'
else:
return 'any'
def set_default_names(data):
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
if com._all_not_none(*data.index.names):
nms = data.index.names
if len(nms) == 1 and data.index.name == 'index':
warnings.warn("Index name of 'index' is not round-trippable")
elif len(nms) > 1 and any(x.startswith('level_') for x in nms):
warnings.warn("Index names beginning with 'level_' are not "
"round-trippable")
return data
data = data.copy()
if data.index.nlevels > 1:
names = [name if name is not None else 'level_{}'.format(i)
for i, name in enumerate(data.index.names)]
data.index.names = names
else:
data.index.name = data.index.name or 'index'
return data
def convert_pandas_type_to_json_field(arr, dtype=None):
dtype = dtype or arr.dtype
if arr.name is None:
name = 'values'
else:
name = arr.name
field = {'name': name,
'type': as_json_table_type(dtype)}
if is_categorical_dtype(arr):
if hasattr(arr, 'categories'):
cats = arr.categories
ordered = arr.ordered
else:
cats = arr.cat.categories
ordered = arr.cat.ordered
field['constraints'] = {"enum": list(cats)}
field['ordered'] = ordered
elif is_period_dtype(arr):
field['freq'] = arr.freqstr
elif is_datetime64tz_dtype(arr):
if hasattr(arr, 'dt'):
field['tz'] = arr.dt.tz.zone
else:
field['tz'] = arr.tz.zone
return field
def convert_json_field_to_pandas_type(field):
"""
Converts a JSON field descriptor into its corresponding NumPy / pandas type
Parameters
----------
field
A JSON field descriptor
Returns
-------
dtype
Raises
-----
ValueError
If the type of the provided field is unknown or currently unsupported
Examples
--------
>>> convert_json_field_to_pandas_type({'name': 'an_int',
'type': 'integer'})
'int64'
>>> convert_json_field_to_pandas_type({'name': 'a_categorical',
'type': 'any',
'contraints': {'enum': [
'a', 'b', 'c']},
'ordered': True})
'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)'
>>> convert_json_field_to_pandas_type({'name': 'a_datetime',
'type': 'datetime'})
'datetime64[ns]'
>>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz',
'type': 'datetime',
'tz': 'US/Central'})
'datetime64[ns, US/Central]'
"""
typ = field['type']
if typ == 'string':
return 'object'
elif typ == 'integer':
return 'int64'
elif typ == 'number':
return 'float64'
elif typ == 'boolean':
return 'bool'
elif typ == 'duration':
return 'timedelta64'
elif typ == 'datetime':
if field.get('tz'):
return 'datetime64[ns, {tz}]'.format(tz=field['tz'])
else:
return 'datetime64[ns]'
elif typ == 'any':
if 'constraints' in field and 'ordered' in field:
return CategoricalDtype(categories=field['constraints']['enum'],
ordered=field['ordered'])
else:
return 'object'
raise ValueError("Unsupported or invalid field type: {}".format(typ))
def build_table_schema(data, index=True, primary_key=None, version=True):
"""
Create a Table schema from ``data``.
Parameters
----------
data : Series, DataFrame
index : bool, default True
Whether to include ``data.index`` in the schema.
primary_key : bool or None, default True
column names to designate as the primary key.
The default `None` will set `'primaryKey'` to the index
level or levels if the index is unique.
version : bool, default True
Whether to include a field `pandas_version` with the version
of pandas that generated the schema.
Returns
-------
schema : dict
Examples
--------
>>> df = pd.DataFrame(
... {'A': [1, 2, 3],
... 'B': ['a', 'b', 'c'],
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
... }, index=pd.Index(range(3), name='idx'))
>>> build_table_schema(df)
{'fields': [{'name': 'idx', 'type': 'integer'},
{'name': 'A', 'type': 'integer'},
{'name': 'B', 'type': 'string'},
{'name': 'C', 'type': 'datetime'}],
'pandas_version': '0.20.0',
'primaryKey': ['idx']}
Notes
-----
See `_as_json_table_type` for conversion types.
Timedeltas as converted to ISO8601 duration format with
9 decimal places after the secnods field for nanosecond precision.
Categoricals are converted to the `any` dtype, and use the `enum` field
constraint to list the allowed values. The `ordered` attribute is included
in an `ordered` field.
"""
if index is True:
data = set_default_names(data)
schema = {}
fields = []
if index:
if data.index.nlevels > 1:
for level in data.index.levels:
fields.append(convert_pandas_type_to_json_field(level))
else:
fields.append(convert_pandas_type_to_json_field(data.index))
if data.ndim > 1:
for column, s in data.iteritems():
fields.append(convert_pandas_type_to_json_field(s))
else:
fields.append(convert_pandas_type_to_json_field(data))
schema['fields'] = fields
if index and data.index.is_unique and primary_key is None:
if data.index.nlevels == 1:
schema['primaryKey'] = [data.index.name]
else:
schema['primaryKey'] = data.index.names
elif primary_key is not None:
schema['primaryKey'] = primary_key
if version:
schema['pandas_version'] = '0.20.0'
return schema
def parse_table_schema(json, precise_float):
"""
Builds a DataFrame from a given schema
Parameters
----------
json :
A JSON table schema
precise_float : boolean
Flag controlling precision when decoding string to double values, as
dictated by ``read_json``
Returns
-------
df : DataFrame
Raises
------
NotImplementedError
If the JSON table schema contains either timezone or timedelta data
Notes
-----
Because :func:`DataFrame.to_json` uses the string 'index' to denote a
name-less :class:`Index`, this function sets the name of the returned
:class:`DataFrame` to ``None`` when said string is encountered with a
normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
applies to any strings beginning with 'level_'. Therefore, an
:class:`Index` name of 'index' and :class:`MultiIndex` names starting
with 'level_' are not supported.
See also
--------
build_table_schema : inverse function
pandas.read_json
"""
table = loads(json, precise_float=precise_float)
col_order = [field['name'] for field in table['schema']['fields']]
df = DataFrame(table['data'], columns=col_order)[col_order]
dtypes = {field['name']: convert_json_field_to_pandas_type(field)
for field in table['schema']['fields']}
# Cannot directly use as_type with timezone data on object; raise for now
if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
raise NotImplementedError('table="orient" can not yet read timezone '
'data')
# No ISO constructor for Timedelta as of yet, so need to raise
if 'timedelta64' in dtypes.values():
raise NotImplementedError('table="orient" can not yet read '
'ISO-formatted Timedelta data')
df = df.astype(dtypes)
df = df.set_index(table['schema']['primaryKey'])
if len(df.index.names) == 1:
if df.index.name == 'index':
df.index.name = None
else:
df.index.names = [None if x.startswith('level_') else x for x in
df.index.names]
return df
@@ -0,0 +1,50 @@
# coding: utf-8
from collections import namedtuple
from pandas.io.msgpack.exceptions import * # noqa
from pandas.io.msgpack._version import version # noqa
class ExtType(namedtuple('ExtType', 'code data')):
"""ExtType represents ext type in msgpack."""
def __new__(cls, code, data):
if not isinstance(code, int):
raise TypeError("code must be int")
if not isinstance(data, bytes):
raise TypeError("data must be bytes")
if not 0 <= code <= 127:
raise ValueError("code must be 0~127")
return super(ExtType, cls).__new__(cls, code, data)
import os # noqa
from pandas.io.msgpack._packer import Packer # noqa
from pandas.io.msgpack._unpacker import unpack, unpackb, Unpacker # noqa
def pack(o, stream, **kwargs):
"""
Pack object `o` and write it to `stream`
See :class:`Packer` for options.
"""
packer = Packer(**kwargs)
stream.write(packer.pack(o))
def packb(o, **kwargs):
"""
Pack object `o` and return packed bytes
See :class:`Packer` for options.
"""
return Packer(**kwargs).pack(o)
# alias for compatibility to simplejson/marshal/pickle.
load = unpack
loads = unpackb
dump = pack
dumps = packb
@@ -0,0 +1 @@
version = (0, 4, 6)
@@ -0,0 +1,32 @@
class UnpackException(Exception):
pass
class BufferFull(UnpackException):
pass
class OutOfData(UnpackException):
pass
class UnpackValueError(UnpackException, ValueError):
pass
class ExtraData(ValueError):
def __init__(self, unpacked, extra):
self.unpacked = unpacked
self.extra = extra
def __str__(self):
return "unpack(b) received extra data."
class PackException(Exception):
pass
class PackValueError(PackException, ValueError):
pass
@@ -0,0 +1,820 @@
"""
Msgpack serializer support for reading and writing pandas data structures
to disk
portions of msgpack_numpy package, by Lev Givon were incorporated
into this module (and tests_packers.py)
License
=======
Copyright (c) 2013, Lev Givon.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Lev Givon nor the names of any
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from datetime import datetime, date, timedelta
from dateutil.parser import parse
import os
from textwrap import dedent
import warnings
import numpy as np
from pandas import compat
from pandas.compat import u, u_safe
from pandas.core.dtypes.common import (
is_categorical_dtype, is_object_dtype,
needs_i8_conversion, pandas_dtype)
from pandas import (Timestamp, Period, Series, DataFrame, # noqa
Index, MultiIndex, Float64Index, Int64Index,
Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT,
Categorical, CategoricalIndex, IntervalIndex, Interval,
TimedeltaIndex)
from pandas.core.sparse.api import SparseSeries, SparseDataFrame
from pandas.core.sparse.array import BlockIndex, IntIndex
from pandas.core.generic import NDFrame
from pandas.errors import PerformanceWarning
from pandas.io.common import get_filepath_or_buffer, _stringify_path
from pandas.core.internals import BlockManager, make_block, _safe_reshape
import pandas.core.internals as internals
from pandas.io.msgpack import Unpacker as _Unpacker, Packer as _Packer, ExtType
from pandas.util._move import (
BadMove as _BadMove,
move_into_mutable_buffer as _move_into_mutable_buffer,
)
# check which compression libs we have installed
try:
import zlib
def _check_zlib():
pass
except ImportError:
def _check_zlib():
raise ImportError('zlib is not installed')
_check_zlib.__doc__ = dedent(
"""\
Check if zlib is installed.
Raises
------
ImportError
Raised when zlib is not installed.
""",
)
try:
import blosc
def _check_blosc():
pass
except ImportError:
def _check_blosc():
raise ImportError('blosc is not installed')
_check_blosc.__doc__ = dedent(
"""\
Check if blosc is installed.
Raises
------
ImportError
Raised when blosc is not installed.
""",
)
# until we can pass this into our conversion functions,
# this is pretty hacky
compressor = None
def to_msgpack(path_or_buf, *args, **kwargs):
"""
msgpack (serialize) object to input file path
THIS IS AN EXPERIMENTAL LIBRARY and the storage format
may not be stable until a future release.
Parameters
----------
path_or_buf : string File path, buffer-like, or None
if None, return generated string
args : an object or objects to serialize
encoding: encoding for unicode objects
append : boolean whether to append to an existing msgpack
(default is False)
compress : type of compressor (zlib or blosc), default to None (no
compression)
"""
global compressor
compressor = kwargs.pop('compress', None)
if compressor:
compressor = u(compressor)
append = kwargs.pop('append', None)
if append:
mode = 'a+b'
else:
mode = 'wb'
def writer(fh):
for a in args:
fh.write(pack(a, **kwargs))
path_or_buf = _stringify_path(path_or_buf)
if isinstance(path_or_buf, compat.string_types):
with open(path_or_buf, mode) as fh:
writer(fh)
elif path_or_buf is None:
buf = compat.BytesIO()
writer(buf)
return buf.getvalue()
else:
writer(path_or_buf)
def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
"""
Load msgpack pandas object from the specified
file path
THIS IS AN EXPERIMENTAL LIBRARY and the storage format
may not be stable until a future release.
Parameters
----------
path_or_buf : string File path, BytesIO like or string
encoding: Encoding for decoding msgpack str type
iterator : boolean, if True, return an iterator to the unpacker
(default is False)
Returns
-------
obj : type of object stored in file
"""
path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)
if iterator:
return Iterator(path_or_buf)
def read(fh):
l = list(unpack(fh, encoding=encoding, **kwargs))
if len(l) == 1:
return l[0]
if should_close:
try:
path_or_buf.close()
except: # noqa: flake8
pass
return l
# see if we have an actual file
if isinstance(path_or_buf, compat.string_types):
try:
exists = os.path.exists(path_or_buf)
except (TypeError, ValueError):
exists = False
if exists:
with open(path_or_buf, 'rb') as fh:
return read(fh)
if isinstance(path_or_buf, compat.binary_type):
# treat as a binary-like
fh = None
try:
# We can't distinguish between a path and a buffer of bytes in
# Python 2 so instead assume the first byte of a valid path is
# less than 0x80.
if compat.PY3 or ord(path_or_buf[0]) >= 0x80:
fh = compat.BytesIO(path_or_buf)
return read(fh)
finally:
if fh is not None:
fh.close()
elif hasattr(path_or_buf, 'read') and compat.callable(path_or_buf.read):
# treat as a buffer like
return read(path_or_buf)
raise ValueError('path_or_buf needs to be a string file path or file-like')
dtype_dict = {21: np.dtype('M8[ns]'),
u('datetime64[ns]'): np.dtype('M8[ns]'),
u('datetime64[us]'): np.dtype('M8[us]'),
22: np.dtype('m8[ns]'),
u('timedelta64[ns]'): np.dtype('m8[ns]'),
u('timedelta64[us]'): np.dtype('m8[us]'),
# this is platform int, which we need to remap to np.int64
# for compat on windows platforms
7: np.dtype('int64'),
'category': 'category'
}
def dtype_for(t):
""" return my dtype mapping, whether number or name """
if t in dtype_dict:
return dtype_dict[t]
return np.typeDict.get(t, t)
c2f_dict = {'complex': np.float64,
'complex128': np.float64,
'complex64': np.float32}
# numpy 1.6.1 compat
if hasattr(np, 'float128'):
c2f_dict['complex256'] = np.float128
def c2f(r, i, ctype_name):
"""
Convert strings to complex number instance with specified numpy type.
"""
ftype = c2f_dict[ctype_name]
return np.typeDict[ctype_name](ftype(r) + 1j * ftype(i))
def convert(values):
""" convert the numpy values to a list """
dtype = values.dtype
if is_categorical_dtype(values):
return values
elif is_object_dtype(dtype):
return values.ravel().tolist()
if needs_i8_conversion(dtype):
values = values.view('i8')
v = values.ravel()
if compressor == 'zlib':
_check_zlib()
# return string arrays like they are
if dtype == np.object_:
return v.tolist()
# convert to a bytes array
v = v.tostring()
return ExtType(0, zlib.compress(v))
elif compressor == 'blosc':
_check_blosc()
# return string arrays like they are
if dtype == np.object_:
return v.tolist()
# convert to a bytes array
v = v.tostring()
return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))
# ndarray (on original dtype)
return ExtType(0, v.tostring())
def unconvert(values, dtype, compress=None):
as_is_ext = isinstance(values, ExtType) and values.code == 0
if as_is_ext:
values = values.data
if is_categorical_dtype(dtype):
return values
elif is_object_dtype(dtype):
return np.array(values, dtype=object)
dtype = pandas_dtype(dtype).base
if not as_is_ext:
values = values.encode('latin1')
if compress:
if compress == u'zlib':
_check_zlib()
decompress = zlib.decompress
elif compress == u'blosc':
_check_blosc()
decompress = blosc.decompress
else:
raise ValueError("compress must be one of 'zlib' or 'blosc'")
try:
return np.frombuffer(
_move_into_mutable_buffer(decompress(values)),
dtype=dtype,
)
except _BadMove as e:
# Pull the decompressed data off of the `_BadMove` exception.
# We don't just store this in the locals because we want to
# minimize the risk of giving users access to a `bytes` object
# whose data is also given to a mutable buffer.
values = e.args[0]
if len(values) > 1:
# The empty string and single characters are memoized in many
# string creating functions in the capi. This case should not
# warn even though we need to make a copy because we are only
# copying at most 1 byte.
warnings.warn(
'copying data after decompressing; this may mean that'
' decompress is caching its result',
PerformanceWarning,
)
# fall through to copying `np.fromstring`
# Copy the bytes into a numpy array.
buf = np.frombuffer(values, dtype=dtype)
buf = buf.copy() # required to not mutate the original data
buf.flags.writeable = True
return buf
def encode(obj):
"""
Data encoder
"""
tobj = type(obj)
if isinstance(obj, Index):
if isinstance(obj, RangeIndex):
return {u'typ': u'range_index',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'start': getattr(obj, '_start', None),
u'stop': getattr(obj, '_stop', None),
u'step': getattr(obj, '_step', None)}
elif isinstance(obj, PeriodIndex):
return {u'typ': u'period_index',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'freq': u_safe(getattr(obj, 'freqstr', None)),
u'dtype': u(obj.dtype.name),
u'data': convert(obj.asi8),
u'compress': compressor}
elif isinstance(obj, DatetimeIndex):
tz = getattr(obj, 'tz', None)
# store tz info and data as UTC
if tz is not None:
tz = u(tz.zone)
obj = obj.tz_convert('UTC')
return {u'typ': u'datetime_index',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'dtype': u(obj.dtype.name),
u'data': convert(obj.asi8),
u'freq': u_safe(getattr(obj, 'freqstr', None)),
u'tz': tz,
u'compress': compressor}
elif isinstance(obj, IntervalIndex):
return {u'typ': u'interval_index',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'left': getattr(obj, '_left', None),
u'right': getattr(obj, '_right', None),
u'closed': getattr(obj, '_closed', None)}
elif isinstance(obj, MultiIndex):
return {u'typ': u'multi_index',
u'klass': u(obj.__class__.__name__),
u'names': getattr(obj, 'names', None),
u'dtype': u(obj.dtype.name),
u'data': convert(obj.values),
u'compress': compressor}
else:
return {u'typ': u'index',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'dtype': u(obj.dtype.name),
u'data': convert(obj.values),
u'compress': compressor}
elif isinstance(obj, Categorical):
return {u'typ': u'category',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'codes': obj.codes,
u'categories': obj.categories,
u'ordered': obj.ordered,
u'compress': compressor}
elif isinstance(obj, Series):
if isinstance(obj, SparseSeries):
raise NotImplementedError(
'msgpack sparse series is not implemented'
)
# d = {'typ': 'sparse_series',
# 'klass': obj.__class__.__name__,
# 'dtype': obj.dtype.name,
# 'index': obj.index,
# 'sp_index': obj.sp_index,
# 'sp_values': convert(obj.sp_values),
# 'compress': compressor}
# for f in ['name', 'fill_value', 'kind']:
# d[f] = getattr(obj, f, None)
# return d
else:
return {u'typ': u'series',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'index': obj.index,
u'dtype': u(obj.dtype.name),
u'data': convert(obj.values),
u'compress': compressor}
elif issubclass(tobj, NDFrame):
if isinstance(obj, SparseDataFrame):
raise NotImplementedError(
'msgpack sparse frame is not implemented'
)
# d = {'typ': 'sparse_dataframe',
# 'klass': obj.__class__.__name__,
# 'columns': obj.columns}
# for f in ['default_fill_value', 'default_kind']:
# d[f] = getattr(obj, f, None)
# d['data'] = dict([(name, ss)
# for name, ss in compat.iteritems(obj)])
# return d
else:
data = obj._data
if not data.is_consolidated():
data = data.consolidate()
# the block manager
return {u'typ': u'block_manager',
u'klass': u(obj.__class__.__name__),
u'axes': data.axes,
u'blocks': [{u'locs': b.mgr_locs.as_array,
u'values': convert(b.values),
u'shape': b.values.shape,
u'dtype': u(b.dtype.name),
u'klass': u(b.__class__.__name__),
u'compress': compressor} for b in data.blocks]
}
elif isinstance(obj, (datetime, date, np.datetime64, timedelta,
np.timedelta64)) or obj is NaT:
if isinstance(obj, Timestamp):
tz = obj.tzinfo
if tz is not None:
tz = u(tz.zone)
freq = obj.freq
if freq is not None:
freq = u(freq.freqstr)
return {u'typ': u'timestamp',
u'value': obj.value,
u'freq': freq,
u'tz': tz}
if obj is NaT:
return {u'typ': u'nat'}
elif isinstance(obj, np.timedelta64):
return {u'typ': u'timedelta64',
u'data': obj.view('i8')}
elif isinstance(obj, timedelta):
return {u'typ': u'timedelta',
u'data': (obj.days, obj.seconds, obj.microseconds)}
elif isinstance(obj, np.datetime64):
return {u'typ': u'datetime64',
u'data': u(str(obj))}
elif isinstance(obj, datetime):
return {u'typ': u'datetime',
u'data': u(obj.isoformat())}
elif isinstance(obj, date):
return {u'typ': u'date',
u'data': u(obj.isoformat())}
raise Exception("cannot encode this datetimelike object: %s" % obj)
elif isinstance(obj, Period):
return {u'typ': u'period',
u'ordinal': obj.ordinal,
u'freq': u_safe(obj.freqstr)}
elif isinstance(obj, Interval):
return {u'typ': u'interval',
u'left': obj.left,
u'right': obj.right,
u'closed': obj.closed}
elif isinstance(obj, BlockIndex):
return {u'typ': u'block_index',
u'klass': u(obj.__class__.__name__),
u'blocs': obj.blocs,
u'blengths': obj.blengths,
u'length': obj.length}
elif isinstance(obj, IntIndex):
return {u'typ': u'int_index',
u'klass': u(obj.__class__.__name__),
u'indices': obj.indices,
u'length': obj.length}
elif isinstance(obj, np.ndarray):
return {u'typ': u'ndarray',
u'shape': obj.shape,
u'ndim': obj.ndim,
u'dtype': u(obj.dtype.name),
u'data': convert(obj),
u'compress': compressor}
elif isinstance(obj, np.number):
if np.iscomplexobj(obj):
return {u'typ': u'np_scalar',
u'sub_typ': u'np_complex',
u'dtype': u(obj.dtype.name),
u'real': u(obj.real.__repr__()),
u'imag': u(obj.imag.__repr__())}
else:
return {u'typ': u'np_scalar',
u'dtype': u(obj.dtype.name),
u'data': u(obj.__repr__())}
elif isinstance(obj, complex):
return {u'typ': u'np_complex',
u'real': u(obj.real.__repr__()),
u'imag': u(obj.imag.__repr__())}
return obj
def decode(obj):
"""
Decoder for deserializing numpy data types.
"""
typ = obj.get(u'typ')
if typ is None:
return obj
elif typ == u'timestamp':
freq = obj[u'freq'] if 'freq' in obj else obj[u'offset']
return Timestamp(obj[u'value'], tz=obj[u'tz'], freq=freq)
elif typ == u'nat':
return NaT
elif typ == u'period':
return Period(ordinal=obj[u'ordinal'], freq=obj[u'freq'])
elif typ == u'index':
dtype = dtype_for(obj[u'dtype'])
data = unconvert(obj[u'data'], dtype,
obj.get(u'compress'))
return globals()[obj[u'klass']](data, dtype=dtype, name=obj[u'name'])
elif typ == u'range_index':
return globals()[obj[u'klass']](obj[u'start'],
obj[u'stop'],
obj[u'step'],
name=obj[u'name'])
elif typ == u'multi_index':
dtype = dtype_for(obj[u'dtype'])
data = unconvert(obj[u'data'], dtype,
obj.get(u'compress'))
data = [tuple(x) for x in data]
return globals()[obj[u'klass']].from_tuples(data, names=obj[u'names'])
elif typ == u'period_index':
data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
d = dict(name=obj[u'name'], freq=obj[u'freq'])
return globals()[obj[u'klass']]._from_ordinals(data, **d)
elif typ == u'datetime_index':
data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
d = dict(name=obj[u'name'], freq=obj[u'freq'], verify_integrity=False)
result = globals()[obj[u'klass']](data, **d)
tz = obj[u'tz']
# reverse tz conversion
if tz is not None:
result = result.tz_localize('UTC').tz_convert(tz)
return result
elif typ == u'interval_index':
return globals()[obj[u'klass']].from_arrays(obj[u'left'],
obj[u'right'],
obj[u'closed'],
name=obj[u'name'])
elif typ == u'category':
from_codes = globals()[obj[u'klass']].from_codes
return from_codes(codes=obj[u'codes'],
categories=obj[u'categories'],
ordered=obj[u'ordered'])
elif typ == u'interval':
return Interval(obj[u'left'], obj[u'right'], obj[u'closed'])
elif typ == u'series':
dtype = dtype_for(obj[u'dtype'])
pd_dtype = pandas_dtype(dtype)
index = obj[u'index']
result = globals()[obj[u'klass']](unconvert(obj[u'data'], dtype,
obj[u'compress']),
index=index,
dtype=pd_dtype,
name=obj[u'name'])
return result
elif typ == u'block_manager':
axes = obj[u'axes']
def create_block(b):
values = _safe_reshape(unconvert(
b[u'values'], dtype_for(b[u'dtype']),
b[u'compress']), b[u'shape'])
# locs handles duplicate column names, and should be used instead
# of items; see GH 9618
if u'locs' in b:
placement = b[u'locs']
else:
placement = axes[0].get_indexer(b[u'items'])
return make_block(values=values,
klass=getattr(internals, b[u'klass']),
placement=placement,
dtype=b[u'dtype'])
blocks = [create_block(b) for b in obj[u'blocks']]
return globals()[obj[u'klass']](BlockManager(blocks, axes))
elif typ == u'datetime':
return parse(obj[u'data'])
elif typ == u'datetime64':
return np.datetime64(parse(obj[u'data']))
elif typ == u'date':
return parse(obj[u'data']).date()
elif typ == u'timedelta':
return timedelta(*obj[u'data'])
elif typ == u'timedelta64':
return np.timedelta64(int(obj[u'data']))
# elif typ == 'sparse_series':
# dtype = dtype_for(obj['dtype'])
# return globals()[obj['klass']](
# unconvert(obj['sp_values'], dtype, obj['compress']),
# sparse_index=obj['sp_index'], index=obj['index'],
# fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
# elif typ == 'sparse_dataframe':
# return globals()[obj['klass']](
# obj['data'], columns=obj['columns'],
# default_fill_value=obj['default_fill_value'],
# default_kind=obj['default_kind']
# )
# elif typ == 'sparse_panel':
# return globals()[obj['klass']](
# obj['data'], items=obj['items'],
# default_fill_value=obj['default_fill_value'],
# default_kind=obj['default_kind'])
elif typ == u'block_index':
return globals()[obj[u'klass']](obj[u'length'], obj[u'blocs'],
obj[u'blengths'])
elif typ == u'int_index':
return globals()[obj[u'klass']](obj[u'length'], obj[u'indices'])
elif typ == u'ndarray':
return unconvert(obj[u'data'], np.typeDict[obj[u'dtype']],
obj.get(u'compress')).reshape(obj[u'shape'])
elif typ == u'np_scalar':
if obj.get(u'sub_typ') == u'np_complex':
return c2f(obj[u'real'], obj[u'imag'], obj[u'dtype'])
else:
dtype = dtype_for(obj[u'dtype'])
try:
return dtype(obj[u'data'])
except:
return dtype.type(obj[u'data'])
elif typ == u'np_complex':
return complex(obj[u'real'] + u'+' + obj[u'imag'] + u'j')
elif isinstance(obj, (dict, list, set)):
return obj
else:
return obj
def pack(o, default=encode,
encoding='utf-8', unicode_errors='strict', use_single_float=False,
autoreset=1, use_bin_type=1):
"""
Pack an object and return the packed bytes.
"""
return Packer(default=default, encoding=encoding,
unicode_errors=unicode_errors,
use_single_float=use_single_float,
autoreset=autoreset,
use_bin_type=use_bin_type).pack(o)
def unpack(packed, object_hook=decode,
list_hook=None, use_list=False, encoding='utf-8',
unicode_errors='strict', object_pairs_hook=None,
max_buffer_size=0, ext_hook=ExtType):
"""
Unpack a packed object, return an iterator
Note: packed lists will be returned as tuples
"""
return Unpacker(packed, object_hook=object_hook,
list_hook=list_hook,
use_list=use_list, encoding=encoding,
unicode_errors=unicode_errors,
object_pairs_hook=object_pairs_hook,
max_buffer_size=max_buffer_size,
ext_hook=ext_hook)
class Packer(_Packer):
def __init__(self, default=encode,
encoding='utf-8',
unicode_errors='strict',
use_single_float=False,
autoreset=1,
use_bin_type=1):
super(Packer, self).__init__(default=default,
encoding=encoding,
unicode_errors=unicode_errors,
use_single_float=use_single_float,
autoreset=autoreset,
use_bin_type=use_bin_type)
class Unpacker(_Unpacker):
def __init__(self, file_like=None, read_size=0, use_list=False,
object_hook=decode,
object_pairs_hook=None, list_hook=None, encoding='utf-8',
unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType):
super(Unpacker, self).__init__(file_like=file_like,
read_size=read_size,
use_list=use_list,
object_hook=object_hook,
object_pairs_hook=object_pairs_hook,
list_hook=list_hook,
encoding=encoding,
unicode_errors=unicode_errors,
max_buffer_size=max_buffer_size,
ext_hook=ext_hook)
class Iterator(object):
""" manage the unpacking iteration,
close the file on completion """
def __init__(self, path, **kwargs):
self.path = path
self.kwargs = kwargs
def __iter__(self):
needs_closing = True
try:
# see if we have an actual file
if isinstance(self.path, compat.string_types):
try:
path_exists = os.path.exists(self.path)
except TypeError:
path_exists = False
if path_exists:
fh = open(self.path, 'rb')
else:
fh = compat.BytesIO(self.path)
else:
if not hasattr(self.path, 'read'):
fh = compat.BytesIO(self.path)
else:
# a file-like
needs_closing = False
fh = self.path
unpacker = unpack(fh)
for o in unpacker:
yield o
finally:
if needs_closing:
fh.close()
@@ -0,0 +1,288 @@
""" parquet compat """
from warnings import catch_warnings
from distutils.version import LooseVersion
from pandas import DataFrame, RangeIndex, Int64Index, get_option
from pandas.compat import string_types
import pandas.core.common as com
from pandas.io.common import get_filepath_or_buffer, is_s3_url
def get_engine(engine):
""" return our implementation """
if engine == 'auto':
engine = get_option('io.parquet.engine')
if engine == 'auto':
# try engines in this order
try:
return PyArrowImpl()
except ImportError:
pass
try:
return FastParquetImpl()
except ImportError:
pass
raise ImportError("Unable to find a usable engine; "
"tried using: 'pyarrow', 'fastparquet'.\n"
"pyarrow or fastparquet is required for parquet "
"support")
if engine not in ['pyarrow', 'fastparquet']:
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
if engine == 'pyarrow':
return PyArrowImpl()
elif engine == 'fastparquet':
return FastParquetImpl()
class BaseImpl(object):
api = None # module
@staticmethod
def validate_dataframe(df):
if not isinstance(df, DataFrame):
raise ValueError("to_parquet only supports IO with DataFrames")
# must have value column names (strings only)
if df.columns.inferred_type not in {'string', 'unicode'}:
raise ValueError("parquet must have string column names")
# index level names must be strings
valid_names = all(
isinstance(name, string_types)
for name in df.index.names
if name is not None
)
if not valid_names:
raise ValueError("Index level names must be strings")
def write(self, df, path, compression, **kwargs):
raise com.AbstractMethodError(self)
def read(self, path, columns=None, **kwargs):
raise com.AbstractMethodError(self)
class PyArrowImpl(BaseImpl):
def __init__(self):
# since pandas is a dependency of pyarrow
# we need to import on first use
try:
import pyarrow
import pyarrow.parquet
except ImportError:
raise ImportError(
"pyarrow is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)
if LooseVersion(pyarrow.__version__) < '0.4.1':
raise ImportError(
"pyarrow >= 0.4.1 is required for parquet support\n\n"
"you can install via conda\n"
"conda install pyarrow -c conda-forge\n"
"\nor via pip\n"
"pip install -U pyarrow\n"
)
self._pyarrow_lt_060 = (
LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0'))
self._pyarrow_lt_070 = (
LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'))
self.api = pyarrow
def write(self, df, path, compression='snappy',
coerce_timestamps='ms', **kwargs):
self.validate_dataframe(df)
if self._pyarrow_lt_070:
self._validate_write_lt_070(df)
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
if self._pyarrow_lt_060:
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
self.api.parquet.write_table(
table, path, compression=compression, **kwargs)
else:
table = self.api.Table.from_pandas(df)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
def read(self, path, columns=None, **kwargs):
path, _, _, should_close = get_filepath_or_buffer(path)
if self._pyarrow_lt_070:
result = self.api.parquet.read_pandas(path, columns=columns,
**kwargs).to_pandas()
else:
kwargs['use_pandas_metadata'] = True
result = self.api.parquet.read_table(path, columns=columns,
**kwargs).to_pandas()
if should_close:
try:
path.close()
except: # noqa: flake8
pass
return result
def _validate_write_lt_070(self, df):
# Compatibility shim for pyarrow < 0.7.0
# TODO: Remove in pandas 0.23.0
from pandas.core.indexes.multi import MultiIndex
if isinstance(df.index, MultiIndex):
msg = (
"Multi-index DataFrames are only supported "
"with pyarrow >= 0.7.0"
)
raise ValueError(msg)
# Validate index
if not isinstance(df.index, Int64Index):
msg = (
"pyarrow < 0.7.0 does not support serializing {} for the "
"index; you can .reset_index() to make the index into "
"column(s), or install the latest version of pyarrow or "
"fastparquet."
)
raise ValueError(msg.format(type(df.index)))
if not df.index.equals(RangeIndex(len(df))):
raise ValueError(
"pyarrow < 0.7.0 does not support serializing a non-default "
"index; you can .reset_index() to make the index into "
"column(s), or install the latest version of pyarrow or "
"fastparquet."
)
if df.index.name is not None:
raise ValueError(
"pyarrow < 0.7.0 does not serialize indexes with a name; you "
"can set the index.name to None or install the latest version "
"of pyarrow or fastparquet."
)
class FastParquetImpl(BaseImpl):
def __init__(self):
# since pandas is a dependency of fastparquet
# we need to import on first use
try:
import fastparquet
except ImportError:
raise ImportError(
"fastparquet is required for parquet support\n\n"
"you can install via conda\n"
"conda install fastparquet -c conda-forge\n"
"\nor via pip\n"
"pip install -U fastparquet"
)
if LooseVersion(fastparquet.__version__) < '0.1.0':
raise ImportError(
"fastparquet >= 0.1.0 is required for parquet "
"support\n\n"
"you can install via conda\n"
"conda install fastparquet -c conda-forge\n"
"\nor via pip\n"
"pip install -U fastparquet"
)
self.api = fastparquet
def write(self, df, path, compression='snappy', **kwargs):
self.validate_dataframe(df)
# thriftpy/protocol/compact.py:339:
# DeprecationWarning: tostring() is deprecated.
# Use tobytes() instead.
if is_s3_url(path):
# path is s3:// so we need to open the s3file in 'wb' mode.
# TODO: Support 'ab'
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
# And pass the opened s3file to the fastparquet internal impl.
kwargs['open_with'] = lambda path, _: path
else:
path, _, _, _ = get_filepath_or_buffer(path)
with catch_warnings(record=True):
self.api.write(path, df,
compression=compression, **kwargs)
def read(self, path, columns=None, **kwargs):
if is_s3_url(path):
# When path is s3:// an S3File is returned.
# We need to retain the original path(str) while also
# pass the S3File().open function to fsatparquet impl.
s3, _, _, should_close = get_filepath_or_buffer(path)
try:
parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open)
finally:
s3.close()
else:
path, _, _, _ = get_filepath_or_buffer(path)
parquet_file = self.api.ParquetFile(path)
return parquet_file.to_pandas(columns=columns, **kwargs)
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
"""
Write a DataFrame to the parquet format.
Parameters
----------
df : DataFrame
path : string
File path
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
behavior is to try 'pyarrow', falling back to 'fastparquet' if
'pyarrow' is unavailable.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
kwargs
Additional keyword arguments passed to the engine
"""
impl = get_engine(engine)
return impl.write(df, path, compression=compression, **kwargs)
def read_parquet(path, engine='auto', columns=None, **kwargs):
"""
Load a parquet object from the file path, returning a DataFrame.
.. versionadded 0.21.0
Parameters
----------
path : string
File path
columns: list, default=None
If not None, only these columns will be read from the file.
.. versionadded 0.21.1
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
behavior is to try 'pyarrow', falling back to 'fastparquet' if
'pyarrow' is unavailable.
kwargs are passed to the engine
Returns
-------
DataFrame
"""
impl = get_engine(engine)
return impl.read(path, columns=columns, **kwargs)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,205 @@
""" pickle compat """
import warnings
import numpy as np
from numpy.lib.format import read_array, write_array
from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
from pandas.core.dtypes.common import is_datetime64_dtype, _NS_DTYPE
from pandas.io.common import _get_handle, _infer_compression, _stringify_path
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
"""
Pickle (serialize) object to file.
Parameters
----------
obj : any object
Any python object.
path : str
File path where the pickled object will be stored.
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
A string representing the compression to use in the output file. By
default, infers from the file extension in specified path.
.. versionadded:: 0.20.0
protocol : int
Int which indicates which protocol should be used by the pickler,
default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
values for this parameter depend on the version of Python. For Python
2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
For Python >= 3.4, 4 is a valid value. A negative value for the
protocol parameter is equivalent to setting its value to
HIGHEST_PROTOCOL.
.. [1] https://docs.python.org/3/library/pickle.html
.. versionadded:: 0.21.0
See Also
--------
read_pickle : Load pickled pandas object (or any object) from file.
DataFrame.to_hdf : Write DataFrame to an HDF5 file.
DataFrame.to_sql : Write DataFrame to a SQL database.
DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
Examples
--------
>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
>>> original_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> pd.to_pickle(original_df, "./dummy.pkl")
>>> unpickled_df = pd.read_pickle("./dummy.pkl")
>>> unpickled_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> import os
>>> os.remove("./dummy.pkl")
"""
path = _stringify_path(path)
inferred_compression = _infer_compression(path, compression)
f, fh = _get_handle(path, 'wb',
compression=inferred_compression,
is_text=False)
if protocol < 0:
protocol = pkl.HIGHEST_PROTOCOL
try:
f.write(pkl.dumps(obj, protocol=protocol))
finally:
for _f in fh:
_f.close()
def read_pickle(path, compression='infer'):
"""
Load pickled pandas object (or any object) from file.
.. warning::
Loading pickled data received from untrusted sources can be
unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.
Parameters
----------
path : str
File path where the pickled object will be loaded.
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
or '.zip' respectively, and no decompression otherwise.
Set to None for no decompression.
.. versionadded:: 0.20.0
Returns
-------
unpickled : type of object stored in file
See Also
--------
DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
Series.to_pickle : Pickle (serialize) Series object to file.
read_hdf : Read HDF5 file into a DataFrame.
read_sql : Read SQL query or database table into a DataFrame.
read_parquet : Load a parquet object, returning a DataFrame.
Examples
--------
>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
>>> original_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> pd.to_pickle(original_df, "./dummy.pkl")
>>> unpickled_df = pd.read_pickle("./dummy.pkl")
>>> unpickled_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> import os
>>> os.remove("./dummy.pkl")
"""
path = _stringify_path(path)
inferred_compression = _infer_compression(path, compression)
def read_wrapper(func):
# wrapper file handle open/close operation
f, fh = _get_handle(path, 'rb',
compression=inferred_compression,
is_text=False)
try:
return func(f)
finally:
for _f in fh:
_f.close()
def try_read(path, encoding=None):
# try with cPickle
# try with current pickle, if we have a Type Error then
# try with the compat pickle to handle subclass changes
# pass encoding only if its not None as py2 doesn't handle
# the param
# cpickle
# GH 6899
try:
with warnings.catch_warnings(record=True):
# We want to silencce any warnings about, e.g. moved modules.
return read_wrapper(lambda f: pkl.load(f))
except Exception:
# reg/patched pickle
try:
return read_wrapper(
lambda f: pc.load(f, encoding=encoding, compat=False))
# compat pickle
except:
return read_wrapper(
lambda f: pc.load(f, encoding=encoding, compat=True))
try:
return try_read(path)
except:
if PY3:
return try_read(path, encoding='latin1')
raise
# compat with sparse pickle / unpickle
def _pickle_array(arr):
arr = arr.view(np.ndarray)
buf = BytesIO()
write_array(buf, arr)
return buf.getvalue()
def _unpickle_array(bytes):
arr = read_array(BytesIO(bytes))
# All datetimes should be stored as M8[ns]. When unpickling with
# numpy1.6, it will read these as M8[us]. So this ensures all
# datetime64 types are read as MS[ns]
if is_datetime64_dtype(arr):
arr = arr.view(_NS_DTYPE)
return arr
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,39 @@
""" s3 support for remote file interactivity """
from pandas import compat
try:
import s3fs
from botocore.exceptions import NoCredentialsError
except:
raise ImportError("The s3fs library is required to handle s3 files")
if compat.PY3:
from urllib.parse import urlparse as parse_url
else:
from urlparse import urlparse as parse_url
def _strip_schema(url):
"""Returns the url without the s3:// part"""
result = parse_url(url)
return result.netloc + result.path
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=None, mode=None):
if mode is None:
mode = 'rb'
fs = s3fs.S3FileSystem(anon=False)
try:
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
except (compat.FileNotFoundError, NoCredentialsError):
# boto3 has troubles when trying to access a public file
# when credentialed...
# An OSError is raised if you have credentials, but they
# aren't valid for that bucket.
# A NoCredentialsError is raised if you don't have creds
# for that bucket.
fs = s3fs.S3FileSystem(anon=True)
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
return filepath_or_buffer, None, compression, True
@@ -0,0 +1 @@
from .sasreader import read_sas # noqa
@@ -0,0 +1,687 @@
"""
Read SAS7BDAT files
Based on code written by Jared Hobbs:
https://bitbucket.org/jaredhobbs/sas7bdat
See also:
https://github.com/BioStatMatt/sas7bdat
Partial documentation of the file format:
https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
Reference for binary data compression:
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
"""
import pandas as pd
from pandas import compat
from pandas.io.common import get_filepath_or_buffer, BaseIterator
from pandas.errors import EmptyDataError
import numpy as np
import struct
import pandas.io.sas.sas_constants as const
from pandas.io.sas._sas import Parser
class _subheader_pointer(object):
pass
class _column(object):
pass
# SAS7BDAT represents a SAS data file in SAS7BDAT format.
class SAS7BDATReader(BaseIterator):
"""
Read SAS files in SAS7BDAT format.
Parameters
----------
path_or_buf : path name or buffer
Name of SAS file or file-like object pointing to SAS file
contents.
index : column identifier, defaults to None
Column to use as index.
convert_dates : boolean, defaults to True
Attempt to convert dates to Pandas datetime values. Note that
some rarely used SAS date formats may be unsupported.
blank_missing : boolean, defaults to True
Convert empty strings to missing values (SAS uses blanks to
indicate missing character variables).
chunksize : int, defaults to None
Return SAS7BDATReader object for iterations, returns chunks
with given number of lines.
encoding : string, defaults to None
String encoding.
convert_text : bool, defaults to True
If False, text variables are left as raw bytes.
convert_header_text : bool, defaults to True
If False, header text, including column names, are left as raw
bytes.
"""
def __init__(self, path_or_buf, index=None, convert_dates=True,
blank_missing=True, chunksize=None, encoding=None,
convert_text=True, convert_header_text=True):
self.index = index
self.convert_dates = convert_dates
self.blank_missing = blank_missing
self.chunksize = chunksize
self.encoding = encoding
self.convert_text = convert_text
self.convert_header_text = convert_header_text
self.default_encoding = "latin-1"
self.compression = ""
self.column_names_strings = []
self.column_names = []
self.column_types = []
self.column_formats = []
self.columns = []
self._current_page_data_subheader_pointers = []
self._cached_page = None
self._column_data_lengths = []
self._column_data_offsets = []
self._current_row_in_file_index = 0
self._current_row_on_page_index = 0
self._current_row_in_file_index = 0
self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
if isinstance(self._path_or_buf, compat.string_types):
self._path_or_buf = open(self._path_or_buf, 'rb')
self.handle = self._path_or_buf
self._get_properties()
self._parse_metadata()
def close(self):
try:
self.handle.close()
except AttributeError:
pass
def _get_properties(self):
# Check magic number
self._path_or_buf.seek(0)
self._cached_page = self._path_or_buf.read(288)
if self._cached_page[0:len(const.magic)] != const.magic:
self.close()
raise ValueError("magic number mismatch (not a SAS file?)")
# Get alignment information
align1, align2 = 0, 0
buf = self._read_bytes(const.align_1_offset, const.align_1_length)
if buf == const.u64_byte_checker_value:
align2 = const.align_2_value
self.U64 = True
self._int_length = 8
self._page_bit_offset = const.page_bit_offset_x64
self._subheader_pointer_length = const.subheader_pointer_length_x64
else:
self.U64 = False
self._page_bit_offset = const.page_bit_offset_x86
self._subheader_pointer_length = const.subheader_pointer_length_x86
self._int_length = 4
buf = self._read_bytes(const.align_2_offset, const.align_2_length)
if buf == const.align_1_checker_value:
align1 = const.align_2_value
total_align = align1 + align2
# Get endianness information
buf = self._read_bytes(const.endianness_offset,
const.endianness_length)
if buf == b'\x01':
self.byte_order = "<"
else:
self.byte_order = ">"
# Get encoding information
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
if buf in const.encoding_names:
self.file_encoding = const.encoding_names[buf]
else:
self.file_encoding = "unknown (code=%s)" % str(buf)
# Get platform information
buf = self._read_bytes(const.platform_offset, const.platform_length)
if buf == b'1':
self.platform = "unix"
elif buf == b'2':
self.platform = "windows"
else:
self.platform = "unknown"
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
self.name = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.name = self.name.decode(
self.encoding or self.default_encoding)
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
self.file_type = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.file_type = self.file_type.decode(
self.encoding or self.default_encoding)
# Timestamp is epoch 01/01/1960
epoch = pd.datetime(1960, 1, 1)
x = self._read_float(const.date_created_offset + align1,
const.date_created_length)
self.date_created = epoch + pd.to_timedelta(x, unit='s')
x = self._read_float(const.date_modified_offset + align1,
const.date_modified_length)
self.date_modified = epoch + pd.to_timedelta(x, unit='s')
self.header_length = self._read_int(const.header_size_offset + align1,
const.header_size_length)
# Read the rest of the header into cached_page.
buf = self._path_or_buf.read(self.header_length - 288)
self._cached_page += buf
if len(self._cached_page) != self.header_length:
self.close()
raise ValueError("The SAS7BDAT file appears to be truncated.")
self._page_length = self._read_int(const.page_size_offset + align1,
const.page_size_length)
self._page_count = self._read_int(const.page_count_offset + align1,
const.page_count_length)
buf = self._read_bytes(const.sas_release_offset + total_align,
const.sas_release_length)
self.sas_release = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.sas_release = self.sas_release.decode(
self.encoding or self.default_encoding)
buf = self._read_bytes(const.sas_server_type_offset + total_align,
const.sas_server_type_length)
self.server_type = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.server_type = self.server_type.decode(
self.encoding or self.default_encoding)
buf = self._read_bytes(const.os_version_number_offset + total_align,
const.os_version_number_length)
self.os_version = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.os_version = self.os_version.decode(
self.encoding or self.default_encoding)
buf = self._read_bytes(const.os_name_offset + total_align,
const.os_name_length)
buf = buf.rstrip(b'\x00 ')
if len(buf) > 0:
self.os_name = buf.decode(self.encoding or self.default_encoding)
else:
buf = self._read_bytes(const.os_maker_offset + total_align,
const.os_maker_length)
self.os_name = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.os_name = self.os_name.decode(
self.encoding or self.default_encoding)
def __next__(self):
da = self.read(nrows=self.chunksize or 1)
if da is None:
raise StopIteration
return da
# Read a single float of the given width (4 or 8).
def _read_float(self, offset, width):
if width not in (4, 8):
self.close()
raise ValueError("invalid float width")
buf = self._read_bytes(offset, width)
fd = "f" if width == 4 else "d"
return struct.unpack(self.byte_order + fd, buf)[0]
# Read a single signed integer of the given width (1, 2, 4 or 8).
def _read_int(self, offset, width):
if width not in (1, 2, 4, 8):
self.close()
raise ValueError("invalid int width")
buf = self._read_bytes(offset, width)
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
iv = struct.unpack(self.byte_order + it, buf)[0]
return iv
def _read_bytes(self, offset, length):
if self._cached_page is None:
self._path_or_buf.seek(offset)
buf = self._path_or_buf.read(length)
if len(buf) < length:
self.close()
msg = "Unable to read {:d} bytes from file position {:d}."
raise ValueError(msg.format(length, offset))
return buf
else:
if offset + length > len(self._cached_page):
self.close()
raise ValueError("The cached page is too small.")
return self._cached_page[offset:offset + length]
def _parse_metadata(self):
done = False
while not done:
self._cached_page = self._path_or_buf.read(self._page_length)
if len(self._cached_page) <= 0:
break
if len(self._cached_page) != self._page_length:
self.close()
raise ValueError(
"Failed to read a meta data page from the SAS file.")
done = self._process_page_meta()
def _process_page_meta(self):
self._read_page_header()
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
if self._current_page_type in pt:
self._process_page_metadata()
return ((self._current_page_type in [256] + const.page_mix_types) or
(self._current_page_data_subheader_pointers is not None))
def _read_page_header(self):
bit_offset = self._page_bit_offset
tx = const.page_type_offset + bit_offset
self._current_page_type = self._read_int(tx, const.page_type_length)
tx = const.block_count_offset + bit_offset
self._current_page_block_count = self._read_int(
tx, const.block_count_length)
tx = const.subheader_count_offset + bit_offset
self._current_page_subheaders_count = (
self._read_int(tx, const.subheader_count_length))
def _process_page_metadata(self):
bit_offset = self._page_bit_offset
for i in range(self._current_page_subheaders_count):
pointer = self._process_subheader_pointers(
const.subheader_pointers_offset + bit_offset, i)
if pointer.length == 0:
continue
if pointer.compression == const.truncated_subheader_id:
continue
subheader_signature = self._read_subheader_signature(
pointer.offset)
subheader_index = (
self._get_subheader_index(subheader_signature,
pointer.compression, pointer.ptype))
self._process_subheader(subheader_index, pointer)
def _get_subheader_index(self, signature, compression, ptype):
index = const.subheader_signature_to_index.get(signature)
if index is None:
f1 = ((compression == const.compressed_subheader_id) or
(compression == 0))
f2 = (ptype == const.compressed_subheader_type)
if (self.compression != "") and f1 and f2:
index = const.SASIndex.data_subheader_index
else:
self.close()
raise ValueError("Unknown subheader signature")
return index
def _process_subheader_pointers(self, offset, subheader_pointer_index):
subheader_pointer_length = self._subheader_pointer_length
total_offset = (offset +
subheader_pointer_length * subheader_pointer_index)
subheader_offset = self._read_int(total_offset, self._int_length)
total_offset += self._int_length
subheader_length = self._read_int(total_offset, self._int_length)
total_offset += self._int_length
subheader_compression = self._read_int(total_offset, 1)
total_offset += 1
subheader_type = self._read_int(total_offset, 1)
x = _subheader_pointer()
x.offset = subheader_offset
x.length = subheader_length
x.compression = subheader_compression
x.ptype = subheader_type
return x
def _read_subheader_signature(self, offset):
subheader_signature = self._read_bytes(offset, self._int_length)
return subheader_signature
def _process_subheader(self, subheader_index, pointer):
offset = pointer.offset
length = pointer.length
if subheader_index == const.SASIndex.row_size_index:
processor = self._process_rowsize_subheader
elif subheader_index == const.SASIndex.column_size_index:
processor = self._process_columnsize_subheader
elif subheader_index == const.SASIndex.column_text_index:
processor = self._process_columntext_subheader
elif subheader_index == const.SASIndex.column_name_index:
processor = self._process_columnname_subheader
elif subheader_index == const.SASIndex.column_attributes_index:
processor = self._process_columnattributes_subheader
elif subheader_index == const.SASIndex.format_and_label_index:
processor = self._process_format_subheader
elif subheader_index == const.SASIndex.column_list_index:
processor = self._process_columnlist_subheader
elif subheader_index == const.SASIndex.subheader_counts_index:
processor = self._process_subheader_counts
elif subheader_index == const.SASIndex.data_subheader_index:
self._current_page_data_subheader_pointers.append(pointer)
return
else:
raise ValueError("unknown subheader index")
processor(offset, length)
def _process_rowsize_subheader(self, offset, length):
int_len = self._int_length
lcs_offset = offset
lcp_offset = offset
if self.U64:
lcs_offset += 682
lcp_offset += 706
else:
lcs_offset += 354
lcp_offset += 378
self.row_length = self._read_int(
offset + const.row_length_offset_multiplier * int_len, int_len)
self.row_count = self._read_int(
offset + const.row_count_offset_multiplier * int_len, int_len)
self.col_count_p1 = self._read_int(
offset + const.col_count_p1_multiplier * int_len, int_len)
self.col_count_p2 = self._read_int(
offset + const.col_count_p2_multiplier * int_len, int_len)
mx = const.row_count_on_mix_page_offset_multiplier * int_len
self._mix_page_row_count = self._read_int(offset + mx, int_len)
self._lcs = self._read_int(lcs_offset, 2)
self._lcp = self._read_int(lcp_offset, 2)
def _process_columnsize_subheader(self, offset, length):
int_len = self._int_length
offset += int_len
self.column_count = self._read_int(offset, int_len)
if (self.col_count_p1 + self.col_count_p2 !=
self.column_count):
print("Warning: column count mismatch (%d + %d != %d)\n",
self.col_count_p1, self.col_count_p2, self.column_count)
# Unknown purpose
def _process_subheader_counts(self, offset, length):
pass
def _process_columntext_subheader(self, offset, length):
offset += self._int_length
text_block_size = self._read_int(offset, const.text_block_size_length)
buf = self._read_bytes(offset, text_block_size)
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
cname = cname_raw
if self.convert_header_text:
cname = cname.decode(self.encoding or self.default_encoding)
self.column_names_strings.append(cname)
if len(self.column_names_strings) == 1:
compression_literal = ""
for cl in const.compression_literals:
if cl in cname_raw:
compression_literal = cl
self.compression = compression_literal
offset -= self._int_length
offset1 = offset + 16
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
compression_literal = buf.rstrip(b"\x00")
if compression_literal == "":
self._lcs = 0
offset1 = offset + 32
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
self.creator_proc = buf[0:self._lcp]
elif compression_literal == const.rle_compression:
offset1 = offset + 40
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
self.creator_proc = buf[0:self._lcp]
elif self._lcs > 0:
self._lcp = 0
offset1 = offset + 16
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcs)
self.creator_proc = buf[0:self._lcp]
if self.convert_header_text:
if hasattr(self, "creator_proc"):
self.creator_proc = self.creator_proc.decode(
self.encoding or self.default_encoding)
def _process_columnname_subheader(self, offset, length):
int_len = self._int_length
offset += int_len
column_name_pointers_count = (length - 2 * int_len - 12) // 8
for i in range(column_name_pointers_count):
text_subheader = offset + const.column_name_pointer_length * \
(i + 1) + const.column_name_text_subheader_offset
col_name_offset = offset + const.column_name_pointer_length * \
(i + 1) + const.column_name_offset_offset
col_name_length = offset + const.column_name_pointer_length * \
(i + 1) + const.column_name_length_offset
idx = self._read_int(
text_subheader, const.column_name_text_subheader_length)
col_offset = self._read_int(
col_name_offset, const.column_name_offset_length)
col_len = self._read_int(
col_name_length, const.column_name_length_length)
name_str = self.column_names_strings[idx]
self.column_names.append(name_str[col_offset:col_offset + col_len])
def _process_columnattributes_subheader(self, offset, length):
int_len = self._int_length
column_attributes_vectors_count = (
length - 2 * int_len - 12) // (int_len + 8)
self.column_types = np.empty(
column_attributes_vectors_count, dtype=np.dtype('S1'))
self._column_data_lengths = np.empty(
column_attributes_vectors_count, dtype=np.int64)
self._column_data_offsets = np.empty(
column_attributes_vectors_count, dtype=np.int64)
for i in range(column_attributes_vectors_count):
col_data_offset = (offset + int_len +
const.column_data_offset_offset +
i * (int_len + 8))
col_data_len = (offset + 2 * int_len +
const.column_data_length_offset +
i * (int_len + 8))
col_types = (offset + 2 * int_len +
const.column_type_offset + i * (int_len + 8))
x = self._read_int(col_data_offset, int_len)
self._column_data_offsets[i] = x
x = self._read_int(col_data_len, const.column_data_length_length)
self._column_data_lengths[i] = x
x = self._read_int(col_types, const.column_type_length)
if x == 1:
self.column_types[i] = b'd'
else:
self.column_types[i] = b's'
def _process_columnlist_subheader(self, offset, length):
# unknown purpose
pass
def _process_format_subheader(self, offset, length):
int_len = self._int_length
text_subheader_format = (
offset +
const.column_format_text_subheader_index_offset +
3 * int_len)
col_format_offset = (offset +
const.column_format_offset_offset +
3 * int_len)
col_format_len = (offset +
const.column_format_length_offset +
3 * int_len)
text_subheader_label = (
offset +
const.column_label_text_subheader_index_offset +
3 * int_len)
col_label_offset = (offset +
const.column_label_offset_offset +
3 * int_len)
col_label_len = offset + const.column_label_length_offset + 3 * int_len
x = self._read_int(text_subheader_format,
const.column_format_text_subheader_index_length)
format_idx = min(x, len(self.column_names_strings) - 1)
format_start = self._read_int(
col_format_offset, const.column_format_offset_length)
format_len = self._read_int(
col_format_len, const.column_format_length_length)
label_idx = self._read_int(
text_subheader_label,
const.column_label_text_subheader_index_length)
label_idx = min(label_idx, len(self.column_names_strings) - 1)
label_start = self._read_int(
col_label_offset, const.column_label_offset_length)
label_len = self._read_int(col_label_len,
const.column_label_length_length)
label_names = self.column_names_strings[label_idx]
column_label = label_names[label_start: label_start + label_len]
format_names = self.column_names_strings[format_idx]
column_format = format_names[format_start: format_start + format_len]
current_column_number = len(self.columns)
col = _column()
col.col_id = current_column_number
col.name = self.column_names[current_column_number]
col.label = column_label
col.format = column_format
col.ctype = self.column_types[current_column_number]
col.length = self._column_data_lengths[current_column_number]
self.column_formats.append(column_format)
self.columns.append(col)
def read(self, nrows=None):
if (nrows is None) and (self.chunksize is not None):
nrows = self.chunksize
elif nrows is None:
nrows = self.row_count
if len(self.column_types) == 0:
self.close()
raise EmptyDataError("No columns to parse from file")
if self._current_row_in_file_index >= self.row_count:
return None
m = self.row_count - self._current_row_in_file_index
if nrows > m:
nrows = m
nd = (self.column_types == b'd').sum()
ns = (self.column_types == b's').sum()
self._string_chunk = np.empty((ns, nrows), dtype=np.object)
self._byte_chunk = np.empty((nd, 8 * nrows), dtype=np.uint8)
self._current_row_in_chunk_index = 0
p = Parser(self)
p.read(nrows)
rslt = self._chunk_to_dataframe()
if self.index is not None:
rslt = rslt.set_index(self.index)
return rslt
def _read_next_page(self):
self._current_page_data_subheader_pointers = []
self._cached_page = self._path_or_buf.read(self._page_length)
if len(self._cached_page) <= 0:
return True
elif len(self._cached_page) != self._page_length:
self.close()
msg = ("failed to read complete page from file "
"(read {:d} of {:d} bytes)")
raise ValueError(msg.format(len(self._cached_page),
self._page_length))
self._read_page_header()
if self._current_page_type == const.page_meta_type:
self._process_page_metadata()
pt = [const.page_meta_type, const.page_data_type]
pt += [const.page_mix_types]
if self._current_page_type not in pt:
return self._read_next_page()
return False
def _chunk_to_dataframe(self):
n = self._current_row_in_chunk_index
m = self._current_row_in_file_index
ix = range(m - n, m)
rslt = pd.DataFrame(index=ix)
js, jb = 0, 0
for j in range(self.column_count):
name = self.column_names[j]
if self.column_types[j] == b'd':
rslt[name] = self._byte_chunk[jb, :].view(
dtype=self.byte_order + 'd')
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
if self.convert_dates:
unit = None
if self.column_formats[j] in const.sas_date_formats:
unit = 'd'
elif self.column_formats[j] in const.sas_datetime_formats:
unit = 's'
if unit:
rslt[name] = pd.to_datetime(rslt[name], unit=unit,
origin="1960-01-01")
jb += 1
elif self.column_types[j] == b's':
rslt[name] = self._string_chunk[js, :]
if self.convert_text and (self.encoding is not None):
rslt[name] = rslt[name].str.decode(
self.encoding or self.default_encoding)
if self.blank_missing:
ii = rslt[name].str.len() == 0
rslt.loc[ii, name] = np.nan
js += 1
else:
self.close()
raise ValueError("unknown column type %s" %
self.column_types[j])
return rslt
@@ -0,0 +1,171 @@
magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" +
b"\x00\x00\x00\x00\xc2\xea\x81\x60" +
b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" +
b"\x09\xc7\x31\x8c\x18\x1f\x10\x11")
align_1_checker_value = b'3'
align_1_offset = 32
align_1_length = 1
align_1_value = 4
u64_byte_checker_value = b'3'
align_2_offset = 35
align_2_length = 1
align_2_value = 4
endianness_offset = 37
endianness_length = 1
platform_offset = 39
platform_length = 1
encoding_offset = 70
encoding_length = 1
dataset_offset = 92
dataset_length = 64
file_type_offset = 156
file_type_length = 8
date_created_offset = 164
date_created_length = 8
date_modified_offset = 172
date_modified_length = 8
header_size_offset = 196
header_size_length = 4
page_size_offset = 200
page_size_length = 4
page_count_offset = 204
page_count_length = 4
sas_release_offset = 216
sas_release_length = 8
sas_server_type_offset = 224
sas_server_type_length = 16
os_version_number_offset = 240
os_version_number_length = 16
os_maker_offset = 256
os_maker_length = 16
os_name_offset = 272
os_name_length = 16
page_bit_offset_x86 = 16
page_bit_offset_x64 = 32
subheader_pointer_length_x86 = 12
subheader_pointer_length_x64 = 24
page_type_offset = 0
page_type_length = 2
block_count_offset = 2
block_count_length = 2
subheader_count_offset = 4
subheader_count_length = 2
page_meta_type = 0
page_data_type = 256
page_amd_type = 1024
page_metc_type = 16384
page_comp_type = -28672
page_mix_types = [512, 640]
subheader_pointers_offset = 8
truncated_subheader_id = 1
compressed_subheader_id = 4
compressed_subheader_type = 1
text_block_size_length = 2
row_length_offset_multiplier = 5
row_count_offset_multiplier = 6
col_count_p1_multiplier = 9
col_count_p2_multiplier = 10
row_count_on_mix_page_offset_multiplier = 15
column_name_pointer_length = 8
column_name_text_subheader_offset = 0
column_name_text_subheader_length = 2
column_name_offset_offset = 2
column_name_offset_length = 2
column_name_length_offset = 4
column_name_length_length = 2
column_data_offset_offset = 8
column_data_length_offset = 8
column_data_length_length = 4
column_type_offset = 14
column_type_length = 1
column_format_text_subheader_index_offset = 22
column_format_text_subheader_index_length = 2
column_format_offset_offset = 24
column_format_offset_length = 2
column_format_length_offset = 26
column_format_length_length = 2
column_label_text_subheader_index_offset = 28
column_label_text_subheader_index_length = 2
column_label_offset_offset = 30
column_label_offset_length = 2
column_label_length_offset = 32
column_label_length_length = 2
rle_compression = b'SASYZCRL'
rdc_compression = b'SASYZCR2'
compression_literals = [rle_compression, rdc_compression]
# Incomplete list of encodings, using SAS nomenclature:
# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2",
61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"}
class SASIndex(object):
row_size_index = 0
column_size_index = 1
subheader_counts_index = 2
column_text_index = 3
column_name_index = 4
column_attributes_index = 5
format_and_label_index = 6
column_list_index = 7
data_subheader_index = 8
subheader_signature_to_index = {
b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index}
# List of frequently used SAS date and datetime formats
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
sas_date_formats = ("DATE", "DAY", "DDMMYY", "DOWNAME", "JULDAY", "JULIAN",
"MMDDYY", "MMYY", "MMYYC", "MMYYD", "MMYYP", "MMYYS",
"MMYYN", "MONNAME", "MONTH", "MONYY", "QTR", "QTRR",
"NENGO", "WEEKDATE", "WEEKDATX", "WEEKDAY", "WEEKV",
"WORDDATE", "WORDDATX", "YEAR", "YYMM", "YYMMC", "YYMMD",
"YYMMP", "YYMMS", "YYMMN", "YYMON", "YYMMDD", "YYQ",
"YYQC", "YYQD", "YYQP", "YYQS", "YYQN", "YYQR", "YYQRC",
"YYQRD", "YYQRP", "YYQRS", "YYQRN",
"YYMMDDP", "YYMMDDC", "E8601DA", "YYMMDDN", "MMDDYYC",
"MMDDYYS", "MMDDYYD", "YYMMDDS", "B8601DA", "DDMMYYN",
"YYMMDDD", "DDMMYYB", "DDMMYYP", "MMDDYYP", "YYMMDDB",
"MMDDYYN", "DDMMYYC", "DDMMYYD", "DDMMYYS",
"MINGUO")
sas_datetime_formats = ("DATETIME", "DTWKDATX",
"B8601DN", "B8601DT", "B8601DX", "B8601DZ", "B8601LX",
"E8601DN", "E8601DT", "E8601DX", "E8601DZ", "E8601LX",
"DATEAMPM", "DTDATE", "DTMONYY", "DTMONYY", "DTWKDATX",
"DTYEAR", "TOD", "MDYAMPM")
@@ -0,0 +1,465 @@
"""
Read a SAS XPort format file into a Pandas DataFrame.
Based on code from Jack Cushman (github.com/jcushman/xport).
The file format is defined here:
https://support.sas.com/techsup/technote/ts140.pdf
"""
from datetime import datetime
import pandas as pd
from pandas.io.common import get_filepath_or_buffer, BaseIterator
from pandas import compat
import struct
import numpy as np
from pandas.util._decorators import Appender
import warnings
_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
"000000000000000000000000000000 ")
_correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!"
"000000000000000001600000000")
_correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
"000000000000000000000000000000 ")
_correct_obs_header = ("HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
"000000000000000000000000000000 ")
_fieldkeys = ['ntype', 'nhfun', 'field_length', 'nvar0', 'name', 'label',
'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform',
'nifl', 'nifd', 'npos', '_']
_base_params_doc = """\
Parameters
----------
filepath_or_buffer : string or file-like object
Path to SAS file or object implementing binary read method."""
_params2_doc = """\
index : identifier of index column
Identifier of column that should be used as index of the DataFrame.
encoding : string
Encoding for text data.
chunksize : int
Read file `chunksize` lines at a time, returns iterator."""
_format_params_doc = """\
format : string
File format, only `xport` is currently supported."""
_iterator_doc = """\
iterator : boolean, default False
Return XportReader object for reading file incrementally."""
_read_sas_doc = """Read a SAS file into a DataFrame.
%(_base_params_doc)s
%(_format_params_doc)s
%(_params2_doc)s
%(_iterator_doc)s
Returns
-------
DataFrame or XportReader
Examples
--------
Read a SAS Xport file:
>>> df = pandas.read_sas('filename.XPT')
Read a Xport file in 10,000 line chunks:
>>> itr = pandas.read_sas('filename.XPT', chunksize=10000)
>>> for chunk in itr:
>>> do_something(chunk)
""" % {"_base_params_doc": _base_params_doc,
"_format_params_doc": _format_params_doc,
"_params2_doc": _params2_doc,
"_iterator_doc": _iterator_doc}
_xport_reader_doc = """\
Class for reading SAS Xport files.
%(_base_params_doc)s
%(_params2_doc)s
Attributes
----------
member_info : list
Contains information about the file
fields : list
Contains information about the variables in the file
""" % {"_base_params_doc": _base_params_doc,
"_params2_doc": _params2_doc}
_read_method_doc = """\
Read observations from SAS Xport file, returning as data frame.
Parameters
----------
nrows : int
Number of rows to read from data file; if None, read whole
file.
Returns
-------
A DataFrame.
"""
def _parse_date(datestr):
""" Given a date in xport format, return Python date. """
try:
# e.g. "16FEB11:10:07:55"
return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
except ValueError:
return pd.NaT
def _split_line(s, parts):
"""
Parameters
----------
s: string
Fixed-length string to split
parts: list of (name, length) pairs
Used to break up string, name '_' will be filtered from output.
Returns
-------
Dict of name:contents of string at given location.
"""
out = {}
start = 0
for name, length in parts:
out[name] = s[start:start + length].strip()
start += length
del out['_']
return out
def _handle_truncated_float_vec(vec, nbytes):
# This feature is not well documented, but some SAS XPORT files
# have 2-7 byte "truncated" floats. To read these truncated
# floats, pad them with zeros on the right to make 8 byte floats.
#
# References:
# https://github.com/jcushman/xport/pull/3
# The R "foreign" library
if nbytes != 8:
vec1 = np.zeros(len(vec), np.dtype('S8'))
dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes))
vec2 = vec1.view(dtype=dtype)
vec2['f0'] = vec
return vec2
return vec
def _parse_float_vec(vec):
"""
Parse a vector of float values representing IBM 8 byte floats into
native 8 byte floats.
"""
dtype = np.dtype('>u4,>u4')
vec1 = vec.view(dtype=dtype)
xport1 = vec1['f0']
xport2 = vec1['f1']
# Start by setting first half of ieee number to first half of IBM
# number sans exponent
ieee1 = xport1 & 0x00ffffff
# Get the second half of the ibm number into the second half of
# the ieee number
ieee2 = xport2
# The fraction bit to the left of the binary point in the ieee
# format was set and the number was shifted 0, 1, 2, or 3
# places. This will tell us how to adjust the ibm exponent to be a
# power of 2 ieee exponent and how to shift the fraction bits to
# restore the correct magnitude.
shift = np.zeros(len(vec), dtype=np.uint8)
shift[np.where(xport1 & 0x00200000)] = 1
shift[np.where(xport1 & 0x00400000)] = 2
shift[np.where(xport1 & 0x00800000)] = 3
# shift the ieee number down the correct number of places then
# set the second half of the ieee number to be the second half
# of the ibm number shifted appropriately, ored with the bits
# from the first half that would have been shifted in if we
# could shift a double. All we are worried about are the low
# order 3 bits of the first half since we're only shifting by
# 1, 2, or 3.
ieee1 >>= shift
ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
# clear the 1 bit to the left of the binary point
ieee1 &= 0xffefffff
# set the exponent of the ieee number to be the actual exponent
# plus the shift count + 1023. Or this into the first half of the
# ieee number. The ibm exponent is excess 64 but is adjusted by 65
# since during conversion to ibm format the exponent is
# incremented by 1 and the fraction bits left 4 positions to the
# right of the radix point. (had to add >> 24 because C treats &
# 0x7f as 0x7f000000 and Python doesn't)
ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) +
shift + 1023) << 20) | (xport1 & 0x80000000)
ieee = np.empty((len(ieee1),), dtype='>u4,>u4')
ieee['f0'] = ieee1
ieee['f1'] = ieee2
ieee = ieee.view(dtype='>f8')
ieee = ieee.astype('f8')
return ieee
class XportReader(BaseIterator):
__doc__ = _xport_reader_doc
def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
chunksize=None):
self._encoding = encoding
self._lines_read = 0
self._index = index
self._chunksize = chunksize
if isinstance(filepath_or_buffer, str):
(filepath_or_buffer, encoding,
compression, should_close) = get_filepath_or_buffer(
filepath_or_buffer, encoding=encoding)
if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)):
self.filepath_or_buffer = open(filepath_or_buffer, 'rb')
else:
# Copy to BytesIO, and ensure no encoding
contents = filepath_or_buffer.read()
try:
contents = contents.encode(self._encoding)
except:
pass
self.filepath_or_buffer = compat.BytesIO(contents)
self._read_header()
def close(self):
self.filepath_or_buffer.close()
def _get_row(self):
return self.filepath_or_buffer.read(80).decode()
def _read_header(self):
self.filepath_or_buffer.seek(0)
# read file header
line1 = self._get_row()
if line1 != _correct_line1:
self.close()
raise ValueError("Header record is not an XPORT file.")
line2 = self._get_row()
fif = [['prefix', 24], ['version', 8], ['OS', 8],
['_', 24], ['created', 16]]
file_info = _split_line(line2, fif)
if file_info['prefix'] != "SAS SAS SASLIB":
self.close()
raise ValueError("Header record has invalid prefix.")
file_info['created'] = _parse_date(file_info['created'])
self.file_info = file_info
line3 = self._get_row()
file_info['modified'] = _parse_date(line3[:16])
# read member header
header1 = self._get_row()
header2 = self._get_row()
headflag1 = header1.startswith(_correct_header1)
headflag2 = (header2 == _correct_header2)
if not (headflag1 and headflag2):
self.close()
raise ValueError("Member header not found")
# usually 140, could be 135
fieldnamelength = int(header1[-5:-2])
# member info
mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8],
['version', 8], ['OS', 8], ['_', 24], ['created', 16]]
member_info = _split_line(self._get_row(), mem)
mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]]
member_info.update(_split_line(self._get_row(), mem))
member_info['modified'] = _parse_date(member_info['modified'])
member_info['created'] = _parse_date(member_info['created'])
self.member_info = member_info
# read field names
types = {1: 'numeric', 2: 'char'}
fieldcount = int(self._get_row()[54:58])
datalength = fieldnamelength * fieldcount
# round up to nearest 80
if datalength % 80:
datalength += 80 - datalength % 80
fielddata = self.filepath_or_buffer.read(datalength)
fields = []
obs_length = 0
while len(fielddata) >= fieldnamelength:
# pull data for one field
field, fielddata = (fielddata[:fieldnamelength],
fielddata[fieldnamelength:])
# rest at end gets ignored, so if field is short, pad out
# to match struct pattern below
field = field.ljust(140)
fieldstruct = struct.unpack('>hhhh8s40s8shhh2s8shhl52s', field)
field = dict(zip(_fieldkeys, fieldstruct))
del field['_']
field['ntype'] = types[field['ntype']]
fl = field['field_length']
if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
self.close()
msg = "Floating field width {0} is not between 2 and 8."
raise TypeError(msg.format(fl))
for k, v in field.items():
try:
field[k] = v.strip()
except AttributeError:
pass
obs_length += field['field_length']
fields += [field]
header = self._get_row()
if not header == _correct_obs_header:
self.close()
raise ValueError("Observation header not found.")
self.fields = fields
self.record_length = obs_length
self.record_start = self.filepath_or_buffer.tell()
self.nobs = self._record_count()
self.columns = [x['name'].decode() for x in self.fields]
# Setup the dtype.
dtypel = []
for i, field in enumerate(self.fields):
dtypel.append(('s' + str(i), "S" + str(field['field_length'])))
dtype = np.dtype(dtypel)
self._dtype = dtype
def __next__(self):
return self.read(nrows=self._chunksize or 1)
def _record_count(self):
"""
Get number of records in file.
This is maybe suboptimal because we have to seek to the end of
the file.
Side effect: returns file position to record_start.
"""
self.filepath_or_buffer.seek(0, 2)
total_records_length = (self.filepath_or_buffer.tell() -
self.record_start)
if total_records_length % 80 != 0:
warnings.warn("xport file may be corrupted")
if self.record_length > 80:
self.filepath_or_buffer.seek(self.record_start)
return total_records_length // self.record_length
self.filepath_or_buffer.seek(-80, 2)
last_card = self.filepath_or_buffer.read(80)
last_card = np.frombuffer(last_card, dtype=np.uint64)
# 8 byte blank
ix = np.flatnonzero(last_card == 2314885530818453536)
if len(ix) == 0:
tail_pad = 0
else:
tail_pad = 8 * len(ix)
self.filepath_or_buffer.seek(self.record_start)
return (total_records_length - tail_pad) // self.record_length
def get_chunk(self, size=None):
"""
Reads lines from Xport file and returns as dataframe
Parameters
----------
size : int, defaults to None
Number of lines to read. If None, reads whole file.
Returns
-------
DataFrame
"""
if size is None:
size = self._chunksize
return self.read(nrows=size)
def _missing_double(self, vec):
v = vec.view(dtype='u1,u1,u2,u4')
miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0)
miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |
(v['f0'] == 0x5f) | (v['f0'] == 0x2e))
miss &= miss1
return miss
@Appender(_read_method_doc)
def read(self, nrows=None):
if nrows is None:
nrows = self.nobs
read_lines = min(nrows, self.nobs - self._lines_read)
read_len = read_lines * self.record_length
if read_len <= 0:
self.close()
raise StopIteration
raw = self.filepath_or_buffer.read(read_len)
data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
df = pd.DataFrame(index=range(read_lines))
for j, x in enumerate(self.columns):
vec = data['s%d' % j]
ntype = self.fields[j]['ntype']
if ntype == "numeric":
vec = _handle_truncated_float_vec(
vec, self.fields[j]['field_length'])
miss = self._missing_double(vec)
v = _parse_float_vec(vec)
v[miss] = np.nan
elif self.fields[j]['ntype'] == 'char':
v = [y.rstrip() for y in vec]
if compat.PY3:
if self._encoding is not None:
v = [y.decode(self._encoding) for y in v]
df[x] = v
if self._index is None:
df.index = range(self._lines_read, self._lines_read + read_lines)
else:
df = df.set_index(self._index)
self._lines_read += read_lines
return df
@@ -0,0 +1,70 @@
"""
Read SAS sas7bdat or xport files.
"""
from pandas import compat
from pandas.io.common import _stringify_path
def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
chunksize=None, iterator=False):
"""
Read SAS files stored as either XPORT or SAS7BDAT format files.
Parameters
----------
filepath_or_buffer : string or file-like object
Path to the SAS file.
format : string {'xport', 'sas7bdat'} or None
If None, file format is inferred. If 'xport' or 'sas7bdat',
uses the corresponding format.
index : identifier of index column, defaults to None
Identifier of column that should be used as index of the DataFrame.
encoding : string, default is None
Encoding for text data. If None, text data are stored as raw bytes.
chunksize : int
Read file `chunksize` lines at a time, returns iterator.
iterator : bool, defaults to False
If True, returns an iterator for reading the file incrementally.
Returns
-------
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
or XportReader
"""
if format is None:
buffer_error_msg = ("If this is a buffer object rather "
"than a string name, you must specify "
"a format string")
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, compat.string_types):
raise ValueError(buffer_error_msg)
try:
fname = filepath_or_buffer.lower()
if fname.endswith(".xpt"):
format = "xport"
elif fname.endswith(".sas7bdat"):
format = "sas7bdat"
else:
raise ValueError("unable to infer format of SAS file")
except:
pass
if format.lower() == 'xport':
from pandas.io.sas.sas_xport import XportReader
reader = XportReader(filepath_or_buffer, index=index,
encoding=encoding,
chunksize=chunksize)
elif format.lower() == 'sas7bdat':
from pandas.io.sas.sas7bdat import SAS7BDATReader
reader = SAS7BDATReader(filepath_or_buffer, index=index,
encoding=encoding,
chunksize=chunksize)
else:
raise ValueError('unknown SAS format')
if iterator or chunksize:
return reader
data = reader.read()
reader.close()
return data
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff