pruned venvs
This commit is contained in:
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,20 +0,0 @@
|
||||
"""
|
||||
Data IO api
|
||||
"""
|
||||
|
||||
# flake8: noqa
|
||||
|
||||
from pandas.io.clipboards import read_clipboard
|
||||
from pandas.io.excel import ExcelFile, ExcelWriter, read_excel
|
||||
from pandas.io.feather_format import read_feather
|
||||
from pandas.io.gbq import read_gbq
|
||||
from pandas.io.html import read_html
|
||||
from pandas.io.json import read_json
|
||||
from pandas.io.packers import read_msgpack, to_msgpack
|
||||
from pandas.io.parquet import read_parquet
|
||||
from pandas.io.parsers import read_csv, read_fwf, read_table
|
||||
from pandas.io.pickle import read_pickle, to_pickle
|
||||
from pandas.io.pytables import HDFStore, read_hdf
|
||||
from pandas.io.sas import read_sas
|
||||
from pandas.io.sql import read_sql, read_sql_query, read_sql_table
|
||||
from pandas.io.stata import read_stata
|
||||
@@ -1,125 +0,0 @@
|
||||
"""
|
||||
Pyperclip
|
||||
|
||||
A cross-platform clipboard module for Python. (only handles plain text for now)
|
||||
By Al Sweigart al@inventwithpython.com
|
||||
BSD License
|
||||
|
||||
Usage:
|
||||
import pyperclip
|
||||
pyperclip.copy('The text to be copied to the clipboard.')
|
||||
spam = pyperclip.paste()
|
||||
|
||||
if not pyperclip.copy:
|
||||
print("Copy functionality unavailable!")
|
||||
|
||||
On Windows, no additional modules are needed.
|
||||
On Mac, the module uses pbcopy and pbpaste, which should come with the os.
|
||||
On Linux, install xclip or xsel via package manager. For example, in Debian:
|
||||
sudo apt-get install xclip
|
||||
|
||||
Otherwise on Linux, you will need the gtk, qtpy or PyQt modules installed.
|
||||
qtpy also requires a python-qt-bindings module: PyQt4, PyQt5, PySide, PySide2
|
||||
|
||||
gtk and PyQt4 modules are not available for Python 3,
|
||||
and this module does not work with PyGObject yet.
|
||||
"""
|
||||
__version__ = '1.5.27'
|
||||
|
||||
import platform
|
||||
import os
|
||||
import subprocess
|
||||
from .clipboards import (init_osx_clipboard,
|
||||
init_gtk_clipboard, init_qt_clipboard,
|
||||
init_xclip_clipboard, init_xsel_clipboard,
|
||||
init_klipper_clipboard, init_no_clipboard)
|
||||
from .windows import init_windows_clipboard
|
||||
|
||||
# `import qtpy` sys.exit()s if DISPLAY is not in the environment.
|
||||
# Thus, we need to detect the presence of $DISPLAY manually
|
||||
# and not load qtpy if it is absent.
|
||||
HAS_DISPLAY = os.getenv("DISPLAY", False)
|
||||
CHECK_CMD = "where" if platform.system() == "Windows" else "which"
|
||||
|
||||
|
||||
def _executable_exists(name):
|
||||
return subprocess.call([CHECK_CMD, name],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
|
||||
|
||||
|
||||
def determine_clipboard():
|
||||
# Determine the OS/platform and set
|
||||
# the copy() and paste() functions accordingly.
|
||||
if 'cygwin' in platform.system().lower():
|
||||
# FIXME: pyperclip currently does not support Cygwin,
|
||||
# see https://github.com/asweigart/pyperclip/issues/55
|
||||
pass
|
||||
elif os.name == 'nt' or platform.system() == 'Windows':
|
||||
return init_windows_clipboard()
|
||||
if os.name == 'mac' or platform.system() == 'Darwin':
|
||||
return init_osx_clipboard()
|
||||
if HAS_DISPLAY:
|
||||
# Determine which command/module is installed, if any.
|
||||
try:
|
||||
# Check if gtk is installed
|
||||
import gtk # noqa
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
return init_gtk_clipboard()
|
||||
|
||||
try:
|
||||
# qtpy is a small abstraction layer that lets you write
|
||||
# applications using a single api call to either PyQt or PySide
|
||||
# https://pypi.org/project/QtPy
|
||||
import qtpy # noqa
|
||||
except ImportError:
|
||||
# If qtpy isn't installed, fall back on importing PyQt5, or PyQt5
|
||||
try:
|
||||
import PyQt5 # noqa
|
||||
except ImportError:
|
||||
try:
|
||||
import PyQt4 # noqa
|
||||
except ImportError:
|
||||
pass # fail fast for all non-ImportError exceptions.
|
||||
else:
|
||||
return init_qt_clipboard()
|
||||
else:
|
||||
return init_qt_clipboard()
|
||||
pass
|
||||
else:
|
||||
return init_qt_clipboard()
|
||||
|
||||
if _executable_exists("xclip"):
|
||||
return init_xclip_clipboard()
|
||||
if _executable_exists("xsel"):
|
||||
return init_xsel_clipboard()
|
||||
if _executable_exists("klipper") and _executable_exists("qdbus"):
|
||||
return init_klipper_clipboard()
|
||||
|
||||
return init_no_clipboard()
|
||||
|
||||
|
||||
def set_clipboard(clipboard):
|
||||
global copy, paste
|
||||
|
||||
clipboard_types = {'osx': init_osx_clipboard,
|
||||
'gtk': init_gtk_clipboard,
|
||||
'qt': init_qt_clipboard,
|
||||
'xclip': init_xclip_clipboard,
|
||||
'xsel': init_xsel_clipboard,
|
||||
'klipper': init_klipper_clipboard,
|
||||
'windows': init_windows_clipboard,
|
||||
'no': init_no_clipboard}
|
||||
|
||||
copy, paste = clipboard_types[clipboard]()
|
||||
|
||||
|
||||
copy, paste = determine_clipboard()
|
||||
|
||||
__all__ = ["copy", "paste"]
|
||||
|
||||
|
||||
# pandas aliases
|
||||
clipboard_get = paste
|
||||
clipboard_set = copy
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -1,145 +0,0 @@
|
||||
import subprocess
|
||||
|
||||
from pandas.compat import PY2, text_type
|
||||
|
||||
from .exceptions import PyperclipException
|
||||
|
||||
EXCEPT_MSG = """
|
||||
Pyperclip could not find a copy/paste mechanism for your system.
|
||||
For more information, please visit https://pyperclip.readthedocs.org """
|
||||
|
||||
|
||||
def init_osx_clipboard():
|
||||
def copy_osx(text):
|
||||
p = subprocess.Popen(['pbcopy', 'w'],
|
||||
stdin=subprocess.PIPE, close_fds=True)
|
||||
p.communicate(input=text.encode('utf-8'))
|
||||
|
||||
def paste_osx():
|
||||
p = subprocess.Popen(['pbpaste', 'r'],
|
||||
stdout=subprocess.PIPE, close_fds=True)
|
||||
stdout, stderr = p.communicate()
|
||||
return stdout.decode('utf-8')
|
||||
|
||||
return copy_osx, paste_osx
|
||||
|
||||
|
||||
def init_gtk_clipboard():
|
||||
import gtk
|
||||
|
||||
def copy_gtk(text):
|
||||
global cb
|
||||
cb = gtk.Clipboard()
|
||||
cb.set_text(text)
|
||||
cb.store()
|
||||
|
||||
def paste_gtk():
|
||||
clipboardContents = gtk.Clipboard().wait_for_text()
|
||||
# for python 2, returns None if the clipboard is blank.
|
||||
if clipboardContents is None:
|
||||
return ''
|
||||
else:
|
||||
return clipboardContents
|
||||
|
||||
return copy_gtk, paste_gtk
|
||||
|
||||
|
||||
def init_qt_clipboard():
|
||||
# $DISPLAY should exist
|
||||
|
||||
# Try to import from qtpy, but if that fails try PyQt5 then PyQt4
|
||||
try:
|
||||
from qtpy.QtWidgets import QApplication
|
||||
except ImportError:
|
||||
try:
|
||||
from PyQt5.QtWidgets import QApplication
|
||||
except ImportError:
|
||||
from PyQt4.QtGui import QApplication
|
||||
|
||||
app = QApplication.instance()
|
||||
if app is None:
|
||||
app = QApplication([])
|
||||
|
||||
def copy_qt(text):
|
||||
cb = app.clipboard()
|
||||
cb.setText(text)
|
||||
|
||||
def paste_qt():
|
||||
cb = app.clipboard()
|
||||
return text_type(cb.text())
|
||||
|
||||
return copy_qt, paste_qt
|
||||
|
||||
|
||||
def init_xclip_clipboard():
|
||||
def copy_xclip(text):
|
||||
p = subprocess.Popen(['xclip', '-selection', 'c'],
|
||||
stdin=subprocess.PIPE, close_fds=True)
|
||||
p.communicate(input=text.encode('utf-8'))
|
||||
|
||||
def paste_xclip():
|
||||
p = subprocess.Popen(['xclip', '-selection', 'c', '-o'],
|
||||
stdout=subprocess.PIPE, close_fds=True)
|
||||
stdout, stderr = p.communicate()
|
||||
return stdout.decode('utf-8')
|
||||
|
||||
return copy_xclip, paste_xclip
|
||||
|
||||
|
||||
def init_xsel_clipboard():
|
||||
def copy_xsel(text):
|
||||
p = subprocess.Popen(['xsel', '-b', '-i'],
|
||||
stdin=subprocess.PIPE, close_fds=True)
|
||||
p.communicate(input=text.encode('utf-8'))
|
||||
|
||||
def paste_xsel():
|
||||
p = subprocess.Popen(['xsel', '-b', '-o'],
|
||||
stdout=subprocess.PIPE, close_fds=True)
|
||||
stdout, stderr = p.communicate()
|
||||
return stdout.decode('utf-8')
|
||||
|
||||
return copy_xsel, paste_xsel
|
||||
|
||||
|
||||
def init_klipper_clipboard():
|
||||
def copy_klipper(text):
|
||||
p = subprocess.Popen(
|
||||
['qdbus', 'org.kde.klipper', '/klipper', 'setClipboardContents',
|
||||
text.encode('utf-8')],
|
||||
stdin=subprocess.PIPE, close_fds=True)
|
||||
p.communicate(input=None)
|
||||
|
||||
def paste_klipper():
|
||||
p = subprocess.Popen(
|
||||
['qdbus', 'org.kde.klipper', '/klipper', 'getClipboardContents'],
|
||||
stdout=subprocess.PIPE, close_fds=True)
|
||||
stdout, stderr = p.communicate()
|
||||
|
||||
# Workaround for https://bugs.kde.org/show_bug.cgi?id=342874
|
||||
# TODO: https://github.com/asweigart/pyperclip/issues/43
|
||||
clipboardContents = stdout.decode('utf-8')
|
||||
# even if blank, Klipper will append a newline at the end
|
||||
assert len(clipboardContents) > 0
|
||||
# make sure that newline is there
|
||||
assert clipboardContents.endswith('\n')
|
||||
if clipboardContents.endswith('\n'):
|
||||
clipboardContents = clipboardContents[:-1]
|
||||
return clipboardContents
|
||||
|
||||
return copy_klipper, paste_klipper
|
||||
|
||||
|
||||
def init_no_clipboard():
|
||||
class ClipboardUnavailable(object):
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
raise PyperclipException(EXCEPT_MSG)
|
||||
|
||||
if PY2:
|
||||
def __nonzero__(self):
|
||||
return False
|
||||
else:
|
||||
def __bool__(self):
|
||||
return False
|
||||
|
||||
return ClipboardUnavailable(), ClipboardUnavailable()
|
||||
@@ -1,12 +0,0 @@
|
||||
import ctypes
|
||||
|
||||
|
||||
class PyperclipException(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class PyperclipWindowsException(PyperclipException):
|
||||
|
||||
def __init__(self, message):
|
||||
message += " ({err})".format(err=ctypes.WinError())
|
||||
super(PyperclipWindowsException, self).__init__(message)
|
||||
@@ -1,154 +0,0 @@
|
||||
"""
|
||||
This module implements clipboard handling on Windows using ctypes.
|
||||
"""
|
||||
import contextlib
|
||||
import ctypes
|
||||
from ctypes import c_size_t, c_wchar, c_wchar_p, get_errno, sizeof
|
||||
import time
|
||||
|
||||
from .exceptions import PyperclipWindowsException
|
||||
|
||||
|
||||
class CheckedCall(object):
|
||||
|
||||
def __init__(self, f):
|
||||
super(CheckedCall, self).__setattr__("f", f)
|
||||
|
||||
def __call__(self, *args):
|
||||
ret = self.f(*args)
|
||||
if not ret and get_errno():
|
||||
raise PyperclipWindowsException("Error calling " + self.f.__name__)
|
||||
return ret
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
setattr(self.f, key, value)
|
||||
|
||||
|
||||
def init_windows_clipboard():
|
||||
from ctypes.wintypes import (HGLOBAL, LPVOID, DWORD, LPCSTR, INT, HWND,
|
||||
HINSTANCE, HMENU, BOOL, UINT, HANDLE)
|
||||
|
||||
windll = ctypes.windll
|
||||
|
||||
safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA)
|
||||
safeCreateWindowExA.argtypes = [DWORD, LPCSTR, LPCSTR, DWORD, INT, INT,
|
||||
INT, INT, HWND, HMENU, HINSTANCE, LPVOID]
|
||||
safeCreateWindowExA.restype = HWND
|
||||
|
||||
safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow)
|
||||
safeDestroyWindow.argtypes = [HWND]
|
||||
safeDestroyWindow.restype = BOOL
|
||||
|
||||
OpenClipboard = windll.user32.OpenClipboard
|
||||
OpenClipboard.argtypes = [HWND]
|
||||
OpenClipboard.restype = BOOL
|
||||
|
||||
safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard)
|
||||
safeCloseClipboard.argtypes = []
|
||||
safeCloseClipboard.restype = BOOL
|
||||
|
||||
safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard)
|
||||
safeEmptyClipboard.argtypes = []
|
||||
safeEmptyClipboard.restype = BOOL
|
||||
|
||||
safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData)
|
||||
safeGetClipboardData.argtypes = [UINT]
|
||||
safeGetClipboardData.restype = HANDLE
|
||||
|
||||
safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData)
|
||||
safeSetClipboardData.argtypes = [UINT, HANDLE]
|
||||
safeSetClipboardData.restype = HANDLE
|
||||
|
||||
safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc)
|
||||
safeGlobalAlloc.argtypes = [UINT, c_size_t]
|
||||
safeGlobalAlloc.restype = HGLOBAL
|
||||
|
||||
safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock)
|
||||
safeGlobalLock.argtypes = [HGLOBAL]
|
||||
safeGlobalLock.restype = LPVOID
|
||||
|
||||
safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock)
|
||||
safeGlobalUnlock.argtypes = [HGLOBAL]
|
||||
safeGlobalUnlock.restype = BOOL
|
||||
|
||||
GMEM_MOVEABLE = 0x0002
|
||||
CF_UNICODETEXT = 13
|
||||
|
||||
@contextlib.contextmanager
|
||||
def window():
|
||||
"""
|
||||
Context that provides a valid Windows hwnd.
|
||||
"""
|
||||
# we really just need the hwnd, so setting "STATIC"
|
||||
# as predefined lpClass is just fine.
|
||||
hwnd = safeCreateWindowExA(0, b"STATIC", None, 0, 0, 0, 0, 0,
|
||||
None, None, None, None)
|
||||
try:
|
||||
yield hwnd
|
||||
finally:
|
||||
safeDestroyWindow(hwnd)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def clipboard(hwnd):
|
||||
"""
|
||||
Context manager that opens the clipboard and prevents
|
||||
other applications from modifying the clipboard content.
|
||||
"""
|
||||
# We may not get the clipboard handle immediately because
|
||||
# some other application is accessing it (?)
|
||||
# We try for at least 500ms to get the clipboard.
|
||||
t = time.time() + 0.5
|
||||
success = False
|
||||
while time.time() < t:
|
||||
success = OpenClipboard(hwnd)
|
||||
if success:
|
||||
break
|
||||
time.sleep(0.01)
|
||||
if not success:
|
||||
raise PyperclipWindowsException("Error calling OpenClipboard")
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
safeCloseClipboard()
|
||||
|
||||
def copy_windows(text):
|
||||
# This function is heavily based on
|
||||
# http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard
|
||||
with window() as hwnd:
|
||||
# http://msdn.com/ms649048
|
||||
# If an application calls OpenClipboard with hwnd set to NULL,
|
||||
# EmptyClipboard sets the clipboard owner to NULL;
|
||||
# this causes SetClipboardData to fail.
|
||||
# => We need a valid hwnd to copy something.
|
||||
with clipboard(hwnd):
|
||||
safeEmptyClipboard()
|
||||
|
||||
if text:
|
||||
# http://msdn.com/ms649051
|
||||
# If the hMem parameter identifies a memory object,
|
||||
# the object must have been allocated using the
|
||||
# function with the GMEM_MOVEABLE flag.
|
||||
count = len(text) + 1
|
||||
handle = safeGlobalAlloc(GMEM_MOVEABLE,
|
||||
count * sizeof(c_wchar))
|
||||
locked_handle = safeGlobalLock(handle)
|
||||
|
||||
ctypes.memmove(c_wchar_p(locked_handle),
|
||||
c_wchar_p(text), count * sizeof(c_wchar))
|
||||
|
||||
safeGlobalUnlock(handle)
|
||||
safeSetClipboardData(CF_UNICODETEXT, handle)
|
||||
|
||||
def paste_windows():
|
||||
with clipboard(None):
|
||||
handle = safeGetClipboardData(CF_UNICODETEXT)
|
||||
if not handle:
|
||||
# GetClipboardData may return NULL with errno == NO_ERROR
|
||||
# if the clipboard is empty.
|
||||
# (Also, it may return a handle to an empty buffer,
|
||||
# but technically that's not empty)
|
||||
return ""
|
||||
return c_wchar_p(handle).value
|
||||
|
||||
return copy_windows, paste_windows
|
||||
@@ -1,145 +0,0 @@
|
||||
""" io on the clipboard """
|
||||
import warnings
|
||||
|
||||
import pandas.compat as compat
|
||||
from pandas.compat import PY2, PY3, StringIO
|
||||
|
||||
from pandas.core.dtypes.generic import ABCDataFrame
|
||||
|
||||
from pandas import get_option, option_context
|
||||
|
||||
|
||||
def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover
|
||||
r"""
|
||||
Read text from clipboard and pass to read_csv. See read_csv for the
|
||||
full argument list
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sep : str, default '\s+'
|
||||
A string or regex delimiter. The default of '\s+' denotes
|
||||
one or more whitespace characters.
|
||||
|
||||
Returns
|
||||
-------
|
||||
parsed : DataFrame
|
||||
"""
|
||||
encoding = kwargs.pop('encoding', 'utf-8')
|
||||
|
||||
# only utf-8 is valid for passed value because that's what clipboard
|
||||
# supports
|
||||
if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
|
||||
raise NotImplementedError(
|
||||
'reading from clipboard only supports utf-8 encoding')
|
||||
|
||||
from pandas.io.clipboard import clipboard_get
|
||||
from pandas.io.parsers import read_csv
|
||||
text = clipboard_get()
|
||||
|
||||
# try to decode (if needed on PY3)
|
||||
# Strange. linux py33 doesn't complain, win py33 does
|
||||
if PY3:
|
||||
try:
|
||||
text = compat.bytes_to_str(
|
||||
text, encoding=(kwargs.get('encoding') or
|
||||
get_option('display.encoding'))
|
||||
)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
# Excel copies into clipboard with \t separation
|
||||
# inspect no more then the 10 first lines, if they
|
||||
# all contain an equal number (>0) of tabs, infer
|
||||
# that this came from excel and set 'sep' accordingly
|
||||
lines = text[:10000].split('\n')[:-1][:10]
|
||||
|
||||
# Need to remove leading white space, since read_csv
|
||||
# accepts:
|
||||
# a b
|
||||
# 0 1 2
|
||||
# 1 3 4
|
||||
|
||||
counts = {x.lstrip().count('\t') for x in lines}
|
||||
if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
|
||||
sep = '\t'
|
||||
|
||||
# Edge case where sep is specified to be None, return to default
|
||||
if sep is None and kwargs.get('delim_whitespace') is None:
|
||||
sep = r'\s+'
|
||||
|
||||
# Regex separator currently only works with python engine.
|
||||
# Default to python if separator is multi-character (regex)
|
||||
if len(sep) > 1 and kwargs.get('engine') is None:
|
||||
kwargs['engine'] = 'python'
|
||||
elif len(sep) > 1 and kwargs.get('engine') == 'c':
|
||||
warnings.warn('read_clipboard with regex separator does not work'
|
||||
' properly with c engine')
|
||||
|
||||
# In PY2, the c table reader first encodes text with UTF-8 but Python
|
||||
# table reader uses the format of the passed string. For consistency,
|
||||
# encode strings for python engine so that output from python and c
|
||||
# engines produce consistent results
|
||||
if kwargs.get('engine') == 'python' and PY2:
|
||||
text = text.encode('utf-8')
|
||||
|
||||
return read_csv(StringIO(text), sep=sep, **kwargs)
|
||||
|
||||
|
||||
def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover
|
||||
"""
|
||||
Attempt to write text representation of object to the system clipboard
|
||||
The clipboard can be then pasted into Excel for example.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : the object to write to the clipboard
|
||||
excel : boolean, defaults to True
|
||||
if True, use the provided separator, writing in a csv
|
||||
format for allowing easy pasting into excel.
|
||||
if False, write a string representation of the object
|
||||
to the clipboard
|
||||
sep : optional, defaults to tab
|
||||
other keywords are passed to to_csv
|
||||
|
||||
Notes
|
||||
-----
|
||||
Requirements for your platform
|
||||
- Linux: xclip, or xsel (with gtk or PyQt4 modules)
|
||||
- Windows:
|
||||
- OS X:
|
||||
"""
|
||||
encoding = kwargs.pop('encoding', 'utf-8')
|
||||
|
||||
# testing if an invalid encoding is passed to clipboard
|
||||
if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
|
||||
raise ValueError('clipboard only supports utf-8 encoding')
|
||||
|
||||
from pandas.io.clipboard import clipboard_set
|
||||
if excel is None:
|
||||
excel = True
|
||||
|
||||
if excel:
|
||||
try:
|
||||
if sep is None:
|
||||
sep = '\t'
|
||||
buf = StringIO()
|
||||
# clipboard_set (pyperclip) expects unicode
|
||||
obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs)
|
||||
text = buf.getvalue()
|
||||
if PY2:
|
||||
text = text.decode('utf-8')
|
||||
clipboard_set(text)
|
||||
return
|
||||
except TypeError:
|
||||
warnings.warn('to_clipboard in excel mode requires a single '
|
||||
'character separator.')
|
||||
elif sep is not None:
|
||||
warnings.warn('to_clipboard with excel=False ignores the sep argument')
|
||||
|
||||
if isinstance(obj, ABCDataFrame):
|
||||
# str(df) has various unhelpful defaults, like truncation
|
||||
with option_context('display.max_colwidth', 999999):
|
||||
objstr = obj.to_string(**kwargs)
|
||||
else:
|
||||
objstr = str(obj)
|
||||
clipboard_set(objstr)
|
||||
@@ -1,617 +0,0 @@
|
||||
"""Common IO api utilities"""
|
||||
|
||||
import codecs
|
||||
from contextlib import closing, contextmanager
|
||||
import csv
|
||||
import mmap
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
import pandas.compat as compat
|
||||
from pandas.compat import BytesIO, StringIO, string_types, text_type
|
||||
from pandas.errors import ( # noqa
|
||||
AbstractMethodError, DtypeWarning, EmptyDataError, ParserError,
|
||||
ParserWarning)
|
||||
|
||||
from pandas.core.dtypes.common import is_file_like, is_number
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
# gh-12665: Alias for now and remove later.
|
||||
CParserError = ParserError
|
||||
|
||||
# common NA values
|
||||
# no longer excluding inf representations
|
||||
# '1.#INF','-1.#INF', '1.#INF000000',
|
||||
_NA_VALUES = {'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A',
|
||||
'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan',
|
||||
'-nan', ''}
|
||||
|
||||
|
||||
if compat.PY3:
|
||||
from urllib.request import urlopen, pathname2url
|
||||
_urlopen = urlopen
|
||||
from urllib.parse import urlparse as parse_url
|
||||
from urllib.parse import (uses_relative, uses_netloc, uses_params,
|
||||
urlencode, urljoin)
|
||||
from urllib.error import URLError
|
||||
from http.client import HTTPException # noqa
|
||||
else:
|
||||
from urllib2 import urlopen as _urlopen
|
||||
from urllib import urlencode, pathname2url # noqa
|
||||
from urlparse import urlparse as parse_url
|
||||
from urlparse import uses_relative, uses_netloc, uses_params, urljoin
|
||||
from urllib2 import URLError # noqa
|
||||
from httplib import HTTPException # noqa
|
||||
from contextlib import contextmanager, closing # noqa
|
||||
from functools import wraps # noqa
|
||||
|
||||
# @wraps(_urlopen)
|
||||
@contextmanager
|
||||
def urlopen(*args, **kwargs):
|
||||
with closing(_urlopen(*args, **kwargs)) as f:
|
||||
yield f
|
||||
|
||||
|
||||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||
_VALID_URLS.discard('')
|
||||
|
||||
|
||||
class BaseIterator(object):
|
||||
"""Subclass this and provide a "__next__()" method to obtain an iterator.
|
||||
Useful only when the object being iterated is non-reusable (e.g. OK for a
|
||||
parser, not for an in-memory table, yes for its iterator)."""
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
|
||||
if not compat.PY3:
|
||||
BaseIterator.next = lambda self: self.__next__()
|
||||
|
||||
|
||||
def _is_url(url):
|
||||
"""Check to see if a URL has a valid protocol.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str or unicode
|
||||
|
||||
Returns
|
||||
-------
|
||||
isurl : bool
|
||||
If `url` has a valid protocol return True otherwise False.
|
||||
"""
|
||||
try:
|
||||
return parse_url(url).scheme in _VALID_URLS
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _expand_user(filepath_or_buffer):
|
||||
"""Return the argument with an initial component of ~ or ~user
|
||||
replaced by that user's home directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : object to be converted if possible
|
||||
|
||||
Returns
|
||||
-------
|
||||
expanded_filepath_or_buffer : an expanded filepath or the
|
||||
input if not expandable
|
||||
"""
|
||||
if isinstance(filepath_or_buffer, string_types):
|
||||
return os.path.expanduser(filepath_or_buffer)
|
||||
return filepath_or_buffer
|
||||
|
||||
|
||||
def _validate_header_arg(header):
|
||||
if isinstance(header, bool):
|
||||
raise TypeError("Passing a bool to header is invalid. "
|
||||
"Use header=None for no header or "
|
||||
"header=int or list-like of ints to specify "
|
||||
"the row(s) making up the column names")
|
||||
|
||||
|
||||
def _stringify_path(filepath_or_buffer):
|
||||
"""Attempt to convert a path-like object to a string.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : object to be converted
|
||||
|
||||
Returns
|
||||
-------
|
||||
str_filepath_or_buffer : maybe a string version of the object
|
||||
|
||||
Notes
|
||||
-----
|
||||
Objects supporting the fspath protocol (python 3.6+) are coerced
|
||||
according to its __fspath__ method.
|
||||
|
||||
For backwards compatibility with older pythons, pathlib.Path and
|
||||
py.path objects are specially coerced.
|
||||
|
||||
Any other object is passed through unchanged, which includes bytes,
|
||||
strings, buffers, or anything else that's not even path-like.
|
||||
"""
|
||||
try:
|
||||
import pathlib
|
||||
_PATHLIB_INSTALLED = True
|
||||
except ImportError:
|
||||
_PATHLIB_INSTALLED = False
|
||||
|
||||
try:
|
||||
from py.path import local as LocalPath
|
||||
_PY_PATH_INSTALLED = True
|
||||
except ImportError:
|
||||
_PY_PATH_INSTALLED = False
|
||||
|
||||
if hasattr(filepath_or_buffer, '__fspath__'):
|
||||
return filepath_or_buffer.__fspath__()
|
||||
if _PATHLIB_INSTALLED and isinstance(filepath_or_buffer, pathlib.Path):
|
||||
return text_type(filepath_or_buffer)
|
||||
if _PY_PATH_INSTALLED and isinstance(filepath_or_buffer, LocalPath):
|
||||
return filepath_or_buffer.strpath
|
||||
return _expand_user(filepath_or_buffer)
|
||||
|
||||
|
||||
def is_s3_url(url):
|
||||
"""Check for an s3, s3n, or s3a url"""
|
||||
try:
|
||||
return parse_url(url).scheme in ['s3', 's3n', 's3a']
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def is_gcs_url(url):
|
||||
"""Check for a gcs url"""
|
||||
try:
|
||||
return parse_url(url).scheme in ['gcs', 'gs']
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
|
||||
compression=None, mode=None):
|
||||
"""
|
||||
If the filepath_or_buffer is a url, translate and return the buffer.
|
||||
Otherwise passthrough.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
|
||||
or buffer
|
||||
encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
|
||||
mode : str, optional
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of ({a filepath_ or buffer or S3File instance},
|
||||
encoding, str,
|
||||
compression, str,
|
||||
should_close, bool)
|
||||
"""
|
||||
filepath_or_buffer = _stringify_path(filepath_or_buffer)
|
||||
|
||||
if _is_url(filepath_or_buffer):
|
||||
req = _urlopen(filepath_or_buffer)
|
||||
content_encoding = req.headers.get('Content-Encoding', None)
|
||||
if content_encoding == 'gzip':
|
||||
# Override compression based on Content-Encoding header
|
||||
compression = 'gzip'
|
||||
reader = BytesIO(req.read())
|
||||
req.close()
|
||||
return reader, encoding, compression, True
|
||||
|
||||
if is_s3_url(filepath_or_buffer):
|
||||
from pandas.io import s3
|
||||
return s3.get_filepath_or_buffer(filepath_or_buffer,
|
||||
encoding=encoding,
|
||||
compression=compression,
|
||||
mode=mode)
|
||||
|
||||
if is_gcs_url(filepath_or_buffer):
|
||||
from pandas.io import gcs
|
||||
return gcs.get_filepath_or_buffer(filepath_or_buffer,
|
||||
encoding=encoding,
|
||||
compression=compression,
|
||||
mode=mode)
|
||||
|
||||
if isinstance(filepath_or_buffer, (compat.string_types,
|
||||
compat.binary_type,
|
||||
mmap.mmap)):
|
||||
return _expand_user(filepath_or_buffer), None, compression, False
|
||||
|
||||
if not is_file_like(filepath_or_buffer):
|
||||
msg = "Invalid file path or buffer object type: {_type}"
|
||||
raise ValueError(msg.format(_type=type(filepath_or_buffer)))
|
||||
|
||||
return filepath_or_buffer, None, compression, False
|
||||
|
||||
|
||||
def file_path_to_url(path):
|
||||
"""
|
||||
converts an absolute native path to a FILE URL.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : a path in native format
|
||||
|
||||
Returns
|
||||
-------
|
||||
a valid FILE URL
|
||||
"""
|
||||
return urljoin('file:', pathname2url(path))
|
||||
|
||||
|
||||
_compression_to_extension = {
|
||||
'gzip': '.gz',
|
||||
'bz2': '.bz2',
|
||||
'zip': '.zip',
|
||||
'xz': '.xz',
|
||||
}
|
||||
|
||||
|
||||
def _infer_compression(filepath_or_buffer, compression):
|
||||
"""
|
||||
Get the compression method for filepath_or_buffer. If compression='infer',
|
||||
the inferred compression method is returned. Otherwise, the input
|
||||
compression method is returned unchanged, unless it's invalid, in which
|
||||
case an error is raised.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer :
|
||||
a path (str) or buffer
|
||||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
|
||||
If 'infer' and `filepath_or_buffer` is path-like, then detect
|
||||
compression from the following extensions: '.gz', '.bz2', '.zip',
|
||||
or '.xz' (otherwise no compression).
|
||||
|
||||
Returns
|
||||
-------
|
||||
string or None :
|
||||
compression method
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError on invalid compression specified
|
||||
"""
|
||||
|
||||
# No compression has been explicitly specified
|
||||
if compression is None:
|
||||
return None
|
||||
|
||||
# Infer compression
|
||||
if compression == 'infer':
|
||||
# Convert all path types (e.g. pathlib.Path) to strings
|
||||
filepath_or_buffer = _stringify_path(filepath_or_buffer)
|
||||
if not isinstance(filepath_or_buffer, compat.string_types):
|
||||
# Cannot infer compression of a buffer, assume no compression
|
||||
return None
|
||||
|
||||
# Infer compression from the filename/URL extension
|
||||
for compression, extension in _compression_to_extension.items():
|
||||
if filepath_or_buffer.endswith(extension):
|
||||
return compression
|
||||
return None
|
||||
|
||||
# Compression has been specified. Check that it's valid
|
||||
if compression in _compression_to_extension:
|
||||
return compression
|
||||
|
||||
msg = 'Unrecognized compression type: {}'.format(compression)
|
||||
valid = ['infer', None] + sorted(_compression_to_extension)
|
||||
msg += '\nValid compression types are {}'.format(valid)
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
def _get_handle(path_or_buf, mode, encoding=None, compression=None,
|
||||
memory_map=False, is_text=True):
|
||||
"""
|
||||
Get file handle for given path/buffer and mode.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf :
|
||||
a path (str) or buffer
|
||||
mode : str
|
||||
mode to open path_or_buf with
|
||||
encoding : str or None
|
||||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
|
||||
If 'infer' and `filepath_or_buffer` is path-like, then detect
|
||||
compression from the following extensions: '.gz', '.bz2', '.zip',
|
||||
or '.xz' (otherwise no compression).
|
||||
memory_map : boolean, default False
|
||||
See parsers._parser_params for more information.
|
||||
is_text : boolean, default True
|
||||
whether file/buffer is in text format (csv, json, etc.), or in binary
|
||||
mode (pickle, etc.)
|
||||
|
||||
Returns
|
||||
-------
|
||||
f : file-like
|
||||
A file-like object
|
||||
handles : list of file-like objects
|
||||
A list of file-like object that were opened in this function.
|
||||
"""
|
||||
try:
|
||||
from s3fs import S3File
|
||||
need_text_wrapping = (BytesIO, S3File)
|
||||
except ImportError:
|
||||
need_text_wrapping = (BytesIO,)
|
||||
|
||||
handles = list()
|
||||
f = path_or_buf
|
||||
|
||||
# Convert pathlib.Path/py.path.local or string
|
||||
path_or_buf = _stringify_path(path_or_buf)
|
||||
is_path = isinstance(path_or_buf, compat.string_types)
|
||||
|
||||
if is_path:
|
||||
compression = _infer_compression(path_or_buf, compression)
|
||||
|
||||
if compression:
|
||||
|
||||
if compat.PY2 and not is_path and encoding:
|
||||
msg = 'compression with encoding is not yet supported in Python 2'
|
||||
raise ValueError(msg)
|
||||
|
||||
# GZ Compression
|
||||
if compression == 'gzip':
|
||||
import gzip
|
||||
if is_path:
|
||||
f = gzip.open(path_or_buf, mode)
|
||||
else:
|
||||
f = gzip.GzipFile(fileobj=path_or_buf)
|
||||
|
||||
# BZ Compression
|
||||
elif compression == 'bz2':
|
||||
import bz2
|
||||
if is_path:
|
||||
f = bz2.BZ2File(path_or_buf, mode)
|
||||
elif compat.PY2:
|
||||
# Python 2's bz2 module can't take file objects, so have to
|
||||
# run through decompress manually
|
||||
f = StringIO(bz2.decompress(path_or_buf.read()))
|
||||
path_or_buf.close()
|
||||
else:
|
||||
f = bz2.BZ2File(path_or_buf)
|
||||
|
||||
# ZIP Compression
|
||||
elif compression == 'zip':
|
||||
zf = BytesZipFile(path_or_buf, mode)
|
||||
# Ensure the container is closed as well.
|
||||
handles.append(zf)
|
||||
if zf.mode == 'w':
|
||||
f = zf
|
||||
elif zf.mode == 'r':
|
||||
zip_names = zf.namelist()
|
||||
if len(zip_names) == 1:
|
||||
f = zf.open(zip_names.pop())
|
||||
elif len(zip_names) == 0:
|
||||
raise ValueError('Zero files found in ZIP file {}'
|
||||
.format(path_or_buf))
|
||||
else:
|
||||
raise ValueError('Multiple files found in ZIP file.'
|
||||
' Only one file per ZIP: {}'
|
||||
.format(zip_names))
|
||||
|
||||
# XZ Compression
|
||||
elif compression == 'xz':
|
||||
lzma = compat.import_lzma()
|
||||
f = lzma.LZMAFile(path_or_buf, mode)
|
||||
|
||||
# Unrecognized Compression
|
||||
else:
|
||||
msg = 'Unrecognized compression type: {}'.format(compression)
|
||||
raise ValueError(msg)
|
||||
|
||||
handles.append(f)
|
||||
|
||||
elif is_path:
|
||||
if compat.PY2:
|
||||
# Python 2
|
||||
mode = "wb" if mode == "w" else mode
|
||||
f = open(path_or_buf, mode)
|
||||
elif encoding:
|
||||
# Python 3 and encoding
|
||||
f = open(path_or_buf, mode, encoding=encoding, newline="")
|
||||
elif is_text:
|
||||
# Python 3 and no explicit encoding
|
||||
f = open(path_or_buf, mode, errors='replace', newline="")
|
||||
else:
|
||||
# Python 3 and binary mode
|
||||
f = open(path_or_buf, mode)
|
||||
handles.append(f)
|
||||
|
||||
# in Python 3, convert BytesIO or fileobjects passed with an encoding
|
||||
if (compat.PY3 and is_text and
|
||||
(compression or isinstance(f, need_text_wrapping))):
|
||||
from io import TextIOWrapper
|
||||
f = TextIOWrapper(f, encoding=encoding)
|
||||
handles.append(f)
|
||||
|
||||
if memory_map and hasattr(f, 'fileno'):
|
||||
try:
|
||||
g = MMapWrapper(f)
|
||||
f.close()
|
||||
f = g
|
||||
except Exception:
|
||||
# we catch any errors that may have occurred
|
||||
# because that is consistent with the lower-level
|
||||
# functionality of the C engine (pd.read_csv), so
|
||||
# leave the file handler as is then
|
||||
pass
|
||||
|
||||
return f, handles
|
||||
|
||||
|
||||
class BytesZipFile(zipfile.ZipFile, BytesIO):
|
||||
"""
|
||||
Wrapper for standard library class ZipFile and allow the returned file-like
|
||||
handle to accept byte strings via `write` method.
|
||||
|
||||
BytesIO provides attributes of file-like object and ZipFile.writestr writes
|
||||
bytes strings into a member of the archive.
|
||||
"""
|
||||
# GH 17778
|
||||
def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
|
||||
if mode in ['wb', 'rb']:
|
||||
mode = mode.replace('b', '')
|
||||
super(BytesZipFile, self).__init__(file, mode, compression, **kwargs)
|
||||
|
||||
def write(self, data):
|
||||
super(BytesZipFile, self).writestr(self.filename, data)
|
||||
|
||||
@property
|
||||
def closed(self):
|
||||
return self.fp is None
|
||||
|
||||
|
||||
class MMapWrapper(BaseIterator):
|
||||
"""
|
||||
Wrapper for the Python's mmap class so that it can be properly read in
|
||||
by Python's csv.reader class.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
f : file object
|
||||
File object to be mapped onto memory. Must support the 'fileno'
|
||||
method or have an equivalent attribute
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, f):
|
||||
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.mmap, name)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
newline = self.mmap.readline()
|
||||
|
||||
# readline returns bytes, not str, in Python 3,
|
||||
# but Python's CSV reader expects str, so convert
|
||||
# the output to str before continuing
|
||||
if compat.PY3:
|
||||
newline = compat.bytes_to_str(newline)
|
||||
|
||||
# mmap doesn't raise if reading past the allocated
|
||||
# data but instead returns an empty string, so raise
|
||||
# if that is returned
|
||||
if newline == '':
|
||||
raise StopIteration
|
||||
return newline
|
||||
|
||||
|
||||
if not compat.PY3:
|
||||
MMapWrapper.next = lambda self: self.__next__()
|
||||
|
||||
|
||||
class UTF8Recoder(BaseIterator):
|
||||
|
||||
"""
|
||||
Iterator that reads an encoded stream and reencodes the input to UTF-8
|
||||
"""
|
||||
|
||||
def __init__(self, f, encoding):
|
||||
self.reader = codecs.getreader(encoding)(f)
|
||||
|
||||
def read(self, bytes=-1):
|
||||
return self.reader.read(bytes).encode("utf-8")
|
||||
|
||||
def readline(self):
|
||||
return self.reader.readline().encode("utf-8")
|
||||
|
||||
def next(self):
|
||||
return next(self.reader).encode("utf-8")
|
||||
|
||||
|
||||
if compat.PY3: # pragma: no cover
|
||||
def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
|
||||
# ignore encoding
|
||||
return csv.reader(f, dialect=dialect, **kwds)
|
||||
|
||||
def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
|
||||
return csv.writer(f, dialect=dialect, **kwds)
|
||||
else:
|
||||
class UnicodeReader(BaseIterator):
|
||||
|
||||
"""
|
||||
A CSV reader which will iterate over lines in the CSV file "f",
|
||||
which is encoded in the given encoding.
|
||||
|
||||
On Python 3, this is replaced (below) by csv.reader, which handles
|
||||
unicode.
|
||||
"""
|
||||
|
||||
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
|
||||
f = UTF8Recoder(f, encoding)
|
||||
self.reader = csv.reader(f, dialect=dialect, **kwds)
|
||||
|
||||
def __next__(self):
|
||||
row = next(self.reader)
|
||||
return [compat.text_type(s, "utf-8") for s in row]
|
||||
|
||||
class UnicodeWriter(object):
|
||||
|
||||
"""
|
||||
A CSV writer which will write rows to CSV file "f",
|
||||
which is encoded in the given encoding.
|
||||
"""
|
||||
|
||||
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
|
||||
# Redirect output to a queue
|
||||
self.queue = StringIO()
|
||||
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
|
||||
self.stream = f
|
||||
self.encoder = codecs.getincrementalencoder(encoding)()
|
||||
self.quoting = kwds.get("quoting", None)
|
||||
|
||||
def writerow(self, row):
|
||||
def _check_as_is(x):
|
||||
return (self.quoting == csv.QUOTE_NONNUMERIC and
|
||||
is_number(x)) or isinstance(x, str)
|
||||
|
||||
row = [x if _check_as_is(x)
|
||||
else pprint_thing(x).encode("utf-8") for x in row]
|
||||
|
||||
self.writer.writerow([s for s in row])
|
||||
# Fetch UTF-8 output from the queue ...
|
||||
data = self.queue.getvalue()
|
||||
data = data.decode("utf-8")
|
||||
# ... and re-encode it into the target encoding
|
||||
data = self.encoder.encode(data)
|
||||
# write to the target stream
|
||||
self.stream.write(data)
|
||||
# empty queue
|
||||
self.queue.truncate(0)
|
||||
|
||||
def writerows(self, rows):
|
||||
def _check_as_is(x):
|
||||
return (self.quoting == csv.QUOTE_NONNUMERIC and
|
||||
is_number(x)) or isinstance(x, str)
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
rows[i] = [x if _check_as_is(x)
|
||||
else pprint_thing(x).encode("utf-8") for x in row]
|
||||
|
||||
self.writer.writerows([[s for s in row] for row in rows])
|
||||
# Fetch UTF-8 output from the queue ...
|
||||
data = self.queue.getvalue()
|
||||
data = data.decode("utf-8")
|
||||
# ... and re-encode it into the target encoding
|
||||
data = self.encoder.encode(data)
|
||||
# write to the target stream
|
||||
self.stream.write(data)
|
||||
# empty queue
|
||||
self.queue.truncate(0)
|
||||
@@ -1,64 +0,0 @@
|
||||
"""This module is designed for community supported date conversion functions"""
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.tslibs import parsing
|
||||
from pandas.compat import map, range
|
||||
|
||||
|
||||
def parse_date_time(date_col, time_col):
|
||||
date_col = _maybe_cast(date_col)
|
||||
time_col = _maybe_cast(time_col)
|
||||
return parsing.try_parse_date_and_time(date_col, time_col)
|
||||
|
||||
|
||||
def parse_date_fields(year_col, month_col, day_col):
|
||||
year_col = _maybe_cast(year_col)
|
||||
month_col = _maybe_cast(month_col)
|
||||
day_col = _maybe_cast(day_col)
|
||||
return parsing.try_parse_year_month_day(year_col, month_col, day_col)
|
||||
|
||||
|
||||
def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col,
|
||||
second_col):
|
||||
year_col = _maybe_cast(year_col)
|
||||
month_col = _maybe_cast(month_col)
|
||||
day_col = _maybe_cast(day_col)
|
||||
hour_col = _maybe_cast(hour_col)
|
||||
minute_col = _maybe_cast(minute_col)
|
||||
second_col = _maybe_cast(second_col)
|
||||
return parsing.try_parse_datetime_components(year_col, month_col, day_col,
|
||||
hour_col, minute_col,
|
||||
second_col)
|
||||
|
||||
|
||||
def generic_parser(parse_func, *cols):
|
||||
N = _check_columns(cols)
|
||||
results = np.empty(N, dtype=object)
|
||||
|
||||
for i in range(N):
|
||||
args = [c[i] for c in cols]
|
||||
results[i] = parse_func(*args)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _maybe_cast(arr):
|
||||
if not arr.dtype.type == np.object_:
|
||||
arr = np.array(arr, dtype=object)
|
||||
return arr
|
||||
|
||||
|
||||
def _check_columns(cols):
|
||||
if not len(cols):
|
||||
raise AssertionError("There must be at least 1 column")
|
||||
|
||||
head, tail = cols[0], cols[1:]
|
||||
|
||||
N = len(head)
|
||||
|
||||
for i, n in enumerate(map(len, tail)):
|
||||
if n != N:
|
||||
raise AssertionError('All columns must have the same length: {0}; '
|
||||
'column {1} has length {2}'.format(N, i, n))
|
||||
|
||||
return N
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,127 +0,0 @@
|
||||
""" feather-format compat """
|
||||
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
from pandas.compat import range
|
||||
from pandas.util._decorators import deprecate_kwarg
|
||||
|
||||
from pandas import DataFrame, Int64Index, RangeIndex
|
||||
|
||||
from pandas.io.common import _stringify_path
|
||||
|
||||
|
||||
def _try_import():
|
||||
# since pandas is a dependency of pyarrow
|
||||
# we need to import on first use
|
||||
try:
|
||||
import pyarrow
|
||||
from pyarrow import feather
|
||||
except ImportError:
|
||||
# give a nice error message
|
||||
raise ImportError("pyarrow is not installed\n\n"
|
||||
"you can install via conda\n"
|
||||
"conda install pyarrow -c conda-forge\n"
|
||||
"or via pip\n"
|
||||
"pip install -U pyarrow\n")
|
||||
|
||||
if LooseVersion(pyarrow.__version__) < LooseVersion('0.9.0'):
|
||||
raise ImportError("pyarrow >= 0.9.0 required for feather support\n\n"
|
||||
"you can install via conda\n"
|
||||
"conda install pyarrow -c conda-forge"
|
||||
"or via pip\n"
|
||||
"pip install -U pyarrow\n")
|
||||
|
||||
return feather, pyarrow
|
||||
|
||||
|
||||
def to_feather(df, path):
|
||||
"""
|
||||
Write a DataFrame to the feather-format
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
path : string file path, or file-like object
|
||||
|
||||
"""
|
||||
path = _stringify_path(path)
|
||||
if not isinstance(df, DataFrame):
|
||||
raise ValueError("feather only support IO with DataFrames")
|
||||
|
||||
feather = _try_import()[0]
|
||||
valid_types = {'string', 'unicode'}
|
||||
|
||||
# validate index
|
||||
# --------------
|
||||
|
||||
# validate that we have only a default index
|
||||
# raise on anything else as we don't serialize the index
|
||||
|
||||
if not isinstance(df.index, Int64Index):
|
||||
raise ValueError("feather does not support serializing {} "
|
||||
"for the index; you can .reset_index()"
|
||||
"to make the index into column(s)".format(
|
||||
type(df.index)))
|
||||
|
||||
if not df.index.equals(RangeIndex.from_range(range(len(df)))):
|
||||
raise ValueError("feather does not support serializing a "
|
||||
"non-default index for the index; you "
|
||||
"can .reset_index() to make the index "
|
||||
"into column(s)")
|
||||
|
||||
if df.index.name is not None:
|
||||
raise ValueError("feather does not serialize index meta-data on a "
|
||||
"default index")
|
||||
|
||||
# validate columns
|
||||
# ----------------
|
||||
|
||||
# must have value column names (strings only)
|
||||
if df.columns.inferred_type not in valid_types:
|
||||
raise ValueError("feather must have string column names")
|
||||
|
||||
feather.write_feather(df, path)
|
||||
|
||||
|
||||
@deprecate_kwarg(old_arg_name='nthreads', new_arg_name='use_threads')
|
||||
def read_feather(path, columns=None, use_threads=True):
|
||||
"""
|
||||
Load a feather-format object from the file path
|
||||
|
||||
.. versionadded 0.20.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : string file path, or file-like object
|
||||
columns : sequence, default None
|
||||
If not provided, all columns are read
|
||||
|
||||
.. versionadded 0.24.0
|
||||
nthreads : int, default 1
|
||||
Number of CPU threads to use when reading to pandas.DataFrame
|
||||
|
||||
.. versionadded 0.21.0
|
||||
.. deprecated 0.24.0
|
||||
use_threads : bool, default True
|
||||
Whether to parallelize reading using multiple threads
|
||||
|
||||
.. versionadded 0.24.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
type of object stored in file
|
||||
|
||||
"""
|
||||
|
||||
feather, pyarrow = _try_import()
|
||||
path = _stringify_path(path)
|
||||
|
||||
if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'):
|
||||
int_use_threads = int(use_threads)
|
||||
if int_use_threads < 1:
|
||||
int_use_threads = 1
|
||||
return feather.read_feather(path, columns=columns,
|
||||
nthreads=int_use_threads)
|
||||
|
||||
return feather.read_feather(path, columns=columns,
|
||||
use_threads=bool(use_threads))
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -1,159 +0,0 @@
|
||||
"""
|
||||
Internal module for console introspection
|
||||
"""
|
||||
|
||||
import locale
|
||||
import sys
|
||||
|
||||
from pandas.io.formats.terminal import get_terminal_size
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Global formatting options
|
||||
_initial_defencoding = None
|
||||
|
||||
|
||||
def detect_console_encoding():
|
||||
"""
|
||||
Try to find the most capable encoding supported by the console.
|
||||
slightly modified from the way IPython handles the same issue.
|
||||
"""
|
||||
global _initial_defencoding
|
||||
|
||||
encoding = None
|
||||
try:
|
||||
encoding = sys.stdout.encoding or sys.stdin.encoding
|
||||
except (AttributeError, IOError):
|
||||
pass
|
||||
|
||||
# try again for something better
|
||||
if not encoding or 'ascii' in encoding.lower():
|
||||
try:
|
||||
encoding = locale.getpreferredencoding()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# when all else fails. this will usually be "ascii"
|
||||
if not encoding or 'ascii' in encoding.lower():
|
||||
encoding = sys.getdefaultencoding()
|
||||
|
||||
# GH3360, save the reported defencoding at import time
|
||||
# MPL backends may change it. Make available for debugging.
|
||||
if not _initial_defencoding:
|
||||
_initial_defencoding = sys.getdefaultencoding()
|
||||
|
||||
return encoding
|
||||
|
||||
|
||||
def get_console_size():
|
||||
"""Return console size as tuple = (width, height).
|
||||
|
||||
Returns (None,None) in non-interactive session.
|
||||
"""
|
||||
from pandas import get_option
|
||||
|
||||
display_width = get_option('display.width')
|
||||
# deprecated.
|
||||
display_height = get_option('display.max_rows')
|
||||
|
||||
# Consider
|
||||
# interactive shell terminal, can detect term size
|
||||
# interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term
|
||||
# size non-interactive script, should disregard term size
|
||||
|
||||
# in addition
|
||||
# width,height have default values, but setting to 'None' signals
|
||||
# should use Auto-Detection, But only in interactive shell-terminal.
|
||||
# Simple. yeah.
|
||||
|
||||
if in_interactive_session():
|
||||
if in_ipython_frontend():
|
||||
# sane defaults for interactive non-shell terminal
|
||||
# match default for width,height in config_init
|
||||
from pandas.core.config import get_default_val
|
||||
terminal_width = get_default_val('display.width')
|
||||
terminal_height = get_default_val('display.max_rows')
|
||||
else:
|
||||
# pure terminal
|
||||
terminal_width, terminal_height = get_terminal_size()
|
||||
else:
|
||||
terminal_width, terminal_height = None, None
|
||||
|
||||
# Note if the User sets width/Height to None (auto-detection)
|
||||
# and we're in a script (non-inter), this will return (None,None)
|
||||
# caller needs to deal.
|
||||
return (display_width or terminal_width, display_height or terminal_height)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Detect our environment
|
||||
|
||||
def in_interactive_session():
|
||||
""" check if we're running in an interactive shell
|
||||
|
||||
returns True if running under python/ipython interactive shell
|
||||
"""
|
||||
from pandas import get_option
|
||||
|
||||
def check_main():
|
||||
try:
|
||||
import __main__ as main
|
||||
except ModuleNotFoundError:
|
||||
return get_option('mode.sim_interactive')
|
||||
return (not hasattr(main, '__file__') or
|
||||
get_option('mode.sim_interactive'))
|
||||
|
||||
try:
|
||||
return __IPYTHON__ or check_main() # noqa
|
||||
except NameError:
|
||||
return check_main()
|
||||
|
||||
|
||||
def in_qtconsole():
|
||||
"""
|
||||
check if we're inside an IPython qtconsole
|
||||
|
||||
.. deprecated:: 0.14.1
|
||||
This is no longer needed, or working, in IPython 3 and above.
|
||||
"""
|
||||
try:
|
||||
ip = get_ipython() # noqa
|
||||
front_end = (
|
||||
ip.config.get('KernelApp', {}).get('parent_appname', "") or
|
||||
ip.config.get('IPKernelApp', {}).get('parent_appname', ""))
|
||||
if 'qtconsole' in front_end.lower():
|
||||
return True
|
||||
except NameError:
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
def in_ipnb():
|
||||
"""
|
||||
check if we're inside an IPython Notebook
|
||||
|
||||
.. deprecated:: 0.14.1
|
||||
This is no longer needed, or working, in IPython 3 and above.
|
||||
"""
|
||||
try:
|
||||
ip = get_ipython() # noqa
|
||||
front_end = (
|
||||
ip.config.get('KernelApp', {}).get('parent_appname', "") or
|
||||
ip.config.get('IPKernelApp', {}).get('parent_appname', ""))
|
||||
if 'notebook' in front_end.lower():
|
||||
return True
|
||||
except NameError:
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
def in_ipython_frontend():
|
||||
"""
|
||||
check if we're inside an an IPython zmq frontend
|
||||
"""
|
||||
try:
|
||||
ip = get_ipython() # noqa
|
||||
return 'zmq' in str(type(ip)).lower()
|
||||
except NameError:
|
||||
pass
|
||||
|
||||
return False
|
||||
@@ -1,250 +0,0 @@
|
||||
"""Utilities for interpreting CSS from Stylers for formatting non-HTML outputs
|
||||
"""
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
|
||||
class CSSWarning(UserWarning):
|
||||
"""This CSS syntax cannot currently be parsed"""
|
||||
pass
|
||||
|
||||
|
||||
class CSSResolver(object):
|
||||
"""A callable for parsing and resolving CSS to atomic properties
|
||||
|
||||
"""
|
||||
|
||||
INITIAL_STYLE = {
|
||||
}
|
||||
|
||||
def __call__(self, declarations_str, inherited=None):
|
||||
""" the given declarations to atomic properties
|
||||
|
||||
Parameters
|
||||
----------
|
||||
declarations_str : str
|
||||
A list of CSS declarations
|
||||
inherited : dict, optional
|
||||
Atomic properties indicating the inherited style context in which
|
||||
declarations_str is to be resolved. ``inherited`` should already
|
||||
be resolved, i.e. valid output of this method.
|
||||
|
||||
Returns
|
||||
-------
|
||||
props : dict
|
||||
Atomic CSS 2.2 properties
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> resolve = CSSResolver()
|
||||
>>> inherited = {'font-family': 'serif', 'font-weight': 'bold'}
|
||||
>>> out = resolve('''
|
||||
... border-color: BLUE RED;
|
||||
... font-size: 1em;
|
||||
... font-size: 2em;
|
||||
... font-weight: normal;
|
||||
... font-weight: inherit;
|
||||
... ''', inherited)
|
||||
>>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE
|
||||
[('border-bottom-color', 'blue'),
|
||||
('border-left-color', 'red'),
|
||||
('border-right-color', 'red'),
|
||||
('border-top-color', 'blue'),
|
||||
('font-family', 'serif'),
|
||||
('font-size', '24pt'),
|
||||
('font-weight', 'bold')]
|
||||
"""
|
||||
|
||||
props = dict(self.atomize(self.parse(declarations_str)))
|
||||
if inherited is None:
|
||||
inherited = {}
|
||||
|
||||
# 1. resolve inherited, initial
|
||||
for prop, val in inherited.items():
|
||||
if prop not in props:
|
||||
props[prop] = val
|
||||
|
||||
for prop, val in list(props.items()):
|
||||
if val == 'inherit':
|
||||
val = inherited.get(prop, 'initial')
|
||||
if val == 'initial':
|
||||
val = self.INITIAL_STYLE.get(prop)
|
||||
|
||||
if val is None:
|
||||
# we do not define a complete initial stylesheet
|
||||
del props[prop]
|
||||
else:
|
||||
props[prop] = val
|
||||
|
||||
# 2. resolve relative font size
|
||||
if props.get('font-size'):
|
||||
if 'font-size' in inherited:
|
||||
em_pt = inherited['font-size']
|
||||
assert em_pt[-2:] == 'pt'
|
||||
em_pt = float(em_pt[:-2])
|
||||
else:
|
||||
em_pt = None
|
||||
props['font-size'] = self.size_to_pt(
|
||||
props['font-size'], em_pt, conversions=self.FONT_SIZE_RATIOS)
|
||||
|
||||
font_size = float(props['font-size'][:-2])
|
||||
else:
|
||||
font_size = None
|
||||
|
||||
# 3. TODO: resolve other font-relative units
|
||||
for side in self.SIDES:
|
||||
prop = 'border-{side}-width'.format(side=side)
|
||||
if prop in props:
|
||||
props[prop] = self.size_to_pt(
|
||||
props[prop], em_pt=font_size,
|
||||
conversions=self.BORDER_WIDTH_RATIOS)
|
||||
for prop in ['margin-{side}'.format(side=side),
|
||||
'padding-{side}'.format(side=side)]:
|
||||
if prop in props:
|
||||
# TODO: support %
|
||||
props[prop] = self.size_to_pt(
|
||||
props[prop], em_pt=font_size,
|
||||
conversions=self.MARGIN_RATIOS)
|
||||
|
||||
return props
|
||||
|
||||
UNIT_RATIOS = {
|
||||
'rem': ('pt', 12),
|
||||
'ex': ('em', .5),
|
||||
# 'ch':
|
||||
'px': ('pt', .75),
|
||||
'pc': ('pt', 12),
|
||||
'in': ('pt', 72),
|
||||
'cm': ('in', 1 / 2.54),
|
||||
'mm': ('in', 1 / 25.4),
|
||||
'q': ('mm', .25),
|
||||
'!!default': ('em', 0),
|
||||
}
|
||||
|
||||
FONT_SIZE_RATIOS = UNIT_RATIOS.copy()
|
||||
FONT_SIZE_RATIOS.update({
|
||||
'%': ('em', .01),
|
||||
'xx-small': ('rem', .5),
|
||||
'x-small': ('rem', .625),
|
||||
'small': ('rem', .8),
|
||||
'medium': ('rem', 1),
|
||||
'large': ('rem', 1.125),
|
||||
'x-large': ('rem', 1.5),
|
||||
'xx-large': ('rem', 2),
|
||||
'smaller': ('em', 1 / 1.2),
|
||||
'larger': ('em', 1.2),
|
||||
'!!default': ('em', 1),
|
||||
})
|
||||
|
||||
MARGIN_RATIOS = UNIT_RATIOS.copy()
|
||||
MARGIN_RATIOS.update({
|
||||
'none': ('pt', 0),
|
||||
})
|
||||
|
||||
BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy()
|
||||
BORDER_WIDTH_RATIOS.update({
|
||||
'none': ('pt', 0),
|
||||
'thick': ('px', 4),
|
||||
'medium': ('px', 2),
|
||||
'thin': ('px', 1),
|
||||
# Default: medium only if solid
|
||||
})
|
||||
|
||||
def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS):
|
||||
def _error():
|
||||
warnings.warn('Unhandled size: {val!r}'.format(val=in_val),
|
||||
CSSWarning)
|
||||
return self.size_to_pt('1!!default', conversions=conversions)
|
||||
|
||||
try:
|
||||
val, unit = re.match(r'^(\S*?)([a-zA-Z%!].*)', in_val).groups()
|
||||
except AttributeError:
|
||||
return _error()
|
||||
if val == '':
|
||||
# hack for 'large' etc.
|
||||
val = 1
|
||||
else:
|
||||
try:
|
||||
val = float(val)
|
||||
except ValueError:
|
||||
return _error()
|
||||
|
||||
while unit != 'pt':
|
||||
if unit == 'em':
|
||||
if em_pt is None:
|
||||
unit = 'rem'
|
||||
else:
|
||||
val *= em_pt
|
||||
unit = 'pt'
|
||||
continue
|
||||
|
||||
try:
|
||||
unit, mul = conversions[unit]
|
||||
except KeyError:
|
||||
return _error()
|
||||
val *= mul
|
||||
|
||||
val = round(val, 5)
|
||||
if int(val) == val:
|
||||
size_fmt = '{fmt:d}pt'.format(fmt=int(val))
|
||||
else:
|
||||
size_fmt = '{fmt:f}pt'.format(fmt=val)
|
||||
return size_fmt
|
||||
|
||||
def atomize(self, declarations):
|
||||
for prop, value in declarations:
|
||||
attr = 'expand_' + prop.replace('-', '_')
|
||||
try:
|
||||
expand = getattr(self, attr)
|
||||
except AttributeError:
|
||||
yield prop, value
|
||||
else:
|
||||
for prop, value in expand(prop, value):
|
||||
yield prop, value
|
||||
|
||||
SIDE_SHORTHANDS = {
|
||||
1: [0, 0, 0, 0],
|
||||
2: [0, 1, 0, 1],
|
||||
3: [0, 1, 2, 1],
|
||||
4: [0, 1, 2, 3],
|
||||
}
|
||||
SIDES = ('top', 'right', 'bottom', 'left')
|
||||
|
||||
def _side_expander(prop_fmt):
|
||||
def expand(self, prop, value):
|
||||
tokens = value.split()
|
||||
try:
|
||||
mapping = self.SIDE_SHORTHANDS[len(tokens)]
|
||||
except KeyError:
|
||||
warnings.warn('Could not expand "{prop}: {val}"'
|
||||
.format(prop=prop, val=value), CSSWarning)
|
||||
return
|
||||
for key, idx in zip(self.SIDES, mapping):
|
||||
yield prop_fmt.format(key), tokens[idx]
|
||||
|
||||
return expand
|
||||
|
||||
expand_border_color = _side_expander('border-{:s}-color')
|
||||
expand_border_style = _side_expander('border-{:s}-style')
|
||||
expand_border_width = _side_expander('border-{:s}-width')
|
||||
expand_margin = _side_expander('margin-{:s}')
|
||||
expand_padding = _side_expander('padding-{:s}')
|
||||
|
||||
def parse(self, declarations_str):
|
||||
"""Generates (prop, value) pairs from declarations
|
||||
|
||||
In a future version may generate parsed tokens from tinycss/tinycss2
|
||||
"""
|
||||
for decl in declarations_str.split(';'):
|
||||
if not decl.strip():
|
||||
continue
|
||||
prop, sep, val = decl.partition(':')
|
||||
prop = prop.strip().lower()
|
||||
# TODO: don't lowercase case sensitive parts of values (strings)
|
||||
val = val.strip().lower()
|
||||
if sep:
|
||||
yield prop, val
|
||||
else:
|
||||
warnings.warn('Ill-formatted attribute: expected a colon '
|
||||
'in {decl!r}'.format(decl=decl), CSSWarning)
|
||||
@@ -1,315 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Module for formatting output data into CSV files.
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import csv as csvlib
|
||||
import os
|
||||
import warnings
|
||||
from zipfile import ZipFile
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import writers as libwriters
|
||||
from pandas.compat import StringIO, range, zip
|
||||
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDatetimeIndex, ABCIndexClass, ABCMultiIndex, ABCPeriodIndex)
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
from pandas import compat
|
||||
|
||||
from pandas.io.common import (
|
||||
UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer)
|
||||
|
||||
|
||||
class CSVFormatter(object):
|
||||
|
||||
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
|
||||
float_format=None, cols=None, header=True, index=True,
|
||||
index_label=None, mode='w', nanRep=None, encoding=None,
|
||||
compression='infer', quoting=None, line_terminator='\n',
|
||||
chunksize=None, tupleize_cols=False, quotechar='"',
|
||||
date_format=None, doublequote=True, escapechar=None,
|
||||
decimal='.'):
|
||||
|
||||
self.obj = obj
|
||||
|
||||
if path_or_buf is None:
|
||||
path_or_buf = StringIO()
|
||||
|
||||
self.path_or_buf, _, _, _ = get_filepath_or_buffer(
|
||||
path_or_buf, encoding=encoding, compression=compression, mode=mode
|
||||
)
|
||||
self.sep = sep
|
||||
self.na_rep = na_rep
|
||||
self.float_format = float_format
|
||||
self.decimal = decimal
|
||||
|
||||
self.header = header
|
||||
self.index = index
|
||||
self.index_label = index_label
|
||||
self.mode = mode
|
||||
if encoding is None:
|
||||
encoding = 'ascii' if compat.PY2 else 'utf-8'
|
||||
self.encoding = encoding
|
||||
self.compression = _infer_compression(self.path_or_buf, compression)
|
||||
|
||||
if quoting is None:
|
||||
quoting = csvlib.QUOTE_MINIMAL
|
||||
self.quoting = quoting
|
||||
|
||||
if quoting == csvlib.QUOTE_NONE:
|
||||
# prevents crash in _csv
|
||||
quotechar = None
|
||||
self.quotechar = quotechar
|
||||
|
||||
self.doublequote = doublequote
|
||||
self.escapechar = escapechar
|
||||
|
||||
self.line_terminator = line_terminator or os.linesep
|
||||
|
||||
self.date_format = date_format
|
||||
|
||||
self.tupleize_cols = tupleize_cols
|
||||
self.has_mi_columns = (isinstance(obj.columns, ABCMultiIndex) and
|
||||
not self.tupleize_cols)
|
||||
|
||||
# validate mi options
|
||||
if self.has_mi_columns:
|
||||
if cols is not None:
|
||||
raise TypeError("cannot specify cols with a MultiIndex on the "
|
||||
"columns")
|
||||
|
||||
if cols is not None:
|
||||
if isinstance(cols, ABCIndexClass):
|
||||
cols = cols.to_native_types(na_rep=na_rep,
|
||||
float_format=float_format,
|
||||
date_format=date_format,
|
||||
quoting=self.quoting)
|
||||
else:
|
||||
cols = list(cols)
|
||||
self.obj = self.obj.loc[:, cols]
|
||||
|
||||
# update columns to include possible multiplicity of dupes
|
||||
# and make sure sure cols is just a list of labels
|
||||
cols = self.obj.columns
|
||||
if isinstance(cols, ABCIndexClass):
|
||||
cols = cols.to_native_types(na_rep=na_rep,
|
||||
float_format=float_format,
|
||||
date_format=date_format,
|
||||
quoting=self.quoting)
|
||||
else:
|
||||
cols = list(cols)
|
||||
|
||||
# save it
|
||||
self.cols = cols
|
||||
|
||||
# preallocate data 2d list
|
||||
self.blocks = self.obj._data.blocks
|
||||
ncols = sum(b.shape[0] for b in self.blocks)
|
||||
self.data = [None] * ncols
|
||||
|
||||
if chunksize is None:
|
||||
chunksize = (100000 // (len(self.cols) or 1)) or 1
|
||||
self.chunksize = int(chunksize)
|
||||
|
||||
self.data_index = obj.index
|
||||
if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and
|
||||
date_format is not None):
|
||||
from pandas import Index
|
||||
self.data_index = Index([x.strftime(date_format) if notna(x) else
|
||||
'' for x in self.data_index])
|
||||
|
||||
self.nlevels = getattr(self.data_index, 'nlevels', 1)
|
||||
if not index:
|
||||
self.nlevels = 0
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Create the writer & save
|
||||
"""
|
||||
# GH21227 internal compression is not used when file-like passed.
|
||||
if self.compression and hasattr(self.path_or_buf, 'write'):
|
||||
msg = ("compression has no effect when passing file-like "
|
||||
"object as input.")
|
||||
warnings.warn(msg, RuntimeWarning, stacklevel=2)
|
||||
|
||||
# when zip compression is called.
|
||||
is_zip = isinstance(self.path_or_buf, ZipFile) or (
|
||||
not hasattr(self.path_or_buf, 'write')
|
||||
and self.compression == 'zip')
|
||||
|
||||
if is_zip:
|
||||
# zipfile doesn't support writing string to archive. uses string
|
||||
# buffer to receive csv writing and dump into zip compression
|
||||
# file handle. GH21241, GH21118
|
||||
f = StringIO()
|
||||
close = False
|
||||
elif hasattr(self.path_or_buf, 'write'):
|
||||
f = self.path_or_buf
|
||||
close = False
|
||||
else:
|
||||
f, handles = _get_handle(self.path_or_buf, self.mode,
|
||||
encoding=self.encoding,
|
||||
compression=self.compression)
|
||||
close = True
|
||||
|
||||
try:
|
||||
writer_kwargs = dict(lineterminator=self.line_terminator,
|
||||
delimiter=self.sep, quoting=self.quoting,
|
||||
doublequote=self.doublequote,
|
||||
escapechar=self.escapechar,
|
||||
quotechar=self.quotechar)
|
||||
if self.encoding == 'ascii':
|
||||
self.writer = csvlib.writer(f, **writer_kwargs)
|
||||
else:
|
||||
writer_kwargs['encoding'] = self.encoding
|
||||
self.writer = UnicodeWriter(f, **writer_kwargs)
|
||||
|
||||
self._save()
|
||||
|
||||
finally:
|
||||
if is_zip:
|
||||
# GH17778 handles zip compression separately.
|
||||
buf = f.getvalue()
|
||||
if hasattr(self.path_or_buf, 'write'):
|
||||
self.path_or_buf.write(buf)
|
||||
else:
|
||||
f, handles = _get_handle(self.path_or_buf, self.mode,
|
||||
encoding=self.encoding,
|
||||
compression=self.compression)
|
||||
f.write(buf)
|
||||
close = True
|
||||
if close:
|
||||
f.close()
|
||||
for _fh in handles:
|
||||
_fh.close()
|
||||
|
||||
def _save_header(self):
|
||||
|
||||
writer = self.writer
|
||||
obj = self.obj
|
||||
index_label = self.index_label
|
||||
cols = self.cols
|
||||
has_mi_columns = self.has_mi_columns
|
||||
header = self.header
|
||||
encoded_labels = []
|
||||
|
||||
has_aliases = isinstance(header, (tuple, list, np.ndarray,
|
||||
ABCIndexClass))
|
||||
if not (has_aliases or self.header):
|
||||
return
|
||||
if has_aliases:
|
||||
if len(header) != len(cols):
|
||||
raise ValueError(('Writing {ncols} cols but got {nalias} '
|
||||
'aliases'.format(ncols=len(cols),
|
||||
nalias=len(header))))
|
||||
else:
|
||||
write_cols = header
|
||||
else:
|
||||
write_cols = cols
|
||||
|
||||
if self.index:
|
||||
# should write something for index label
|
||||
if index_label is not False:
|
||||
if index_label is None:
|
||||
if isinstance(obj.index, ABCMultiIndex):
|
||||
index_label = []
|
||||
for i, name in enumerate(obj.index.names):
|
||||
if name is None:
|
||||
name = ''
|
||||
index_label.append(name)
|
||||
else:
|
||||
index_label = obj.index.name
|
||||
if index_label is None:
|
||||
index_label = ['']
|
||||
else:
|
||||
index_label = [index_label]
|
||||
elif not isinstance(index_label,
|
||||
(list, tuple, np.ndarray, ABCIndexClass)):
|
||||
# given a string for a DF with Index
|
||||
index_label = [index_label]
|
||||
|
||||
encoded_labels = list(index_label)
|
||||
else:
|
||||
encoded_labels = []
|
||||
|
||||
if not has_mi_columns or has_aliases:
|
||||
encoded_labels += list(write_cols)
|
||||
writer.writerow(encoded_labels)
|
||||
else:
|
||||
# write out the mi
|
||||
columns = obj.columns
|
||||
|
||||
# write out the names for each level, then ALL of the values for
|
||||
# each level
|
||||
for i in range(columns.nlevels):
|
||||
|
||||
# we need at least 1 index column to write our col names
|
||||
col_line = []
|
||||
if self.index:
|
||||
|
||||
# name is the first column
|
||||
col_line.append(columns.names[i])
|
||||
|
||||
if isinstance(index_label, list) and len(index_label) > 1:
|
||||
col_line.extend([''] * (len(index_label) - 1))
|
||||
|
||||
col_line.extend(columns._get_level_values(i))
|
||||
|
||||
writer.writerow(col_line)
|
||||
|
||||
# Write out the index line if it's not empty.
|
||||
# Otherwise, we will print out an extraneous
|
||||
# blank line between the mi and the data rows.
|
||||
if encoded_labels and set(encoded_labels) != {''}:
|
||||
encoded_labels.extend([''] * len(columns))
|
||||
writer.writerow(encoded_labels)
|
||||
|
||||
def _save(self):
|
||||
|
||||
self._save_header()
|
||||
|
||||
nrows = len(self.data_index)
|
||||
|
||||
# write in chunksize bites
|
||||
chunksize = self.chunksize
|
||||
chunks = int(nrows / chunksize) + 1
|
||||
|
||||
for i in range(chunks):
|
||||
start_i = i * chunksize
|
||||
end_i = min((i + 1) * chunksize, nrows)
|
||||
if start_i >= end_i:
|
||||
break
|
||||
|
||||
self._save_chunk(start_i, end_i)
|
||||
|
||||
def _save_chunk(self, start_i, end_i):
|
||||
|
||||
data_index = self.data_index
|
||||
|
||||
# create the data for a chunk
|
||||
slicer = slice(start_i, end_i)
|
||||
for i in range(len(self.blocks)):
|
||||
b = self.blocks[i]
|
||||
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
|
||||
float_format=self.float_format,
|
||||
decimal=self.decimal,
|
||||
date_format=self.date_format,
|
||||
quoting=self.quoting)
|
||||
|
||||
for col_loc, col in zip(b.mgr_locs, d):
|
||||
# self.data is a preallocated list
|
||||
self.data[col_loc] = col
|
||||
|
||||
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
|
||||
float_format=self.float_format,
|
||||
decimal=self.decimal,
|
||||
date_format=self.date_format,
|
||||
quoting=self.quoting)
|
||||
|
||||
libwriters.write_csv_rows(self.data, ix, self.nlevels,
|
||||
self.cols, self.writer)
|
||||
@@ -1,664 +0,0 @@
|
||||
"""Utilities for conversion to writer-agnostic Excel representation
|
||||
"""
|
||||
|
||||
import itertools
|
||||
import re
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat import reduce
|
||||
|
||||
from pandas.core.dtypes import missing
|
||||
from pandas.core.dtypes.common import is_float, is_scalar
|
||||
from pandas.core.dtypes.generic import ABCMultiIndex, ABCPeriodIndex
|
||||
|
||||
from pandas import Index
|
||||
import pandas.core.common as com
|
||||
|
||||
from pandas.io.formats.css import CSSResolver, CSSWarning
|
||||
from pandas.io.formats.format import get_level_lengths
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
|
||||
class ExcelCell(object):
|
||||
__fields__ = ('row', 'col', 'val', 'style', 'mergestart', 'mergeend')
|
||||
__slots__ = __fields__
|
||||
|
||||
def __init__(self, row, col, val, style=None, mergestart=None,
|
||||
mergeend=None):
|
||||
self.row = row
|
||||
self.col = col
|
||||
self.val = val
|
||||
self.style = style
|
||||
self.mergestart = mergestart
|
||||
self.mergeend = mergeend
|
||||
|
||||
|
||||
class CSSToExcelConverter(object):
|
||||
"""A callable for converting CSS declarations to ExcelWriter styles
|
||||
|
||||
Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow),
|
||||
focusing on font styling, backgrounds, borders and alignment.
|
||||
|
||||
Operates by first computing CSS styles in a fairly generic
|
||||
way (see :meth:`compute_css`) then determining Excel style
|
||||
properties from CSS properties (see :meth:`build_xlstyle`).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inherited : str, optional
|
||||
CSS declarations understood to be the containing scope for the
|
||||
CSS processed by :meth:`__call__`.
|
||||
"""
|
||||
# NB: Most of the methods here could be classmethods, as only __init__
|
||||
# and __call__ make use of instance attributes. We leave them as
|
||||
# instancemethods so that users can easily experiment with extensions
|
||||
# without monkey-patching.
|
||||
|
||||
def __init__(self, inherited=None):
|
||||
if inherited is not None:
|
||||
inherited = self.compute_css(inherited,
|
||||
self.compute_css.INITIAL_STYLE)
|
||||
|
||||
self.inherited = inherited
|
||||
|
||||
compute_css = CSSResolver()
|
||||
|
||||
def __call__(self, declarations_str):
|
||||
"""Convert CSS declarations to ExcelWriter style
|
||||
|
||||
Parameters
|
||||
----------
|
||||
declarations_str : str
|
||||
List of CSS declarations.
|
||||
e.g. "font-weight: bold; background: blue"
|
||||
|
||||
Returns
|
||||
-------
|
||||
xlstyle : dict
|
||||
A style as interpreted by ExcelWriter when found in
|
||||
ExcelCell.style.
|
||||
"""
|
||||
# TODO: memoize?
|
||||
properties = self.compute_css(declarations_str, self.inherited)
|
||||
return self.build_xlstyle(properties)
|
||||
|
||||
def build_xlstyle(self, props):
|
||||
out = {
|
||||
'alignment': self.build_alignment(props),
|
||||
'border': self.build_border(props),
|
||||
'fill': self.build_fill(props),
|
||||
'font': self.build_font(props),
|
||||
'number_format': self.build_number_format(props),
|
||||
}
|
||||
# TODO: handle cell width and height: needs support in pandas.io.excel
|
||||
|
||||
def remove_none(d):
|
||||
"""Remove key where value is None, through nested dicts"""
|
||||
for k, v in list(d.items()):
|
||||
if v is None:
|
||||
del d[k]
|
||||
elif isinstance(v, dict):
|
||||
remove_none(v)
|
||||
if not v:
|
||||
del d[k]
|
||||
|
||||
remove_none(out)
|
||||
return out
|
||||
|
||||
VERTICAL_MAP = {
|
||||
'top': 'top',
|
||||
'text-top': 'top',
|
||||
'middle': 'center',
|
||||
'baseline': 'bottom',
|
||||
'bottom': 'bottom',
|
||||
'text-bottom': 'bottom',
|
||||
# OpenXML also has 'justify', 'distributed'
|
||||
}
|
||||
|
||||
def build_alignment(self, props):
|
||||
# TODO: text-indent, padding-left -> alignment.indent
|
||||
return {'horizontal': props.get('text-align'),
|
||||
'vertical': self.VERTICAL_MAP.get(props.get('vertical-align')),
|
||||
'wrap_text': (None if props.get('white-space') is None else
|
||||
props['white-space'] not in
|
||||
('nowrap', 'pre', 'pre-line'))
|
||||
}
|
||||
|
||||
def build_border(self, props):
|
||||
return {side: {
|
||||
'style': self._border_style(props.get('border-{side}-style'
|
||||
.format(side=side)),
|
||||
props.get('border-{side}-width'
|
||||
.format(side=side))),
|
||||
'color': self.color_to_excel(
|
||||
props.get('border-{side}-color'.format(side=side))),
|
||||
} for side in ['top', 'right', 'bottom', 'left']}
|
||||
|
||||
def _border_style(self, style, width):
|
||||
# convert styles and widths to openxml, one of:
|
||||
# 'dashDot'
|
||||
# 'dashDotDot'
|
||||
# 'dashed'
|
||||
# 'dotted'
|
||||
# 'double'
|
||||
# 'hair'
|
||||
# 'medium'
|
||||
# 'mediumDashDot'
|
||||
# 'mediumDashDotDot'
|
||||
# 'mediumDashed'
|
||||
# 'slantDashDot'
|
||||
# 'thick'
|
||||
# 'thin'
|
||||
if width is None and style is None:
|
||||
return None
|
||||
if style == 'none' or style == 'hidden':
|
||||
return None
|
||||
|
||||
if width is None:
|
||||
width = '2pt'
|
||||
width = float(width[:-2])
|
||||
if width < 1e-5:
|
||||
return None
|
||||
elif width < 1.3:
|
||||
width_name = 'thin'
|
||||
elif width < 2.8:
|
||||
width_name = 'medium'
|
||||
else:
|
||||
width_name = 'thick'
|
||||
|
||||
if style in (None, 'groove', 'ridge', 'inset', 'outset'):
|
||||
# not handled
|
||||
style = 'solid'
|
||||
|
||||
if style == 'double':
|
||||
return 'double'
|
||||
if style == 'solid':
|
||||
return width_name
|
||||
if style == 'dotted':
|
||||
if width_name in ('hair', 'thin'):
|
||||
return 'dotted'
|
||||
return 'mediumDashDotDot'
|
||||
if style == 'dashed':
|
||||
if width_name in ('hair', 'thin'):
|
||||
return 'dashed'
|
||||
return 'mediumDashed'
|
||||
|
||||
def build_fill(self, props):
|
||||
# TODO: perhaps allow for special properties
|
||||
# -excel-pattern-bgcolor and -excel-pattern-type
|
||||
fill_color = props.get('background-color')
|
||||
if fill_color not in (None, 'transparent', 'none'):
|
||||
return {
|
||||
'fgColor': self.color_to_excel(fill_color),
|
||||
'patternType': 'solid',
|
||||
}
|
||||
|
||||
BOLD_MAP = {'bold': True, 'bolder': True, '600': True, '700': True,
|
||||
'800': True, '900': True,
|
||||
'normal': False, 'lighter': False, '100': False, '200': False,
|
||||
'300': False, '400': False, '500': False}
|
||||
ITALIC_MAP = {'normal': False, 'italic': True, 'oblique': True}
|
||||
|
||||
def build_font(self, props):
|
||||
size = props.get('font-size')
|
||||
if size is not None:
|
||||
assert size.endswith('pt')
|
||||
size = float(size[:-2])
|
||||
|
||||
font_names_tmp = re.findall(r'''(?x)
|
||||
(
|
||||
"(?:[^"]|\\")+"
|
||||
|
|
||||
'(?:[^']|\\')+'
|
||||
|
|
||||
[^'",]+
|
||||
)(?=,|\s*$)
|
||||
''', props.get('font-family', ''))
|
||||
font_names = []
|
||||
for name in font_names_tmp:
|
||||
if name[:1] == '"':
|
||||
name = name[1:-1].replace('\\"', '"')
|
||||
elif name[:1] == '\'':
|
||||
name = name[1:-1].replace('\\\'', '\'')
|
||||
else:
|
||||
name = name.strip()
|
||||
if name:
|
||||
font_names.append(name)
|
||||
|
||||
family = None
|
||||
for name in font_names:
|
||||
if name == 'serif':
|
||||
family = 1 # roman
|
||||
break
|
||||
elif name == 'sans-serif':
|
||||
family = 2 # swiss
|
||||
break
|
||||
elif name == 'cursive':
|
||||
family = 4 # script
|
||||
break
|
||||
elif name == 'fantasy':
|
||||
family = 5 # decorative
|
||||
break
|
||||
|
||||
decoration = props.get('text-decoration')
|
||||
if decoration is not None:
|
||||
decoration = decoration.split()
|
||||
else:
|
||||
decoration = ()
|
||||
|
||||
return {
|
||||
'name': font_names[0] if font_names else None,
|
||||
'family': family,
|
||||
'size': size,
|
||||
'bold': self.BOLD_MAP.get(props.get('font-weight')),
|
||||
'italic': self.ITALIC_MAP.get(props.get('font-style')),
|
||||
'underline': ('single' if
|
||||
'underline' in decoration
|
||||
else None),
|
||||
'strike': ('line-through' in decoration) or None,
|
||||
'color': self.color_to_excel(props.get('color')),
|
||||
# shadow if nonzero digit before shadow color
|
||||
'shadow': (bool(re.search('^[^#(]*[1-9]',
|
||||
props['text-shadow']))
|
||||
if 'text-shadow' in props else None),
|
||||
# 'vertAlign':,
|
||||
# 'charset': ,
|
||||
# 'scheme': ,
|
||||
# 'outline': ,
|
||||
# 'condense': ,
|
||||
}
|
||||
|
||||
NAMED_COLORS = {
|
||||
'maroon': '800000',
|
||||
'brown': 'A52A2A',
|
||||
'red': 'FF0000',
|
||||
'pink': 'FFC0CB',
|
||||
'orange': 'FFA500',
|
||||
'yellow': 'FFFF00',
|
||||
'olive': '808000',
|
||||
'green': '008000',
|
||||
'purple': '800080',
|
||||
'fuchsia': 'FF00FF',
|
||||
'lime': '00FF00',
|
||||
'teal': '008080',
|
||||
'aqua': '00FFFF',
|
||||
'blue': '0000FF',
|
||||
'navy': '000080',
|
||||
'black': '000000',
|
||||
'gray': '808080',
|
||||
'grey': '808080',
|
||||
'silver': 'C0C0C0',
|
||||
'white': 'FFFFFF',
|
||||
}
|
||||
|
||||
def color_to_excel(self, val):
|
||||
if val is None:
|
||||
return None
|
||||
if val.startswith('#') and len(val) == 7:
|
||||
return val[1:].upper()
|
||||
if val.startswith('#') and len(val) == 4:
|
||||
return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper()
|
||||
try:
|
||||
return self.NAMED_COLORS[val]
|
||||
except KeyError:
|
||||
warnings.warn('Unhandled color format: {val!r}'.format(val=val),
|
||||
CSSWarning)
|
||||
|
||||
def build_number_format(self, props):
|
||||
return {'format_code': props.get('number-format')}
|
||||
|
||||
|
||||
class ExcelFormatter(object):
|
||||
"""
|
||||
Class for formatting a DataFrame to a list of ExcelCells,
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame or Styler
|
||||
na_rep: na representation
|
||||
float_format : string, default None
|
||||
Format string for floating point numbers
|
||||
cols : sequence, optional
|
||||
Columns to write
|
||||
header : boolean or list of string, default True
|
||||
Write out column names. If a list of string is given it is
|
||||
assumed to be aliases for the column names
|
||||
index : boolean, default True
|
||||
output row names (index)
|
||||
index_label : string or sequence, default None
|
||||
Column label for index column(s) if desired. If None is given, and
|
||||
`header` and `index` are True, then the index names are used. A
|
||||
sequence should be given if the DataFrame uses MultiIndex.
|
||||
merge_cells : boolean, default False
|
||||
Format MultiIndex and Hierarchical Rows as merged cells.
|
||||
inf_rep : string, default `'inf'`
|
||||
representation for np.inf values (which aren't representable in Excel)
|
||||
A `'-'` sign will be added in front of -inf.
|
||||
style_converter : callable, optional
|
||||
This translates Styler styles (CSS) into ExcelWriter styles.
|
||||
Defaults to ``CSSToExcelConverter()``.
|
||||
It should have signature css_declarations string -> excel style.
|
||||
This is only called for body cells.
|
||||
"""
|
||||
|
||||
def __init__(self, df, na_rep='', float_format=None, cols=None,
|
||||
header=True, index=True, index_label=None, merge_cells=False,
|
||||
inf_rep='inf', style_converter=None):
|
||||
self.rowcounter = 0
|
||||
self.na_rep = na_rep
|
||||
if hasattr(df, 'render'):
|
||||
self.styler = df
|
||||
df = df.data
|
||||
if style_converter is None:
|
||||
style_converter = CSSToExcelConverter()
|
||||
self.style_converter = style_converter
|
||||
else:
|
||||
self.styler = None
|
||||
self.df = df
|
||||
if cols is not None:
|
||||
|
||||
# all missing, raise
|
||||
if not len(Index(cols) & df.columns):
|
||||
raise KeyError(
|
||||
"passes columns are not ALL present dataframe")
|
||||
|
||||
# deprecatedin gh-17295
|
||||
# 1 missing is ok (for now)
|
||||
if len(Index(cols) & df.columns) != len(cols):
|
||||
warnings.warn(
|
||||
"Not all names specified in 'columns' are found; "
|
||||
"this will raise a KeyError in the future",
|
||||
FutureWarning)
|
||||
|
||||
self.df = df.reindex(columns=cols)
|
||||
self.columns = self.df.columns
|
||||
self.float_format = float_format
|
||||
self.index = index
|
||||
self.index_label = index_label
|
||||
self.header = header
|
||||
self.merge_cells = merge_cells
|
||||
self.inf_rep = inf_rep
|
||||
|
||||
@property
|
||||
def header_style(self):
|
||||
return {"font": {"bold": True},
|
||||
"borders": {"top": "thin",
|
||||
"right": "thin",
|
||||
"bottom": "thin",
|
||||
"left": "thin"},
|
||||
"alignment": {"horizontal": "center",
|
||||
"vertical": "top"}}
|
||||
|
||||
def _format_value(self, val):
|
||||
if is_scalar(val) and missing.isna(val):
|
||||
val = self.na_rep
|
||||
elif is_float(val):
|
||||
if missing.isposinf_scalar(val):
|
||||
val = self.inf_rep
|
||||
elif missing.isneginf_scalar(val):
|
||||
val = '-{inf}'.format(inf=self.inf_rep)
|
||||
elif self.float_format is not None:
|
||||
val = float(self.float_format % val)
|
||||
return val
|
||||
|
||||
def _format_header_mi(self):
|
||||
if self.columns.nlevels > 1:
|
||||
if not self.index:
|
||||
raise NotImplementedError("Writing to Excel with MultiIndex"
|
||||
" columns and no index "
|
||||
"('index'=False) is not yet "
|
||||
"implemented.")
|
||||
|
||||
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
|
||||
if not (has_aliases or self.header):
|
||||
return
|
||||
|
||||
columns = self.columns
|
||||
level_strs = columns.format(sparsify=self.merge_cells, adjoin=False,
|
||||
names=False)
|
||||
level_lengths = get_level_lengths(level_strs)
|
||||
coloffset = 0
|
||||
lnum = 0
|
||||
|
||||
if self.index and isinstance(self.df.index, ABCMultiIndex):
|
||||
coloffset = len(self.df.index[0]) - 1
|
||||
|
||||
if self.merge_cells:
|
||||
# Format multi-index as a merged cells.
|
||||
for lnum in range(len(level_lengths)):
|
||||
name = columns.names[lnum]
|
||||
yield ExcelCell(lnum, coloffset, name, self.header_style)
|
||||
|
||||
for lnum, (spans, levels, level_codes) in enumerate(zip(
|
||||
level_lengths, columns.levels, columns.codes)):
|
||||
values = levels.take(level_codes)
|
||||
for i in spans:
|
||||
if spans[i] > 1:
|
||||
yield ExcelCell(lnum, coloffset + i + 1, values[i],
|
||||
self.header_style, lnum,
|
||||
coloffset + i + spans[i])
|
||||
else:
|
||||
yield ExcelCell(lnum, coloffset + i + 1, values[i],
|
||||
self.header_style)
|
||||
else:
|
||||
# Format in legacy format with dots to indicate levels.
|
||||
for i, values in enumerate(zip(*level_strs)):
|
||||
v = ".".join(map(pprint_thing, values))
|
||||
yield ExcelCell(lnum, coloffset + i + 1, v, self.header_style)
|
||||
|
||||
self.rowcounter = lnum
|
||||
|
||||
def _format_header_regular(self):
|
||||
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
|
||||
if has_aliases or self.header:
|
||||
coloffset = 0
|
||||
|
||||
if self.index:
|
||||
coloffset = 1
|
||||
if isinstance(self.df.index, ABCMultiIndex):
|
||||
coloffset = len(self.df.index[0])
|
||||
|
||||
colnames = self.columns
|
||||
if has_aliases:
|
||||
if len(self.header) != len(self.columns):
|
||||
raise ValueError('Writing {cols} cols but got {alias} '
|
||||
'aliases'.format(cols=len(self.columns),
|
||||
alias=len(self.header)))
|
||||
else:
|
||||
colnames = self.header
|
||||
|
||||
for colindex, colname in enumerate(colnames):
|
||||
yield ExcelCell(self.rowcounter, colindex + coloffset, colname,
|
||||
self.header_style)
|
||||
|
||||
def _format_header(self):
|
||||
if isinstance(self.columns, ABCMultiIndex):
|
||||
gen = self._format_header_mi()
|
||||
else:
|
||||
gen = self._format_header_regular()
|
||||
|
||||
gen2 = ()
|
||||
if self.df.index.names:
|
||||
row = [x if x is not None else ''
|
||||
for x in self.df.index.names] + [''] * len(self.columns)
|
||||
if reduce(lambda x, y: x and y, map(lambda x: x != '', row)):
|
||||
gen2 = (ExcelCell(self.rowcounter, colindex, val,
|
||||
self.header_style)
|
||||
for colindex, val in enumerate(row))
|
||||
self.rowcounter += 1
|
||||
return itertools.chain(gen, gen2)
|
||||
|
||||
def _format_body(self):
|
||||
|
||||
if isinstance(self.df.index, ABCMultiIndex):
|
||||
return self._format_hierarchical_rows()
|
||||
else:
|
||||
return self._format_regular_rows()
|
||||
|
||||
def _format_regular_rows(self):
|
||||
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
|
||||
if has_aliases or self.header:
|
||||
self.rowcounter += 1
|
||||
|
||||
# output index and index_label?
|
||||
if self.index:
|
||||
# check aliases
|
||||
# if list only take first as this is not a MultiIndex
|
||||
if (self.index_label and
|
||||
isinstance(self.index_label, (list, tuple, np.ndarray,
|
||||
Index))):
|
||||
index_label = self.index_label[0]
|
||||
# if string good to go
|
||||
elif self.index_label and isinstance(self.index_label, str):
|
||||
index_label = self.index_label
|
||||
else:
|
||||
index_label = self.df.index.names[0]
|
||||
|
||||
if isinstance(self.columns, ABCMultiIndex):
|
||||
self.rowcounter += 1
|
||||
|
||||
if index_label and self.header is not False:
|
||||
yield ExcelCell(self.rowcounter - 1, 0, index_label,
|
||||
self.header_style)
|
||||
|
||||
# write index_values
|
||||
index_values = self.df.index
|
||||
if isinstance(self.df.index, ABCPeriodIndex):
|
||||
index_values = self.df.index.to_timestamp()
|
||||
|
||||
for idx, idxval in enumerate(index_values):
|
||||
yield ExcelCell(self.rowcounter + idx, 0, idxval,
|
||||
self.header_style)
|
||||
|
||||
coloffset = 1
|
||||
else:
|
||||
coloffset = 0
|
||||
|
||||
for cell in self._generate_body(coloffset):
|
||||
yield cell
|
||||
|
||||
def _format_hierarchical_rows(self):
|
||||
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
|
||||
if has_aliases or self.header:
|
||||
self.rowcounter += 1
|
||||
|
||||
gcolidx = 0
|
||||
|
||||
if self.index:
|
||||
index_labels = self.df.index.names
|
||||
# check for aliases
|
||||
if (self.index_label and
|
||||
isinstance(self.index_label, (list, tuple, np.ndarray,
|
||||
Index))):
|
||||
index_labels = self.index_label
|
||||
|
||||
# MultiIndex columns require an extra row
|
||||
# with index names (blank if None) for
|
||||
# unambigous round-trip, unless not merging,
|
||||
# in which case the names all go on one row Issue #11328
|
||||
if isinstance(self.columns, ABCMultiIndex) and self.merge_cells:
|
||||
self.rowcounter += 1
|
||||
|
||||
# if index labels are not empty go ahead and dump
|
||||
if com._any_not_none(*index_labels) and self.header is not False:
|
||||
|
||||
for cidx, name in enumerate(index_labels):
|
||||
yield ExcelCell(self.rowcounter - 1, cidx, name,
|
||||
self.header_style)
|
||||
|
||||
if self.merge_cells:
|
||||
# Format hierarchical rows as merged cells.
|
||||
level_strs = self.df.index.format(sparsify=True, adjoin=False,
|
||||
names=False)
|
||||
level_lengths = get_level_lengths(level_strs)
|
||||
|
||||
for spans, levels, level_codes in zip(level_lengths,
|
||||
self.df.index.levels,
|
||||
self.df.index.codes):
|
||||
|
||||
values = levels.take(level_codes,
|
||||
allow_fill=levels._can_hold_na,
|
||||
fill_value=True)
|
||||
|
||||
for i in spans:
|
||||
if spans[i] > 1:
|
||||
yield ExcelCell(self.rowcounter + i, gcolidx,
|
||||
values[i], self.header_style,
|
||||
self.rowcounter + i + spans[i] - 1,
|
||||
gcolidx)
|
||||
else:
|
||||
yield ExcelCell(self.rowcounter + i, gcolidx,
|
||||
values[i], self.header_style)
|
||||
gcolidx += 1
|
||||
|
||||
else:
|
||||
# Format hierarchical rows with non-merged values.
|
||||
for indexcolvals in zip(*self.df.index):
|
||||
for idx, indexcolval in enumerate(indexcolvals):
|
||||
yield ExcelCell(self.rowcounter + idx, gcolidx,
|
||||
indexcolval, self.header_style)
|
||||
gcolidx += 1
|
||||
|
||||
for cell in self._generate_body(gcolidx):
|
||||
yield cell
|
||||
|
||||
def _generate_body(self, coloffset):
|
||||
if self.styler is None:
|
||||
styles = None
|
||||
else:
|
||||
styles = self.styler._compute().ctx
|
||||
if not styles:
|
||||
styles = None
|
||||
xlstyle = None
|
||||
|
||||
# Write the body of the frame data series by series.
|
||||
for colidx in range(len(self.columns)):
|
||||
series = self.df.iloc[:, colidx]
|
||||
for i, val in enumerate(series):
|
||||
if styles is not None:
|
||||
xlstyle = self.style_converter(';'.join(styles[i, colidx]))
|
||||
yield ExcelCell(self.rowcounter + i, colidx + coloffset, val,
|
||||
xlstyle)
|
||||
|
||||
def get_formatted_cells(self):
|
||||
for cell in itertools.chain(self._format_header(),
|
||||
self._format_body()):
|
||||
cell.val = self._format_value(cell.val)
|
||||
yield cell
|
||||
|
||||
def write(self, writer, sheet_name='Sheet1', startrow=0,
|
||||
startcol=0, freeze_panes=None, engine=None):
|
||||
"""
|
||||
writer : string or ExcelWriter object
|
||||
File path or existing ExcelWriter
|
||||
sheet_name : string, default 'Sheet1'
|
||||
Name of sheet which will contain DataFrame
|
||||
startrow :
|
||||
upper left cell row to dump data frame
|
||||
startcol :
|
||||
upper left cell column to dump data frame
|
||||
freeze_panes : tuple of integer (length 2), default None
|
||||
Specifies the one-based bottommost row and rightmost column that
|
||||
is to be frozen
|
||||
engine : string, default None
|
||||
write engine to use if writer is a path - you can also set this
|
||||
via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``,
|
||||
and ``io.excel.xlsm.writer``.
|
||||
"""
|
||||
from pandas.io.excel import ExcelWriter
|
||||
from pandas.io.common import _stringify_path
|
||||
|
||||
if isinstance(writer, ExcelWriter):
|
||||
need_save = False
|
||||
else:
|
||||
writer = ExcelWriter(_stringify_path(writer), engine=engine)
|
||||
need_save = True
|
||||
|
||||
formatted_cells = self.get_formatted_cells()
|
||||
writer.write_cells(formatted_cells, sheet_name,
|
||||
startrow=startrow, startcol=startcol,
|
||||
freeze_panes=freeze_panes)
|
||||
if need_save:
|
||||
writer.save()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,531 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Module for formatting output data in HTML.
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from textwrap import dedent
|
||||
|
||||
from pandas.compat import OrderedDict, lzip, map, range, u, unichr, zip
|
||||
|
||||
from pandas.core.dtypes.generic import ABCMultiIndex
|
||||
|
||||
from pandas import compat
|
||||
import pandas.core.common as com
|
||||
from pandas.core.config import get_option
|
||||
|
||||
from pandas.io.common import _is_url
|
||||
from pandas.io.formats.format import TableFormatter, get_level_lengths
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
|
||||
class HTMLFormatter(TableFormatter):
|
||||
"""
|
||||
Internal class for formatting output data in html.
|
||||
This class is intended for shared functionality between
|
||||
DataFrame.to_html() and DataFrame._repr_html_().
|
||||
Any logic in common with other output formatting methods
|
||||
should ideally be inherited from classes in format.py
|
||||
and this class responsible for only producing html markup.
|
||||
"""
|
||||
|
||||
indent_delta = 2
|
||||
|
||||
def __init__(self, formatter, classes=None, border=None):
|
||||
self.fmt = formatter
|
||||
self.classes = classes
|
||||
|
||||
self.frame = self.fmt.frame
|
||||
self.columns = self.fmt.tr_frame.columns
|
||||
self.elements = []
|
||||
self.bold_rows = self.fmt.kwds.get('bold_rows', False)
|
||||
self.escape = self.fmt.kwds.get('escape', True)
|
||||
self.show_dimensions = self.fmt.show_dimensions
|
||||
if border is None:
|
||||
border = get_option('display.html.border')
|
||||
self.border = border
|
||||
self.table_id = self.fmt.table_id
|
||||
self.render_links = self.fmt.render_links
|
||||
|
||||
@property
|
||||
def show_row_idx_names(self):
|
||||
return self.fmt.show_row_idx_names
|
||||
|
||||
@property
|
||||
def show_col_idx_names(self):
|
||||
return self.fmt.show_col_idx_names
|
||||
|
||||
@property
|
||||
def row_levels(self):
|
||||
if self.fmt.index:
|
||||
# showing (row) index
|
||||
return self.frame.index.nlevels
|
||||
elif self.show_col_idx_names:
|
||||
# see gh-22579
|
||||
# Column misalignment also occurs for
|
||||
# a standard index when the columns index is named.
|
||||
# If the row index is not displayed a column of
|
||||
# blank cells need to be included before the DataFrame values.
|
||||
return 1
|
||||
# not showing (row) index
|
||||
return 0
|
||||
|
||||
@property
|
||||
def is_truncated(self):
|
||||
return self.fmt.is_truncated
|
||||
|
||||
@property
|
||||
def ncols(self):
|
||||
return len(self.fmt.tr_frame.columns)
|
||||
|
||||
def write(self, s, indent=0):
|
||||
rs = pprint_thing(s)
|
||||
self.elements.append(' ' * indent + rs)
|
||||
|
||||
def write_th(self, s, indent=0, tags=None):
|
||||
if self.fmt.col_space is not None and self.fmt.col_space > 0:
|
||||
tags = (tags or "")
|
||||
tags += ('style="min-width: {colspace};"'
|
||||
.format(colspace=self.fmt.col_space))
|
||||
|
||||
return self._write_cell(s, kind='th', indent=indent, tags=tags)
|
||||
|
||||
def write_td(self, s, indent=0, tags=None):
|
||||
return self._write_cell(s, kind='td', indent=indent, tags=tags)
|
||||
|
||||
def _write_cell(self, s, kind='td', indent=0, tags=None):
|
||||
if tags is not None:
|
||||
start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags)
|
||||
else:
|
||||
start_tag = '<{kind}>'.format(kind=kind)
|
||||
|
||||
if self.escape:
|
||||
# escape & first to prevent double escaping of &
|
||||
esc = OrderedDict([('&', r'&'), ('<', r'<'),
|
||||
('>', r'>')])
|
||||
else:
|
||||
esc = {}
|
||||
|
||||
rs = pprint_thing(s, escape_chars=esc).strip()
|
||||
|
||||
if self.render_links and _is_url(rs):
|
||||
rs_unescaped = pprint_thing(s, escape_chars={}).strip()
|
||||
start_tag += '<a href="{url}" target="_blank">'.format(
|
||||
url=rs_unescaped)
|
||||
end_a = '</a>'
|
||||
else:
|
||||
end_a = ''
|
||||
|
||||
self.write(u'{start}{rs}{end_a}</{kind}>'.format(
|
||||
start=start_tag, rs=rs, end_a=end_a, kind=kind), indent)
|
||||
|
||||
def write_tr(self, line, indent=0, indent_delta=0, header=False,
|
||||
align=None, tags=None, nindex_levels=0):
|
||||
if tags is None:
|
||||
tags = {}
|
||||
|
||||
if align is None:
|
||||
self.write('<tr>', indent)
|
||||
else:
|
||||
self.write('<tr style="text-align: {align};">'
|
||||
.format(align=align), indent)
|
||||
indent += indent_delta
|
||||
|
||||
for i, s in enumerate(line):
|
||||
val_tag = tags.get(i, None)
|
||||
if header or (self.bold_rows and i < nindex_levels):
|
||||
self.write_th(s, indent, tags=val_tag)
|
||||
else:
|
||||
self.write_td(s, indent, tags=val_tag)
|
||||
|
||||
indent -= indent_delta
|
||||
self.write('</tr>', indent)
|
||||
|
||||
def render(self):
|
||||
self._write_table()
|
||||
|
||||
if self.should_show_dimensions:
|
||||
by = chr(215) if compat.PY3 else unichr(215) # ×
|
||||
self.write(u('<p>{rows} rows {by} {cols} columns</p>')
|
||||
.format(rows=len(self.frame),
|
||||
by=by,
|
||||
cols=len(self.frame.columns)))
|
||||
|
||||
return self.elements
|
||||
|
||||
def _write_table(self, indent=0):
|
||||
_classes = ['dataframe'] # Default class.
|
||||
use_mathjax = get_option("display.html.use_mathjax")
|
||||
if not use_mathjax:
|
||||
_classes.append('tex2jax_ignore')
|
||||
if self.classes is not None:
|
||||
if isinstance(self.classes, str):
|
||||
self.classes = self.classes.split()
|
||||
if not isinstance(self.classes, (list, tuple)):
|
||||
raise AssertionError('classes must be list or tuple, not {typ}'
|
||||
.format(typ=type(self.classes)))
|
||||
_classes.extend(self.classes)
|
||||
|
||||
if self.table_id is None:
|
||||
id_section = ""
|
||||
else:
|
||||
id_section = ' id="{table_id}"'.format(table_id=self.table_id)
|
||||
|
||||
self.write('<table border="{border}" class="{cls}"{id_section}>'
|
||||
.format(border=self.border, cls=' '.join(_classes),
|
||||
id_section=id_section), indent)
|
||||
|
||||
if self.fmt.header or self.show_row_idx_names:
|
||||
self._write_header(indent + self.indent_delta)
|
||||
|
||||
self._write_body(indent + self.indent_delta)
|
||||
|
||||
self.write('</table>', indent)
|
||||
|
||||
def _write_col_header(self, indent):
|
||||
truncate_h = self.fmt.truncate_h
|
||||
if isinstance(self.columns, ABCMultiIndex):
|
||||
template = 'colspan="{span:d}" halign="left"'
|
||||
|
||||
if self.fmt.sparsify:
|
||||
# GH3547
|
||||
sentinel = com.sentinel_factory()
|
||||
else:
|
||||
sentinel = False
|
||||
levels = self.columns.format(sparsify=sentinel, adjoin=False,
|
||||
names=False)
|
||||
level_lengths = get_level_lengths(levels, sentinel)
|
||||
inner_lvl = len(level_lengths) - 1
|
||||
for lnum, (records, values) in enumerate(zip(level_lengths,
|
||||
levels)):
|
||||
if truncate_h:
|
||||
# modify the header lines
|
||||
ins_col = self.fmt.tr_col_num
|
||||
if self.fmt.sparsify:
|
||||
recs_new = {}
|
||||
# Increment tags after ... col.
|
||||
for tag, span in list(records.items()):
|
||||
if tag >= ins_col:
|
||||
recs_new[tag + 1] = span
|
||||
elif tag + span > ins_col:
|
||||
recs_new[tag] = span + 1
|
||||
if lnum == inner_lvl:
|
||||
values = (values[:ins_col] + (u('...'),) +
|
||||
values[ins_col:])
|
||||
else:
|
||||
# sparse col headers do not receive a ...
|
||||
values = (values[:ins_col] +
|
||||
(values[ins_col - 1], ) +
|
||||
values[ins_col:])
|
||||
else:
|
||||
recs_new[tag] = span
|
||||
# if ins_col lies between tags, all col headers
|
||||
# get ...
|
||||
if tag + span == ins_col:
|
||||
recs_new[ins_col] = 1
|
||||
values = (values[:ins_col] + (u('...'),) +
|
||||
values[ins_col:])
|
||||
records = recs_new
|
||||
inner_lvl = len(level_lengths) - 1
|
||||
if lnum == inner_lvl:
|
||||
records[ins_col] = 1
|
||||
else:
|
||||
recs_new = {}
|
||||
for tag, span in list(records.items()):
|
||||
if tag >= ins_col:
|
||||
recs_new[tag + 1] = span
|
||||
else:
|
||||
recs_new[tag] = span
|
||||
recs_new[ins_col] = 1
|
||||
records = recs_new
|
||||
values = (values[:ins_col] + [u('...')] +
|
||||
values[ins_col:])
|
||||
|
||||
# see gh-22579
|
||||
# Column Offset Bug with to_html(index=False) with
|
||||
# MultiIndex Columns and Index.
|
||||
# Initially fill row with blank cells before column names.
|
||||
# TODO: Refactor to remove code duplication with code
|
||||
# block below for standard columns index.
|
||||
row = [''] * (self.row_levels - 1)
|
||||
if self.fmt.index or self.show_col_idx_names:
|
||||
# see gh-22747
|
||||
# If to_html(index_names=False) do not show columns
|
||||
# index names.
|
||||
# TODO: Refactor to use _get_column_name_list from
|
||||
# DataFrameFormatter class and create a
|
||||
# _get_formatted_column_labels function for code
|
||||
# parity with DataFrameFormatter class.
|
||||
if self.fmt.show_index_names:
|
||||
name = self.columns.names[lnum]
|
||||
row.append(pprint_thing(name or ''))
|
||||
else:
|
||||
row.append('')
|
||||
|
||||
tags = {}
|
||||
j = len(row)
|
||||
for i, v in enumerate(values):
|
||||
if i in records:
|
||||
if records[i] > 1:
|
||||
tags[j] = template.format(span=records[i])
|
||||
else:
|
||||
continue
|
||||
j += 1
|
||||
row.append(v)
|
||||
self.write_tr(row, indent, self.indent_delta, tags=tags,
|
||||
header=True)
|
||||
else:
|
||||
# see gh-22579
|
||||
# Column misalignment also occurs for
|
||||
# a standard index when the columns index is named.
|
||||
# Initially fill row with blank cells before column names.
|
||||
# TODO: Refactor to remove code duplication with code block
|
||||
# above for columns MultiIndex.
|
||||
row = [''] * (self.row_levels - 1)
|
||||
if self.fmt.index or self.show_col_idx_names:
|
||||
# see gh-22747
|
||||
# If to_html(index_names=False) do not show columns
|
||||
# index names.
|
||||
# TODO: Refactor to use _get_column_name_list from
|
||||
# DataFrameFormatter class.
|
||||
if self.fmt.show_index_names:
|
||||
row.append(self.columns.name or '')
|
||||
else:
|
||||
row.append('')
|
||||
row.extend(self.columns)
|
||||
align = self.fmt.justify
|
||||
|
||||
if truncate_h:
|
||||
ins_col = self.row_levels + self.fmt.tr_col_num
|
||||
row.insert(ins_col, '...')
|
||||
|
||||
self.write_tr(row, indent, self.indent_delta, header=True,
|
||||
align=align)
|
||||
|
||||
def _write_row_header(self, indent):
|
||||
truncate_h = self.fmt.truncate_h
|
||||
row = ([x if x is not None else '' for x in self.frame.index.names]
|
||||
+ [''] * (self.ncols + (1 if truncate_h else 0)))
|
||||
self.write_tr(row, indent, self.indent_delta, header=True)
|
||||
|
||||
def _write_header(self, indent):
|
||||
self.write('<thead>', indent)
|
||||
|
||||
if self.fmt.header:
|
||||
self._write_col_header(indent + self.indent_delta)
|
||||
|
||||
if self.show_row_idx_names:
|
||||
self._write_row_header(indent + self.indent_delta)
|
||||
|
||||
self.write('</thead>', indent)
|
||||
|
||||
def _write_body(self, indent):
|
||||
self.write('<tbody>', indent)
|
||||
fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)}
|
||||
|
||||
# write values
|
||||
if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
|
||||
self._write_hierarchical_rows(
|
||||
fmt_values, indent + self.indent_delta)
|
||||
else:
|
||||
self._write_regular_rows(
|
||||
fmt_values, indent + self.indent_delta)
|
||||
|
||||
self.write('</tbody>', indent)
|
||||
|
||||
def _write_regular_rows(self, fmt_values, indent):
|
||||
truncate_h = self.fmt.truncate_h
|
||||
truncate_v = self.fmt.truncate_v
|
||||
|
||||
nrows = len(self.fmt.tr_frame)
|
||||
|
||||
if self.fmt.index:
|
||||
fmt = self.fmt._get_formatter('__index__')
|
||||
if fmt is not None:
|
||||
index_values = self.fmt.tr_frame.index.map(fmt)
|
||||
else:
|
||||
index_values = self.fmt.tr_frame.index.format()
|
||||
|
||||
row = []
|
||||
for i in range(nrows):
|
||||
|
||||
if truncate_v and i == (self.fmt.tr_row_num):
|
||||
str_sep_row = ['...'] * len(row)
|
||||
self.write_tr(str_sep_row, indent, self.indent_delta,
|
||||
tags=None, nindex_levels=self.row_levels)
|
||||
|
||||
row = []
|
||||
if self.fmt.index:
|
||||
row.append(index_values[i])
|
||||
# see gh-22579
|
||||
# Column misalignment also occurs for
|
||||
# a standard index when the columns index is named.
|
||||
# Add blank cell before data cells.
|
||||
elif self.show_col_idx_names:
|
||||
row.append('')
|
||||
row.extend(fmt_values[j][i] for j in range(self.ncols))
|
||||
|
||||
if truncate_h:
|
||||
dot_col_ix = self.fmt.tr_col_num + self.row_levels
|
||||
row.insert(dot_col_ix, '...')
|
||||
self.write_tr(row, indent, self.indent_delta, tags=None,
|
||||
nindex_levels=self.row_levels)
|
||||
|
||||
def _write_hierarchical_rows(self, fmt_values, indent):
|
||||
template = 'rowspan="{span}" valign="top"'
|
||||
|
||||
truncate_h = self.fmt.truncate_h
|
||||
truncate_v = self.fmt.truncate_v
|
||||
frame = self.fmt.tr_frame
|
||||
nrows = len(frame)
|
||||
|
||||
idx_values = frame.index.format(sparsify=False, adjoin=False,
|
||||
names=False)
|
||||
idx_values = lzip(*idx_values)
|
||||
|
||||
if self.fmt.sparsify:
|
||||
# GH3547
|
||||
sentinel = com.sentinel_factory()
|
||||
levels = frame.index.format(sparsify=sentinel, adjoin=False,
|
||||
names=False)
|
||||
|
||||
level_lengths = get_level_lengths(levels, sentinel)
|
||||
inner_lvl = len(level_lengths) - 1
|
||||
if truncate_v:
|
||||
# Insert ... row and adjust idx_values and
|
||||
# level_lengths to take this into account.
|
||||
ins_row = self.fmt.tr_row_num
|
||||
inserted = False
|
||||
for lnum, records in enumerate(level_lengths):
|
||||
rec_new = {}
|
||||
for tag, span in list(records.items()):
|
||||
if tag >= ins_row:
|
||||
rec_new[tag + 1] = span
|
||||
elif tag + span > ins_row:
|
||||
rec_new[tag] = span + 1
|
||||
|
||||
# GH 14882 - Make sure insertion done once
|
||||
if not inserted:
|
||||
dot_row = list(idx_values[ins_row - 1])
|
||||
dot_row[-1] = u('...')
|
||||
idx_values.insert(ins_row, tuple(dot_row))
|
||||
inserted = True
|
||||
else:
|
||||
dot_row = list(idx_values[ins_row])
|
||||
dot_row[inner_lvl - lnum] = u('...')
|
||||
idx_values[ins_row] = tuple(dot_row)
|
||||
else:
|
||||
rec_new[tag] = span
|
||||
# If ins_row lies between tags, all cols idx cols
|
||||
# receive ...
|
||||
if tag + span == ins_row:
|
||||
rec_new[ins_row] = 1
|
||||
if lnum == 0:
|
||||
idx_values.insert(ins_row, tuple(
|
||||
[u('...')] * len(level_lengths)))
|
||||
|
||||
# GH 14882 - Place ... in correct level
|
||||
elif inserted:
|
||||
dot_row = list(idx_values[ins_row])
|
||||
dot_row[inner_lvl - lnum] = u('...')
|
||||
idx_values[ins_row] = tuple(dot_row)
|
||||
level_lengths[lnum] = rec_new
|
||||
|
||||
level_lengths[inner_lvl][ins_row] = 1
|
||||
for ix_col in range(len(fmt_values)):
|
||||
fmt_values[ix_col].insert(ins_row, '...')
|
||||
nrows += 1
|
||||
|
||||
for i in range(nrows):
|
||||
row = []
|
||||
tags = {}
|
||||
|
||||
sparse_offset = 0
|
||||
j = 0
|
||||
for records, v in zip(level_lengths, idx_values[i]):
|
||||
if i in records:
|
||||
if records[i] > 1:
|
||||
tags[j] = template.format(span=records[i])
|
||||
else:
|
||||
sparse_offset += 1
|
||||
continue
|
||||
|
||||
j += 1
|
||||
row.append(v)
|
||||
|
||||
row.extend(fmt_values[j][i] for j in range(self.ncols))
|
||||
if truncate_h:
|
||||
row.insert(self.row_levels - sparse_offset +
|
||||
self.fmt.tr_col_num, '...')
|
||||
self.write_tr(row, indent, self.indent_delta, tags=tags,
|
||||
nindex_levels=len(levels) - sparse_offset)
|
||||
else:
|
||||
row = []
|
||||
for i in range(len(frame)):
|
||||
if truncate_v and i == (self.fmt.tr_row_num):
|
||||
str_sep_row = ['...'] * len(row)
|
||||
self.write_tr(str_sep_row, indent, self.indent_delta,
|
||||
tags=None, nindex_levels=self.row_levels)
|
||||
|
||||
idx_values = list(zip(*frame.index.format(
|
||||
sparsify=False, adjoin=False, names=False)))
|
||||
row = []
|
||||
row.extend(idx_values[i])
|
||||
row.extend(fmt_values[j][i] for j in range(self.ncols))
|
||||
if truncate_h:
|
||||
row.insert(self.row_levels + self.fmt.tr_col_num, '...')
|
||||
self.write_tr(row, indent, self.indent_delta, tags=None,
|
||||
nindex_levels=frame.index.nlevels)
|
||||
|
||||
|
||||
class NotebookFormatter(HTMLFormatter):
|
||||
"""
|
||||
Internal class for formatting output data in html for display in Jupyter
|
||||
Notebooks. This class is intended for functionality specific to
|
||||
DataFrame._repr_html_() and DataFrame.to_html(notebook=True)
|
||||
"""
|
||||
|
||||
def write_style(self):
|
||||
# We use the "scoped" attribute here so that the desired
|
||||
# style properties for the data frame are not then applied
|
||||
# throughout the entire notebook.
|
||||
template_first = """\
|
||||
<style scoped>"""
|
||||
template_last = """\
|
||||
</style>"""
|
||||
template_select = """\
|
||||
.dataframe %s {
|
||||
%s: %s;
|
||||
}"""
|
||||
element_props = [('tbody tr th:only-of-type',
|
||||
'vertical-align',
|
||||
'middle'),
|
||||
('tbody tr th',
|
||||
'vertical-align',
|
||||
'top')]
|
||||
if isinstance(self.columns, ABCMultiIndex):
|
||||
element_props.append(('thead tr th',
|
||||
'text-align',
|
||||
'left'))
|
||||
if self.show_row_idx_names:
|
||||
element_props.append(('thead tr:last-of-type th',
|
||||
'text-align',
|
||||
'right'))
|
||||
else:
|
||||
element_props.append(('thead th',
|
||||
'text-align',
|
||||
'right'))
|
||||
template_mid = '\n\n'.join(map(lambda t: template_select % t,
|
||||
element_props))
|
||||
template = dedent('\n'.join((template_first,
|
||||
template_mid,
|
||||
template_last)))
|
||||
self.write(template)
|
||||
|
||||
def render(self):
|
||||
self.write('<div>')
|
||||
self.write_style()
|
||||
super(NotebookFormatter, self).render()
|
||||
self.write('</div>')
|
||||
return self.elements
|
||||
@@ -1,246 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Module for formatting output data in Latex.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat import map, range, u, zip
|
||||
|
||||
from pandas.core.dtypes.generic import ABCMultiIndex
|
||||
|
||||
from pandas import compat
|
||||
|
||||
from pandas.io.formats.format import TableFormatter
|
||||
|
||||
|
||||
class LatexFormatter(TableFormatter):
|
||||
""" Used to render a DataFrame to a LaTeX tabular/longtable environment
|
||||
output.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
formatter : `DataFrameFormatter`
|
||||
column_format : str, default None
|
||||
The columns format as specified in `LaTeX table format
|
||||
<https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g 'rcl' for 3 columns
|
||||
longtable : boolean, default False
|
||||
Use a longtable environment instead of tabular.
|
||||
|
||||
See Also
|
||||
--------
|
||||
HTMLFormatter
|
||||
"""
|
||||
|
||||
def __init__(self, formatter, column_format=None, longtable=False,
|
||||
multicolumn=False, multicolumn_format=None, multirow=False):
|
||||
self.fmt = formatter
|
||||
self.frame = self.fmt.frame
|
||||
self.bold_rows = self.fmt.kwds.get('bold_rows', False)
|
||||
self.column_format = column_format
|
||||
self.longtable = longtable
|
||||
self.multicolumn = multicolumn
|
||||
self.multicolumn_format = multicolumn_format
|
||||
self.multirow = multirow
|
||||
|
||||
def write_result(self, buf):
|
||||
"""
|
||||
Render a DataFrame to a LaTeX tabular/longtable environment output.
|
||||
"""
|
||||
|
||||
# string representation of the columns
|
||||
if len(self.frame.columns) == 0 or len(self.frame.index) == 0:
|
||||
info_line = (u('Empty {name}\nColumns: {col}\nIndex: {idx}')
|
||||
.format(name=type(self.frame).__name__,
|
||||
col=self.frame.columns,
|
||||
idx=self.frame.index))
|
||||
strcols = [[info_line]]
|
||||
else:
|
||||
strcols = self.fmt._to_str_columns()
|
||||
|
||||
def get_col_type(dtype):
|
||||
if issubclass(dtype.type, np.number):
|
||||
return 'r'
|
||||
else:
|
||||
return 'l'
|
||||
|
||||
# reestablish the MultiIndex that has been joined by _to_str_column
|
||||
if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
|
||||
out = self.frame.index.format(
|
||||
adjoin=False, sparsify=self.fmt.sparsify,
|
||||
names=self.fmt.has_index_names, na_rep=self.fmt.na_rep
|
||||
)
|
||||
|
||||
# index.format will sparsify repeated entries with empty strings
|
||||
# so pad these with some empty space
|
||||
def pad_empties(x):
|
||||
for pad in reversed(x):
|
||||
if pad:
|
||||
break
|
||||
return [x[0]] + [i if i else ' ' * len(pad) for i in x[1:]]
|
||||
out = (pad_empties(i) for i in out)
|
||||
|
||||
# Add empty spaces for each column level
|
||||
clevels = self.frame.columns.nlevels
|
||||
out = [[' ' * len(i[-1])] * clevels + i for i in out]
|
||||
|
||||
# Add the column names to the last index column
|
||||
cnames = self.frame.columns.names
|
||||
if any(cnames):
|
||||
new_names = [i if i else '{}' for i in cnames]
|
||||
out[self.frame.index.nlevels - 1][:clevels] = new_names
|
||||
|
||||
# Get rid of old multiindex column and add new ones
|
||||
strcols = out + strcols[1:]
|
||||
|
||||
column_format = self.column_format
|
||||
if column_format is None:
|
||||
dtypes = self.frame.dtypes._values
|
||||
column_format = ''.join(map(get_col_type, dtypes))
|
||||
if self.fmt.index:
|
||||
index_format = 'l' * self.frame.index.nlevels
|
||||
column_format = index_format + column_format
|
||||
elif not isinstance(column_format,
|
||||
compat.string_types): # pragma: no cover
|
||||
raise AssertionError('column_format must be str or unicode, '
|
||||
'not {typ}'.format(typ=type(column_format)))
|
||||
|
||||
if not self.longtable:
|
||||
buf.write('\\begin{{tabular}}{{{fmt}}}\n'
|
||||
.format(fmt=column_format))
|
||||
buf.write('\\toprule\n')
|
||||
else:
|
||||
buf.write('\\begin{{longtable}}{{{fmt}}}\n'
|
||||
.format(fmt=column_format))
|
||||
buf.write('\\toprule\n')
|
||||
|
||||
ilevels = self.frame.index.nlevels
|
||||
clevels = self.frame.columns.nlevels
|
||||
nlevels = clevels
|
||||
if self.fmt.has_index_names and self.fmt.show_index_names:
|
||||
nlevels += 1
|
||||
strrows = list(zip(*strcols))
|
||||
self.clinebuf = []
|
||||
|
||||
for i, row in enumerate(strrows):
|
||||
if i == nlevels and self.fmt.header:
|
||||
buf.write('\\midrule\n') # End of header
|
||||
if self.longtable:
|
||||
buf.write('\\endhead\n')
|
||||
buf.write('\\midrule\n')
|
||||
buf.write('\\multicolumn{{{n}}}{{r}}{{{{Continued on next '
|
||||
'page}}}} \\\\\n'.format(n=len(row)))
|
||||
buf.write('\\midrule\n')
|
||||
buf.write('\\endfoot\n\n')
|
||||
buf.write('\\bottomrule\n')
|
||||
buf.write('\\endlastfoot\n')
|
||||
if self.fmt.kwds.get('escape', True):
|
||||
# escape backslashes first
|
||||
crow = [(x.replace('\\', '\\textbackslash ')
|
||||
.replace('_', '\\_')
|
||||
.replace('%', '\\%').replace('$', '\\$')
|
||||
.replace('#', '\\#').replace('{', '\\{')
|
||||
.replace('}', '\\}').replace('~', '\\textasciitilde ')
|
||||
.replace('^', '\\textasciicircum ')
|
||||
.replace('&', '\\&')
|
||||
if (x and x != '{}') else '{}') for x in row]
|
||||
else:
|
||||
crow = [x if x else '{}' for x in row]
|
||||
if self.bold_rows and self.fmt.index:
|
||||
# bold row labels
|
||||
crow = ['\\textbf{{{x}}}'.format(x=x)
|
||||
if j < ilevels and x.strip() not in ['', '{}'] else x
|
||||
for j, x in enumerate(crow)]
|
||||
if i < clevels and self.fmt.header and self.multicolumn:
|
||||
# sum up columns to multicolumns
|
||||
crow = self._format_multicolumn(crow, ilevels)
|
||||
if (i >= nlevels and self.fmt.index and self.multirow and
|
||||
ilevels > 1):
|
||||
# sum up rows to multirows
|
||||
crow = self._format_multirow(crow, ilevels, i, strrows)
|
||||
buf.write(' & '.join(crow))
|
||||
buf.write(' \\\\\n')
|
||||
if self.multirow and i < len(strrows) - 1:
|
||||
self._print_cline(buf, i, len(strcols))
|
||||
|
||||
if not self.longtable:
|
||||
buf.write('\\bottomrule\n')
|
||||
buf.write('\\end{tabular}\n')
|
||||
else:
|
||||
buf.write('\\end{longtable}\n')
|
||||
|
||||
def _format_multicolumn(self, row, ilevels):
|
||||
r"""
|
||||
Combine columns belonging to a group to a single multicolumn entry
|
||||
according to self.multicolumn_format
|
||||
|
||||
e.g.:
|
||||
a & & & b & c &
|
||||
will become
|
||||
\multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c}
|
||||
"""
|
||||
row2 = list(row[:ilevels])
|
||||
ncol = 1
|
||||
coltext = ''
|
||||
|
||||
def append_col():
|
||||
# write multicolumn if needed
|
||||
if ncol > 1:
|
||||
row2.append('\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}'
|
||||
.format(ncol=ncol, fmt=self.multicolumn_format,
|
||||
txt=coltext.strip()))
|
||||
# don't modify where not needed
|
||||
else:
|
||||
row2.append(coltext)
|
||||
for c in row[ilevels:]:
|
||||
# if next col has text, write the previous
|
||||
if c.strip():
|
||||
if coltext:
|
||||
append_col()
|
||||
coltext = c
|
||||
ncol = 1
|
||||
# if not, add it to the previous multicolumn
|
||||
else:
|
||||
ncol += 1
|
||||
# write last column name
|
||||
if coltext:
|
||||
append_col()
|
||||
return row2
|
||||
|
||||
def _format_multirow(self, row, ilevels, i, rows):
|
||||
r"""
|
||||
Check following rows, whether row should be a multirow
|
||||
|
||||
e.g.: becomes:
|
||||
a & 0 & \multirow{2}{*}{a} & 0 &
|
||||
& 1 & & 1 &
|
||||
b & 0 & \cline{1-2}
|
||||
b & 0 &
|
||||
"""
|
||||
for j in range(ilevels):
|
||||
if row[j].strip():
|
||||
nrow = 1
|
||||
for r in rows[i + 1:]:
|
||||
if not r[j].strip():
|
||||
nrow += 1
|
||||
else:
|
||||
break
|
||||
if nrow > 1:
|
||||
# overwrite non-multirow entry
|
||||
row[j] = '\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}'.format(
|
||||
nrow=nrow, row=row[j].strip())
|
||||
# save when to end the current block with \cline
|
||||
self.clinebuf.append([i + nrow - 1, j + 1])
|
||||
return row
|
||||
|
||||
def _print_cline(self, buf, i, icol):
|
||||
"""
|
||||
Print clines after multirow-blocks are finished
|
||||
"""
|
||||
for cl in self.clinebuf:
|
||||
if cl[0] == i:
|
||||
buf.write('\\cline{{{cl:d}-{icol:d}}}\n'
|
||||
.format(cl=cl[1], icol=icol))
|
||||
# remove entries that have been written to buffer
|
||||
self.clinebuf = [x for x in self.clinebuf if x[0] != i]
|
||||
@@ -1,435 +0,0 @@
|
||||
"""
|
||||
printing tools
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
from pandas.compat import u
|
||||
|
||||
from pandas.core.dtypes.inference import is_sequence
|
||||
|
||||
from pandas import compat
|
||||
from pandas.core.config import get_option
|
||||
|
||||
|
||||
def adjoin(space, *lists, **kwargs):
|
||||
"""
|
||||
Glues together two sets of strings using the amount of space requested.
|
||||
The idea is to prettify.
|
||||
|
||||
----------
|
||||
space : int
|
||||
number of spaces for padding
|
||||
lists : str
|
||||
list of str which being joined
|
||||
strlen : callable
|
||||
function used to calculate the length of each str. Needed for unicode
|
||||
handling.
|
||||
justfunc : callable
|
||||
function used to justify str. Needed for unicode handling.
|
||||
"""
|
||||
strlen = kwargs.pop('strlen', len)
|
||||
justfunc = kwargs.pop('justfunc', justify)
|
||||
|
||||
out_lines = []
|
||||
newLists = []
|
||||
lengths = [max(map(strlen, x)) + space for x in lists[:-1]]
|
||||
# not the last one
|
||||
lengths.append(max(map(len, lists[-1])))
|
||||
maxLen = max(map(len, lists))
|
||||
for i, lst in enumerate(lists):
|
||||
nl = justfunc(lst, lengths[i], mode='left')
|
||||
nl.extend([' ' * lengths[i]] * (maxLen - len(lst)))
|
||||
newLists.append(nl)
|
||||
toJoin = zip(*newLists)
|
||||
for lines in toJoin:
|
||||
out_lines.append(_join_unicode(lines))
|
||||
return _join_unicode(out_lines, sep='\n')
|
||||
|
||||
|
||||
def justify(texts, max_len, mode='right'):
|
||||
"""
|
||||
Perform ljust, center, rjust against string or list-like
|
||||
"""
|
||||
if mode == 'left':
|
||||
return [x.ljust(max_len) for x in texts]
|
||||
elif mode == 'center':
|
||||
return [x.center(max_len) for x in texts]
|
||||
else:
|
||||
return [x.rjust(max_len) for x in texts]
|
||||
|
||||
|
||||
def _join_unicode(lines, sep=''):
|
||||
try:
|
||||
return sep.join(lines)
|
||||
except UnicodeDecodeError:
|
||||
sep = compat.text_type(sep)
|
||||
return sep.join([x.decode('utf-8') if isinstance(x, str) else x
|
||||
for x in lines])
|
||||
|
||||
|
||||
# Unicode consolidation
|
||||
# ---------------------
|
||||
#
|
||||
# pprinting utility functions for generating Unicode text or
|
||||
# bytes(3.x)/str(2.x) representations of objects.
|
||||
# Try to use these as much as possible rather then rolling your own.
|
||||
#
|
||||
# When to use
|
||||
# -----------
|
||||
#
|
||||
# 1) If you're writing code internal to pandas (no I/O directly involved),
|
||||
# use pprint_thing().
|
||||
#
|
||||
# It will always return unicode text which can handled by other
|
||||
# parts of the package without breakage.
|
||||
#
|
||||
# 2) if you need to write something out to file, use
|
||||
# pprint_thing_encoded(encoding).
|
||||
#
|
||||
# If no encoding is specified, it defaults to utf-8. Since encoding pure
|
||||
# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're
|
||||
# working with straight ascii.
|
||||
|
||||
|
||||
def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds):
|
||||
"""
|
||||
internal. pprinter for iterables. you should probably use pprint_thing()
|
||||
rather then calling this directly.
|
||||
|
||||
bounds length of printed sequence, depending on options
|
||||
"""
|
||||
if isinstance(seq, set):
|
||||
fmt = u("{{{body}}}")
|
||||
else:
|
||||
fmt = u("[{body}]") if hasattr(seq, '__setitem__') else u("({body})")
|
||||
|
||||
if max_seq_items is False:
|
||||
nitems = len(seq)
|
||||
else:
|
||||
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
|
||||
|
||||
s = iter(seq)
|
||||
# handle sets, no slicing
|
||||
r = [pprint_thing(next(s),
|
||||
_nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
|
||||
for i in range(min(nitems, len(seq)))]
|
||||
body = ", ".join(r)
|
||||
|
||||
if nitems < len(seq):
|
||||
body += ", ..."
|
||||
elif isinstance(seq, tuple) and len(seq) == 1:
|
||||
body += ','
|
||||
|
||||
return fmt.format(body=body)
|
||||
|
||||
|
||||
def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds):
|
||||
"""
|
||||
internal. pprinter for iterables. you should probably use pprint_thing()
|
||||
rather then calling this directly.
|
||||
"""
|
||||
fmt = u("{{{things}}}")
|
||||
pairs = []
|
||||
|
||||
pfmt = u("{key}: {val}")
|
||||
|
||||
if max_seq_items is False:
|
||||
nitems = len(seq)
|
||||
else:
|
||||
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
|
||||
|
||||
for k, v in list(seq.items())[:nitems]:
|
||||
pairs.append(
|
||||
pfmt.format(
|
||||
key=pprint_thing(k, _nest_lvl + 1,
|
||||
max_seq_items=max_seq_items, **kwds),
|
||||
val=pprint_thing(v, _nest_lvl + 1,
|
||||
max_seq_items=max_seq_items, **kwds)))
|
||||
|
||||
if nitems < len(seq):
|
||||
return fmt.format(things=", ".join(pairs) + ", ...")
|
||||
else:
|
||||
return fmt.format(things=", ".join(pairs))
|
||||
|
||||
|
||||
def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False,
|
||||
quote_strings=False, max_seq_items=None):
|
||||
"""
|
||||
This function is the sanctioned way of converting objects
|
||||
to a unicode representation.
|
||||
|
||||
properly handles nested sequences containing unicode strings
|
||||
(unicode(object) does not)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
thing : anything to be formatted
|
||||
_nest_lvl : internal use only. pprint_thing() is mutually-recursive
|
||||
with pprint_sequence, this argument is used to keep track of the
|
||||
current nesting level, and limit it.
|
||||
escape_chars : list or dict, optional
|
||||
Characters to escape. If a dict is passed the values are the
|
||||
replacements
|
||||
default_escapes : bool, default False
|
||||
Whether the input escape characters replaces or adds to the defaults
|
||||
max_seq_items : False, int, default None
|
||||
Pass thru to other pretty printers to limit sequence printing
|
||||
|
||||
Returns
|
||||
-------
|
||||
result - unicode object on py2, str on py3. Always Unicode.
|
||||
|
||||
"""
|
||||
|
||||
def as_escaped_unicode(thing, escape_chars=escape_chars):
|
||||
# Unicode is fine, else we try to decode using utf-8 and 'replace'
|
||||
# if that's not it either, we have no way of knowing and the user
|
||||
# should deal with it himself.
|
||||
|
||||
try:
|
||||
result = compat.text_type(thing) # we should try this first
|
||||
except UnicodeDecodeError:
|
||||
# either utf-8 or we replace errors
|
||||
result = str(thing).decode('utf-8', "replace")
|
||||
|
||||
translate = {'\t': r'\t', '\n': r'\n', '\r': r'\r', }
|
||||
if isinstance(escape_chars, dict):
|
||||
if default_escapes:
|
||||
translate.update(escape_chars)
|
||||
else:
|
||||
translate = escape_chars
|
||||
escape_chars = list(escape_chars.keys())
|
||||
else:
|
||||
escape_chars = escape_chars or tuple()
|
||||
for c in escape_chars:
|
||||
result = result.replace(c, translate[c])
|
||||
|
||||
return compat.text_type(result)
|
||||
|
||||
if (compat.PY3 and hasattr(thing, '__next__')) or hasattr(thing, 'next'):
|
||||
return compat.text_type(thing)
|
||||
elif (isinstance(thing, dict) and
|
||||
_nest_lvl < get_option("display.pprint_nest_depth")):
|
||||
result = _pprint_dict(thing, _nest_lvl, quote_strings=True,
|
||||
max_seq_items=max_seq_items)
|
||||
elif (is_sequence(thing) and
|
||||
_nest_lvl < get_option("display.pprint_nest_depth")):
|
||||
result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars,
|
||||
quote_strings=quote_strings,
|
||||
max_seq_items=max_seq_items)
|
||||
elif isinstance(thing, compat.string_types) and quote_strings:
|
||||
if compat.PY3:
|
||||
fmt = u("'{thing}'")
|
||||
else:
|
||||
fmt = u("u'{thing}'")
|
||||
result = fmt.format(thing=as_escaped_unicode(thing))
|
||||
else:
|
||||
result = as_escaped_unicode(thing)
|
||||
|
||||
return compat.text_type(result) # always unicode
|
||||
|
||||
|
||||
def pprint_thing_encoded(object, encoding='utf-8', errors='replace', **kwds):
|
||||
value = pprint_thing(object) # get unicode representation of object
|
||||
return value.encode(encoding, errors, **kwds)
|
||||
|
||||
|
||||
def _enable_data_resource_formatter(enable):
|
||||
if 'IPython' not in sys.modules:
|
||||
# definitely not in IPython
|
||||
return
|
||||
from IPython import get_ipython
|
||||
ip = get_ipython()
|
||||
if ip is None:
|
||||
# still not in IPython
|
||||
return
|
||||
|
||||
formatters = ip.display_formatter.formatters
|
||||
mimetype = "application/vnd.dataresource+json"
|
||||
|
||||
if enable:
|
||||
if mimetype not in formatters:
|
||||
# define tableschema formatter
|
||||
from IPython.core.formatters import BaseFormatter
|
||||
|
||||
class TableSchemaFormatter(BaseFormatter):
|
||||
print_method = '_repr_data_resource_'
|
||||
_return_type = (dict,)
|
||||
# register it:
|
||||
formatters[mimetype] = TableSchemaFormatter()
|
||||
# enable it if it's been disabled:
|
||||
formatters[mimetype].enabled = True
|
||||
else:
|
||||
# unregister tableschema mime-type
|
||||
if mimetype in formatters:
|
||||
formatters[mimetype].enabled = False
|
||||
|
||||
|
||||
default_pprint = lambda x, max_seq_items=None: \
|
||||
pprint_thing(x, escape_chars=('\t', '\r', '\n'), quote_strings=True,
|
||||
max_seq_items=max_seq_items)
|
||||
|
||||
|
||||
def format_object_summary(obj, formatter, is_justify=True, name=None,
|
||||
indent_for_name=True):
|
||||
"""
|
||||
Return the formatted obj as a unicode string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : object
|
||||
must be iterable and support __getitem__
|
||||
formatter : callable
|
||||
string formatter for an element
|
||||
is_justify : boolean
|
||||
should justify the display
|
||||
name : name, optional
|
||||
defaults to the class name of the obj
|
||||
indent_for_name : bool, default True
|
||||
Whether subsequent lines should be be indented to
|
||||
align with the name.
|
||||
|
||||
Returns
|
||||
-------
|
||||
summary string
|
||||
|
||||
"""
|
||||
from pandas.io.formats.console import get_console_size
|
||||
from pandas.io.formats.format import _get_adjustment
|
||||
|
||||
display_width, _ = get_console_size()
|
||||
if display_width is None:
|
||||
display_width = get_option('display.width') or 80
|
||||
if name is None:
|
||||
name = obj.__class__.__name__
|
||||
|
||||
if indent_for_name:
|
||||
name_len = len(name)
|
||||
space1 = "\n%s" % (' ' * (name_len + 1))
|
||||
space2 = "\n%s" % (' ' * (name_len + 2))
|
||||
else:
|
||||
space1 = "\n"
|
||||
space2 = "\n " # space for the opening '['
|
||||
|
||||
n = len(obj)
|
||||
sep = ','
|
||||
max_seq_items = get_option('display.max_seq_items') or n
|
||||
|
||||
# are we a truncated display
|
||||
is_truncated = n > max_seq_items
|
||||
|
||||
# adj can optionally handle unicode eastern asian width
|
||||
adj = _get_adjustment()
|
||||
|
||||
def _extend_line(s, line, value, display_width, next_line_prefix):
|
||||
|
||||
if (adj.len(line.rstrip()) + adj.len(value.rstrip()) >=
|
||||
display_width):
|
||||
s += line.rstrip()
|
||||
line = next_line_prefix
|
||||
line += value
|
||||
return s, line
|
||||
|
||||
def best_len(values):
|
||||
if values:
|
||||
return max(adj.len(x) for x in values)
|
||||
else:
|
||||
return 0
|
||||
|
||||
close = u', '
|
||||
|
||||
if n == 0:
|
||||
summary = u'[]{}'.format(close)
|
||||
elif n == 1:
|
||||
first = formatter(obj[0])
|
||||
summary = u'[{}]{}'.format(first, close)
|
||||
elif n == 2:
|
||||
first = formatter(obj[0])
|
||||
last = formatter(obj[-1])
|
||||
summary = u'[{}, {}]{}'.format(first, last, close)
|
||||
else:
|
||||
|
||||
if n > max_seq_items:
|
||||
n = min(max_seq_items // 2, 10)
|
||||
head = [formatter(x) for x in obj[:n]]
|
||||
tail = [formatter(x) for x in obj[-n:]]
|
||||
else:
|
||||
head = []
|
||||
tail = [formatter(x) for x in obj]
|
||||
|
||||
# adjust all values to max length if needed
|
||||
if is_justify:
|
||||
|
||||
# however, if we are not truncated and we are only a single
|
||||
# line, then don't justify
|
||||
if (is_truncated or
|
||||
not (len(', '.join(head)) < display_width and
|
||||
len(', '.join(tail)) < display_width)):
|
||||
max_len = max(best_len(head), best_len(tail))
|
||||
head = [x.rjust(max_len) for x in head]
|
||||
tail = [x.rjust(max_len) for x in tail]
|
||||
|
||||
summary = ""
|
||||
line = space2
|
||||
|
||||
for i in range(len(head)):
|
||||
word = head[i] + sep + ' '
|
||||
summary, line = _extend_line(summary, line, word,
|
||||
display_width, space2)
|
||||
|
||||
if is_truncated:
|
||||
# remove trailing space of last line
|
||||
summary += line.rstrip() + space2 + '...'
|
||||
line = space2
|
||||
|
||||
for i in range(len(tail) - 1):
|
||||
word = tail[i] + sep + ' '
|
||||
summary, line = _extend_line(summary, line, word,
|
||||
display_width, space2)
|
||||
|
||||
# last value: no sep added + 1 space of width used for trailing ','
|
||||
summary, line = _extend_line(summary, line, tail[-1],
|
||||
display_width - 2, space2)
|
||||
summary += line
|
||||
|
||||
# right now close is either '' or ', '
|
||||
# Now we want to include the ']', but not the maybe space.
|
||||
close = ']' + close.rstrip(' ')
|
||||
summary += close
|
||||
|
||||
if len(summary) > (display_width):
|
||||
summary += space1
|
||||
else: # one row
|
||||
summary += ' '
|
||||
|
||||
# remove initial space
|
||||
summary = '[' + summary[len(space2):]
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def format_object_attrs(obj):
|
||||
"""
|
||||
Return a list of tuples of the (attr, formatted_value)
|
||||
for common attrs, including dtype, name, length
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : object
|
||||
must be iterable
|
||||
|
||||
Returns
|
||||
-------
|
||||
list
|
||||
|
||||
"""
|
||||
attrs = []
|
||||
if hasattr(obj, 'dtype'):
|
||||
attrs.append(('dtype', "'{}'".format(obj.dtype)))
|
||||
if getattr(obj, 'name', None) is not None:
|
||||
attrs.append(('name', default_pprint(obj.name)))
|
||||
max_seq_items = get_option('display.max_seq_items') or len(obj)
|
||||
if len(obj) > max_seq_items:
|
||||
attrs.append(('length', len(obj)))
|
||||
return attrs
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,70 +0,0 @@
|
||||
{# Update the template_structure.html document too #}
|
||||
{%- block before_style -%}{%- endblock before_style -%}
|
||||
{% block style %}
|
||||
<style type="text/css" >
|
||||
{% block table_styles %}
|
||||
{% for s in table_styles %}
|
||||
#T_{{uuid}} {{s.selector}} {
|
||||
{% for p,val in s.props %}
|
||||
{{p}}: {{val}};
|
||||
{% endfor -%}
|
||||
}
|
||||
{%- endfor -%}
|
||||
{% endblock table_styles %}
|
||||
{% block before_cellstyle %}{% endblock before_cellstyle %}
|
||||
{% block cellstyle %}
|
||||
{%- for s in cellstyle %}
|
||||
#T_{{uuid}}{{s.selector}} {
|
||||
{% for p,val in s.props %}
|
||||
{{p}}: {{val}};
|
||||
{% endfor %}
|
||||
}
|
||||
{%- endfor -%}
|
||||
{%- endblock cellstyle %}
|
||||
</style>
|
||||
{%- endblock style %}
|
||||
{%- block before_table %}{% endblock before_table %}
|
||||
{%- block table %}
|
||||
<table id="T_{{uuid}}" {% if table_attributes %}{{ table_attributes }}{% endif %}>
|
||||
{%- block caption %}
|
||||
{%- if caption -%}
|
||||
<caption>{{caption}}</caption>
|
||||
{%- endif -%}
|
||||
{%- endblock caption %}
|
||||
{%- block thead %}
|
||||
<thead>
|
||||
{%- block before_head_rows %}{% endblock %}
|
||||
{%- for r in head %}
|
||||
{%- block head_tr scoped %}
|
||||
<tr>
|
||||
{%- for c in r %}
|
||||
{%- if c.is_visible != False %}
|
||||
<{{ c.type }} class="{{c.class}}" {{ c.attributes|join(" ") }}>{{c.value}}</{{ c.type }}>
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
</tr>
|
||||
{%- endblock head_tr %}
|
||||
{%- endfor %}
|
||||
{%- block after_head_rows %}{% endblock %}
|
||||
</thead>
|
||||
{%- endblock thead %}
|
||||
{%- block tbody %}
|
||||
<tbody>
|
||||
{% block before_rows %}{% endblock before_rows %}
|
||||
{% for r in body %}
|
||||
{% block tr scoped %}
|
||||
<tr>
|
||||
{% for c in r %}
|
||||
{% if c.is_visible != False %}
|
||||
<{{ c.type }} {% if c.id is defined -%} id="T_{{ uuid }}{{ c.id }}" {%- endif %} class="{{ c.class }}" {{ c.attributes|join(" ") }}>{{ c.display_value }}</{{ c.type }}>
|
||||
{% endif %}
|
||||
{%- endfor %}
|
||||
</tr>
|
||||
{% endblock tr %}
|
||||
{%- endfor %}
|
||||
{%- block after_rows %}{%- endblock after_rows %}
|
||||
</tbody>
|
||||
{%- endblock tbody %}
|
||||
</table>
|
||||
{%- endblock table %}
|
||||
{%- block after_table %}{% endblock after_table %}
|
||||
@@ -1,144 +0,0 @@
|
||||
"""
|
||||
get_terminal_size() -- return width and height of terminal as a tuple
|
||||
|
||||
code from:
|
||||
http://stackoverflow.com/questions/566746/how-to-get-console- window-width-in-
|
||||
python
|
||||
|
||||
written by
|
||||
Harco Kuppens (http://stackoverflow.com/users/825214/harco-kuppens)
|
||||
|
||||
It is mentioned in the stackoverflow response that this code works
|
||||
on linux, os x, windows and cygwin (windows).
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from pandas.compat import PY3
|
||||
|
||||
__all__ = ['get_terminal_size', 'is_terminal']
|
||||
|
||||
|
||||
def get_terminal_size():
|
||||
"""
|
||||
Detect terminal size and return tuple = (width, height).
|
||||
|
||||
Only to be used when running in a terminal. Note that the IPython notebook,
|
||||
IPython zmq frontends, or IDLE do not run in a terminal,
|
||||
"""
|
||||
import platform
|
||||
|
||||
if PY3:
|
||||
return shutil.get_terminal_size()
|
||||
|
||||
current_os = platform.system()
|
||||
tuple_xy = None
|
||||
if current_os == 'Windows':
|
||||
tuple_xy = _get_terminal_size_windows()
|
||||
if tuple_xy is None:
|
||||
tuple_xy = _get_terminal_size_tput()
|
||||
# needed for window's python in cygwin's xterm!
|
||||
if (current_os == 'Linux' or current_os == 'Darwin' or
|
||||
current_os.startswith('CYGWIN')):
|
||||
tuple_xy = _get_terminal_size_linux()
|
||||
if tuple_xy is None:
|
||||
tuple_xy = (80, 25) # default value
|
||||
return tuple_xy
|
||||
|
||||
|
||||
def is_terminal():
|
||||
"""
|
||||
Detect if Python is running in a terminal.
|
||||
|
||||
Returns True if Python is running in a terminal or False if not.
|
||||
"""
|
||||
try:
|
||||
ip = get_ipython()
|
||||
except NameError: # assume standard Python interpreter in a terminal
|
||||
return True
|
||||
else:
|
||||
if hasattr(ip, 'kernel'): # IPython as a Jupyter kernel
|
||||
return False
|
||||
else: # IPython in a terminal
|
||||
return True
|
||||
|
||||
|
||||
def _get_terminal_size_windows():
|
||||
|
||||
try:
|
||||
from ctypes import windll, create_string_buffer
|
||||
|
||||
# stdin handle is -10
|
||||
# stdout handle is -11
|
||||
# stderr handle is -12
|
||||
|
||||
h = windll.kernel32.GetStdHandle(-12)
|
||||
csbi = create_string_buffer(22)
|
||||
res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi)
|
||||
except (AttributeError, ValueError):
|
||||
return None
|
||||
if res:
|
||||
import struct
|
||||
(bufx, bufy, curx, cury, wattr, left, top, right, bottom, maxx,
|
||||
maxy) = struct.unpack("hhhhHhhhhhh", csbi.raw)
|
||||
sizex = right - left + 1
|
||||
sizey = bottom - top + 1
|
||||
return sizex, sizey
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def _get_terminal_size_tput():
|
||||
# get terminal width
|
||||
# src: http://stackoverflow.com/questions/263890/how-do-i-find-the-width
|
||||
# -height-of-a-terminal-window
|
||||
try:
|
||||
import subprocess
|
||||
proc = subprocess.Popen(["tput", "cols"],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE)
|
||||
output = proc.communicate(input=None)
|
||||
cols = int(output[0])
|
||||
proc = subprocess.Popen(["tput", "lines"],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE)
|
||||
output = proc.communicate(input=None)
|
||||
rows = int(output[0])
|
||||
return (cols, rows)
|
||||
except OSError:
|
||||
return None
|
||||
|
||||
|
||||
def _get_terminal_size_linux():
|
||||
def ioctl_GWINSZ(fd):
|
||||
try:
|
||||
import fcntl
|
||||
import termios
|
||||
import struct
|
||||
cr = struct.unpack(
|
||||
'hh', fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234'))
|
||||
except (struct.error, IOError):
|
||||
return None
|
||||
return cr
|
||||
cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
|
||||
if not cr:
|
||||
try:
|
||||
fd = os.open(os.ctermid(), os.O_RDONLY)
|
||||
cr = ioctl_GWINSZ(fd)
|
||||
os.close(fd)
|
||||
except OSError:
|
||||
pass
|
||||
if not cr or cr == (0, 0):
|
||||
try:
|
||||
from os import environ as env
|
||||
cr = (env['LINES'], env['COLUMNS'])
|
||||
except (ValueError, KeyError):
|
||||
return None
|
||||
return int(cr[1]), int(cr[0])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sizex, sizey = get_terminal_size()
|
||||
print('width = {w} height = {h}'.format(w=sizex, h=sizey))
|
||||
@@ -1,162 +0,0 @@
|
||||
""" Google BigQuery support """
|
||||
|
||||
import warnings
|
||||
|
||||
|
||||
def _try_import():
|
||||
# since pandas is a dependency of pandas-gbq
|
||||
# we need to import on first use
|
||||
try:
|
||||
import pandas_gbq
|
||||
except ImportError:
|
||||
|
||||
# give a nice error message
|
||||
raise ImportError("Load data from Google BigQuery\n"
|
||||
"\n"
|
||||
"the pandas-gbq package is not installed\n"
|
||||
"see the docs: https://pandas-gbq.readthedocs.io\n"
|
||||
"\n"
|
||||
"you can install via pip or conda:\n"
|
||||
"pip install pandas-gbq\n"
|
||||
"conda install pandas-gbq -c conda-forge\n")
|
||||
|
||||
return pandas_gbq
|
||||
|
||||
|
||||
def read_gbq(query, project_id=None, index_col=None, col_order=None,
|
||||
reauth=False, auth_local_webserver=False, dialect=None,
|
||||
location=None, configuration=None, credentials=None,
|
||||
private_key=None, verbose=None):
|
||||
"""
|
||||
Load data from Google BigQuery.
|
||||
|
||||
This function requires the `pandas-gbq package
|
||||
<https://pandas-gbq.readthedocs.io>`__.
|
||||
|
||||
See the `How to authenticate with Google BigQuery
|
||||
<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
|
||||
guide for authentication instructions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
SQL-Like Query to return data values.
|
||||
project_id : str, optional
|
||||
Google BigQuery Account project ID. Optional when available from
|
||||
the environment.
|
||||
index_col : str, optional
|
||||
Name of result column to use for index in results DataFrame.
|
||||
col_order : list(str), optional
|
||||
List of BigQuery column names in the desired order for results
|
||||
DataFrame.
|
||||
reauth : boolean, default False
|
||||
Force Google BigQuery to re-authenticate the user. This is useful
|
||||
if multiple accounts are used.
|
||||
auth_local_webserver : boolean, default False
|
||||
Use the `local webserver flow`_ instead of the `console flow`_
|
||||
when getting user credentials.
|
||||
|
||||
.. _local webserver flow:
|
||||
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
|
||||
.. _console flow:
|
||||
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
|
||||
|
||||
*New in version 0.2.0 of pandas-gbq*.
|
||||
dialect : str, default 'legacy'
|
||||
Note: The default value is changing to 'standard' in a future verion.
|
||||
|
||||
SQL syntax dialect to use. Value can be one of:
|
||||
|
||||
``'legacy'``
|
||||
Use BigQuery's legacy SQL dialect. For more information see
|
||||
`BigQuery Legacy SQL Reference
|
||||
<https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
|
||||
``'standard'``
|
||||
Use BigQuery's standard SQL, which is
|
||||
compliant with the SQL 2011 standard. For more information
|
||||
see `BigQuery Standard SQL Reference
|
||||
<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
|
||||
|
||||
.. versionchanged:: 0.24.0
|
||||
location : str, optional
|
||||
Location where the query job should run. See the `BigQuery locations
|
||||
documentation
|
||||
<https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
|
||||
list of available locations. The location must match that of any
|
||||
datasets used in the query.
|
||||
|
||||
*New in version 0.5.0 of pandas-gbq*.
|
||||
configuration : dict, optional
|
||||
Query config parameters for job processing.
|
||||
For example:
|
||||
|
||||
configuration = {'query': {'useQueryCache': False}}
|
||||
|
||||
For more information see `BigQuery REST API Reference
|
||||
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
|
||||
credentials : google.auth.credentials.Credentials, optional
|
||||
Credentials for accessing Google APIs. Use this parameter to override
|
||||
default credentials, such as to use Compute Engine
|
||||
:class:`google.auth.compute_engine.Credentials` or Service Account
|
||||
:class:`google.oauth2.service_account.Credentials` directly.
|
||||
|
||||
*New in version 0.8.0 of pandas-gbq*.
|
||||
|
||||
.. versionadded:: 0.24.0
|
||||
private_key : str, deprecated
|
||||
Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``
|
||||
parameter and
|
||||
:func:`google.oauth2.service_account.Credentials.from_service_account_info`
|
||||
or
|
||||
:func:`google.oauth2.service_account.Credentials.from_service_account_file`
|
||||
instead.
|
||||
|
||||
Service account private key in JSON format. Can be file path
|
||||
or string contents. This is useful for remote server
|
||||
authentication (eg. Jupyter/IPython notebook on remote host).
|
||||
verbose : None, deprecated
|
||||
Deprecated in pandas-gbq version 0.4.0. Use the `logging module to
|
||||
adjust verbosity instead
|
||||
<https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df: DataFrame
|
||||
DataFrame representing results of query.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pandas_gbq.read_gbq : This function in the pandas-gbq library.
|
||||
pandas.DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
|
||||
"""
|
||||
pandas_gbq = _try_import()
|
||||
|
||||
if dialect is None:
|
||||
dialect = "legacy"
|
||||
warnings.warn(
|
||||
'The default value for dialect is changing to "standard" in a '
|
||||
'future version of pandas-gbq. Pass in dialect="legacy" to '
|
||||
"disable this warning.",
|
||||
FutureWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
return pandas_gbq.read_gbq(
|
||||
query, project_id=project_id, index_col=index_col,
|
||||
col_order=col_order, reauth=reauth,
|
||||
auth_local_webserver=auth_local_webserver, dialect=dialect,
|
||||
location=location, configuration=configuration,
|
||||
credentials=credentials, verbose=verbose, private_key=private_key)
|
||||
|
||||
|
||||
def to_gbq(dataframe, destination_table, project_id=None, chunksize=None,
|
||||
reauth=False, if_exists='fail', auth_local_webserver=False,
|
||||
table_schema=None, location=None, progress_bar=True,
|
||||
credentials=None, verbose=None, private_key=None):
|
||||
pandas_gbq = _try_import()
|
||||
return pandas_gbq.to_gbq(
|
||||
dataframe, destination_table, project_id=project_id,
|
||||
chunksize=chunksize, reauth=reauth, if_exists=if_exists,
|
||||
auth_local_webserver=auth_local_webserver, table_schema=table_schema,
|
||||
location=location, progress_bar=progress_bar,
|
||||
credentials=credentials, verbose=verbose, private_key=private_key)
|
||||
@@ -1,16 +0,0 @@
|
||||
""" GCS support for remote file interactivity """
|
||||
try:
|
||||
import gcsfs
|
||||
except ImportError:
|
||||
raise ImportError("The gcsfs library is required to handle GCS files")
|
||||
|
||||
|
||||
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
|
||||
compression=None, mode=None):
|
||||
|
||||
if mode is None:
|
||||
mode = 'rb'
|
||||
|
||||
fs = gcsfs.GCSFileSystem()
|
||||
filepath_or_buffer = fs.open(filepath_or_buffer, mode)
|
||||
return filepath_or_buffer, None, compression, True
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,5 +0,0 @@
|
||||
from .json import to_json, read_json, loads, dumps # noqa
|
||||
from .normalize import json_normalize # noqa
|
||||
from .table_schema import build_table_schema # noqa
|
||||
|
||||
del json, normalize, table_schema # noqa
|
||||
BIN
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -1,951 +0,0 @@
|
||||
# pylint: disable-msg=E1101,W0613,W0603
|
||||
from itertools import islice
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
import pandas._libs.json as json
|
||||
from pandas._libs.tslibs import iNaT
|
||||
from pandas.compat import StringIO, long, to_str, u
|
||||
from pandas.errors import AbstractMethodError
|
||||
|
||||
from pandas.core.dtypes.common import is_period_dtype
|
||||
|
||||
from pandas import DataFrame, MultiIndex, Series, compat, isna, to_datetime
|
||||
from pandas.core.reshape.concat import concat
|
||||
|
||||
from pandas.io.common import (
|
||||
BaseIterator, _get_handle, _infer_compression, _stringify_path,
|
||||
get_filepath_or_buffer)
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
from pandas.io.parsers import _validate_integer
|
||||
|
||||
from .normalize import _convert_to_line_delimits
|
||||
from .table_schema import build_table_schema, parse_table_schema
|
||||
|
||||
loads = json.loads
|
||||
dumps = json.dumps
|
||||
|
||||
TABLE_SCHEMA_VERSION = '0.20.0'
|
||||
|
||||
|
||||
# interface to/from
|
||||
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
|
||||
double_precision=10, force_ascii=True, date_unit='ms',
|
||||
default_handler=None, lines=False, compression='infer',
|
||||
index=True):
|
||||
|
||||
if not index and orient not in ['split', 'table']:
|
||||
raise ValueError("'index=False' is only valid when 'orient' is "
|
||||
"'split' or 'table'")
|
||||
|
||||
path_or_buf = _stringify_path(path_or_buf)
|
||||
if lines and orient != 'records':
|
||||
raise ValueError(
|
||||
"'lines' keyword only valid when 'orient' is records")
|
||||
|
||||
if orient == 'table' and isinstance(obj, Series):
|
||||
obj = obj.to_frame(name=obj.name or 'values')
|
||||
if orient == 'table' and isinstance(obj, DataFrame):
|
||||
writer = JSONTableWriter
|
||||
elif isinstance(obj, Series):
|
||||
writer = SeriesWriter
|
||||
elif isinstance(obj, DataFrame):
|
||||
writer = FrameWriter
|
||||
else:
|
||||
raise NotImplementedError("'obj' should be a Series or a DataFrame")
|
||||
|
||||
s = writer(
|
||||
obj, orient=orient, date_format=date_format,
|
||||
double_precision=double_precision, ensure_ascii=force_ascii,
|
||||
date_unit=date_unit, default_handler=default_handler,
|
||||
index=index).write()
|
||||
|
||||
if lines:
|
||||
s = _convert_to_line_delimits(s)
|
||||
|
||||
if isinstance(path_or_buf, compat.string_types):
|
||||
fh, handles = _get_handle(path_or_buf, 'w', compression=compression)
|
||||
try:
|
||||
fh.write(s)
|
||||
finally:
|
||||
fh.close()
|
||||
elif path_or_buf is None:
|
||||
return s
|
||||
else:
|
||||
path_or_buf.write(s)
|
||||
|
||||
|
||||
class Writer(object):
|
||||
def __init__(self, obj, orient, date_format, double_precision,
|
||||
ensure_ascii, date_unit, index, default_handler=None):
|
||||
self.obj = obj
|
||||
|
||||
if orient is None:
|
||||
orient = self._default_orient
|
||||
|
||||
self.orient = orient
|
||||
self.date_format = date_format
|
||||
self.double_precision = double_precision
|
||||
self.ensure_ascii = ensure_ascii
|
||||
self.date_unit = date_unit
|
||||
self.default_handler = default_handler
|
||||
self.index = index
|
||||
|
||||
self.is_copy = None
|
||||
self._format_axes()
|
||||
|
||||
def _format_axes(self):
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
def write(self):
|
||||
return self._write(self.obj, self.orient, self.double_precision,
|
||||
self.ensure_ascii, self.date_unit,
|
||||
self.date_format == 'iso', self.default_handler)
|
||||
|
||||
def _write(self, obj, orient, double_precision, ensure_ascii,
|
||||
date_unit, iso_dates, default_handler):
|
||||
return dumps(
|
||||
obj,
|
||||
orient=orient,
|
||||
double_precision=double_precision,
|
||||
ensure_ascii=ensure_ascii,
|
||||
date_unit=date_unit,
|
||||
iso_dates=iso_dates,
|
||||
default_handler=default_handler
|
||||
)
|
||||
|
||||
|
||||
class SeriesWriter(Writer):
|
||||
_default_orient = 'index'
|
||||
|
||||
def _format_axes(self):
|
||||
if not self.obj.index.is_unique and self.orient == 'index':
|
||||
raise ValueError("Series index must be unique for orient="
|
||||
"'{orient}'".format(orient=self.orient))
|
||||
|
||||
def _write(self, obj, orient, double_precision, ensure_ascii,
|
||||
date_unit, iso_dates, default_handler):
|
||||
if not self.index and orient == 'split':
|
||||
obj = {"name": obj.name, "data": obj.values}
|
||||
return super(SeriesWriter, self)._write(obj, orient,
|
||||
double_precision,
|
||||
ensure_ascii, date_unit,
|
||||
iso_dates, default_handler)
|
||||
|
||||
|
||||
class FrameWriter(Writer):
|
||||
_default_orient = 'columns'
|
||||
|
||||
def _format_axes(self):
|
||||
"""
|
||||
Try to format axes if they are datelike.
|
||||
"""
|
||||
if not self.obj.index.is_unique and self.orient in (
|
||||
'index', 'columns'):
|
||||
raise ValueError("DataFrame index must be unique for orient="
|
||||
"'{orient}'.".format(orient=self.orient))
|
||||
if not self.obj.columns.is_unique and self.orient in (
|
||||
'index', 'columns', 'records'):
|
||||
raise ValueError("DataFrame columns must be unique for orient="
|
||||
"'{orient}'.".format(orient=self.orient))
|
||||
|
||||
def _write(self, obj, orient, double_precision, ensure_ascii,
|
||||
date_unit, iso_dates, default_handler):
|
||||
if not self.index and orient == 'split':
|
||||
obj = obj.to_dict(orient='split')
|
||||
del obj["index"]
|
||||
return super(FrameWriter, self)._write(obj, orient,
|
||||
double_precision,
|
||||
ensure_ascii, date_unit,
|
||||
iso_dates, default_handler)
|
||||
|
||||
|
||||
class JSONTableWriter(FrameWriter):
|
||||
_default_orient = 'records'
|
||||
|
||||
def __init__(self, obj, orient, date_format, double_precision,
|
||||
ensure_ascii, date_unit, index, default_handler=None):
|
||||
"""
|
||||
Adds a `schema` attribute with the Table Schema, resets
|
||||
the index (can't do in caller, because the schema inference needs
|
||||
to know what the index is, forces orient to records, and forces
|
||||
date_format to 'iso'.
|
||||
"""
|
||||
super(JSONTableWriter, self).__init__(
|
||||
obj, orient, date_format, double_precision, ensure_ascii,
|
||||
date_unit, index, default_handler=default_handler)
|
||||
|
||||
if date_format != 'iso':
|
||||
msg = ("Trying to write with `orient='table'` and "
|
||||
"`date_format='{fmt}'`. Table Schema requires dates "
|
||||
"to be formatted with `date_format='iso'`"
|
||||
.format(fmt=date_format))
|
||||
raise ValueError(msg)
|
||||
|
||||
self.schema = build_table_schema(obj, index=self.index)
|
||||
|
||||
# NotImplementd on a column MultiIndex
|
||||
if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):
|
||||
raise NotImplementedError(
|
||||
"orient='table' is not supported for MultiIndex")
|
||||
|
||||
# TODO: Do this timedelta properly in objToJSON.c See GH #15137
|
||||
if ((obj.ndim == 1) and (obj.name in set(obj.index.names)) or
|
||||
len(obj.columns & obj.index.names)):
|
||||
msg = "Overlapping names between the index and columns"
|
||||
raise ValueError(msg)
|
||||
|
||||
obj = obj.copy()
|
||||
timedeltas = obj.select_dtypes(include=['timedelta']).columns
|
||||
if len(timedeltas):
|
||||
obj[timedeltas] = obj[timedeltas].applymap(
|
||||
lambda x: x.isoformat())
|
||||
# Convert PeriodIndex to datetimes before serialzing
|
||||
if is_period_dtype(obj.index):
|
||||
obj.index = obj.index.to_timestamp()
|
||||
|
||||
# exclude index from obj if index=False
|
||||
if not self.index:
|
||||
self.obj = obj.reset_index(drop=True)
|
||||
else:
|
||||
self.obj = obj.reset_index(drop=False)
|
||||
self.date_format = 'iso'
|
||||
self.orient = 'records'
|
||||
self.index = index
|
||||
|
||||
def _write(self, obj, orient, double_precision, ensure_ascii,
|
||||
date_unit, iso_dates, default_handler):
|
||||
data = super(JSONTableWriter, self)._write(obj, orient,
|
||||
double_precision,
|
||||
ensure_ascii, date_unit,
|
||||
iso_dates,
|
||||
default_handler)
|
||||
serialized = '{{"schema": {schema}, "data": {data}}}'.format(
|
||||
schema=dumps(self.schema), data=data)
|
||||
return serialized
|
||||
|
||||
|
||||
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
|
||||
convert_axes=True, convert_dates=True, keep_default_dates=True,
|
||||
numpy=False, precise_float=False, date_unit=None, encoding=None,
|
||||
lines=False, chunksize=None, compression='infer'):
|
||||
"""
|
||||
Convert a JSON string to pandas object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf : a valid JSON string or file-like, default: None
|
||||
The string could be a URL. Valid URL schemes include http, ftp, s3,
|
||||
gcs, and file. For file URLs, a host is expected. For instance, a local
|
||||
file could be ``file://localhost/path/to/table.json``
|
||||
|
||||
orient : string,
|
||||
Indication of expected JSON string format.
|
||||
Compatible JSON strings can be produced by ``to_json()`` with a
|
||||
corresponding orient value.
|
||||
The set of possible orients is:
|
||||
|
||||
- ``'split'`` : dict like
|
||||
``{index -> [index], columns -> [columns], data -> [values]}``
|
||||
- ``'records'`` : list like
|
||||
``[{column -> value}, ... , {column -> value}]``
|
||||
- ``'index'`` : dict like ``{index -> {column -> value}}``
|
||||
- ``'columns'`` : dict like ``{column -> {index -> value}}``
|
||||
- ``'values'`` : just the values array
|
||||
|
||||
The allowed and default values depend on the value
|
||||
of the `typ` parameter.
|
||||
|
||||
* when ``typ == 'series'``,
|
||||
|
||||
- allowed orients are ``{'split','records','index'}``
|
||||
- default is ``'index'``
|
||||
- The Series index must be unique for orient ``'index'``.
|
||||
|
||||
* when ``typ == 'frame'``,
|
||||
|
||||
- allowed orients are ``{'split','records','index',
|
||||
'columns','values', 'table'}``
|
||||
- default is ``'columns'``
|
||||
- The DataFrame index must be unique for orients ``'index'`` and
|
||||
``'columns'``.
|
||||
- The DataFrame columns must be unique for orients ``'index'``,
|
||||
``'columns'``, and ``'records'``.
|
||||
|
||||
.. versionadded:: 0.23.0
|
||||
'table' as an allowed value for the ``orient`` argument
|
||||
|
||||
typ : type of object to recover (series or frame), default 'frame'
|
||||
dtype : boolean or dict, default True
|
||||
If True, infer dtypes, if a dict of column to dtype, then use those,
|
||||
if False, then don't infer dtypes at all, applies only to the data.
|
||||
convert_axes : boolean, default True
|
||||
Try to convert the axes to the proper dtypes.
|
||||
convert_dates : boolean, default True
|
||||
List of columns to parse for dates; If True, then try to parse
|
||||
datelike columns default is True; a column label is datelike if
|
||||
|
||||
* it ends with ``'_at'``,
|
||||
|
||||
* it ends with ``'_time'``,
|
||||
|
||||
* it begins with ``'timestamp'``,
|
||||
|
||||
* it is ``'modified'``, or
|
||||
|
||||
* it is ``'date'``
|
||||
|
||||
keep_default_dates : boolean, default True
|
||||
If parsing dates, then parse the default datelike columns
|
||||
numpy : boolean, default False
|
||||
Direct decoding to numpy arrays. Supports numeric data only, but
|
||||
non-numeric column and index labels are supported. Note also that the
|
||||
JSON ordering MUST be the same for each term if numpy=True.
|
||||
precise_float : boolean, default False
|
||||
Set to enable usage of higher precision (strtod) function when
|
||||
decoding string to double values. Default (False) is to use fast but
|
||||
less precise builtin functionality
|
||||
date_unit : string, default None
|
||||
The timestamp unit to detect if converting dates. The default behaviour
|
||||
is to try and detect the correct precision, but if this is not desired
|
||||
then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
|
||||
milliseconds, microseconds or nanoseconds respectively.
|
||||
encoding : str, default is 'utf-8'
|
||||
The encoding to use to decode py3 bytes.
|
||||
|
||||
.. versionadded:: 0.19.0
|
||||
|
||||
lines : boolean, default False
|
||||
Read the file as a json object per line.
|
||||
|
||||
.. versionadded:: 0.19.0
|
||||
|
||||
chunksize : integer, default None
|
||||
Return JsonReader object for iteration.
|
||||
See the `line-delimted json docs
|
||||
<http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
|
||||
for more information on ``chunksize``.
|
||||
This can only be passed if `lines=True`.
|
||||
If this is None, the file will be read into memory all at once.
|
||||
|
||||
.. versionadded:: 0.21.0
|
||||
|
||||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
|
||||
For on-the-fly decompression of on-disk data. If 'infer', then use
|
||||
gzip, bz2, zip or xz if path_or_buf is a string ending in
|
||||
'.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
|
||||
otherwise. If using 'zip', the ZIP file must contain only one data
|
||||
file to be read in. Set to None for no decompression.
|
||||
|
||||
.. versionadded:: 0.21.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : Series or DataFrame, depending on the value of `typ`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.to_json
|
||||
|
||||
Notes
|
||||
-----
|
||||
Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
|
||||
:class:`Index` name of `index` gets written with :func:`to_json`, the
|
||||
subsequent read operation will incorrectly set the :class:`Index` name to
|
||||
``None``. This is because `index` is also used by :func:`DataFrame.to_json`
|
||||
to denote a missing :class:`Index` name, and the subsequent
|
||||
:func:`read_json` operation cannot distinguish between the two. The same
|
||||
limitation is encountered with a :class:`MultiIndex` and any names
|
||||
beginning with ``'level_'``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
|
||||
... index=['row 1', 'row 2'],
|
||||
... columns=['col 1', 'col 2'])
|
||||
|
||||
Encoding/decoding a Dataframe using ``'split'`` formatted JSON:
|
||||
|
||||
>>> df.to_json(orient='split')
|
||||
'{"columns":["col 1","col 2"],
|
||||
"index":["row 1","row 2"],
|
||||
"data":[["a","b"],["c","d"]]}'
|
||||
>>> pd.read_json(_, orient='split')
|
||||
col 1 col 2
|
||||
row 1 a b
|
||||
row 2 c d
|
||||
|
||||
Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
|
||||
|
||||
>>> df.to_json(orient='index')
|
||||
'{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
|
||||
>>> pd.read_json(_, orient='index')
|
||||
col 1 col 2
|
||||
row 1 a b
|
||||
row 2 c d
|
||||
|
||||
Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
|
||||
Note that index labels are not preserved with this encoding.
|
||||
|
||||
>>> df.to_json(orient='records')
|
||||
'[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
|
||||
>>> pd.read_json(_, orient='records')
|
||||
col 1 col 2
|
||||
0 a b
|
||||
1 c d
|
||||
|
||||
Encoding with Table Schema
|
||||
|
||||
>>> df.to_json(orient='table')
|
||||
'{"schema": {"fields": [{"name": "index", "type": "string"},
|
||||
{"name": "col 1", "type": "string"},
|
||||
{"name": "col 2", "type": "string"}],
|
||||
"primaryKey": "index",
|
||||
"pandas_version": "0.20.0"},
|
||||
"data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
|
||||
{"index": "row 2", "col 1": "c", "col 2": "d"}]}'
|
||||
"""
|
||||
|
||||
compression = _infer_compression(path_or_buf, compression)
|
||||
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
|
||||
path_or_buf, encoding=encoding, compression=compression,
|
||||
)
|
||||
|
||||
json_reader = JsonReader(
|
||||
filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
|
||||
convert_axes=convert_axes, convert_dates=convert_dates,
|
||||
keep_default_dates=keep_default_dates, numpy=numpy,
|
||||
precise_float=precise_float, date_unit=date_unit, encoding=encoding,
|
||||
lines=lines, chunksize=chunksize, compression=compression,
|
||||
)
|
||||
|
||||
if chunksize:
|
||||
return json_reader
|
||||
|
||||
result = json_reader.read()
|
||||
if should_close:
|
||||
try:
|
||||
filepath_or_buffer.close()
|
||||
except: # noqa: flake8
|
||||
pass
|
||||
return result
|
||||
|
||||
|
||||
class JsonReader(BaseIterator):
|
||||
"""
|
||||
JsonReader provides an interface for reading in a JSON file.
|
||||
|
||||
If initialized with ``lines=True`` and ``chunksize``, can be iterated over
|
||||
``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
|
||||
whole document.
|
||||
"""
|
||||
def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
|
||||
convert_dates, keep_default_dates, numpy, precise_float,
|
||||
date_unit, encoding, lines, chunksize, compression):
|
||||
|
||||
self.path_or_buf = filepath_or_buffer
|
||||
self.orient = orient
|
||||
self.typ = typ
|
||||
self.dtype = dtype
|
||||
self.convert_axes = convert_axes
|
||||
self.convert_dates = convert_dates
|
||||
self.keep_default_dates = keep_default_dates
|
||||
self.numpy = numpy
|
||||
self.precise_float = precise_float
|
||||
self.date_unit = date_unit
|
||||
self.encoding = encoding
|
||||
self.compression = compression
|
||||
self.lines = lines
|
||||
self.chunksize = chunksize
|
||||
self.nrows_seen = 0
|
||||
self.should_close = False
|
||||
|
||||
if self.chunksize is not None:
|
||||
self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
|
||||
if not self.lines:
|
||||
raise ValueError("chunksize can only be passed if lines=True")
|
||||
|
||||
data = self._get_data_from_filepath(filepath_or_buffer)
|
||||
self.data = self._preprocess_data(data)
|
||||
|
||||
def _preprocess_data(self, data):
|
||||
"""
|
||||
At this point, the data either has a `read` attribute (e.g. a file
|
||||
object or a StringIO) or is a string that is a JSON document.
|
||||
|
||||
If self.chunksize, we prepare the data for the `__next__` method.
|
||||
Otherwise, we read it into memory for the `read` method.
|
||||
"""
|
||||
if hasattr(data, 'read') and not self.chunksize:
|
||||
data = data.read()
|
||||
if not hasattr(data, 'read') and self.chunksize:
|
||||
data = StringIO(data)
|
||||
|
||||
return data
|
||||
|
||||
def _get_data_from_filepath(self, filepath_or_buffer):
|
||||
"""
|
||||
The function read_json accepts three input types:
|
||||
1. filepath (string-like)
|
||||
2. file-like object (e.g. open file object, StringIO)
|
||||
3. JSON string
|
||||
|
||||
This method turns (1) into (2) to simplify the rest of the processing.
|
||||
It returns input types (2) and (3) unchanged.
|
||||
"""
|
||||
data = filepath_or_buffer
|
||||
|
||||
exists = False
|
||||
if isinstance(data, compat.string_types):
|
||||
try:
|
||||
exists = os.path.exists(filepath_or_buffer)
|
||||
# gh-5874: if the filepath is too long will raise here
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
if exists or self.compression is not None:
|
||||
data, _ = _get_handle(filepath_or_buffer, 'r',
|
||||
encoding=self.encoding,
|
||||
compression=self.compression)
|
||||
self.should_close = True
|
||||
self.open_stream = data
|
||||
|
||||
return data
|
||||
|
||||
def _combine_lines(self, lines):
|
||||
"""
|
||||
Combines a list of JSON objects into one JSON object.
|
||||
"""
|
||||
lines = filter(None, map(lambda x: x.strip(), lines))
|
||||
return '[' + ','.join(lines) + ']'
|
||||
|
||||
def read(self):
|
||||
"""
|
||||
Read the whole JSON input into a pandas object.
|
||||
"""
|
||||
if self.lines and self.chunksize:
|
||||
obj = concat(self)
|
||||
elif self.lines:
|
||||
|
||||
data = to_str(self.data)
|
||||
obj = self._get_object_parser(
|
||||
self._combine_lines(data.split('\n'))
|
||||
)
|
||||
else:
|
||||
obj = self._get_object_parser(self.data)
|
||||
self.close()
|
||||
return obj
|
||||
|
||||
def _get_object_parser(self, json):
|
||||
"""
|
||||
Parses a json document into a pandas object.
|
||||
"""
|
||||
typ = self.typ
|
||||
dtype = self.dtype
|
||||
kwargs = {
|
||||
"orient": self.orient, "dtype": self.dtype,
|
||||
"convert_axes": self.convert_axes,
|
||||
"convert_dates": self.convert_dates,
|
||||
"keep_default_dates": self.keep_default_dates, "numpy": self.numpy,
|
||||
"precise_float": self.precise_float, "date_unit": self.date_unit
|
||||
}
|
||||
obj = None
|
||||
if typ == 'frame':
|
||||
obj = FrameParser(json, **kwargs).parse()
|
||||
|
||||
if typ == 'series' or obj is None:
|
||||
if not isinstance(dtype, bool):
|
||||
kwargs['dtype'] = dtype
|
||||
obj = SeriesParser(json, **kwargs).parse()
|
||||
|
||||
return obj
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
If we opened a stream earlier, in _get_data_from_filepath, we should
|
||||
close it.
|
||||
|
||||
If an open stream or file was passed, we leave it open.
|
||||
"""
|
||||
if self.should_close:
|
||||
try:
|
||||
self.open_stream.close()
|
||||
except (IOError, AttributeError):
|
||||
pass
|
||||
|
||||
def __next__(self):
|
||||
lines = list(islice(self.data, self.chunksize))
|
||||
if lines:
|
||||
lines_json = self._combine_lines(lines)
|
||||
obj = self._get_object_parser(lines_json)
|
||||
|
||||
# Make sure that the returned objects have the right index.
|
||||
obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
|
||||
self.nrows_seen += len(obj)
|
||||
|
||||
return obj
|
||||
|
||||
self.close()
|
||||
raise StopIteration
|
||||
|
||||
|
||||
class Parser(object):
|
||||
|
||||
_STAMP_UNITS = ('s', 'ms', 'us', 'ns')
|
||||
_MIN_STAMPS = {
|
||||
's': long(31536000),
|
||||
'ms': long(31536000000),
|
||||
'us': long(31536000000000),
|
||||
'ns': long(31536000000000000)}
|
||||
|
||||
def __init__(self, json, orient, dtype=True, convert_axes=True,
|
||||
convert_dates=True, keep_default_dates=False, numpy=False,
|
||||
precise_float=False, date_unit=None):
|
||||
self.json = json
|
||||
|
||||
if orient is None:
|
||||
orient = self._default_orient
|
||||
|
||||
self.orient = orient
|
||||
self.dtype = dtype
|
||||
|
||||
if orient == "split":
|
||||
numpy = False
|
||||
|
||||
if date_unit is not None:
|
||||
date_unit = date_unit.lower()
|
||||
if date_unit not in self._STAMP_UNITS:
|
||||
raise ValueError('date_unit must be one of {units}'
|
||||
.format(units=self._STAMP_UNITS))
|
||||
self.min_stamp = self._MIN_STAMPS[date_unit]
|
||||
else:
|
||||
self.min_stamp = self._MIN_STAMPS['s']
|
||||
|
||||
self.numpy = numpy
|
||||
self.precise_float = precise_float
|
||||
self.convert_axes = convert_axes
|
||||
self.convert_dates = convert_dates
|
||||
self.date_unit = date_unit
|
||||
self.keep_default_dates = keep_default_dates
|
||||
self.obj = None
|
||||
|
||||
def check_keys_split(self, decoded):
|
||||
"""
|
||||
Checks that dict has only the appropriate keys for orient='split'.
|
||||
"""
|
||||
bad_keys = set(decoded.keys()).difference(set(self._split_keys))
|
||||
if bad_keys:
|
||||
bad_keys = ", ".join(bad_keys)
|
||||
raise ValueError(u("JSON data had unexpected key(s): {bad_keys}")
|
||||
.format(bad_keys=pprint_thing(bad_keys)))
|
||||
|
||||
def parse(self):
|
||||
|
||||
# try numpy
|
||||
numpy = self.numpy
|
||||
if numpy:
|
||||
self._parse_numpy()
|
||||
|
||||
else:
|
||||
self._parse_no_numpy()
|
||||
|
||||
if self.obj is None:
|
||||
return None
|
||||
if self.convert_axes:
|
||||
self._convert_axes()
|
||||
self._try_convert_types()
|
||||
return self.obj
|
||||
|
||||
def _convert_axes(self):
|
||||
"""
|
||||
Try to convert axes.
|
||||
"""
|
||||
for axis in self.obj._AXIS_NUMBERS.keys():
|
||||
new_axis, result = self._try_convert_data(
|
||||
axis, self.obj._get_axis(axis), use_dtypes=False,
|
||||
convert_dates=True)
|
||||
if result:
|
||||
setattr(self.obj, axis, new_axis)
|
||||
|
||||
def _try_convert_types(self):
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
def _try_convert_data(self, name, data, use_dtypes=True,
|
||||
convert_dates=True):
|
||||
"""
|
||||
Try to parse a ndarray like into a column by inferring dtype.
|
||||
"""
|
||||
|
||||
# don't try to coerce, unless a force conversion
|
||||
if use_dtypes:
|
||||
if self.dtype is False:
|
||||
return data, False
|
||||
elif self.dtype is True:
|
||||
pass
|
||||
else:
|
||||
# dtype to force
|
||||
dtype = (self.dtype.get(name)
|
||||
if isinstance(self.dtype, dict) else self.dtype)
|
||||
if dtype is not None:
|
||||
try:
|
||||
dtype = np.dtype(dtype)
|
||||
return data.astype(dtype), True
|
||||
except (TypeError, ValueError):
|
||||
return data, False
|
||||
|
||||
if convert_dates:
|
||||
new_data, result = self._try_convert_to_date(data)
|
||||
if result:
|
||||
return new_data, True
|
||||
|
||||
result = False
|
||||
|
||||
if data.dtype == 'object':
|
||||
|
||||
# try float
|
||||
try:
|
||||
data = data.astype('float64')
|
||||
result = True
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
if data.dtype.kind == 'f':
|
||||
|
||||
if data.dtype != 'float64':
|
||||
|
||||
# coerce floats to 64
|
||||
try:
|
||||
data = data.astype('float64')
|
||||
result = True
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
# don't coerce 0-len data
|
||||
if len(data) and (data.dtype == 'float' or data.dtype == 'object'):
|
||||
|
||||
# coerce ints if we can
|
||||
try:
|
||||
new_data = data.astype('int64')
|
||||
if (new_data == data).all():
|
||||
data = new_data
|
||||
result = True
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
# coerce ints to 64
|
||||
if data.dtype == 'int':
|
||||
|
||||
# coerce floats to 64
|
||||
try:
|
||||
data = data.astype('int64')
|
||||
result = True
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
return data, result
|
||||
|
||||
def _try_convert_to_date(self, data):
|
||||
"""
|
||||
Try to parse a ndarray like into a date column.
|
||||
|
||||
Try to coerce object in epoch/iso formats and integer/float in epoch
|
||||
formats. Return a boolean if parsing was successful.
|
||||
"""
|
||||
|
||||
# no conversion on empty
|
||||
if not len(data):
|
||||
return data, False
|
||||
|
||||
new_data = data
|
||||
if new_data.dtype == 'object':
|
||||
try:
|
||||
new_data = data.astype('int64')
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
pass
|
||||
|
||||
# ignore numbers that are out of range
|
||||
if issubclass(new_data.dtype.type, np.number):
|
||||
in_range = (isna(new_data.values) | (new_data > self.min_stamp) |
|
||||
(new_data.values == iNaT))
|
||||
if not in_range.all():
|
||||
return data, False
|
||||
|
||||
date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
|
||||
for date_unit in date_units:
|
||||
try:
|
||||
new_data = to_datetime(new_data, errors='raise',
|
||||
unit=date_unit)
|
||||
except ValueError:
|
||||
continue
|
||||
except Exception:
|
||||
break
|
||||
return new_data, True
|
||||
return data, False
|
||||
|
||||
def _try_convert_dates(self):
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
|
||||
class SeriesParser(Parser):
|
||||
_default_orient = 'index'
|
||||
_split_keys = ('name', 'index', 'data')
|
||||
|
||||
def _parse_no_numpy(self):
|
||||
|
||||
json = self.json
|
||||
orient = self.orient
|
||||
if orient == "split":
|
||||
decoded = {str(k): v for k, v in compat.iteritems(
|
||||
loads(json, precise_float=self.precise_float))}
|
||||
self.check_keys_split(decoded)
|
||||
self.obj = Series(dtype=None, **decoded)
|
||||
else:
|
||||
self.obj = Series(
|
||||
loads(json, precise_float=self.precise_float), dtype=None)
|
||||
|
||||
def _parse_numpy(self):
|
||||
|
||||
json = self.json
|
||||
orient = self.orient
|
||||
if orient == "split":
|
||||
decoded = loads(json, dtype=None, numpy=True,
|
||||
precise_float=self.precise_float)
|
||||
decoded = {str(k): v for k, v in compat.iteritems(decoded)}
|
||||
self.check_keys_split(decoded)
|
||||
self.obj = Series(**decoded)
|
||||
elif orient == "columns" or orient == "index":
|
||||
self.obj = Series(*loads(json, dtype=None, numpy=True,
|
||||
labelled=True,
|
||||
precise_float=self.precise_float))
|
||||
else:
|
||||
self.obj = Series(loads(json, dtype=None, numpy=True,
|
||||
precise_float=self.precise_float))
|
||||
|
||||
def _try_convert_types(self):
|
||||
if self.obj is None:
|
||||
return
|
||||
obj, result = self._try_convert_data(
|
||||
'data', self.obj, convert_dates=self.convert_dates)
|
||||
if result:
|
||||
self.obj = obj
|
||||
|
||||
|
||||
class FrameParser(Parser):
|
||||
_default_orient = 'columns'
|
||||
_split_keys = ('columns', 'index', 'data')
|
||||
|
||||
def _parse_numpy(self):
|
||||
|
||||
json = self.json
|
||||
orient = self.orient
|
||||
|
||||
if orient == "columns":
|
||||
args = loads(json, dtype=None, numpy=True, labelled=True,
|
||||
precise_float=self.precise_float)
|
||||
if len(args):
|
||||
args = (args[0].T, args[2], args[1])
|
||||
self.obj = DataFrame(*args)
|
||||
elif orient == "split":
|
||||
decoded = loads(json, dtype=None, numpy=True,
|
||||
precise_float=self.precise_float)
|
||||
decoded = {str(k): v for k, v in compat.iteritems(decoded)}
|
||||
self.check_keys_split(decoded)
|
||||
self.obj = DataFrame(**decoded)
|
||||
elif orient == "values":
|
||||
self.obj = DataFrame(loads(json, dtype=None, numpy=True,
|
||||
precise_float=self.precise_float))
|
||||
else:
|
||||
self.obj = DataFrame(*loads(json, dtype=None, numpy=True,
|
||||
labelled=True,
|
||||
precise_float=self.precise_float))
|
||||
|
||||
def _parse_no_numpy(self):
|
||||
|
||||
json = self.json
|
||||
orient = self.orient
|
||||
|
||||
if orient == "columns":
|
||||
self.obj = DataFrame(
|
||||
loads(json, precise_float=self.precise_float), dtype=None)
|
||||
elif orient == "split":
|
||||
decoded = {str(k): v for k, v in compat.iteritems(
|
||||
loads(json, precise_float=self.precise_float))}
|
||||
self.check_keys_split(decoded)
|
||||
self.obj = DataFrame(dtype=None, **decoded)
|
||||
elif orient == "index":
|
||||
self.obj = DataFrame(
|
||||
loads(json, precise_float=self.precise_float), dtype=None).T
|
||||
elif orient == 'table':
|
||||
self.obj = parse_table_schema(json,
|
||||
precise_float=self.precise_float)
|
||||
else:
|
||||
self.obj = DataFrame(
|
||||
loads(json, precise_float=self.precise_float), dtype=None)
|
||||
|
||||
def _process_converter(self, f, filt=None):
|
||||
"""
|
||||
Take a conversion function and possibly recreate the frame.
|
||||
"""
|
||||
|
||||
if filt is None:
|
||||
filt = lambda col, c: True
|
||||
|
||||
needs_new_obj = False
|
||||
new_obj = dict()
|
||||
for i, (col, c) in enumerate(self.obj.iteritems()):
|
||||
if filt(col, c):
|
||||
new_data, result = f(col, c)
|
||||
if result:
|
||||
c = new_data
|
||||
needs_new_obj = True
|
||||
new_obj[i] = c
|
||||
|
||||
if needs_new_obj:
|
||||
|
||||
# possibly handle dup columns
|
||||
new_obj = DataFrame(new_obj, index=self.obj.index)
|
||||
new_obj.columns = self.obj.columns
|
||||
self.obj = new_obj
|
||||
|
||||
def _try_convert_types(self):
|
||||
if self.obj is None:
|
||||
return
|
||||
if self.convert_dates:
|
||||
self._try_convert_dates()
|
||||
|
||||
self._process_converter(
|
||||
lambda col, c: self._try_convert_data(col, c, convert_dates=False))
|
||||
|
||||
def _try_convert_dates(self):
|
||||
if self.obj is None:
|
||||
return
|
||||
|
||||
# our columns to parse
|
||||
convert_dates = self.convert_dates
|
||||
if convert_dates is True:
|
||||
convert_dates = []
|
||||
convert_dates = set(convert_dates)
|
||||
|
||||
def is_ok(col):
|
||||
"""
|
||||
Return if this col is ok to try for a date parse.
|
||||
"""
|
||||
if not isinstance(col, compat.string_types):
|
||||
return False
|
||||
|
||||
col_lower = col.lower()
|
||||
if (col_lower.endswith('_at') or
|
||||
col_lower.endswith('_time') or
|
||||
col_lower == 'modified' or
|
||||
col_lower == 'date' or
|
||||
col_lower == 'datetime' or
|
||||
col_lower.startswith('timestamp')):
|
||||
return True
|
||||
return False
|
||||
|
||||
self._process_converter(
|
||||
lambda col, c: self._try_convert_to_date(c),
|
||||
lambda col, c: ((self.keep_default_dates and is_ok(col)) or
|
||||
col in convert_dates))
|
||||
@@ -1,286 +0,0 @@
|
||||
# ---------------------------------------------------------------------
|
||||
# JSON normalization routines
|
||||
|
||||
from collections import defaultdict
|
||||
import copy
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.writers import convert_json_to_lines
|
||||
|
||||
from pandas import DataFrame, compat
|
||||
|
||||
|
||||
def _convert_to_line_delimits(s):
|
||||
"""
|
||||
Helper function that converts JSON lists to line delimited JSON.
|
||||
"""
|
||||
|
||||
# Determine we have a JSON list to turn to lines otherwise just return the
|
||||
# json object, only lists can
|
||||
if not s[0] == '[' and s[-1] == ']':
|
||||
return s
|
||||
s = s[1:-1]
|
||||
|
||||
return convert_json_to_lines(s)
|
||||
|
||||
|
||||
def nested_to_record(ds, prefix="", sep=".", level=0):
|
||||
"""
|
||||
A simplified json_normalize.
|
||||
|
||||
Converts a nested dict into a flat dict ("record"), unlike json_normalize,
|
||||
it does not attempt to extract a subset of the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ds : dict or list of dicts
|
||||
prefix: the prefix, optional, default: ""
|
||||
sep : string, default '.'
|
||||
Nested records will generate names separated by sep,
|
||||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
|
||||
level: the number of levels in the jason string, optional, default: 0
|
||||
|
||||
Returns
|
||||
-------
|
||||
d - dict or list of dicts, matching `ds`
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),
|
||||
nested=dict(e=dict(c=1,d=2),d=2)))
|
||||
Out[52]:
|
||||
{'dict1.c': 1,
|
||||
'dict1.d': 2,
|
||||
'flat1': 1,
|
||||
'nested.d': 2,
|
||||
'nested.e.c': 1,
|
||||
'nested.e.d': 2}
|
||||
"""
|
||||
singleton = False
|
||||
if isinstance(ds, dict):
|
||||
ds = [ds]
|
||||
singleton = True
|
||||
|
||||
new_ds = []
|
||||
for d in ds:
|
||||
|
||||
new_d = copy.deepcopy(d)
|
||||
for k, v in d.items():
|
||||
# each key gets renamed with prefix
|
||||
if not isinstance(k, compat.string_types):
|
||||
k = str(k)
|
||||
if level == 0:
|
||||
newkey = k
|
||||
else:
|
||||
newkey = prefix + sep + k
|
||||
|
||||
# only dicts gets recurse-flattend
|
||||
# only at level>1 do we rename the rest of the keys
|
||||
if not isinstance(v, dict):
|
||||
if level != 0: # so we skip copying for top level, common case
|
||||
v = new_d.pop(k)
|
||||
new_d[newkey] = v
|
||||
continue
|
||||
else:
|
||||
v = new_d.pop(k)
|
||||
new_d.update(nested_to_record(v, newkey, sep, level + 1))
|
||||
new_ds.append(new_d)
|
||||
|
||||
if singleton:
|
||||
return new_ds[0]
|
||||
return new_ds
|
||||
|
||||
|
||||
def json_normalize(data, record_path=None, meta=None,
|
||||
meta_prefix=None,
|
||||
record_prefix=None,
|
||||
errors='raise',
|
||||
sep='.'):
|
||||
"""
|
||||
Normalize semi-structured JSON data into a flat table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : dict or list of dicts
|
||||
Unserialized JSON objects
|
||||
record_path : string or list of strings, default None
|
||||
Path in each object to list of records. If not passed, data will be
|
||||
assumed to be an array of records
|
||||
meta : list of paths (string or list of strings), default None
|
||||
Fields to use as metadata for each record in resulting table
|
||||
meta_prefix : string, default None
|
||||
record_prefix : string, default None
|
||||
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
|
||||
path to records is ['foo', 'bar']
|
||||
errors : {'raise', 'ignore'}, default 'raise'
|
||||
|
||||
* 'ignore' : will ignore KeyError if keys listed in meta are not
|
||||
always present
|
||||
* 'raise' : will raise KeyError if keys listed in meta are not
|
||||
always present
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
|
||||
sep : string, default '.'
|
||||
Nested records will generate names separated by sep,
|
||||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
frame : DataFrame
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> from pandas.io.json import json_normalize
|
||||
>>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
|
||||
... {'name': {'given': 'Mose', 'family': 'Regner'}},
|
||||
... {'id': 2, 'name': 'Faye Raker'}]
|
||||
>>> json_normalize(data)
|
||||
id name name.family name.first name.given name.last
|
||||
0 1.0 NaN NaN Coleen NaN Volk
|
||||
1 NaN NaN Regner NaN Mose NaN
|
||||
2 2.0 Faye Raker NaN NaN NaN NaN
|
||||
|
||||
>>> data = [{'state': 'Florida',
|
||||
... 'shortname': 'FL',
|
||||
... 'info': {
|
||||
... 'governor': 'Rick Scott'
|
||||
... },
|
||||
... 'counties': [{'name': 'Dade', 'population': 12345},
|
||||
... {'name': 'Broward', 'population': 40000},
|
||||
... {'name': 'Palm Beach', 'population': 60000}]},
|
||||
... {'state': 'Ohio',
|
||||
... 'shortname': 'OH',
|
||||
... 'info': {
|
||||
... 'governor': 'John Kasich'
|
||||
... },
|
||||
... 'counties': [{'name': 'Summit', 'population': 1234},
|
||||
... {'name': 'Cuyahoga', 'population': 1337}]}]
|
||||
>>> result = json_normalize(data, 'counties', ['state', 'shortname',
|
||||
... ['info', 'governor']])
|
||||
>>> result
|
||||
name population info.governor state shortname
|
||||
0 Dade 12345 Rick Scott Florida FL
|
||||
1 Broward 40000 Rick Scott Florida FL
|
||||
2 Palm Beach 60000 Rick Scott Florida FL
|
||||
3 Summit 1234 John Kasich Ohio OH
|
||||
4 Cuyahoga 1337 John Kasich Ohio OH
|
||||
|
||||
>>> data = {'A': [1, 2]}
|
||||
>>> json_normalize(data, 'A', record_prefix='Prefix.')
|
||||
Prefix.0
|
||||
0 1
|
||||
1 2
|
||||
"""
|
||||
def _pull_field(js, spec):
|
||||
result = js
|
||||
if isinstance(spec, list):
|
||||
for field in spec:
|
||||
result = result[field]
|
||||
else:
|
||||
result = result[spec]
|
||||
|
||||
return result
|
||||
|
||||
if isinstance(data, list) and not data:
|
||||
return DataFrame()
|
||||
|
||||
# A bit of a hackjob
|
||||
if isinstance(data, dict):
|
||||
data = [data]
|
||||
|
||||
if record_path is None:
|
||||
if any([isinstance(x, dict)
|
||||
for x in compat.itervalues(y)] for y in data):
|
||||
# naive normalization, this is idempotent for flat records
|
||||
# and potentially will inflate the data considerably for
|
||||
# deeply nested structures:
|
||||
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
|
||||
#
|
||||
# TODO: handle record value which are lists, at least error
|
||||
# reasonably
|
||||
data = nested_to_record(data, sep=sep)
|
||||
return DataFrame(data)
|
||||
elif not isinstance(record_path, list):
|
||||
record_path = [record_path]
|
||||
|
||||
if meta is None:
|
||||
meta = []
|
||||
elif not isinstance(meta, list):
|
||||
meta = [meta]
|
||||
|
||||
meta = [m if isinstance(m, list) else [m] for m in meta]
|
||||
|
||||
# Disastrously inefficient for now
|
||||
records = []
|
||||
lengths = []
|
||||
|
||||
meta_vals = defaultdict(list)
|
||||
if not isinstance(sep, compat.string_types):
|
||||
sep = str(sep)
|
||||
meta_keys = [sep.join(val) for val in meta]
|
||||
|
||||
def _recursive_extract(data, path, seen_meta, level=0):
|
||||
if isinstance(data, dict):
|
||||
data = [data]
|
||||
if len(path) > 1:
|
||||
for obj in data:
|
||||
for val, key in zip(meta, meta_keys):
|
||||
if level + 1 == len(val):
|
||||
seen_meta[key] = _pull_field(obj, val[-1])
|
||||
|
||||
_recursive_extract(obj[path[0]], path[1:],
|
||||
seen_meta, level=level + 1)
|
||||
else:
|
||||
for obj in data:
|
||||
recs = _pull_field(obj, path[0])
|
||||
|
||||
# For repeating the metadata later
|
||||
lengths.append(len(recs))
|
||||
|
||||
for val, key in zip(meta, meta_keys):
|
||||
if level + 1 > len(val):
|
||||
meta_val = seen_meta[key]
|
||||
else:
|
||||
try:
|
||||
meta_val = _pull_field(obj, val[level:])
|
||||
except KeyError as e:
|
||||
if errors == 'ignore':
|
||||
meta_val = np.nan
|
||||
else:
|
||||
raise KeyError("Try running with "
|
||||
"errors='ignore' as key "
|
||||
"{err} is not always present"
|
||||
.format(err=e))
|
||||
meta_vals[key].append(meta_val)
|
||||
|
||||
records.extend(recs)
|
||||
|
||||
_recursive_extract(data, record_path, {}, level=0)
|
||||
|
||||
result = DataFrame(records)
|
||||
|
||||
if record_prefix is not None:
|
||||
result = result.rename(
|
||||
columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))
|
||||
|
||||
# Data types, a problem
|
||||
for k, v in compat.iteritems(meta_vals):
|
||||
if meta_prefix is not None:
|
||||
k = meta_prefix + k
|
||||
|
||||
if k in result:
|
||||
raise ValueError('Conflicting metadata name {name}, '
|
||||
'need distinguishing prefix '.format(name=k))
|
||||
|
||||
result[k] = np.array(v).repeat(lengths)
|
||||
|
||||
return result
|
||||
@@ -1,325 +0,0 @@
|
||||
"""
|
||||
Table Schema builders
|
||||
|
||||
http://specs.frictionlessdata.io/json-table-schema/
|
||||
"""
|
||||
import warnings
|
||||
|
||||
import pandas._libs.json as json
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
|
||||
is_datetime64tz_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype,
|
||||
is_string_dtype, is_timedelta64_dtype)
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.api.types import CategoricalDtype
|
||||
import pandas.core.common as com
|
||||
|
||||
loads = json.loads
|
||||
|
||||
|
||||
def as_json_table_type(x):
|
||||
"""
|
||||
Convert a NumPy / pandas type to its corresponding json_table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array or dtype
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : str
|
||||
the Table Schema data types
|
||||
|
||||
Notes
|
||||
-----
|
||||
This table shows the relationship between NumPy / pandas dtypes,
|
||||
and Table Schema dtypes.
|
||||
|
||||
============== =================
|
||||
Pandas type Table Schema type
|
||||
============== =================
|
||||
int64 integer
|
||||
float64 number
|
||||
bool boolean
|
||||
datetime64[ns] datetime
|
||||
timedelta64[ns] duration
|
||||
object str
|
||||
categorical any
|
||||
=============== =================
|
||||
"""
|
||||
if is_integer_dtype(x):
|
||||
return 'integer'
|
||||
elif is_bool_dtype(x):
|
||||
return 'boolean'
|
||||
elif is_numeric_dtype(x):
|
||||
return 'number'
|
||||
elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
|
||||
is_period_dtype(x)):
|
||||
return 'datetime'
|
||||
elif is_timedelta64_dtype(x):
|
||||
return 'duration'
|
||||
elif is_categorical_dtype(x):
|
||||
return 'any'
|
||||
elif is_string_dtype(x):
|
||||
return 'string'
|
||||
else:
|
||||
return 'any'
|
||||
|
||||
|
||||
def set_default_names(data):
|
||||
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
|
||||
if com._all_not_none(*data.index.names):
|
||||
nms = data.index.names
|
||||
if len(nms) == 1 and data.index.name == 'index':
|
||||
warnings.warn("Index name of 'index' is not round-trippable")
|
||||
elif len(nms) > 1 and any(x.startswith('level_') for x in nms):
|
||||
warnings.warn("Index names beginning with 'level_' are not "
|
||||
"round-trippable")
|
||||
return data
|
||||
|
||||
data = data.copy()
|
||||
if data.index.nlevels > 1:
|
||||
names = [name if name is not None else 'level_{}'.format(i)
|
||||
for i, name in enumerate(data.index.names)]
|
||||
data.index.names = names
|
||||
else:
|
||||
data.index.name = data.index.name or 'index'
|
||||
return data
|
||||
|
||||
|
||||
def convert_pandas_type_to_json_field(arr, dtype=None):
|
||||
dtype = dtype or arr.dtype
|
||||
if arr.name is None:
|
||||
name = 'values'
|
||||
else:
|
||||
name = arr.name
|
||||
field = {'name': name,
|
||||
'type': as_json_table_type(dtype)}
|
||||
|
||||
if is_categorical_dtype(arr):
|
||||
if hasattr(arr, 'categories'):
|
||||
cats = arr.categories
|
||||
ordered = arr.ordered
|
||||
else:
|
||||
cats = arr.cat.categories
|
||||
ordered = arr.cat.ordered
|
||||
field['constraints'] = {"enum": list(cats)}
|
||||
field['ordered'] = ordered
|
||||
elif is_period_dtype(arr):
|
||||
field['freq'] = arr.freqstr
|
||||
elif is_datetime64tz_dtype(arr):
|
||||
if hasattr(arr, 'dt'):
|
||||
field['tz'] = arr.dt.tz.zone
|
||||
else:
|
||||
field['tz'] = arr.tz.zone
|
||||
return field
|
||||
|
||||
|
||||
def convert_json_field_to_pandas_type(field):
|
||||
"""
|
||||
Converts a JSON field descriptor into its corresponding NumPy / pandas type
|
||||
|
||||
Parameters
|
||||
----------
|
||||
field
|
||||
A JSON field descriptor
|
||||
|
||||
Returns
|
||||
-------
|
||||
dtype
|
||||
|
||||
Raises
|
||||
-----
|
||||
ValueError
|
||||
If the type of the provided field is unknown or currently unsupported
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> convert_json_field_to_pandas_type({'name': 'an_int',
|
||||
'type': 'integer'})
|
||||
'int64'
|
||||
>>> convert_json_field_to_pandas_type({'name': 'a_categorical',
|
||||
'type': 'any',
|
||||
'contraints': {'enum': [
|
||||
'a', 'b', 'c']},
|
||||
'ordered': True})
|
||||
'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)'
|
||||
>>> convert_json_field_to_pandas_type({'name': 'a_datetime',
|
||||
'type': 'datetime'})
|
||||
'datetime64[ns]'
|
||||
>>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz',
|
||||
'type': 'datetime',
|
||||
'tz': 'US/Central'})
|
||||
'datetime64[ns, US/Central]'
|
||||
"""
|
||||
typ = field['type']
|
||||
if typ == 'string':
|
||||
return 'object'
|
||||
elif typ == 'integer':
|
||||
return 'int64'
|
||||
elif typ == 'number':
|
||||
return 'float64'
|
||||
elif typ == 'boolean':
|
||||
return 'bool'
|
||||
elif typ == 'duration':
|
||||
return 'timedelta64'
|
||||
elif typ == 'datetime':
|
||||
if field.get('tz'):
|
||||
return 'datetime64[ns, {tz}]'.format(tz=field['tz'])
|
||||
else:
|
||||
return 'datetime64[ns]'
|
||||
elif typ == 'any':
|
||||
if 'constraints' in field and 'ordered' in field:
|
||||
return CategoricalDtype(categories=field['constraints']['enum'],
|
||||
ordered=field['ordered'])
|
||||
else:
|
||||
return 'object'
|
||||
|
||||
raise ValueError("Unsupported or invalid field type: {}".format(typ))
|
||||
|
||||
|
||||
def build_table_schema(data, index=True, primary_key=None, version=True):
|
||||
"""
|
||||
Create a Table schema from ``data``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series, DataFrame
|
||||
index : bool, default True
|
||||
Whether to include ``data.index`` in the schema.
|
||||
primary_key : bool or None, default True
|
||||
column names to designate as the primary key.
|
||||
The default `None` will set `'primaryKey'` to the index
|
||||
level or levels if the index is unique.
|
||||
version : bool, default True
|
||||
Whether to include a field `pandas_version` with the version
|
||||
of pandas that generated the schema.
|
||||
|
||||
Returns
|
||||
-------
|
||||
schema : dict
|
||||
|
||||
Notes
|
||||
-----
|
||||
See `_as_json_table_type` for conversion types.
|
||||
Timedeltas as converted to ISO8601 duration format with
|
||||
9 decimal places after the seconds field for nanosecond precision.
|
||||
|
||||
Categoricals are converted to the `any` dtype, and use the `enum` field
|
||||
constraint to list the allowed values. The `ordered` attribute is included
|
||||
in an `ordered` field.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame(
|
||||
... {'A': [1, 2, 3],
|
||||
... 'B': ['a', 'b', 'c'],
|
||||
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
|
||||
... }, index=pd.Index(range(3), name='idx'))
|
||||
>>> build_table_schema(df)
|
||||
{'fields': [{'name': 'idx', 'type': 'integer'},
|
||||
{'name': 'A', 'type': 'integer'},
|
||||
{'name': 'B', 'type': 'string'},
|
||||
{'name': 'C', 'type': 'datetime'}],
|
||||
'pandas_version': '0.20.0',
|
||||
'primaryKey': ['idx']}
|
||||
"""
|
||||
if index is True:
|
||||
data = set_default_names(data)
|
||||
|
||||
schema = {}
|
||||
fields = []
|
||||
|
||||
if index:
|
||||
if data.index.nlevels > 1:
|
||||
for level in data.index.levels:
|
||||
fields.append(convert_pandas_type_to_json_field(level))
|
||||
else:
|
||||
fields.append(convert_pandas_type_to_json_field(data.index))
|
||||
|
||||
if data.ndim > 1:
|
||||
for column, s in data.iteritems():
|
||||
fields.append(convert_pandas_type_to_json_field(s))
|
||||
else:
|
||||
fields.append(convert_pandas_type_to_json_field(data))
|
||||
|
||||
schema['fields'] = fields
|
||||
if index and data.index.is_unique and primary_key is None:
|
||||
if data.index.nlevels == 1:
|
||||
schema['primaryKey'] = [data.index.name]
|
||||
else:
|
||||
schema['primaryKey'] = data.index.names
|
||||
elif primary_key is not None:
|
||||
schema['primaryKey'] = primary_key
|
||||
|
||||
if version:
|
||||
schema['pandas_version'] = '0.20.0'
|
||||
return schema
|
||||
|
||||
|
||||
def parse_table_schema(json, precise_float):
|
||||
"""
|
||||
Builds a DataFrame from a given schema
|
||||
|
||||
Parameters
|
||||
----------
|
||||
json :
|
||||
A JSON table schema
|
||||
precise_float : boolean
|
||||
Flag controlling precision when decoding string to double values, as
|
||||
dictated by ``read_json``
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
|
||||
Raises
|
||||
------
|
||||
NotImplementedError
|
||||
If the JSON table schema contains either timezone or timedelta data
|
||||
|
||||
Notes
|
||||
-----
|
||||
Because :func:`DataFrame.to_json` uses the string 'index' to denote a
|
||||
name-less :class:`Index`, this function sets the name of the returned
|
||||
:class:`DataFrame` to ``None`` when said string is encountered with a
|
||||
normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
|
||||
applies to any strings beginning with 'level_'. Therefore, an
|
||||
:class:`Index` name of 'index' and :class:`MultiIndex` names starting
|
||||
with 'level_' are not supported.
|
||||
|
||||
See Also
|
||||
--------
|
||||
build_table_schema : Inverse function.
|
||||
pandas.read_json
|
||||
"""
|
||||
table = loads(json, precise_float=precise_float)
|
||||
col_order = [field['name'] for field in table['schema']['fields']]
|
||||
df = DataFrame(table['data'], columns=col_order)[col_order]
|
||||
|
||||
dtypes = {field['name']: convert_json_field_to_pandas_type(field)
|
||||
for field in table['schema']['fields']}
|
||||
|
||||
# Cannot directly use as_type with timezone data on object; raise for now
|
||||
if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
|
||||
raise NotImplementedError('table="orient" can not yet read timezone '
|
||||
'data')
|
||||
|
||||
# No ISO constructor for Timedelta as of yet, so need to raise
|
||||
if 'timedelta64' in dtypes.values():
|
||||
raise NotImplementedError('table="orient" can not yet read '
|
||||
'ISO-formatted Timedelta data')
|
||||
|
||||
df = df.astype(dtypes)
|
||||
|
||||
df = df.set_index(table['schema']['primaryKey'])
|
||||
if len(df.index.names) == 1:
|
||||
if df.index.name == 'index':
|
||||
df.index.name = None
|
||||
else:
|
||||
df.index.names = [None if x.startswith('level_') else x for x in
|
||||
df.index.names]
|
||||
|
||||
return df
|
||||
@@ -1,50 +0,0 @@
|
||||
# coding: utf-8
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
from pandas.io.msgpack.exceptions import * # noqa
|
||||
from pandas.io.msgpack._version import version # noqa
|
||||
|
||||
|
||||
class ExtType(namedtuple('ExtType', 'code data')):
|
||||
"""ExtType represents ext type in msgpack."""
|
||||
def __new__(cls, code, data):
|
||||
if not isinstance(code, int):
|
||||
raise TypeError("code must be int")
|
||||
if not isinstance(data, bytes):
|
||||
raise TypeError("data must be bytes")
|
||||
if not 0 <= code <= 127:
|
||||
raise ValueError("code must be 0~127")
|
||||
return super(ExtType, cls).__new__(cls, code, data)
|
||||
|
||||
import os # noqa
|
||||
|
||||
from pandas.io.msgpack._packer import Packer # noqa
|
||||
from pandas.io.msgpack._unpacker import unpack, unpackb, Unpacker # noqa
|
||||
|
||||
|
||||
def pack(o, stream, **kwargs):
|
||||
"""
|
||||
Pack object `o` and write it to `stream`
|
||||
|
||||
See :class:`Packer` for options.
|
||||
"""
|
||||
packer = Packer(**kwargs)
|
||||
stream.write(packer.pack(o))
|
||||
|
||||
|
||||
def packb(o, **kwargs):
|
||||
"""
|
||||
Pack object `o` and return packed bytes
|
||||
|
||||
See :class:`Packer` for options.
|
||||
"""
|
||||
return Packer(**kwargs).pack(o)
|
||||
|
||||
|
||||
# alias for compatibility to simplejson/marshal/pickle.
|
||||
load = unpack
|
||||
loads = unpackb
|
||||
|
||||
dump = pack
|
||||
dumps = packb
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -1 +0,0 @@
|
||||
version = (0, 4, 6)
|
||||
@@ -1,32 +0,0 @@
|
||||
class UnpackException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class BufferFull(UnpackException):
|
||||
pass
|
||||
|
||||
|
||||
class OutOfData(UnpackException):
|
||||
pass
|
||||
|
||||
|
||||
class UnpackValueError(UnpackException, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class ExtraData(ValueError):
|
||||
|
||||
def __init__(self, unpacked, extra):
|
||||
self.unpacked = unpacked
|
||||
self.extra = extra
|
||||
|
||||
def __str__(self):
|
||||
return "unpack(b) received extra data."
|
||||
|
||||
|
||||
class PackException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class PackValueError(PackException, ValueError):
|
||||
pass
|
||||
@@ -1,830 +0,0 @@
|
||||
"""
|
||||
Msgpack serializer support for reading and writing pandas data structures
|
||||
to disk
|
||||
|
||||
portions of msgpack_numpy package, by Lev Givon were incorporated
|
||||
into this module (and tests_packers.py)
|
||||
|
||||
License
|
||||
=======
|
||||
|
||||
Copyright (c) 2013, Lev Givon.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following
|
||||
disclaimer in the documentation and/or other materials provided
|
||||
with the distribution.
|
||||
* Neither the name of Lev Givon nor the names of any
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
"""
|
||||
|
||||
from datetime import date, datetime, timedelta
|
||||
import os
|
||||
from textwrap import dedent
|
||||
import warnings
|
||||
|
||||
from dateutil.parser import parse
|
||||
import numpy as np
|
||||
|
||||
import pandas.compat as compat
|
||||
from pandas.compat import u, u_safe
|
||||
from pandas.errors import PerformanceWarning
|
||||
from pandas.util._move import (
|
||||
BadMove as _BadMove, move_into_mutable_buffer as _move_into_mutable_buffer)
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_categorical_dtype, is_datetime64tz_dtype, is_object_dtype,
|
||||
needs_i8_conversion, pandas_dtype)
|
||||
|
||||
from pandas import ( # noqa:F401
|
||||
Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
|
||||
Index, Int64Index, Interval, IntervalIndex, MultiIndex, NaT, Panel, Period,
|
||||
PeriodIndex, RangeIndex, Series, TimedeltaIndex, Timestamp)
|
||||
from pandas.core import internals
|
||||
from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray
|
||||
from pandas.core.arrays.sparse import BlockIndex, IntIndex
|
||||
from pandas.core.generic import NDFrame
|
||||
from pandas.core.internals import BlockManager, _safe_reshape, make_block
|
||||
from pandas.core.sparse.api import SparseDataFrame, SparseSeries
|
||||
|
||||
from pandas.io.common import _stringify_path, get_filepath_or_buffer
|
||||
from pandas.io.msgpack import ExtType, Packer as _Packer, Unpacker as _Unpacker
|
||||
|
||||
# check which compression libs we have installed
|
||||
try:
|
||||
import zlib
|
||||
|
||||
def _check_zlib():
|
||||
pass
|
||||
except ImportError:
|
||||
def _check_zlib():
|
||||
raise ImportError('zlib is not installed')
|
||||
|
||||
_check_zlib.__doc__ = dedent(
|
||||
"""\
|
||||
Check if zlib is installed.
|
||||
|
||||
Raises
|
||||
------
|
||||
ImportError
|
||||
Raised when zlib is not installed.
|
||||
""",
|
||||
)
|
||||
|
||||
try:
|
||||
import blosc
|
||||
|
||||
def _check_blosc():
|
||||
pass
|
||||
except ImportError:
|
||||
def _check_blosc():
|
||||
raise ImportError('blosc is not installed')
|
||||
|
||||
_check_blosc.__doc__ = dedent(
|
||||
"""\
|
||||
Check if blosc is installed.
|
||||
|
||||
Raises
|
||||
------
|
||||
ImportError
|
||||
Raised when blosc is not installed.
|
||||
""",
|
||||
)
|
||||
|
||||
# until we can pass this into our conversion functions,
|
||||
# this is pretty hacky
|
||||
compressor = None
|
||||
|
||||
|
||||
def to_msgpack(path_or_buf, *args, **kwargs):
|
||||
"""
|
||||
msgpack (serialize) object to input file path
|
||||
|
||||
THIS IS AN EXPERIMENTAL LIBRARY and the storage format
|
||||
may not be stable until a future release.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf : string File path, buffer-like, or None
|
||||
if None, return generated string
|
||||
args : an object or objects to serialize
|
||||
encoding : encoding for unicode objects
|
||||
append : boolean whether to append to an existing msgpack
|
||||
(default is False)
|
||||
compress : type of compressor (zlib or blosc), default to None (no
|
||||
compression)
|
||||
"""
|
||||
global compressor
|
||||
compressor = kwargs.pop('compress', None)
|
||||
if compressor:
|
||||
compressor = u(compressor)
|
||||
append = kwargs.pop('append', None)
|
||||
if append:
|
||||
mode = 'a+b'
|
||||
else:
|
||||
mode = 'wb'
|
||||
|
||||
def writer(fh):
|
||||
for a in args:
|
||||
fh.write(pack(a, **kwargs))
|
||||
|
||||
path_or_buf = _stringify_path(path_or_buf)
|
||||
if isinstance(path_or_buf, compat.string_types):
|
||||
with open(path_or_buf, mode) as fh:
|
||||
writer(fh)
|
||||
elif path_or_buf is None:
|
||||
buf = compat.BytesIO()
|
||||
writer(buf)
|
||||
return buf.getvalue()
|
||||
else:
|
||||
writer(path_or_buf)
|
||||
|
||||
|
||||
def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
|
||||
"""
|
||||
Load msgpack pandas object from the specified
|
||||
file path
|
||||
|
||||
THIS IS AN EXPERIMENTAL LIBRARY and the storage format
|
||||
may not be stable until a future release.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf : string File path, BytesIO like or string
|
||||
encoding : Encoding for decoding msgpack str type
|
||||
iterator : boolean, if True, return an iterator to the unpacker
|
||||
(default is False)
|
||||
|
||||
Returns
|
||||
-------
|
||||
obj : same type as object stored in file
|
||||
"""
|
||||
path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)
|
||||
if iterator:
|
||||
return Iterator(path_or_buf)
|
||||
|
||||
def read(fh):
|
||||
unpacked_obj = list(unpack(fh, encoding=encoding, **kwargs))
|
||||
if len(unpacked_obj) == 1:
|
||||
return unpacked_obj[0]
|
||||
|
||||
if should_close:
|
||||
try:
|
||||
path_or_buf.close()
|
||||
except IOError:
|
||||
pass
|
||||
return unpacked_obj
|
||||
|
||||
# see if we have an actual file
|
||||
if isinstance(path_or_buf, compat.string_types):
|
||||
try:
|
||||
exists = os.path.exists(path_or_buf)
|
||||
except (TypeError, ValueError):
|
||||
exists = False
|
||||
|
||||
if exists:
|
||||
with open(path_or_buf, 'rb') as fh:
|
||||
return read(fh)
|
||||
|
||||
if isinstance(path_or_buf, compat.binary_type):
|
||||
# treat as a binary-like
|
||||
fh = None
|
||||
try:
|
||||
# We can't distinguish between a path and a buffer of bytes in
|
||||
# Python 2 so instead assume the first byte of a valid path is
|
||||
# less than 0x80.
|
||||
if compat.PY3 or ord(path_or_buf[0]) >= 0x80:
|
||||
fh = compat.BytesIO(path_or_buf)
|
||||
return read(fh)
|
||||
finally:
|
||||
if fh is not None:
|
||||
fh.close()
|
||||
elif hasattr(path_or_buf, 'read') and compat.callable(path_or_buf.read):
|
||||
# treat as a buffer like
|
||||
return read(path_or_buf)
|
||||
|
||||
raise ValueError('path_or_buf needs to be a string file path or file-like')
|
||||
|
||||
|
||||
dtype_dict = {21: np.dtype('M8[ns]'),
|
||||
u('datetime64[ns]'): np.dtype('M8[ns]'),
|
||||
u('datetime64[us]'): np.dtype('M8[us]'),
|
||||
22: np.dtype('m8[ns]'),
|
||||
u('timedelta64[ns]'): np.dtype('m8[ns]'),
|
||||
u('timedelta64[us]'): np.dtype('m8[us]'),
|
||||
|
||||
# this is platform int, which we need to remap to np.int64
|
||||
# for compat on windows platforms
|
||||
7: np.dtype('int64'),
|
||||
'category': 'category'
|
||||
}
|
||||
|
||||
|
||||
def dtype_for(t):
|
||||
""" return my dtype mapping, whether number or name """
|
||||
if t in dtype_dict:
|
||||
return dtype_dict[t]
|
||||
return np.typeDict.get(t, t)
|
||||
|
||||
|
||||
c2f_dict = {'complex': np.float64,
|
||||
'complex128': np.float64,
|
||||
'complex64': np.float32}
|
||||
|
||||
# windows (32 bit) compat
|
||||
if hasattr(np, 'float128'):
|
||||
c2f_dict['complex256'] = np.float128
|
||||
|
||||
|
||||
def c2f(r, i, ctype_name):
|
||||
"""
|
||||
Convert strings to complex number instance with specified numpy type.
|
||||
"""
|
||||
|
||||
ftype = c2f_dict[ctype_name]
|
||||
return np.typeDict[ctype_name](ftype(r) + 1j * ftype(i))
|
||||
|
||||
|
||||
def convert(values):
|
||||
""" convert the numpy values to a list """
|
||||
|
||||
dtype = values.dtype
|
||||
|
||||
if is_categorical_dtype(values):
|
||||
return values
|
||||
|
||||
elif is_object_dtype(dtype):
|
||||
return values.ravel().tolist()
|
||||
|
||||
if needs_i8_conversion(dtype):
|
||||
values = values.view('i8')
|
||||
v = values.ravel()
|
||||
|
||||
if compressor == 'zlib':
|
||||
_check_zlib()
|
||||
|
||||
# return string arrays like they are
|
||||
if dtype == np.object_:
|
||||
return v.tolist()
|
||||
|
||||
# convert to a bytes array
|
||||
v = v.tostring()
|
||||
return ExtType(0, zlib.compress(v))
|
||||
|
||||
elif compressor == 'blosc':
|
||||
_check_blosc()
|
||||
|
||||
# return string arrays like they are
|
||||
if dtype == np.object_:
|
||||
return v.tolist()
|
||||
|
||||
# convert to a bytes array
|
||||
v = v.tostring()
|
||||
return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))
|
||||
|
||||
# ndarray (on original dtype)
|
||||
return ExtType(0, v.tostring())
|
||||
|
||||
|
||||
def unconvert(values, dtype, compress=None):
|
||||
|
||||
as_is_ext = isinstance(values, ExtType) and values.code == 0
|
||||
|
||||
if as_is_ext:
|
||||
values = values.data
|
||||
|
||||
if is_categorical_dtype(dtype):
|
||||
return values
|
||||
|
||||
elif is_object_dtype(dtype):
|
||||
return np.array(values, dtype=object)
|
||||
|
||||
dtype = pandas_dtype(dtype).base
|
||||
|
||||
if not as_is_ext:
|
||||
values = values.encode('latin1')
|
||||
|
||||
if compress:
|
||||
if compress == u'zlib':
|
||||
_check_zlib()
|
||||
decompress = zlib.decompress
|
||||
elif compress == u'blosc':
|
||||
_check_blosc()
|
||||
decompress = blosc.decompress
|
||||
else:
|
||||
raise ValueError("compress must be one of 'zlib' or 'blosc'")
|
||||
|
||||
try:
|
||||
return np.frombuffer(
|
||||
_move_into_mutable_buffer(decompress(values)),
|
||||
dtype=dtype,
|
||||
)
|
||||
except _BadMove as e:
|
||||
# Pull the decompressed data off of the `_BadMove` exception.
|
||||
# We don't just store this in the locals because we want to
|
||||
# minimize the risk of giving users access to a `bytes` object
|
||||
# whose data is also given to a mutable buffer.
|
||||
values = e.args[0]
|
||||
if len(values) > 1:
|
||||
# The empty string and single characters are memoized in many
|
||||
# string creating functions in the capi. This case should not
|
||||
# warn even though we need to make a copy because we are only
|
||||
# copying at most 1 byte.
|
||||
warnings.warn(
|
||||
'copying data after decompressing; this may mean that'
|
||||
' decompress is caching its result',
|
||||
PerformanceWarning,
|
||||
)
|
||||
# fall through to copying `np.fromstring`
|
||||
|
||||
# Copy the bytes into a numpy array.
|
||||
buf = np.frombuffer(values, dtype=dtype)
|
||||
buf = buf.copy() # required to not mutate the original data
|
||||
buf.flags.writeable = True
|
||||
return buf
|
||||
|
||||
|
||||
def encode(obj):
|
||||
"""
|
||||
Data encoder
|
||||
"""
|
||||
tobj = type(obj)
|
||||
if isinstance(obj, Index):
|
||||
if isinstance(obj, RangeIndex):
|
||||
return {u'typ': u'range_index',
|
||||
u'klass': u(obj.__class__.__name__),
|
||||
u'name': getattr(obj, 'name', None),
|
||||
u'start': getattr(obj, '_start', None),
|
||||
u'stop': getattr(obj, '_stop', None),
|
||||
u'step': getattr(obj, '_step', None)}
|
||||
elif isinstance(obj, PeriodIndex):
|
||||
return {u'typ': u'period_index',
|
||||
u'klass': u(obj.__class__.__name__),
|
||||
u'name': getattr(obj, 'name', None),
|
||||
u'freq': u_safe(getattr(obj, 'freqstr', None)),
|
||||
u'dtype': u(obj.dtype.name),
|
||||
u'data': convert(obj.asi8),
|
||||
u'compress': compressor}
|
||||
elif isinstance(obj, DatetimeIndex):
|
||||
tz = getattr(obj, 'tz', None)
|
||||
|
||||
# store tz info and data as UTC
|
||||
if tz is not None:
|
||||
tz = u(tz.zone)
|
||||
obj = obj.tz_convert('UTC')
|
||||
return {u'typ': u'datetime_index',
|
||||
u'klass': u(obj.__class__.__name__),
|
||||
u'name': getattr(obj, 'name', None),
|
||||
u'dtype': u(obj.dtype.name),
|
||||
u'data': convert(obj.asi8),
|
||||
u'freq': u_safe(getattr(obj, 'freqstr', None)),
|
||||
u'tz': tz,
|
||||
u'compress': compressor}
|
||||
elif isinstance(obj, (IntervalIndex, IntervalArray)):
|
||||
if isinstance(obj, IntervalIndex):
|
||||
typ = u'interval_index'
|
||||
else:
|
||||
typ = u'interval_array'
|
||||
return {u'typ': typ,
|
||||
u'klass': u(obj.__class__.__name__),
|
||||
u'name': getattr(obj, 'name', None),
|
||||
u'left': getattr(obj, 'left', None),
|
||||
u'right': getattr(obj, 'right', None),
|
||||
u'closed': getattr(obj, 'closed', None)}
|
||||
elif isinstance(obj, MultiIndex):
|
||||
return {u'typ': u'multi_index',
|
||||
u'klass': u(obj.__class__.__name__),
|
||||
u'names': getattr(obj, 'names', None),
|
||||
u'dtype': u(obj.dtype.name),
|
||||
u'data': convert(obj.values),
|
||||
u'compress': compressor}
|
||||
else:
|
||||
return {u'typ': u'index',
|
||||
u'klass': u(obj.__class__.__name__),
|
||||
u'name': getattr(obj, 'name', None),
|
||||
u'dtype': u(obj.dtype.name),
|
||||
u'data': convert(obj.values),
|
||||
u'compress': compressor}
|
||||
|
||||
elif isinstance(obj, Categorical):
|
||||
return {u'typ': u'category',
|
||||
u'klass': u(obj.__class__.__name__),
|
||||
u'name': getattr(obj, 'name', None),
|
||||
u'codes': obj.codes,
|
||||
u'categories': obj.categories,
|
||||
u'ordered': obj.ordered,
|
||||
u'compress': compressor}
|
||||
|
||||
elif isinstance(obj, Series):
|
||||
if isinstance(obj, SparseSeries):
|
||||
raise NotImplementedError(
|
||||
'msgpack sparse series is not implemented'
|
||||
)
|
||||
# d = {'typ': 'sparse_series',
|
||||
# 'klass': obj.__class__.__name__,
|
||||
# 'dtype': obj.dtype.name,
|
||||
# 'index': obj.index,
|
||||
# 'sp_index': obj.sp_index,
|
||||
# 'sp_values': convert(obj.sp_values),
|
||||
# 'compress': compressor}
|
||||
# for f in ['name', 'fill_value', 'kind']:
|
||||
# d[f] = getattr(obj, f, None)
|
||||
# return d
|
||||
else:
|
||||
return {u'typ': u'series',
|
||||
u'klass': u(obj.__class__.__name__),
|
||||
u'name': getattr(obj, 'name', None),
|
||||
u'index': obj.index,
|
||||
u'dtype': u(obj.dtype.name),
|
||||
u'data': convert(obj.values),
|
||||
u'compress': compressor}
|
||||
elif issubclass(tobj, NDFrame):
|
||||
if isinstance(obj, SparseDataFrame):
|
||||
raise NotImplementedError(
|
||||
'msgpack sparse frame is not implemented'
|
||||
)
|
||||
# d = {'typ': 'sparse_dataframe',
|
||||
# 'klass': obj.__class__.__name__,
|
||||
# 'columns': obj.columns}
|
||||
# for f in ['default_fill_value', 'default_kind']:
|
||||
# d[f] = getattr(obj, f, None)
|
||||
# d['data'] = dict([(name, ss)
|
||||
# for name, ss in compat.iteritems(obj)])
|
||||
# return d
|
||||
else:
|
||||
|
||||
data = obj._data
|
||||
if not data.is_consolidated():
|
||||
data = data.consolidate()
|
||||
|
||||
# the block manager
|
||||
return {u'typ': u'block_manager',
|
||||
u'klass': u(obj.__class__.__name__),
|
||||
u'axes': data.axes,
|
||||
u'blocks': [{u'locs': b.mgr_locs.as_array,
|
||||
u'values': convert(b.values),
|
||||
u'shape': b.values.shape,
|
||||
u'dtype': u(b.dtype.name),
|
||||
u'klass': u(b.__class__.__name__),
|
||||
u'compress': compressor} for b in data.blocks]
|
||||
}
|
||||
|
||||
elif isinstance(obj, (datetime, date, np.datetime64, timedelta,
|
||||
np.timedelta64)) or obj is NaT:
|
||||
if isinstance(obj, Timestamp):
|
||||
tz = obj.tzinfo
|
||||
if tz is not None:
|
||||
tz = u(tz.zone)
|
||||
freq = obj.freq
|
||||
if freq is not None:
|
||||
freq = u(freq.freqstr)
|
||||
return {u'typ': u'timestamp',
|
||||
u'value': obj.value,
|
||||
u'freq': freq,
|
||||
u'tz': tz}
|
||||
if obj is NaT:
|
||||
return {u'typ': u'nat'}
|
||||
elif isinstance(obj, np.timedelta64):
|
||||
return {u'typ': u'timedelta64',
|
||||
u'data': obj.view('i8')}
|
||||
elif isinstance(obj, timedelta):
|
||||
return {u'typ': u'timedelta',
|
||||
u'data': (obj.days, obj.seconds, obj.microseconds)}
|
||||
elif isinstance(obj, np.datetime64):
|
||||
return {u'typ': u'datetime64',
|
||||
u'data': u(str(obj))}
|
||||
elif isinstance(obj, datetime):
|
||||
return {u'typ': u'datetime',
|
||||
u'data': u(obj.isoformat())}
|
||||
elif isinstance(obj, date):
|
||||
return {u'typ': u'date',
|
||||
u'data': u(obj.isoformat())}
|
||||
raise Exception(
|
||||
"cannot encode this datetimelike object: {obj}".format(obj=obj))
|
||||
elif isinstance(obj, Period):
|
||||
return {u'typ': u'period',
|
||||
u'ordinal': obj.ordinal,
|
||||
u'freq': u_safe(obj.freqstr)}
|
||||
elif isinstance(obj, Interval):
|
||||
return {u'typ': u'interval',
|
||||
u'left': obj.left,
|
||||
u'right': obj.right,
|
||||
u'closed': obj.closed}
|
||||
elif isinstance(obj, BlockIndex):
|
||||
return {u'typ': u'block_index',
|
||||
u'klass': u(obj.__class__.__name__),
|
||||
u'blocs': obj.blocs,
|
||||
u'blengths': obj.blengths,
|
||||
u'length': obj.length}
|
||||
elif isinstance(obj, IntIndex):
|
||||
return {u'typ': u'int_index',
|
||||
u'klass': u(obj.__class__.__name__),
|
||||
u'indices': obj.indices,
|
||||
u'length': obj.length}
|
||||
elif isinstance(obj, np.ndarray):
|
||||
return {u'typ': u'ndarray',
|
||||
u'shape': obj.shape,
|
||||
u'ndim': obj.ndim,
|
||||
u'dtype': u(obj.dtype.name),
|
||||
u'data': convert(obj),
|
||||
u'compress': compressor}
|
||||
elif isinstance(obj, np.number):
|
||||
if np.iscomplexobj(obj):
|
||||
return {u'typ': u'np_scalar',
|
||||
u'sub_typ': u'np_complex',
|
||||
u'dtype': u(obj.dtype.name),
|
||||
u'real': u(obj.real.__repr__()),
|
||||
u'imag': u(obj.imag.__repr__())}
|
||||
else:
|
||||
return {u'typ': u'np_scalar',
|
||||
u'dtype': u(obj.dtype.name),
|
||||
u'data': u(obj.__repr__())}
|
||||
elif isinstance(obj, complex):
|
||||
return {u'typ': u'np_complex',
|
||||
u'real': u(obj.real.__repr__()),
|
||||
u'imag': u(obj.imag.__repr__())}
|
||||
|
||||
return obj
|
||||
|
||||
|
||||
def decode(obj):
|
||||
"""
|
||||
Decoder for deserializing numpy data types.
|
||||
"""
|
||||
|
||||
typ = obj.get(u'typ')
|
||||
if typ is None:
|
||||
return obj
|
||||
elif typ == u'timestamp':
|
||||
freq = obj[u'freq'] if 'freq' in obj else obj[u'offset']
|
||||
return Timestamp(obj[u'value'], tz=obj[u'tz'], freq=freq)
|
||||
elif typ == u'nat':
|
||||
return NaT
|
||||
elif typ == u'period':
|
||||
return Period(ordinal=obj[u'ordinal'], freq=obj[u'freq'])
|
||||
elif typ == u'index':
|
||||
dtype = dtype_for(obj[u'dtype'])
|
||||
data = unconvert(obj[u'data'], dtype,
|
||||
obj.get(u'compress'))
|
||||
return Index(data, dtype=dtype, name=obj[u'name'])
|
||||
elif typ == u'range_index':
|
||||
return RangeIndex(obj[u'start'],
|
||||
obj[u'stop'],
|
||||
obj[u'step'],
|
||||
name=obj[u'name'])
|
||||
elif typ == u'multi_index':
|
||||
dtype = dtype_for(obj[u'dtype'])
|
||||
data = unconvert(obj[u'data'], dtype,
|
||||
obj.get(u'compress'))
|
||||
data = [tuple(x) for x in data]
|
||||
return MultiIndex.from_tuples(data, names=obj[u'names'])
|
||||
elif typ == u'period_index':
|
||||
data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
|
||||
d = dict(name=obj[u'name'], freq=obj[u'freq'])
|
||||
freq = d.pop('freq', None)
|
||||
return PeriodIndex(PeriodArray(data, freq), **d)
|
||||
|
||||
elif typ == u'datetime_index':
|
||||
data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
|
||||
d = dict(name=obj[u'name'], freq=obj[u'freq'])
|
||||
result = DatetimeIndex(data, **d)
|
||||
tz = obj[u'tz']
|
||||
|
||||
# reverse tz conversion
|
||||
if tz is not None:
|
||||
result = result.tz_localize('UTC').tz_convert(tz)
|
||||
return result
|
||||
|
||||
elif typ in (u'interval_index', 'interval_array'):
|
||||
return globals()[obj[u'klass']].from_arrays(obj[u'left'],
|
||||
obj[u'right'],
|
||||
obj[u'closed'],
|
||||
name=obj[u'name'])
|
||||
elif typ == u'category':
|
||||
from_codes = globals()[obj[u'klass']].from_codes
|
||||
return from_codes(codes=obj[u'codes'],
|
||||
categories=obj[u'categories'],
|
||||
ordered=obj[u'ordered'])
|
||||
|
||||
elif typ == u'interval':
|
||||
return Interval(obj[u'left'], obj[u'right'], obj[u'closed'])
|
||||
elif typ == u'series':
|
||||
dtype = dtype_for(obj[u'dtype'])
|
||||
pd_dtype = pandas_dtype(dtype)
|
||||
|
||||
index = obj[u'index']
|
||||
result = Series(unconvert(obj[u'data'], dtype, obj[u'compress']),
|
||||
index=index,
|
||||
dtype=pd_dtype,
|
||||
name=obj[u'name'])
|
||||
return result
|
||||
|
||||
elif typ == u'block_manager':
|
||||
axes = obj[u'axes']
|
||||
|
||||
def create_block(b):
|
||||
values = _safe_reshape(unconvert(
|
||||
b[u'values'], dtype_for(b[u'dtype']),
|
||||
b[u'compress']), b[u'shape'])
|
||||
|
||||
# locs handles duplicate column names, and should be used instead
|
||||
# of items; see GH 9618
|
||||
if u'locs' in b:
|
||||
placement = b[u'locs']
|
||||
else:
|
||||
placement = axes[0].get_indexer(b[u'items'])
|
||||
|
||||
if is_datetime64tz_dtype(b[u'dtype']):
|
||||
assert isinstance(values, np.ndarray), type(values)
|
||||
assert values.dtype == 'M8[ns]', values.dtype
|
||||
values = DatetimeArray(values, dtype=b[u'dtype'])
|
||||
|
||||
return make_block(values=values,
|
||||
klass=getattr(internals, b[u'klass']),
|
||||
placement=placement,
|
||||
dtype=b[u'dtype'])
|
||||
|
||||
blocks = [create_block(b) for b in obj[u'blocks']]
|
||||
return globals()[obj[u'klass']](BlockManager(blocks, axes))
|
||||
elif typ == u'datetime':
|
||||
return parse(obj[u'data'])
|
||||
elif typ == u'datetime64':
|
||||
return np.datetime64(parse(obj[u'data']))
|
||||
elif typ == u'date':
|
||||
return parse(obj[u'data']).date()
|
||||
elif typ == u'timedelta':
|
||||
return timedelta(*obj[u'data'])
|
||||
elif typ == u'timedelta64':
|
||||
return np.timedelta64(int(obj[u'data']))
|
||||
# elif typ == 'sparse_series':
|
||||
# dtype = dtype_for(obj['dtype'])
|
||||
# return SparseSeries(
|
||||
# unconvert(obj['sp_values'], dtype, obj['compress']),
|
||||
# sparse_index=obj['sp_index'], index=obj['index'],
|
||||
# fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
|
||||
# elif typ == 'sparse_dataframe':
|
||||
# return SparseDataFrame(
|
||||
# obj['data'], columns=obj['columns'],
|
||||
# default_fill_value=obj['default_fill_value'],
|
||||
# default_kind=obj['default_kind']
|
||||
# )
|
||||
# elif typ == 'sparse_panel':
|
||||
# return SparsePanel(
|
||||
# obj['data'], items=obj['items'],
|
||||
# default_fill_value=obj['default_fill_value'],
|
||||
# default_kind=obj['default_kind'])
|
||||
elif typ == u'block_index':
|
||||
return globals()[obj[u'klass']](obj[u'length'], obj[u'blocs'],
|
||||
obj[u'blengths'])
|
||||
elif typ == u'int_index':
|
||||
return globals()[obj[u'klass']](obj[u'length'], obj[u'indices'])
|
||||
elif typ == u'ndarray':
|
||||
return unconvert(obj[u'data'], np.typeDict[obj[u'dtype']],
|
||||
obj.get(u'compress')).reshape(obj[u'shape'])
|
||||
elif typ == u'np_scalar':
|
||||
if obj.get(u'sub_typ') == u'np_complex':
|
||||
return c2f(obj[u'real'], obj[u'imag'], obj[u'dtype'])
|
||||
else:
|
||||
dtype = dtype_for(obj[u'dtype'])
|
||||
try:
|
||||
return dtype(obj[u'data'])
|
||||
except (ValueError, TypeError):
|
||||
return dtype.type(obj[u'data'])
|
||||
elif typ == u'np_complex':
|
||||
return complex(obj[u'real'] + u'+' + obj[u'imag'] + u'j')
|
||||
elif isinstance(obj, (dict, list, set)):
|
||||
return obj
|
||||
else:
|
||||
return obj
|
||||
|
||||
|
||||
def pack(o, default=encode,
|
||||
encoding='utf-8', unicode_errors='strict', use_single_float=False,
|
||||
autoreset=1, use_bin_type=1):
|
||||
"""
|
||||
Pack an object and return the packed bytes.
|
||||
"""
|
||||
|
||||
return Packer(default=default, encoding=encoding,
|
||||
unicode_errors=unicode_errors,
|
||||
use_single_float=use_single_float,
|
||||
autoreset=autoreset,
|
||||
use_bin_type=use_bin_type).pack(o)
|
||||
|
||||
|
||||
def unpack(packed, object_hook=decode,
|
||||
list_hook=None, use_list=False, encoding='utf-8',
|
||||
unicode_errors='strict', object_pairs_hook=None,
|
||||
max_buffer_size=0, ext_hook=ExtType):
|
||||
"""
|
||||
Unpack a packed object, return an iterator
|
||||
Note: packed lists will be returned as tuples
|
||||
"""
|
||||
|
||||
return Unpacker(packed, object_hook=object_hook,
|
||||
list_hook=list_hook,
|
||||
use_list=use_list, encoding=encoding,
|
||||
unicode_errors=unicode_errors,
|
||||
object_pairs_hook=object_pairs_hook,
|
||||
max_buffer_size=max_buffer_size,
|
||||
ext_hook=ext_hook)
|
||||
|
||||
|
||||
class Packer(_Packer):
|
||||
|
||||
def __init__(self, default=encode,
|
||||
encoding='utf-8',
|
||||
unicode_errors='strict',
|
||||
use_single_float=False,
|
||||
autoreset=1,
|
||||
use_bin_type=1):
|
||||
super(Packer, self).__init__(default=default,
|
||||
encoding=encoding,
|
||||
unicode_errors=unicode_errors,
|
||||
use_single_float=use_single_float,
|
||||
autoreset=autoreset,
|
||||
use_bin_type=use_bin_type)
|
||||
|
||||
|
||||
class Unpacker(_Unpacker):
|
||||
|
||||
def __init__(self, file_like=None, read_size=0, use_list=False,
|
||||
object_hook=decode,
|
||||
object_pairs_hook=None, list_hook=None, encoding='utf-8',
|
||||
unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType):
|
||||
super(Unpacker, self).__init__(file_like=file_like,
|
||||
read_size=read_size,
|
||||
use_list=use_list,
|
||||
object_hook=object_hook,
|
||||
object_pairs_hook=object_pairs_hook,
|
||||
list_hook=list_hook,
|
||||
encoding=encoding,
|
||||
unicode_errors=unicode_errors,
|
||||
max_buffer_size=max_buffer_size,
|
||||
ext_hook=ext_hook)
|
||||
|
||||
|
||||
class Iterator(object):
|
||||
|
||||
""" manage the unpacking iteration,
|
||||
close the file on completion """
|
||||
|
||||
def __init__(self, path, **kwargs):
|
||||
self.path = path
|
||||
self.kwargs = kwargs
|
||||
|
||||
def __iter__(self):
|
||||
|
||||
needs_closing = True
|
||||
try:
|
||||
|
||||
# see if we have an actual file
|
||||
if isinstance(self.path, compat.string_types):
|
||||
|
||||
try:
|
||||
path_exists = os.path.exists(self.path)
|
||||
except TypeError:
|
||||
path_exists = False
|
||||
|
||||
if path_exists:
|
||||
fh = open(self.path, 'rb')
|
||||
else:
|
||||
fh = compat.BytesIO(self.path)
|
||||
|
||||
else:
|
||||
|
||||
if not hasattr(self.path, 'read'):
|
||||
fh = compat.BytesIO(self.path)
|
||||
|
||||
else:
|
||||
|
||||
# a file-like
|
||||
needs_closing = False
|
||||
fh = self.path
|
||||
|
||||
unpacker = unpack(fh)
|
||||
for o in unpacker:
|
||||
yield o
|
||||
finally:
|
||||
if needs_closing:
|
||||
fh.close()
|
||||
@@ -1,282 +0,0 @@
|
||||
""" parquet compat """
|
||||
|
||||
from distutils.version import LooseVersion
|
||||
from warnings import catch_warnings
|
||||
|
||||
from pandas.compat import string_types
|
||||
from pandas.errors import AbstractMethodError
|
||||
|
||||
from pandas import DataFrame, get_option
|
||||
|
||||
from pandas.io.common import get_filepath_or_buffer, is_s3_url
|
||||
|
||||
|
||||
def get_engine(engine):
|
||||
""" return our implementation """
|
||||
|
||||
if engine == 'auto':
|
||||
engine = get_option('io.parquet.engine')
|
||||
|
||||
if engine == 'auto':
|
||||
# try engines in this order
|
||||
try:
|
||||
return PyArrowImpl()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
return FastParquetImpl()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
raise ImportError("Unable to find a usable engine; "
|
||||
"tried using: 'pyarrow', 'fastparquet'.\n"
|
||||
"pyarrow or fastparquet is required for parquet "
|
||||
"support")
|
||||
|
||||
if engine not in ['pyarrow', 'fastparquet']:
|
||||
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
|
||||
|
||||
if engine == 'pyarrow':
|
||||
return PyArrowImpl()
|
||||
elif engine == 'fastparquet':
|
||||
return FastParquetImpl()
|
||||
|
||||
|
||||
class BaseImpl(object):
|
||||
|
||||
api = None # module
|
||||
|
||||
@staticmethod
|
||||
def validate_dataframe(df):
|
||||
|
||||
if not isinstance(df, DataFrame):
|
||||
raise ValueError("to_parquet only supports IO with DataFrames")
|
||||
|
||||
# must have value column names (strings only)
|
||||
if df.columns.inferred_type not in {'string', 'unicode'}:
|
||||
raise ValueError("parquet must have string column names")
|
||||
|
||||
# index level names must be strings
|
||||
valid_names = all(
|
||||
isinstance(name, string_types)
|
||||
for name in df.index.names
|
||||
if name is not None
|
||||
)
|
||||
if not valid_names:
|
||||
raise ValueError("Index level names must be strings")
|
||||
|
||||
def write(self, df, path, compression, **kwargs):
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
def read(self, path, columns=None, **kwargs):
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
|
||||
class PyArrowImpl(BaseImpl):
|
||||
|
||||
def __init__(self):
|
||||
# since pandas is a dependency of pyarrow
|
||||
# we need to import on first use
|
||||
try:
|
||||
import pyarrow
|
||||
import pyarrow.parquet
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pyarrow is required for parquet support\n\n"
|
||||
"you can install via conda\n"
|
||||
"conda install pyarrow -c conda-forge\n"
|
||||
"\nor via pip\n"
|
||||
"pip install -U pyarrow\n"
|
||||
)
|
||||
if LooseVersion(pyarrow.__version__) < '0.9.0':
|
||||
raise ImportError(
|
||||
"pyarrow >= 0.9.0 is required for parquet support\n\n"
|
||||
"you can install via conda\n"
|
||||
"conda install pyarrow -c conda-forge\n"
|
||||
"\nor via pip\n"
|
||||
"pip install -U pyarrow\n"
|
||||
)
|
||||
|
||||
self.api = pyarrow
|
||||
|
||||
def write(self, df, path, compression='snappy',
|
||||
coerce_timestamps='ms', index=None, partition_cols=None,
|
||||
**kwargs):
|
||||
self.validate_dataframe(df)
|
||||
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
|
||||
|
||||
if index is None:
|
||||
from_pandas_kwargs = {}
|
||||
else:
|
||||
from_pandas_kwargs = {'preserve_index': index}
|
||||
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
|
||||
if partition_cols is not None:
|
||||
self.api.parquet.write_to_dataset(
|
||||
table, path, compression=compression,
|
||||
coerce_timestamps=coerce_timestamps,
|
||||
partition_cols=partition_cols, **kwargs)
|
||||
else:
|
||||
self.api.parquet.write_table(
|
||||
table, path, compression=compression,
|
||||
coerce_timestamps=coerce_timestamps, **kwargs)
|
||||
|
||||
def read(self, path, columns=None, **kwargs):
|
||||
path, _, _, should_close = get_filepath_or_buffer(path)
|
||||
|
||||
kwargs['use_pandas_metadata'] = True
|
||||
result = self.api.parquet.read_table(path, columns=columns,
|
||||
**kwargs).to_pandas()
|
||||
if should_close:
|
||||
try:
|
||||
path.close()
|
||||
except: # noqa: flake8
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class FastParquetImpl(BaseImpl):
|
||||
|
||||
def __init__(self):
|
||||
# since pandas is a dependency of fastparquet
|
||||
# we need to import on first use
|
||||
try:
|
||||
import fastparquet
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"fastparquet is required for parquet support\n\n"
|
||||
"you can install via conda\n"
|
||||
"conda install fastparquet -c conda-forge\n"
|
||||
"\nor via pip\n"
|
||||
"pip install -U fastparquet"
|
||||
)
|
||||
if LooseVersion(fastparquet.__version__) < '0.2.1':
|
||||
raise ImportError(
|
||||
"fastparquet >= 0.2.1 is required for parquet "
|
||||
"support\n\n"
|
||||
"you can install via conda\n"
|
||||
"conda install fastparquet -c conda-forge\n"
|
||||
"\nor via pip\n"
|
||||
"pip install -U fastparquet"
|
||||
)
|
||||
self.api = fastparquet
|
||||
|
||||
def write(self, df, path, compression='snappy', index=None,
|
||||
partition_cols=None, **kwargs):
|
||||
self.validate_dataframe(df)
|
||||
# thriftpy/protocol/compact.py:339:
|
||||
# DeprecationWarning: tostring() is deprecated.
|
||||
# Use tobytes() instead.
|
||||
|
||||
if 'partition_on' in kwargs and partition_cols is not None:
|
||||
raise ValueError("Cannot use both partition_on and "
|
||||
"partition_cols. Use partition_cols for "
|
||||
"partitioning data")
|
||||
elif 'partition_on' in kwargs:
|
||||
partition_cols = kwargs.pop('partition_on')
|
||||
|
||||
if partition_cols is not None:
|
||||
kwargs['file_scheme'] = 'hive'
|
||||
|
||||
if is_s3_url(path):
|
||||
# path is s3:// so we need to open the s3file in 'wb' mode.
|
||||
# TODO: Support 'ab'
|
||||
|
||||
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
|
||||
# And pass the opened s3file to the fastparquet internal impl.
|
||||
kwargs['open_with'] = lambda path, _: path
|
||||
else:
|
||||
path, _, _, _ = get_filepath_or_buffer(path)
|
||||
|
||||
with catch_warnings(record=True):
|
||||
self.api.write(path, df, compression=compression,
|
||||
write_index=index, partition_on=partition_cols,
|
||||
**kwargs)
|
||||
|
||||
def read(self, path, columns=None, **kwargs):
|
||||
if is_s3_url(path):
|
||||
# When path is s3:// an S3File is returned.
|
||||
# We need to retain the original path(str) while also
|
||||
# pass the S3File().open function to fsatparquet impl.
|
||||
s3, _, _, should_close = get_filepath_or_buffer(path)
|
||||
try:
|
||||
parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open)
|
||||
finally:
|
||||
s3.close()
|
||||
else:
|
||||
path, _, _, _ = get_filepath_or_buffer(path)
|
||||
parquet_file = self.api.ParquetFile(path)
|
||||
|
||||
return parquet_file.to_pandas(columns=columns, **kwargs)
|
||||
|
||||
|
||||
def to_parquet(df, path, engine='auto', compression='snappy', index=None,
|
||||
partition_cols=None, **kwargs):
|
||||
"""
|
||||
Write a DataFrame to the parquet format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
File path or Root Directory path. Will be used as Root Directory path
|
||||
while writing a partitioned dataset.
|
||||
|
||||
.. versionchanged:: 0.24.0
|
||||
|
||||
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
|
||||
Parquet library to use. If 'auto', then the option
|
||||
``io.parquet.engine`` is used. The default ``io.parquet.engine``
|
||||
behavior is to try 'pyarrow', falling back to 'fastparquet' if
|
||||
'pyarrow' is unavailable.
|
||||
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
|
||||
Name of the compression to use. Use ``None`` for no compression.
|
||||
index : bool, default None
|
||||
If ``True``, include the dataframe's index(es) in the file output. If
|
||||
``False``, they will not be written to the file. If ``None``, the
|
||||
engine's default behavior will be used.
|
||||
|
||||
.. versionadded 0.24.0
|
||||
|
||||
partition_cols : list, optional, default None
|
||||
Column names by which to partition the dataset
|
||||
Columns are partitioned in the order they are given
|
||||
|
||||
.. versionadded:: 0.24.0
|
||||
|
||||
kwargs
|
||||
Additional keyword arguments passed to the engine
|
||||
"""
|
||||
impl = get_engine(engine)
|
||||
return impl.write(df, path, compression=compression, index=index,
|
||||
partition_cols=partition_cols, **kwargs)
|
||||
|
||||
|
||||
def read_parquet(path, engine='auto', columns=None, **kwargs):
|
||||
"""
|
||||
Load a parquet object from the file path, returning a DataFrame.
|
||||
|
||||
.. versionadded 0.21.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : string
|
||||
File path
|
||||
columns : list, default=None
|
||||
If not None, only these columns will be read from the file.
|
||||
|
||||
.. versionadded 0.21.1
|
||||
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
|
||||
Parquet library to use. If 'auto', then the option
|
||||
``io.parquet.engine`` is used. The default ``io.parquet.engine``
|
||||
behavior is to try 'pyarrow', falling back to 'fastparquet' if
|
||||
'pyarrow' is unavailable.
|
||||
kwargs are passed to the engine
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
"""
|
||||
|
||||
impl = get_engine(engine)
|
||||
return impl.read(path, columns=columns, **kwargs)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,201 +0,0 @@
|
||||
""" pickle compat """
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from numpy.lib.format import read_array, write_array
|
||||
|
||||
from pandas.compat import PY3, BytesIO, cPickle as pkl, pickle_compat as pc
|
||||
|
||||
from pandas.io.common import _get_handle, _stringify_path
|
||||
|
||||
|
||||
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
|
||||
"""
|
||||
Pickle (serialize) object to file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : any object
|
||||
Any python object.
|
||||
path : str
|
||||
File path where the pickled object will be stored.
|
||||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
|
||||
A string representing the compression to use in the output file. By
|
||||
default, infers from the file extension in specified path.
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
protocol : int
|
||||
Int which indicates which protocol should be used by the pickler,
|
||||
default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
|
||||
values for this parameter depend on the version of Python. For Python
|
||||
2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
|
||||
For Python >= 3.4, 4 is a valid value. A negative value for the
|
||||
protocol parameter is equivalent to setting its value to
|
||||
HIGHEST_PROTOCOL.
|
||||
|
||||
.. [1] https://docs.python.org/3/library/pickle.html
|
||||
.. versionadded:: 0.21.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_pickle : Load pickled pandas object (or any object) from file.
|
||||
DataFrame.to_hdf : Write DataFrame to an HDF5 file.
|
||||
DataFrame.to_sql : Write DataFrame to a SQL database.
|
||||
DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
|
||||
>>> original_df
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
>>> pd.to_pickle(original_df, "./dummy.pkl")
|
||||
|
||||
>>> unpickled_df = pd.read_pickle("./dummy.pkl")
|
||||
>>> unpickled_df
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
|
||||
>>> import os
|
||||
>>> os.remove("./dummy.pkl")
|
||||
"""
|
||||
path = _stringify_path(path)
|
||||
f, fh = _get_handle(path, 'wb',
|
||||
compression=compression,
|
||||
is_text=False)
|
||||
if protocol < 0:
|
||||
protocol = pkl.HIGHEST_PROTOCOL
|
||||
try:
|
||||
f.write(pkl.dumps(obj, protocol=protocol))
|
||||
finally:
|
||||
for _f in fh:
|
||||
_f.close()
|
||||
|
||||
|
||||
def read_pickle(path, compression='infer'):
|
||||
"""
|
||||
Load pickled pandas object (or any object) from file.
|
||||
|
||||
.. warning::
|
||||
|
||||
Loading pickled data received from untrusted sources can be
|
||||
unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
File path where the pickled object will be loaded.
|
||||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
|
||||
For on-the-fly decompression of on-disk data. If 'infer', then use
|
||||
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
|
||||
or '.zip' respectively, and no decompression otherwise.
|
||||
Set to None for no decompression.
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
unpickled : same type as object stored in file
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
|
||||
Series.to_pickle : Pickle (serialize) Series object to file.
|
||||
read_hdf : Read HDF5 file into a DataFrame.
|
||||
read_sql : Read SQL query or database table into a DataFrame.
|
||||
read_parquet : Load a parquet object, returning a DataFrame.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
|
||||
>>> original_df
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
>>> pd.to_pickle(original_df, "./dummy.pkl")
|
||||
|
||||
>>> unpickled_df = pd.read_pickle("./dummy.pkl")
|
||||
>>> unpickled_df
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
|
||||
>>> import os
|
||||
>>> os.remove("./dummy.pkl")
|
||||
"""
|
||||
path = _stringify_path(path)
|
||||
|
||||
def read_wrapper(func):
|
||||
# wrapper file handle open/close operation
|
||||
f, fh = _get_handle(path, 'rb',
|
||||
compression=compression,
|
||||
is_text=False)
|
||||
try:
|
||||
return func(f)
|
||||
finally:
|
||||
for _f in fh:
|
||||
_f.close()
|
||||
|
||||
def try_read(path, encoding=None):
|
||||
# try with cPickle
|
||||
# try with current pickle, if we have a Type Error then
|
||||
# try with the compat pickle to handle subclass changes
|
||||
# pass encoding only if its not None as py2 doesn't handle
|
||||
# the param
|
||||
|
||||
# cpickle
|
||||
# GH 6899
|
||||
try:
|
||||
with warnings.catch_warnings(record=True):
|
||||
# We want to silence any warnings about, e.g. moved modules.
|
||||
warnings.simplefilter("ignore", Warning)
|
||||
return read_wrapper(lambda f: pkl.load(f))
|
||||
except Exception: # noqa: E722
|
||||
# reg/patched pickle
|
||||
# compat not used in pandas/compat/pickle_compat.py::load
|
||||
# TODO: remove except block OR modify pc.load to use compat
|
||||
try:
|
||||
return read_wrapper(
|
||||
lambda f: pc.load(f, encoding=encoding, compat=False))
|
||||
# compat pickle
|
||||
except Exception: # noqa: E722
|
||||
return read_wrapper(
|
||||
lambda f: pc.load(f, encoding=encoding, compat=True))
|
||||
try:
|
||||
return try_read(path)
|
||||
except Exception: # noqa: E722
|
||||
if PY3:
|
||||
return try_read(path, encoding='latin1')
|
||||
raise
|
||||
|
||||
|
||||
# compat with sparse pickle / unpickle
|
||||
|
||||
|
||||
def _pickle_array(arr):
|
||||
arr = arr.view(np.ndarray)
|
||||
|
||||
buf = BytesIO()
|
||||
write_array(buf, arr)
|
||||
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def _unpickle_array(bytes):
|
||||
arr = read_array(BytesIO(bytes))
|
||||
|
||||
return arr
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,40 +0,0 @@
|
||||
""" s3 support for remote file interactivity """
|
||||
from pandas import compat
|
||||
|
||||
try:
|
||||
import s3fs
|
||||
from botocore.exceptions import NoCredentialsError
|
||||
except ImportError:
|
||||
raise ImportError("The s3fs library is required to handle s3 files")
|
||||
|
||||
if compat.PY3:
|
||||
from urllib.parse import urlparse as parse_url
|
||||
else:
|
||||
from urlparse import urlparse as parse_url
|
||||
|
||||
|
||||
def _strip_schema(url):
|
||||
"""Returns the url without the s3:// part"""
|
||||
result = parse_url(url)
|
||||
return result.netloc + result.path
|
||||
|
||||
|
||||
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
|
||||
compression=None, mode=None):
|
||||
|
||||
if mode is None:
|
||||
mode = 'rb'
|
||||
|
||||
fs = s3fs.S3FileSystem(anon=False)
|
||||
try:
|
||||
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
|
||||
except (compat.FileNotFoundError, NoCredentialsError):
|
||||
# boto3 has troubles when trying to access a public file
|
||||
# when credentialed...
|
||||
# An OSError is raised if you have credentials, but they
|
||||
# aren't valid for that bucket.
|
||||
# A NoCredentialsError is raised if you don't have creds
|
||||
# for that bucket.
|
||||
fs = s3fs.S3FileSystem(anon=True)
|
||||
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
|
||||
return filepath_or_buffer, None, compression, True
|
||||
@@ -1 +0,0 @@
|
||||
from .sasreader import read_sas # noqa
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -1,703 +0,0 @@
|
||||
"""
|
||||
Read SAS7BDAT files
|
||||
|
||||
Based on code written by Jared Hobbs:
|
||||
https://bitbucket.org/jaredhobbs/sas7bdat
|
||||
|
||||
See also:
|
||||
https://github.com/BioStatMatt/sas7bdat
|
||||
|
||||
Partial documentation of the file format:
|
||||
https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
|
||||
|
||||
Reference for binary data compression:
|
||||
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
|
||||
"""
|
||||
from datetime import datetime
|
||||
import struct
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.errors import EmptyDataError
|
||||
|
||||
import pandas as pd
|
||||
from pandas import compat
|
||||
|
||||
from pandas.io.common import BaseIterator, get_filepath_or_buffer
|
||||
from pandas.io.sas._sas import Parser
|
||||
import pandas.io.sas.sas_constants as const
|
||||
|
||||
|
||||
class _subheader_pointer(object):
|
||||
pass
|
||||
|
||||
|
||||
class _column(object):
|
||||
pass
|
||||
|
||||
|
||||
# SAS7BDAT represents a SAS data file in SAS7BDAT format.
|
||||
class SAS7BDATReader(BaseIterator):
|
||||
"""
|
||||
Read SAS files in SAS7BDAT format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf : path name or buffer
|
||||
Name of SAS file or file-like object pointing to SAS file
|
||||
contents.
|
||||
index : column identifier, defaults to None
|
||||
Column to use as index.
|
||||
convert_dates : boolean, defaults to True
|
||||
Attempt to convert dates to Pandas datetime values. Note that
|
||||
some rarely used SAS date formats may be unsupported.
|
||||
blank_missing : boolean, defaults to True
|
||||
Convert empty strings to missing values (SAS uses blanks to
|
||||
indicate missing character variables).
|
||||
chunksize : int, defaults to None
|
||||
Return SAS7BDATReader object for iterations, returns chunks
|
||||
with given number of lines.
|
||||
encoding : string, defaults to None
|
||||
String encoding.
|
||||
convert_text : bool, defaults to True
|
||||
If False, text variables are left as raw bytes.
|
||||
convert_header_text : bool, defaults to True
|
||||
If False, header text, including column names, are left as raw
|
||||
bytes.
|
||||
"""
|
||||
|
||||
def __init__(self, path_or_buf, index=None, convert_dates=True,
|
||||
blank_missing=True, chunksize=None, encoding=None,
|
||||
convert_text=True, convert_header_text=True):
|
||||
|
||||
self.index = index
|
||||
self.convert_dates = convert_dates
|
||||
self.blank_missing = blank_missing
|
||||
self.chunksize = chunksize
|
||||
self.encoding = encoding
|
||||
self.convert_text = convert_text
|
||||
self.convert_header_text = convert_header_text
|
||||
|
||||
self.default_encoding = "latin-1"
|
||||
self.compression = ""
|
||||
self.column_names_strings = []
|
||||
self.column_names = []
|
||||
self.column_formats = []
|
||||
self.columns = []
|
||||
|
||||
self._current_page_data_subheader_pointers = []
|
||||
self._cached_page = None
|
||||
self._column_data_lengths = []
|
||||
self._column_data_offsets = []
|
||||
self._column_types = []
|
||||
|
||||
self._current_row_in_file_index = 0
|
||||
self._current_row_on_page_index = 0
|
||||
self._current_row_in_file_index = 0
|
||||
|
||||
self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
|
||||
if isinstance(self._path_or_buf, compat.string_types):
|
||||
self._path_or_buf = open(self._path_or_buf, 'rb')
|
||||
self.handle = self._path_or_buf
|
||||
|
||||
self._get_properties()
|
||||
self._parse_metadata()
|
||||
|
||||
def column_data_lengths(self):
|
||||
"""Return a numpy int64 array of the column data lengths"""
|
||||
return np.asarray(self._column_data_lengths, dtype=np.int64)
|
||||
|
||||
def column_data_offsets(self):
|
||||
"""Return a numpy int64 array of the column offsets"""
|
||||
return np.asarray(self._column_data_offsets, dtype=np.int64)
|
||||
|
||||
def column_types(self):
|
||||
"""Returns a numpy character array of the column types:
|
||||
s (string) or d (double)"""
|
||||
return np.asarray(self._column_types, dtype=np.dtype('S1'))
|
||||
|
||||
def close(self):
|
||||
try:
|
||||
self.handle.close()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
def _get_properties(self):
|
||||
|
||||
# Check magic number
|
||||
self._path_or_buf.seek(0)
|
||||
self._cached_page = self._path_or_buf.read(288)
|
||||
if self._cached_page[0:len(const.magic)] != const.magic:
|
||||
self.close()
|
||||
raise ValueError("magic number mismatch (not a SAS file?)")
|
||||
|
||||
# Get alignment information
|
||||
align1, align2 = 0, 0
|
||||
buf = self._read_bytes(const.align_1_offset, const.align_1_length)
|
||||
if buf == const.u64_byte_checker_value:
|
||||
align2 = const.align_2_value
|
||||
self.U64 = True
|
||||
self._int_length = 8
|
||||
self._page_bit_offset = const.page_bit_offset_x64
|
||||
self._subheader_pointer_length = const.subheader_pointer_length_x64
|
||||
else:
|
||||
self.U64 = False
|
||||
self._page_bit_offset = const.page_bit_offset_x86
|
||||
self._subheader_pointer_length = const.subheader_pointer_length_x86
|
||||
self._int_length = 4
|
||||
buf = self._read_bytes(const.align_2_offset, const.align_2_length)
|
||||
if buf == const.align_1_checker_value:
|
||||
align1 = const.align_2_value
|
||||
total_align = align1 + align2
|
||||
|
||||
# Get endianness information
|
||||
buf = self._read_bytes(const.endianness_offset,
|
||||
const.endianness_length)
|
||||
if buf == b'\x01':
|
||||
self.byte_order = "<"
|
||||
else:
|
||||
self.byte_order = ">"
|
||||
|
||||
# Get encoding information
|
||||
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
|
||||
if buf in const.encoding_names:
|
||||
self.file_encoding = const.encoding_names[buf]
|
||||
else:
|
||||
self.file_encoding = "unknown (code={name!s})".format(name=buf)
|
||||
|
||||
# Get platform information
|
||||
buf = self._read_bytes(const.platform_offset, const.platform_length)
|
||||
if buf == b'1':
|
||||
self.platform = "unix"
|
||||
elif buf == b'2':
|
||||
self.platform = "windows"
|
||||
else:
|
||||
self.platform = "unknown"
|
||||
|
||||
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
|
||||
self.name = buf.rstrip(b'\x00 ')
|
||||
if self.convert_header_text:
|
||||
self.name = self.name.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
|
||||
self.file_type = buf.rstrip(b'\x00 ')
|
||||
if self.convert_header_text:
|
||||
self.file_type = self.file_type.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
# Timestamp is epoch 01/01/1960
|
||||
epoch = datetime(1960, 1, 1)
|
||||
x = self._read_float(const.date_created_offset + align1,
|
||||
const.date_created_length)
|
||||
self.date_created = epoch + pd.to_timedelta(x, unit='s')
|
||||
x = self._read_float(const.date_modified_offset + align1,
|
||||
const.date_modified_length)
|
||||
self.date_modified = epoch + pd.to_timedelta(x, unit='s')
|
||||
|
||||
self.header_length = self._read_int(const.header_size_offset + align1,
|
||||
const.header_size_length)
|
||||
|
||||
# Read the rest of the header into cached_page.
|
||||
buf = self._path_or_buf.read(self.header_length - 288)
|
||||
self._cached_page += buf
|
||||
if len(self._cached_page) != self.header_length:
|
||||
self.close()
|
||||
raise ValueError("The SAS7BDAT file appears to be truncated.")
|
||||
|
||||
self._page_length = self._read_int(const.page_size_offset + align1,
|
||||
const.page_size_length)
|
||||
self._page_count = self._read_int(const.page_count_offset + align1,
|
||||
const.page_count_length)
|
||||
|
||||
buf = self._read_bytes(const.sas_release_offset + total_align,
|
||||
const.sas_release_length)
|
||||
self.sas_release = buf.rstrip(b'\x00 ')
|
||||
if self.convert_header_text:
|
||||
self.sas_release = self.sas_release.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
buf = self._read_bytes(const.sas_server_type_offset + total_align,
|
||||
const.sas_server_type_length)
|
||||
self.server_type = buf.rstrip(b'\x00 ')
|
||||
if self.convert_header_text:
|
||||
self.server_type = self.server_type.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
buf = self._read_bytes(const.os_version_number_offset + total_align,
|
||||
const.os_version_number_length)
|
||||
self.os_version = buf.rstrip(b'\x00 ')
|
||||
if self.convert_header_text:
|
||||
self.os_version = self.os_version.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
buf = self._read_bytes(const.os_name_offset + total_align,
|
||||
const.os_name_length)
|
||||
buf = buf.rstrip(b'\x00 ')
|
||||
if len(buf) > 0:
|
||||
self.os_name = buf.decode(self.encoding or self.default_encoding)
|
||||
else:
|
||||
buf = self._read_bytes(const.os_maker_offset + total_align,
|
||||
const.os_maker_length)
|
||||
self.os_name = buf.rstrip(b'\x00 ')
|
||||
if self.convert_header_text:
|
||||
self.os_name = self.os_name.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
def __next__(self):
|
||||
da = self.read(nrows=self.chunksize or 1)
|
||||
if da is None:
|
||||
raise StopIteration
|
||||
return da
|
||||
|
||||
# Read a single float of the given width (4 or 8).
|
||||
def _read_float(self, offset, width):
|
||||
if width not in (4, 8):
|
||||
self.close()
|
||||
raise ValueError("invalid float width")
|
||||
buf = self._read_bytes(offset, width)
|
||||
fd = "f" if width == 4 else "d"
|
||||
return struct.unpack(self.byte_order + fd, buf)[0]
|
||||
|
||||
# Read a single signed integer of the given width (1, 2, 4 or 8).
|
||||
def _read_int(self, offset, width):
|
||||
if width not in (1, 2, 4, 8):
|
||||
self.close()
|
||||
raise ValueError("invalid int width")
|
||||
buf = self._read_bytes(offset, width)
|
||||
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
|
||||
iv = struct.unpack(self.byte_order + it, buf)[0]
|
||||
return iv
|
||||
|
||||
def _read_bytes(self, offset, length):
|
||||
if self._cached_page is None:
|
||||
self._path_or_buf.seek(offset)
|
||||
buf = self._path_or_buf.read(length)
|
||||
if len(buf) < length:
|
||||
self.close()
|
||||
msg = "Unable to read {:d} bytes from file position {:d}."
|
||||
raise ValueError(msg.format(length, offset))
|
||||
return buf
|
||||
else:
|
||||
if offset + length > len(self._cached_page):
|
||||
self.close()
|
||||
raise ValueError("The cached page is too small.")
|
||||
return self._cached_page[offset:offset + length]
|
||||
|
||||
def _parse_metadata(self):
|
||||
done = False
|
||||
while not done:
|
||||
self._cached_page = self._path_or_buf.read(self._page_length)
|
||||
if len(self._cached_page) <= 0:
|
||||
break
|
||||
if len(self._cached_page) != self._page_length:
|
||||
self.close()
|
||||
raise ValueError(
|
||||
"Failed to read a meta data page from the SAS file.")
|
||||
done = self._process_page_meta()
|
||||
|
||||
def _process_page_meta(self):
|
||||
self._read_page_header()
|
||||
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
|
||||
if self._current_page_type in pt:
|
||||
self._process_page_metadata()
|
||||
is_data_page = self._current_page_type & const.page_data_type
|
||||
is_mix_page = self._current_page_type in const.page_mix_types
|
||||
return (is_data_page or is_mix_page
|
||||
or self._current_page_data_subheader_pointers != [])
|
||||
|
||||
def _read_page_header(self):
|
||||
bit_offset = self._page_bit_offset
|
||||
tx = const.page_type_offset + bit_offset
|
||||
self._current_page_type = self._read_int(tx, const.page_type_length)
|
||||
tx = const.block_count_offset + bit_offset
|
||||
self._current_page_block_count = self._read_int(
|
||||
tx, const.block_count_length)
|
||||
tx = const.subheader_count_offset + bit_offset
|
||||
self._current_page_subheaders_count = (
|
||||
self._read_int(tx, const.subheader_count_length))
|
||||
|
||||
def _process_page_metadata(self):
|
||||
bit_offset = self._page_bit_offset
|
||||
|
||||
for i in range(self._current_page_subheaders_count):
|
||||
pointer = self._process_subheader_pointers(
|
||||
const.subheader_pointers_offset + bit_offset, i)
|
||||
if pointer.length == 0:
|
||||
continue
|
||||
if pointer.compression == const.truncated_subheader_id:
|
||||
continue
|
||||
subheader_signature = self._read_subheader_signature(
|
||||
pointer.offset)
|
||||
subheader_index = (
|
||||
self._get_subheader_index(subheader_signature,
|
||||
pointer.compression, pointer.ptype))
|
||||
self._process_subheader(subheader_index, pointer)
|
||||
|
||||
def _get_subheader_index(self, signature, compression, ptype):
|
||||
index = const.subheader_signature_to_index.get(signature)
|
||||
if index is None:
|
||||
f1 = ((compression == const.compressed_subheader_id) or
|
||||
(compression == 0))
|
||||
f2 = (ptype == const.compressed_subheader_type)
|
||||
if (self.compression != "") and f1 and f2:
|
||||
index = const.SASIndex.data_subheader_index
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError("Unknown subheader signature")
|
||||
return index
|
||||
|
||||
def _process_subheader_pointers(self, offset, subheader_pointer_index):
|
||||
|
||||
subheader_pointer_length = self._subheader_pointer_length
|
||||
total_offset = (offset +
|
||||
subheader_pointer_length * subheader_pointer_index)
|
||||
|
||||
subheader_offset = self._read_int(total_offset, self._int_length)
|
||||
total_offset += self._int_length
|
||||
|
||||
subheader_length = self._read_int(total_offset, self._int_length)
|
||||
total_offset += self._int_length
|
||||
|
||||
subheader_compression = self._read_int(total_offset, 1)
|
||||
total_offset += 1
|
||||
|
||||
subheader_type = self._read_int(total_offset, 1)
|
||||
|
||||
x = _subheader_pointer()
|
||||
x.offset = subheader_offset
|
||||
x.length = subheader_length
|
||||
x.compression = subheader_compression
|
||||
x.ptype = subheader_type
|
||||
|
||||
return x
|
||||
|
||||
def _read_subheader_signature(self, offset):
|
||||
subheader_signature = self._read_bytes(offset, self._int_length)
|
||||
return subheader_signature
|
||||
|
||||
def _process_subheader(self, subheader_index, pointer):
|
||||
offset = pointer.offset
|
||||
length = pointer.length
|
||||
|
||||
if subheader_index == const.SASIndex.row_size_index:
|
||||
processor = self._process_rowsize_subheader
|
||||
elif subheader_index == const.SASIndex.column_size_index:
|
||||
processor = self._process_columnsize_subheader
|
||||
elif subheader_index == const.SASIndex.column_text_index:
|
||||
processor = self._process_columntext_subheader
|
||||
elif subheader_index == const.SASIndex.column_name_index:
|
||||
processor = self._process_columnname_subheader
|
||||
elif subheader_index == const.SASIndex.column_attributes_index:
|
||||
processor = self._process_columnattributes_subheader
|
||||
elif subheader_index == const.SASIndex.format_and_label_index:
|
||||
processor = self._process_format_subheader
|
||||
elif subheader_index == const.SASIndex.column_list_index:
|
||||
processor = self._process_columnlist_subheader
|
||||
elif subheader_index == const.SASIndex.subheader_counts_index:
|
||||
processor = self._process_subheader_counts
|
||||
elif subheader_index == const.SASIndex.data_subheader_index:
|
||||
self._current_page_data_subheader_pointers.append(pointer)
|
||||
return
|
||||
else:
|
||||
raise ValueError("unknown subheader index")
|
||||
|
||||
processor(offset, length)
|
||||
|
||||
def _process_rowsize_subheader(self, offset, length):
|
||||
|
||||
int_len = self._int_length
|
||||
lcs_offset = offset
|
||||
lcp_offset = offset
|
||||
if self.U64:
|
||||
lcs_offset += 682
|
||||
lcp_offset += 706
|
||||
else:
|
||||
lcs_offset += 354
|
||||
lcp_offset += 378
|
||||
|
||||
self.row_length = self._read_int(
|
||||
offset + const.row_length_offset_multiplier * int_len, int_len)
|
||||
self.row_count = self._read_int(
|
||||
offset + const.row_count_offset_multiplier * int_len, int_len)
|
||||
self.col_count_p1 = self._read_int(
|
||||
offset + const.col_count_p1_multiplier * int_len, int_len)
|
||||
self.col_count_p2 = self._read_int(
|
||||
offset + const.col_count_p2_multiplier * int_len, int_len)
|
||||
mx = const.row_count_on_mix_page_offset_multiplier * int_len
|
||||
self._mix_page_row_count = self._read_int(offset + mx, int_len)
|
||||
self._lcs = self._read_int(lcs_offset, 2)
|
||||
self._lcp = self._read_int(lcp_offset, 2)
|
||||
|
||||
def _process_columnsize_subheader(self, offset, length):
|
||||
int_len = self._int_length
|
||||
offset += int_len
|
||||
self.column_count = self._read_int(offset, int_len)
|
||||
if (self.col_count_p1 + self.col_count_p2 !=
|
||||
self.column_count):
|
||||
print(
|
||||
"Warning: column count mismatch ({p1} + {p2} != "
|
||||
"{column_count})\n".format(
|
||||
p1=self.col_count_p1, p2=self.col_count_p2,
|
||||
column_count=self.column_count))
|
||||
|
||||
# Unknown purpose
|
||||
def _process_subheader_counts(self, offset, length):
|
||||
pass
|
||||
|
||||
def _process_columntext_subheader(self, offset, length):
|
||||
|
||||
offset += self._int_length
|
||||
text_block_size = self._read_int(offset, const.text_block_size_length)
|
||||
|
||||
buf = self._read_bytes(offset, text_block_size)
|
||||
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
|
||||
cname = cname_raw
|
||||
if self.convert_header_text:
|
||||
cname = cname.decode(self.encoding or self.default_encoding)
|
||||
self.column_names_strings.append(cname)
|
||||
|
||||
if len(self.column_names_strings) == 1:
|
||||
compression_literal = ""
|
||||
for cl in const.compression_literals:
|
||||
if cl in cname_raw:
|
||||
compression_literal = cl
|
||||
self.compression = compression_literal
|
||||
offset -= self._int_length
|
||||
|
||||
offset1 = offset + 16
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
compression_literal = buf.rstrip(b"\x00")
|
||||
if compression_literal == "":
|
||||
self._lcs = 0
|
||||
offset1 = offset + 32
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
self.creator_proc = buf[0:self._lcp]
|
||||
elif compression_literal == const.rle_compression:
|
||||
offset1 = offset + 40
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
self.creator_proc = buf[0:self._lcp]
|
||||
elif self._lcs > 0:
|
||||
self._lcp = 0
|
||||
offset1 = offset + 16
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcs)
|
||||
self.creator_proc = buf[0:self._lcp]
|
||||
if self.convert_header_text:
|
||||
if hasattr(self, "creator_proc"):
|
||||
self.creator_proc = self.creator_proc.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
def _process_columnname_subheader(self, offset, length):
|
||||
int_len = self._int_length
|
||||
offset += int_len
|
||||
column_name_pointers_count = (length - 2 * int_len - 12) // 8
|
||||
for i in range(column_name_pointers_count):
|
||||
text_subheader = offset + const.column_name_pointer_length * \
|
||||
(i + 1) + const.column_name_text_subheader_offset
|
||||
col_name_offset = offset + const.column_name_pointer_length * \
|
||||
(i + 1) + const.column_name_offset_offset
|
||||
col_name_length = offset + const.column_name_pointer_length * \
|
||||
(i + 1) + const.column_name_length_offset
|
||||
|
||||
idx = self._read_int(
|
||||
text_subheader, const.column_name_text_subheader_length)
|
||||
col_offset = self._read_int(
|
||||
col_name_offset, const.column_name_offset_length)
|
||||
col_len = self._read_int(
|
||||
col_name_length, const.column_name_length_length)
|
||||
|
||||
name_str = self.column_names_strings[idx]
|
||||
self.column_names.append(name_str[col_offset:col_offset + col_len])
|
||||
|
||||
def _process_columnattributes_subheader(self, offset, length):
|
||||
int_len = self._int_length
|
||||
column_attributes_vectors_count = (
|
||||
length - 2 * int_len - 12) // (int_len + 8)
|
||||
for i in range(column_attributes_vectors_count):
|
||||
col_data_offset = (offset + int_len +
|
||||
const.column_data_offset_offset +
|
||||
i * (int_len + 8))
|
||||
col_data_len = (offset + 2 * int_len +
|
||||
const.column_data_length_offset +
|
||||
i * (int_len + 8))
|
||||
col_types = (offset + 2 * int_len +
|
||||
const.column_type_offset + i * (int_len + 8))
|
||||
|
||||
x = self._read_int(col_data_offset, int_len)
|
||||
self._column_data_offsets.append(x)
|
||||
|
||||
x = self._read_int(col_data_len, const.column_data_length_length)
|
||||
self._column_data_lengths.append(x)
|
||||
|
||||
x = self._read_int(col_types, const.column_type_length)
|
||||
self._column_types.append(b'd' if x == 1 else b's')
|
||||
|
||||
def _process_columnlist_subheader(self, offset, length):
|
||||
# unknown purpose
|
||||
pass
|
||||
|
||||
def _process_format_subheader(self, offset, length):
|
||||
int_len = self._int_length
|
||||
text_subheader_format = (
|
||||
offset +
|
||||
const.column_format_text_subheader_index_offset +
|
||||
3 * int_len)
|
||||
col_format_offset = (offset +
|
||||
const.column_format_offset_offset +
|
||||
3 * int_len)
|
||||
col_format_len = (offset +
|
||||
const.column_format_length_offset +
|
||||
3 * int_len)
|
||||
text_subheader_label = (
|
||||
offset +
|
||||
const.column_label_text_subheader_index_offset +
|
||||
3 * int_len)
|
||||
col_label_offset = (offset +
|
||||
const.column_label_offset_offset +
|
||||
3 * int_len)
|
||||
col_label_len = offset + const.column_label_length_offset + 3 * int_len
|
||||
|
||||
x = self._read_int(text_subheader_format,
|
||||
const.column_format_text_subheader_index_length)
|
||||
format_idx = min(x, len(self.column_names_strings) - 1)
|
||||
|
||||
format_start = self._read_int(
|
||||
col_format_offset, const.column_format_offset_length)
|
||||
format_len = self._read_int(
|
||||
col_format_len, const.column_format_length_length)
|
||||
|
||||
label_idx = self._read_int(
|
||||
text_subheader_label,
|
||||
const.column_label_text_subheader_index_length)
|
||||
label_idx = min(label_idx, len(self.column_names_strings) - 1)
|
||||
|
||||
label_start = self._read_int(
|
||||
col_label_offset, const.column_label_offset_length)
|
||||
label_len = self._read_int(col_label_len,
|
||||
const.column_label_length_length)
|
||||
|
||||
label_names = self.column_names_strings[label_idx]
|
||||
column_label = label_names[label_start: label_start + label_len]
|
||||
format_names = self.column_names_strings[format_idx]
|
||||
column_format = format_names[format_start: format_start + format_len]
|
||||
current_column_number = len(self.columns)
|
||||
|
||||
col = _column()
|
||||
col.col_id = current_column_number
|
||||
col.name = self.column_names[current_column_number]
|
||||
col.label = column_label
|
||||
col.format = column_format
|
||||
col.ctype = self._column_types[current_column_number]
|
||||
col.length = self._column_data_lengths[current_column_number]
|
||||
|
||||
self.column_formats.append(column_format)
|
||||
self.columns.append(col)
|
||||
|
||||
def read(self, nrows=None):
|
||||
|
||||
if (nrows is None) and (self.chunksize is not None):
|
||||
nrows = self.chunksize
|
||||
elif nrows is None:
|
||||
nrows = self.row_count
|
||||
|
||||
if len(self._column_types) == 0:
|
||||
self.close()
|
||||
raise EmptyDataError("No columns to parse from file")
|
||||
|
||||
if self._current_row_in_file_index >= self.row_count:
|
||||
return None
|
||||
|
||||
m = self.row_count - self._current_row_in_file_index
|
||||
if nrows > m:
|
||||
nrows = m
|
||||
|
||||
nd = self._column_types.count(b'd')
|
||||
ns = self._column_types.count(b's')
|
||||
|
||||
self._string_chunk = np.empty((ns, nrows), dtype=np.object)
|
||||
self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
|
||||
|
||||
self._current_row_in_chunk_index = 0
|
||||
p = Parser(self)
|
||||
p.read(nrows)
|
||||
|
||||
rslt = self._chunk_to_dataframe()
|
||||
if self.index is not None:
|
||||
rslt = rslt.set_index(self.index)
|
||||
|
||||
return rslt
|
||||
|
||||
def _read_next_page(self):
|
||||
self._current_page_data_subheader_pointers = []
|
||||
self._cached_page = self._path_or_buf.read(self._page_length)
|
||||
if len(self._cached_page) <= 0:
|
||||
return True
|
||||
elif len(self._cached_page) != self._page_length:
|
||||
self.close()
|
||||
msg = ("failed to read complete page from file "
|
||||
"(read {:d} of {:d} bytes)")
|
||||
raise ValueError(msg.format(len(self._cached_page),
|
||||
self._page_length))
|
||||
|
||||
self._read_page_header()
|
||||
page_type = self._current_page_type
|
||||
if page_type == const.page_meta_type:
|
||||
self._process_page_metadata()
|
||||
|
||||
is_data_page = page_type & const.page_data_type
|
||||
pt = [const.page_meta_type] + const.page_mix_types
|
||||
if not is_data_page and self._current_page_type not in pt:
|
||||
return self._read_next_page()
|
||||
|
||||
return False
|
||||
|
||||
def _chunk_to_dataframe(self):
|
||||
|
||||
n = self._current_row_in_chunk_index
|
||||
m = self._current_row_in_file_index
|
||||
ix = range(m - n, m)
|
||||
rslt = pd.DataFrame(index=ix)
|
||||
|
||||
js, jb = 0, 0
|
||||
for j in range(self.column_count):
|
||||
|
||||
name = self.column_names[j]
|
||||
|
||||
if self._column_types[j] == b'd':
|
||||
rslt[name] = self._byte_chunk[jb, :].view(
|
||||
dtype=self.byte_order + 'd')
|
||||
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
|
||||
if self.convert_dates:
|
||||
unit = None
|
||||
if self.column_formats[j] in const.sas_date_formats:
|
||||
unit = 'd'
|
||||
elif self.column_formats[j] in const.sas_datetime_formats:
|
||||
unit = 's'
|
||||
if unit:
|
||||
rslt[name] = pd.to_datetime(rslt[name], unit=unit,
|
||||
origin="1960-01-01")
|
||||
jb += 1
|
||||
elif self._column_types[j] == b's':
|
||||
rslt[name] = self._string_chunk[js, :]
|
||||
if self.convert_text and (self.encoding is not None):
|
||||
rslt[name] = rslt[name].str.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
if self.blank_missing:
|
||||
ii = rslt[name].str.len() == 0
|
||||
rslt.loc[ii, name] = np.nan
|
||||
js += 1
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError("unknown column type {type}".format(
|
||||
type=self._column_types[j]))
|
||||
|
||||
return rslt
|
||||
@@ -1,171 +0,0 @@
|
||||
magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" +
|
||||
b"\x00\x00\x00\x00\xc2\xea\x81\x60" +
|
||||
b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" +
|
||||
b"\x09\xc7\x31\x8c\x18\x1f\x10\x11")
|
||||
|
||||
align_1_checker_value = b'3'
|
||||
align_1_offset = 32
|
||||
align_1_length = 1
|
||||
align_1_value = 4
|
||||
u64_byte_checker_value = b'3'
|
||||
align_2_offset = 35
|
||||
align_2_length = 1
|
||||
align_2_value = 4
|
||||
endianness_offset = 37
|
||||
endianness_length = 1
|
||||
platform_offset = 39
|
||||
platform_length = 1
|
||||
encoding_offset = 70
|
||||
encoding_length = 1
|
||||
dataset_offset = 92
|
||||
dataset_length = 64
|
||||
file_type_offset = 156
|
||||
file_type_length = 8
|
||||
date_created_offset = 164
|
||||
date_created_length = 8
|
||||
date_modified_offset = 172
|
||||
date_modified_length = 8
|
||||
header_size_offset = 196
|
||||
header_size_length = 4
|
||||
page_size_offset = 200
|
||||
page_size_length = 4
|
||||
page_count_offset = 204
|
||||
page_count_length = 4
|
||||
sas_release_offset = 216
|
||||
sas_release_length = 8
|
||||
sas_server_type_offset = 224
|
||||
sas_server_type_length = 16
|
||||
os_version_number_offset = 240
|
||||
os_version_number_length = 16
|
||||
os_maker_offset = 256
|
||||
os_maker_length = 16
|
||||
os_name_offset = 272
|
||||
os_name_length = 16
|
||||
page_bit_offset_x86 = 16
|
||||
page_bit_offset_x64 = 32
|
||||
subheader_pointer_length_x86 = 12
|
||||
subheader_pointer_length_x64 = 24
|
||||
page_type_offset = 0
|
||||
page_type_length = 2
|
||||
block_count_offset = 2
|
||||
block_count_length = 2
|
||||
subheader_count_offset = 4
|
||||
subheader_count_length = 2
|
||||
page_meta_type = 0
|
||||
page_data_type = 256
|
||||
page_amd_type = 1024
|
||||
page_metc_type = 16384
|
||||
page_comp_type = -28672
|
||||
page_mix_types = [512, 640]
|
||||
subheader_pointers_offset = 8
|
||||
truncated_subheader_id = 1
|
||||
compressed_subheader_id = 4
|
||||
compressed_subheader_type = 1
|
||||
text_block_size_length = 2
|
||||
row_length_offset_multiplier = 5
|
||||
row_count_offset_multiplier = 6
|
||||
col_count_p1_multiplier = 9
|
||||
col_count_p2_multiplier = 10
|
||||
row_count_on_mix_page_offset_multiplier = 15
|
||||
column_name_pointer_length = 8
|
||||
column_name_text_subheader_offset = 0
|
||||
column_name_text_subheader_length = 2
|
||||
column_name_offset_offset = 2
|
||||
column_name_offset_length = 2
|
||||
column_name_length_offset = 4
|
||||
column_name_length_length = 2
|
||||
column_data_offset_offset = 8
|
||||
column_data_length_offset = 8
|
||||
column_data_length_length = 4
|
||||
column_type_offset = 14
|
||||
column_type_length = 1
|
||||
column_format_text_subheader_index_offset = 22
|
||||
column_format_text_subheader_index_length = 2
|
||||
column_format_offset_offset = 24
|
||||
column_format_offset_length = 2
|
||||
column_format_length_offset = 26
|
||||
column_format_length_length = 2
|
||||
column_label_text_subheader_index_offset = 28
|
||||
column_label_text_subheader_index_length = 2
|
||||
column_label_offset_offset = 30
|
||||
column_label_offset_length = 2
|
||||
column_label_length_offset = 32
|
||||
column_label_length_length = 2
|
||||
rle_compression = b'SASYZCRL'
|
||||
rdc_compression = b'SASYZCR2'
|
||||
|
||||
compression_literals = [rle_compression, rdc_compression]
|
||||
|
||||
# Incomplete list of encodings, using SAS nomenclature:
|
||||
# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
|
||||
encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2",
|
||||
61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"}
|
||||
|
||||
|
||||
class SASIndex(object):
|
||||
row_size_index = 0
|
||||
column_size_index = 1
|
||||
subheader_counts_index = 2
|
||||
column_text_index = 3
|
||||
column_name_index = 4
|
||||
column_attributes_index = 5
|
||||
format_and_label_index = 6
|
||||
column_list_index = 7
|
||||
data_subheader_index = 8
|
||||
|
||||
|
||||
subheader_signature_to_index = {
|
||||
b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
|
||||
b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
|
||||
b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
|
||||
b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
|
||||
b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
|
||||
b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
|
||||
b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
|
||||
b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
|
||||
b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
|
||||
b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
|
||||
b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
|
||||
b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
|
||||
b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
|
||||
b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
|
||||
b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
|
||||
b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
|
||||
b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
|
||||
b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
|
||||
b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
|
||||
b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
|
||||
b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
|
||||
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index}
|
||||
|
||||
|
||||
# List of frequently used SAS date and datetime formats
|
||||
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
|
||||
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
|
||||
sas_date_formats = ("DATE", "DAY", "DDMMYY", "DOWNAME", "JULDAY", "JULIAN",
|
||||
"MMDDYY", "MMYY", "MMYYC", "MMYYD", "MMYYP", "MMYYS",
|
||||
"MMYYN", "MONNAME", "MONTH", "MONYY", "QTR", "QTRR",
|
||||
"NENGO", "WEEKDATE", "WEEKDATX", "WEEKDAY", "WEEKV",
|
||||
"WORDDATE", "WORDDATX", "YEAR", "YYMM", "YYMMC", "YYMMD",
|
||||
"YYMMP", "YYMMS", "YYMMN", "YYMON", "YYMMDD", "YYQ",
|
||||
"YYQC", "YYQD", "YYQP", "YYQS", "YYQN", "YYQR", "YYQRC",
|
||||
"YYQRD", "YYQRP", "YYQRS", "YYQRN",
|
||||
"YYMMDDP", "YYMMDDC", "E8601DA", "YYMMDDN", "MMDDYYC",
|
||||
"MMDDYYS", "MMDDYYD", "YYMMDDS", "B8601DA", "DDMMYYN",
|
||||
"YYMMDDD", "DDMMYYB", "DDMMYYP", "MMDDYYP", "YYMMDDB",
|
||||
"MMDDYYN", "DDMMYYC", "DDMMYYD", "DDMMYYS",
|
||||
"MINGUO")
|
||||
|
||||
sas_datetime_formats = ("DATETIME", "DTWKDATX",
|
||||
"B8601DN", "B8601DT", "B8601DX", "B8601DZ", "B8601LX",
|
||||
"E8601DN", "E8601DT", "E8601DX", "E8601DZ", "E8601LX",
|
||||
"DATEAMPM", "DTDATE", "DTMONYY", "DTMONYY", "DTWKDATX",
|
||||
"DTYEAR", "TOD", "MDYAMPM")
|
||||
@@ -1,464 +0,0 @@
|
||||
"""
|
||||
Read a SAS XPort format file into a Pandas DataFrame.
|
||||
|
||||
Based on code from Jack Cushman (github.com/jcushman/xport).
|
||||
|
||||
The file format is defined here:
|
||||
|
||||
https://support.sas.com/techsup/technote/ts140.pdf
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
import struct
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.util._decorators import Appender
|
||||
|
||||
import pandas as pd
|
||||
from pandas import compat
|
||||
|
||||
from pandas.io.common import BaseIterator, get_filepath_or_buffer
|
||||
|
||||
_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 ")
|
||||
_correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!"
|
||||
"000000000000000001600000000")
|
||||
_correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 ")
|
||||
_correct_obs_header = ("HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 ")
|
||||
_fieldkeys = ['ntype', 'nhfun', 'field_length', 'nvar0', 'name', 'label',
|
||||
'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform',
|
||||
'nifl', 'nifd', 'npos', '_']
|
||||
|
||||
|
||||
_base_params_doc = """\
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : string or file-like object
|
||||
Path to SAS file or object implementing binary read method."""
|
||||
|
||||
_params2_doc = """\
|
||||
index : identifier of index column
|
||||
Identifier of column that should be used as index of the DataFrame.
|
||||
encoding : string
|
||||
Encoding for text data.
|
||||
chunksize : int
|
||||
Read file `chunksize` lines at a time, returns iterator."""
|
||||
|
||||
_format_params_doc = """\
|
||||
format : string
|
||||
File format, only `xport` is currently supported."""
|
||||
|
||||
_iterator_doc = """\
|
||||
iterator : boolean, default False
|
||||
Return XportReader object for reading file incrementally."""
|
||||
|
||||
|
||||
_read_sas_doc = """Read a SAS file into a DataFrame.
|
||||
|
||||
%(_base_params_doc)s
|
||||
%(_format_params_doc)s
|
||||
%(_params2_doc)s
|
||||
%(_iterator_doc)s
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or XportReader
|
||||
|
||||
Examples
|
||||
--------
|
||||
Read a SAS Xport file:
|
||||
|
||||
>>> df = pd.read_sas('filename.XPT')
|
||||
|
||||
Read a Xport file in 10,000 line chunks:
|
||||
|
||||
>>> itr = pd.read_sas('filename.XPT', chunksize=10000)
|
||||
>>> for chunk in itr:
|
||||
>>> do_something(chunk)
|
||||
|
||||
""" % {"_base_params_doc": _base_params_doc,
|
||||
"_format_params_doc": _format_params_doc,
|
||||
"_params2_doc": _params2_doc,
|
||||
"_iterator_doc": _iterator_doc}
|
||||
|
||||
|
||||
_xport_reader_doc = """\
|
||||
Class for reading SAS Xport files.
|
||||
|
||||
%(_base_params_doc)s
|
||||
%(_params2_doc)s
|
||||
|
||||
Attributes
|
||||
----------
|
||||
member_info : list
|
||||
Contains information about the file
|
||||
fields : list
|
||||
Contains information about the variables in the file
|
||||
""" % {"_base_params_doc": _base_params_doc,
|
||||
"_params2_doc": _params2_doc}
|
||||
|
||||
|
||||
_read_method_doc = """\
|
||||
Read observations from SAS Xport file, returning as data frame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nrows : int
|
||||
Number of rows to read from data file; if None, read whole
|
||||
file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A DataFrame.
|
||||
"""
|
||||
|
||||
|
||||
def _parse_date(datestr):
|
||||
""" Given a date in xport format, return Python date. """
|
||||
try:
|
||||
# e.g. "16FEB11:10:07:55"
|
||||
return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
|
||||
except ValueError:
|
||||
return pd.NaT
|
||||
|
||||
|
||||
def _split_line(s, parts):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
s: string
|
||||
Fixed-length string to split
|
||||
parts: list of (name, length) pairs
|
||||
Used to break up string, name '_' will be filtered from output.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict of name:contents of string at given location.
|
||||
"""
|
||||
out = {}
|
||||
start = 0
|
||||
for name, length in parts:
|
||||
out[name] = s[start:start + length].strip()
|
||||
start += length
|
||||
del out['_']
|
||||
return out
|
||||
|
||||
|
||||
def _handle_truncated_float_vec(vec, nbytes):
|
||||
# This feature is not well documented, but some SAS XPORT files
|
||||
# have 2-7 byte "truncated" floats. To read these truncated
|
||||
# floats, pad them with zeros on the right to make 8 byte floats.
|
||||
#
|
||||
# References:
|
||||
# https://github.com/jcushman/xport/pull/3
|
||||
# The R "foreign" library
|
||||
|
||||
if nbytes != 8:
|
||||
vec1 = np.zeros(len(vec), np.dtype('S8'))
|
||||
dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes))
|
||||
vec2 = vec1.view(dtype=dtype)
|
||||
vec2['f0'] = vec
|
||||
return vec2
|
||||
|
||||
return vec
|
||||
|
||||
|
||||
def _parse_float_vec(vec):
|
||||
"""
|
||||
Parse a vector of float values representing IBM 8 byte floats into
|
||||
native 8 byte floats.
|
||||
"""
|
||||
|
||||
dtype = np.dtype('>u4,>u4')
|
||||
vec1 = vec.view(dtype=dtype)
|
||||
xport1 = vec1['f0']
|
||||
xport2 = vec1['f1']
|
||||
|
||||
# Start by setting first half of ieee number to first half of IBM
|
||||
# number sans exponent
|
||||
ieee1 = xport1 & 0x00ffffff
|
||||
|
||||
# The fraction bit to the left of the binary point in the ieee
|
||||
# format was set and the number was shifted 0, 1, 2, or 3
|
||||
# places. This will tell us how to adjust the ibm exponent to be a
|
||||
# power of 2 ieee exponent and how to shift the fraction bits to
|
||||
# restore the correct magnitude.
|
||||
shift = np.zeros(len(vec), dtype=np.uint8)
|
||||
shift[np.where(xport1 & 0x00200000)] = 1
|
||||
shift[np.where(xport1 & 0x00400000)] = 2
|
||||
shift[np.where(xport1 & 0x00800000)] = 3
|
||||
|
||||
# shift the ieee number down the correct number of places then
|
||||
# set the second half of the ieee number to be the second half
|
||||
# of the ibm number shifted appropriately, ored with the bits
|
||||
# from the first half that would have been shifted in if we
|
||||
# could shift a double. All we are worried about are the low
|
||||
# order 3 bits of the first half since we're only shifting by
|
||||
# 1, 2, or 3.
|
||||
ieee1 >>= shift
|
||||
ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
|
||||
|
||||
# clear the 1 bit to the left of the binary point
|
||||
ieee1 &= 0xffefffff
|
||||
|
||||
# set the exponent of the ieee number to be the actual exponent
|
||||
# plus the shift count + 1023. Or this into the first half of the
|
||||
# ieee number. The ibm exponent is excess 64 but is adjusted by 65
|
||||
# since during conversion to ibm format the exponent is
|
||||
# incremented by 1 and the fraction bits left 4 positions to the
|
||||
# right of the radix point. (had to add >> 24 because C treats &
|
||||
# 0x7f as 0x7f000000 and Python doesn't)
|
||||
ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) +
|
||||
shift + 1023) << 20) | (xport1 & 0x80000000)
|
||||
|
||||
ieee = np.empty((len(ieee1),), dtype='>u4,>u4')
|
||||
ieee['f0'] = ieee1
|
||||
ieee['f1'] = ieee2
|
||||
ieee = ieee.view(dtype='>f8')
|
||||
ieee = ieee.astype('f8')
|
||||
|
||||
return ieee
|
||||
|
||||
|
||||
class XportReader(BaseIterator):
|
||||
__doc__ = _xport_reader_doc
|
||||
|
||||
def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
|
||||
chunksize=None):
|
||||
|
||||
self._encoding = encoding
|
||||
self._lines_read = 0
|
||||
self._index = index
|
||||
self._chunksize = chunksize
|
||||
|
||||
if isinstance(filepath_or_buffer, str):
|
||||
(filepath_or_buffer, encoding,
|
||||
compression, should_close) = get_filepath_or_buffer(
|
||||
filepath_or_buffer, encoding=encoding)
|
||||
|
||||
if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)):
|
||||
self.filepath_or_buffer = open(filepath_or_buffer, 'rb')
|
||||
else:
|
||||
# Copy to BytesIO, and ensure no encoding
|
||||
contents = filepath_or_buffer.read()
|
||||
try:
|
||||
contents = contents.encode(self._encoding)
|
||||
except UnicodeEncodeError:
|
||||
pass
|
||||
self.filepath_or_buffer = compat.BytesIO(contents)
|
||||
|
||||
self._read_header()
|
||||
|
||||
def close(self):
|
||||
self.filepath_or_buffer.close()
|
||||
|
||||
def _get_row(self):
|
||||
return self.filepath_or_buffer.read(80).decode()
|
||||
|
||||
def _read_header(self):
|
||||
self.filepath_or_buffer.seek(0)
|
||||
|
||||
# read file header
|
||||
line1 = self._get_row()
|
||||
if line1 != _correct_line1:
|
||||
self.close()
|
||||
raise ValueError("Header record is not an XPORT file.")
|
||||
|
||||
line2 = self._get_row()
|
||||
fif = [['prefix', 24], ['version', 8], ['OS', 8],
|
||||
['_', 24], ['created', 16]]
|
||||
file_info = _split_line(line2, fif)
|
||||
if file_info['prefix'] != "SAS SAS SASLIB":
|
||||
self.close()
|
||||
raise ValueError("Header record has invalid prefix.")
|
||||
file_info['created'] = _parse_date(file_info['created'])
|
||||
self.file_info = file_info
|
||||
|
||||
line3 = self._get_row()
|
||||
file_info['modified'] = _parse_date(line3[:16])
|
||||
|
||||
# read member header
|
||||
header1 = self._get_row()
|
||||
header2 = self._get_row()
|
||||
headflag1 = header1.startswith(_correct_header1)
|
||||
headflag2 = (header2 == _correct_header2)
|
||||
if not (headflag1 and headflag2):
|
||||
self.close()
|
||||
raise ValueError("Member header not found")
|
||||
# usually 140, could be 135
|
||||
fieldnamelength = int(header1[-5:-2])
|
||||
|
||||
# member info
|
||||
mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8],
|
||||
['version', 8], ['OS', 8], ['_', 24], ['created', 16]]
|
||||
member_info = _split_line(self._get_row(), mem)
|
||||
mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]]
|
||||
member_info.update(_split_line(self._get_row(), mem))
|
||||
member_info['modified'] = _parse_date(member_info['modified'])
|
||||
member_info['created'] = _parse_date(member_info['created'])
|
||||
self.member_info = member_info
|
||||
|
||||
# read field names
|
||||
types = {1: 'numeric', 2: 'char'}
|
||||
fieldcount = int(self._get_row()[54:58])
|
||||
datalength = fieldnamelength * fieldcount
|
||||
# round up to nearest 80
|
||||
if datalength % 80:
|
||||
datalength += 80 - datalength % 80
|
||||
fielddata = self.filepath_or_buffer.read(datalength)
|
||||
fields = []
|
||||
obs_length = 0
|
||||
while len(fielddata) >= fieldnamelength:
|
||||
# pull data for one field
|
||||
field, fielddata = (fielddata[:fieldnamelength],
|
||||
fielddata[fieldnamelength:])
|
||||
|
||||
# rest at end gets ignored, so if field is short, pad out
|
||||
# to match struct pattern below
|
||||
field = field.ljust(140)
|
||||
|
||||
fieldstruct = struct.unpack('>hhhh8s40s8shhh2s8shhl52s', field)
|
||||
field = dict(zip(_fieldkeys, fieldstruct))
|
||||
del field['_']
|
||||
field['ntype'] = types[field['ntype']]
|
||||
fl = field['field_length']
|
||||
if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
|
||||
self.close()
|
||||
msg = "Floating field width {0} is not between 2 and 8."
|
||||
raise TypeError(msg.format(fl))
|
||||
|
||||
for k, v in field.items():
|
||||
try:
|
||||
field[k] = v.strip()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
obs_length += field['field_length']
|
||||
fields += [field]
|
||||
|
||||
header = self._get_row()
|
||||
if not header == _correct_obs_header:
|
||||
self.close()
|
||||
raise ValueError("Observation header not found.")
|
||||
|
||||
self.fields = fields
|
||||
self.record_length = obs_length
|
||||
self.record_start = self.filepath_or_buffer.tell()
|
||||
|
||||
self.nobs = self._record_count()
|
||||
self.columns = [x['name'].decode() for x in self.fields]
|
||||
|
||||
# Setup the dtype.
|
||||
dtypel = [('s' + str(i), "S" + str(field['field_length']))
|
||||
for i, field in enumerate(self.fields)]
|
||||
dtype = np.dtype(dtypel)
|
||||
self._dtype = dtype
|
||||
|
||||
def __next__(self):
|
||||
return self.read(nrows=self._chunksize or 1)
|
||||
|
||||
def _record_count(self):
|
||||
"""
|
||||
Get number of records in file.
|
||||
|
||||
This is maybe suboptimal because we have to seek to the end of
|
||||
the file.
|
||||
|
||||
Side effect: returns file position to record_start.
|
||||
"""
|
||||
|
||||
self.filepath_or_buffer.seek(0, 2)
|
||||
total_records_length = (self.filepath_or_buffer.tell() -
|
||||
self.record_start)
|
||||
|
||||
if total_records_length % 80 != 0:
|
||||
warnings.warn("xport file may be corrupted")
|
||||
|
||||
if self.record_length > 80:
|
||||
self.filepath_or_buffer.seek(self.record_start)
|
||||
return total_records_length // self.record_length
|
||||
|
||||
self.filepath_or_buffer.seek(-80, 2)
|
||||
last_card = self.filepath_or_buffer.read(80)
|
||||
last_card = np.frombuffer(last_card, dtype=np.uint64)
|
||||
|
||||
# 8 byte blank
|
||||
ix = np.flatnonzero(last_card == 2314885530818453536)
|
||||
|
||||
if len(ix) == 0:
|
||||
tail_pad = 0
|
||||
else:
|
||||
tail_pad = 8 * len(ix)
|
||||
|
||||
self.filepath_or_buffer.seek(self.record_start)
|
||||
|
||||
return (total_records_length - tail_pad) // self.record_length
|
||||
|
||||
def get_chunk(self, size=None):
|
||||
"""
|
||||
Reads lines from Xport file and returns as dataframe
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int, defaults to None
|
||||
Number of lines to read. If None, reads whole file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
"""
|
||||
if size is None:
|
||||
size = self._chunksize
|
||||
return self.read(nrows=size)
|
||||
|
||||
def _missing_double(self, vec):
|
||||
v = vec.view(dtype='u1,u1,u2,u4')
|
||||
miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0)
|
||||
miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |
|
||||
(v['f0'] == 0x5f) | (v['f0'] == 0x2e))
|
||||
miss &= miss1
|
||||
return miss
|
||||
|
||||
@Appender(_read_method_doc)
|
||||
def read(self, nrows=None):
|
||||
|
||||
if nrows is None:
|
||||
nrows = self.nobs
|
||||
|
||||
read_lines = min(nrows, self.nobs - self._lines_read)
|
||||
read_len = read_lines * self.record_length
|
||||
if read_len <= 0:
|
||||
self.close()
|
||||
raise StopIteration
|
||||
raw = self.filepath_or_buffer.read(read_len)
|
||||
data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
|
||||
|
||||
df = pd.DataFrame(index=range(read_lines))
|
||||
for j, x in enumerate(self.columns):
|
||||
vec = data['s%d' % j]
|
||||
ntype = self.fields[j]['ntype']
|
||||
if ntype == "numeric":
|
||||
vec = _handle_truncated_float_vec(
|
||||
vec, self.fields[j]['field_length'])
|
||||
miss = self._missing_double(vec)
|
||||
v = _parse_float_vec(vec)
|
||||
v[miss] = np.nan
|
||||
elif self.fields[j]['ntype'] == 'char':
|
||||
v = [y.rstrip() for y in vec]
|
||||
if compat.PY3:
|
||||
if self._encoding is not None:
|
||||
v = [y.decode(self._encoding) for y in v]
|
||||
df[x] = v
|
||||
|
||||
if self._index is None:
|
||||
df.index = range(self._lines_read, self._lines_read + read_lines)
|
||||
else:
|
||||
df = df.set_index(self._index)
|
||||
|
||||
self._lines_read += read_lines
|
||||
|
||||
return df
|
||||
@@ -1,68 +0,0 @@
|
||||
"""
|
||||
Read SAS sas7bdat or xport files.
|
||||
"""
|
||||
from pandas import compat
|
||||
|
||||
from pandas.io.common import _stringify_path
|
||||
|
||||
|
||||
def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
|
||||
chunksize=None, iterator=False):
|
||||
"""
|
||||
Read SAS files stored as either XPORT or SAS7BDAT format files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : string or file-like object
|
||||
Path to the SAS file.
|
||||
format : string {'xport', 'sas7bdat'} or None
|
||||
If None, file format is inferred from file extension. If 'xport' or
|
||||
'sas7bdat', uses the corresponding format.
|
||||
index : identifier of index column, defaults to None
|
||||
Identifier of column that should be used as index of the DataFrame.
|
||||
encoding : string, default is None
|
||||
Encoding for text data. If None, text data are stored as raw bytes.
|
||||
chunksize : int
|
||||
Read file `chunksize` lines at a time, returns iterator.
|
||||
iterator : bool, defaults to False
|
||||
If True, returns an iterator for reading the file incrementally.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
|
||||
or XportReader
|
||||
"""
|
||||
if format is None:
|
||||
buffer_error_msg = ("If this is a buffer object rather "
|
||||
"than a string name, you must specify "
|
||||
"a format string")
|
||||
filepath_or_buffer = _stringify_path(filepath_or_buffer)
|
||||
if not isinstance(filepath_or_buffer, compat.string_types):
|
||||
raise ValueError(buffer_error_msg)
|
||||
fname = filepath_or_buffer.lower()
|
||||
if fname.endswith(".xpt"):
|
||||
format = "xport"
|
||||
elif fname.endswith(".sas7bdat"):
|
||||
format = "sas7bdat"
|
||||
else:
|
||||
raise ValueError("unable to infer format of SAS file")
|
||||
|
||||
if format.lower() == 'xport':
|
||||
from pandas.io.sas.sas_xport import XportReader
|
||||
reader = XportReader(filepath_or_buffer, index=index,
|
||||
encoding=encoding,
|
||||
chunksize=chunksize)
|
||||
elif format.lower() == 'sas7bdat':
|
||||
from pandas.io.sas.sas7bdat import SAS7BDATReader
|
||||
reader = SAS7BDATReader(filepath_or_buffer, index=index,
|
||||
encoding=encoding,
|
||||
chunksize=chunksize)
|
||||
else:
|
||||
raise ValueError('unknown SAS format')
|
||||
|
||||
if iterator or chunksize:
|
||||
return reader
|
||||
|
||||
data = reader.read()
|
||||
reader.close()
|
||||
return data
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user