started work on backend
This commit is contained in:
@@ -0,0 +1 @@
|
||||
from .sasreader import read_sas # noqa
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
Executable
BIN
Binary file not shown.
@@ -0,0 +1,687 @@
|
||||
"""
|
||||
Read SAS7BDAT files
|
||||
|
||||
Based on code written by Jared Hobbs:
|
||||
https://bitbucket.org/jaredhobbs/sas7bdat
|
||||
|
||||
See also:
|
||||
https://github.com/BioStatMatt/sas7bdat
|
||||
|
||||
Partial documentation of the file format:
|
||||
https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
|
||||
|
||||
Reference for binary data compression:
|
||||
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from pandas import compat
|
||||
from pandas.io.common import get_filepath_or_buffer, BaseIterator
|
||||
from pandas.errors import EmptyDataError
|
||||
import numpy as np
|
||||
import struct
|
||||
import pandas.io.sas.sas_constants as const
|
||||
from pandas.io.sas._sas import Parser
|
||||
|
||||
|
||||
class _subheader_pointer(object):
|
||||
pass
|
||||
|
||||
|
||||
class _column(object):
|
||||
pass
|
||||
|
||||
|
||||
# SAS7BDAT represents a SAS data file in SAS7BDAT format.
|
||||
class SAS7BDATReader(BaseIterator):
|
||||
"""
|
||||
Read SAS files in SAS7BDAT format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf : path name or buffer
|
||||
Name of SAS file or file-like object pointing to SAS file
|
||||
contents.
|
||||
index : column identifier, defaults to None
|
||||
Column to use as index.
|
||||
convert_dates : boolean, defaults to True
|
||||
Attempt to convert dates to Pandas datetime values. Note that
|
||||
some rarely used SAS date formats may be unsupported.
|
||||
blank_missing : boolean, defaults to True
|
||||
Convert empty strings to missing values (SAS uses blanks to
|
||||
indicate missing character variables).
|
||||
chunksize : int, defaults to None
|
||||
Return SAS7BDATReader object for iterations, returns chunks
|
||||
with given number of lines.
|
||||
encoding : string, defaults to None
|
||||
String encoding.
|
||||
convert_text : bool, defaults to True
|
||||
If False, text variables are left as raw bytes.
|
||||
convert_header_text : bool, defaults to True
|
||||
If False, header text, including column names, are left as raw
|
||||
bytes.
|
||||
"""
|
||||
|
||||
def __init__(self, path_or_buf, index=None, convert_dates=True,
|
||||
blank_missing=True, chunksize=None, encoding=None,
|
||||
convert_text=True, convert_header_text=True):
|
||||
|
||||
self.index = index
|
||||
self.convert_dates = convert_dates
|
||||
self.blank_missing = blank_missing
|
||||
self.chunksize = chunksize
|
||||
self.encoding = encoding
|
||||
self.convert_text = convert_text
|
||||
self.convert_header_text = convert_header_text
|
||||
|
||||
self.default_encoding = "latin-1"
|
||||
self.compression = ""
|
||||
self.column_names_strings = []
|
||||
self.column_names = []
|
||||
self.column_types = []
|
||||
self.column_formats = []
|
||||
self.columns = []
|
||||
|
||||
self._current_page_data_subheader_pointers = []
|
||||
self._cached_page = None
|
||||
self._column_data_lengths = []
|
||||
self._column_data_offsets = []
|
||||
self._current_row_in_file_index = 0
|
||||
self._current_row_on_page_index = 0
|
||||
self._current_row_in_file_index = 0
|
||||
|
||||
self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
|
||||
if isinstance(self._path_or_buf, compat.string_types):
|
||||
self._path_or_buf = open(self._path_or_buf, 'rb')
|
||||
self.handle = self._path_or_buf
|
||||
|
||||
self._get_properties()
|
||||
self._parse_metadata()
|
||||
|
||||
def close(self):
|
||||
try:
|
||||
self.handle.close()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
def _get_properties(self):
|
||||
|
||||
# Check magic number
|
||||
self._path_or_buf.seek(0)
|
||||
self._cached_page = self._path_or_buf.read(288)
|
||||
if self._cached_page[0:len(const.magic)] != const.magic:
|
||||
self.close()
|
||||
raise ValueError("magic number mismatch (not a SAS file?)")
|
||||
|
||||
# Get alignment information
|
||||
align1, align2 = 0, 0
|
||||
buf = self._read_bytes(const.align_1_offset, const.align_1_length)
|
||||
if buf == const.u64_byte_checker_value:
|
||||
align2 = const.align_2_value
|
||||
self.U64 = True
|
||||
self._int_length = 8
|
||||
self._page_bit_offset = const.page_bit_offset_x64
|
||||
self._subheader_pointer_length = const.subheader_pointer_length_x64
|
||||
else:
|
||||
self.U64 = False
|
||||
self._page_bit_offset = const.page_bit_offset_x86
|
||||
self._subheader_pointer_length = const.subheader_pointer_length_x86
|
||||
self._int_length = 4
|
||||
buf = self._read_bytes(const.align_2_offset, const.align_2_length)
|
||||
if buf == const.align_1_checker_value:
|
||||
align1 = const.align_2_value
|
||||
total_align = align1 + align2
|
||||
|
||||
# Get endianness information
|
||||
buf = self._read_bytes(const.endianness_offset,
|
||||
const.endianness_length)
|
||||
if buf == b'\x01':
|
||||
self.byte_order = "<"
|
||||
else:
|
||||
self.byte_order = ">"
|
||||
|
||||
# Get encoding information
|
||||
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
|
||||
if buf in const.encoding_names:
|
||||
self.file_encoding = const.encoding_names[buf]
|
||||
else:
|
||||
self.file_encoding = "unknown (code=%s)" % str(buf)
|
||||
|
||||
# Get platform information
|
||||
buf = self._read_bytes(const.platform_offset, const.platform_length)
|
||||
if buf == b'1':
|
||||
self.platform = "unix"
|
||||
elif buf == b'2':
|
||||
self.platform = "windows"
|
||||
else:
|
||||
self.platform = "unknown"
|
||||
|
||||
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
|
||||
self.name = buf.rstrip(b'\x00 ')
|
||||
if self.convert_header_text:
|
||||
self.name = self.name.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
|
||||
self.file_type = buf.rstrip(b'\x00 ')
|
||||
if self.convert_header_text:
|
||||
self.file_type = self.file_type.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
# Timestamp is epoch 01/01/1960
|
||||
epoch = pd.datetime(1960, 1, 1)
|
||||
x = self._read_float(const.date_created_offset + align1,
|
||||
const.date_created_length)
|
||||
self.date_created = epoch + pd.to_timedelta(x, unit='s')
|
||||
x = self._read_float(const.date_modified_offset + align1,
|
||||
const.date_modified_length)
|
||||
self.date_modified = epoch + pd.to_timedelta(x, unit='s')
|
||||
|
||||
self.header_length = self._read_int(const.header_size_offset + align1,
|
||||
const.header_size_length)
|
||||
|
||||
# Read the rest of the header into cached_page.
|
||||
buf = self._path_or_buf.read(self.header_length - 288)
|
||||
self._cached_page += buf
|
||||
if len(self._cached_page) != self.header_length:
|
||||
self.close()
|
||||
raise ValueError("The SAS7BDAT file appears to be truncated.")
|
||||
|
||||
self._page_length = self._read_int(const.page_size_offset + align1,
|
||||
const.page_size_length)
|
||||
self._page_count = self._read_int(const.page_count_offset + align1,
|
||||
const.page_count_length)
|
||||
|
||||
buf = self._read_bytes(const.sas_release_offset + total_align,
|
||||
const.sas_release_length)
|
||||
self.sas_release = buf.rstrip(b'\x00 ')
|
||||
if self.convert_header_text:
|
||||
self.sas_release = self.sas_release.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
buf = self._read_bytes(const.sas_server_type_offset + total_align,
|
||||
const.sas_server_type_length)
|
||||
self.server_type = buf.rstrip(b'\x00 ')
|
||||
if self.convert_header_text:
|
||||
self.server_type = self.server_type.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
buf = self._read_bytes(const.os_version_number_offset + total_align,
|
||||
const.os_version_number_length)
|
||||
self.os_version = buf.rstrip(b'\x00 ')
|
||||
if self.convert_header_text:
|
||||
self.os_version = self.os_version.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
buf = self._read_bytes(const.os_name_offset + total_align,
|
||||
const.os_name_length)
|
||||
buf = buf.rstrip(b'\x00 ')
|
||||
if len(buf) > 0:
|
||||
self.os_name = buf.decode(self.encoding or self.default_encoding)
|
||||
else:
|
||||
buf = self._read_bytes(const.os_maker_offset + total_align,
|
||||
const.os_maker_length)
|
||||
self.os_name = buf.rstrip(b'\x00 ')
|
||||
if self.convert_header_text:
|
||||
self.os_name = self.os_name.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
def __next__(self):
|
||||
da = self.read(nrows=self.chunksize or 1)
|
||||
if da is None:
|
||||
raise StopIteration
|
||||
return da
|
||||
|
||||
# Read a single float of the given width (4 or 8).
|
||||
def _read_float(self, offset, width):
|
||||
if width not in (4, 8):
|
||||
self.close()
|
||||
raise ValueError("invalid float width")
|
||||
buf = self._read_bytes(offset, width)
|
||||
fd = "f" if width == 4 else "d"
|
||||
return struct.unpack(self.byte_order + fd, buf)[0]
|
||||
|
||||
# Read a single signed integer of the given width (1, 2, 4 or 8).
|
||||
def _read_int(self, offset, width):
|
||||
if width not in (1, 2, 4, 8):
|
||||
self.close()
|
||||
raise ValueError("invalid int width")
|
||||
buf = self._read_bytes(offset, width)
|
||||
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
|
||||
iv = struct.unpack(self.byte_order + it, buf)[0]
|
||||
return iv
|
||||
|
||||
def _read_bytes(self, offset, length):
|
||||
if self._cached_page is None:
|
||||
self._path_or_buf.seek(offset)
|
||||
buf = self._path_or_buf.read(length)
|
||||
if len(buf) < length:
|
||||
self.close()
|
||||
msg = "Unable to read {:d} bytes from file position {:d}."
|
||||
raise ValueError(msg.format(length, offset))
|
||||
return buf
|
||||
else:
|
||||
if offset + length > len(self._cached_page):
|
||||
self.close()
|
||||
raise ValueError("The cached page is too small.")
|
||||
return self._cached_page[offset:offset + length]
|
||||
|
||||
def _parse_metadata(self):
|
||||
done = False
|
||||
while not done:
|
||||
self._cached_page = self._path_or_buf.read(self._page_length)
|
||||
if len(self._cached_page) <= 0:
|
||||
break
|
||||
if len(self._cached_page) != self._page_length:
|
||||
self.close()
|
||||
raise ValueError(
|
||||
"Failed to read a meta data page from the SAS file.")
|
||||
done = self._process_page_meta()
|
||||
|
||||
def _process_page_meta(self):
|
||||
self._read_page_header()
|
||||
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
|
||||
if self._current_page_type in pt:
|
||||
self._process_page_metadata()
|
||||
return ((self._current_page_type in [256] + const.page_mix_types) or
|
||||
(self._current_page_data_subheader_pointers is not None))
|
||||
|
||||
def _read_page_header(self):
|
||||
bit_offset = self._page_bit_offset
|
||||
tx = const.page_type_offset + bit_offset
|
||||
self._current_page_type = self._read_int(tx, const.page_type_length)
|
||||
tx = const.block_count_offset + bit_offset
|
||||
self._current_page_block_count = self._read_int(
|
||||
tx, const.block_count_length)
|
||||
tx = const.subheader_count_offset + bit_offset
|
||||
self._current_page_subheaders_count = (
|
||||
self._read_int(tx, const.subheader_count_length))
|
||||
|
||||
def _process_page_metadata(self):
|
||||
bit_offset = self._page_bit_offset
|
||||
|
||||
for i in range(self._current_page_subheaders_count):
|
||||
pointer = self._process_subheader_pointers(
|
||||
const.subheader_pointers_offset + bit_offset, i)
|
||||
if pointer.length == 0:
|
||||
continue
|
||||
if pointer.compression == const.truncated_subheader_id:
|
||||
continue
|
||||
subheader_signature = self._read_subheader_signature(
|
||||
pointer.offset)
|
||||
subheader_index = (
|
||||
self._get_subheader_index(subheader_signature,
|
||||
pointer.compression, pointer.ptype))
|
||||
self._process_subheader(subheader_index, pointer)
|
||||
|
||||
def _get_subheader_index(self, signature, compression, ptype):
|
||||
index = const.subheader_signature_to_index.get(signature)
|
||||
if index is None:
|
||||
f1 = ((compression == const.compressed_subheader_id) or
|
||||
(compression == 0))
|
||||
f2 = (ptype == const.compressed_subheader_type)
|
||||
if (self.compression != "") and f1 and f2:
|
||||
index = const.SASIndex.data_subheader_index
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError("Unknown subheader signature")
|
||||
return index
|
||||
|
||||
def _process_subheader_pointers(self, offset, subheader_pointer_index):
|
||||
|
||||
subheader_pointer_length = self._subheader_pointer_length
|
||||
total_offset = (offset +
|
||||
subheader_pointer_length * subheader_pointer_index)
|
||||
|
||||
subheader_offset = self._read_int(total_offset, self._int_length)
|
||||
total_offset += self._int_length
|
||||
|
||||
subheader_length = self._read_int(total_offset, self._int_length)
|
||||
total_offset += self._int_length
|
||||
|
||||
subheader_compression = self._read_int(total_offset, 1)
|
||||
total_offset += 1
|
||||
|
||||
subheader_type = self._read_int(total_offset, 1)
|
||||
|
||||
x = _subheader_pointer()
|
||||
x.offset = subheader_offset
|
||||
x.length = subheader_length
|
||||
x.compression = subheader_compression
|
||||
x.ptype = subheader_type
|
||||
|
||||
return x
|
||||
|
||||
def _read_subheader_signature(self, offset):
|
||||
subheader_signature = self._read_bytes(offset, self._int_length)
|
||||
return subheader_signature
|
||||
|
||||
def _process_subheader(self, subheader_index, pointer):
|
||||
offset = pointer.offset
|
||||
length = pointer.length
|
||||
|
||||
if subheader_index == const.SASIndex.row_size_index:
|
||||
processor = self._process_rowsize_subheader
|
||||
elif subheader_index == const.SASIndex.column_size_index:
|
||||
processor = self._process_columnsize_subheader
|
||||
elif subheader_index == const.SASIndex.column_text_index:
|
||||
processor = self._process_columntext_subheader
|
||||
elif subheader_index == const.SASIndex.column_name_index:
|
||||
processor = self._process_columnname_subheader
|
||||
elif subheader_index == const.SASIndex.column_attributes_index:
|
||||
processor = self._process_columnattributes_subheader
|
||||
elif subheader_index == const.SASIndex.format_and_label_index:
|
||||
processor = self._process_format_subheader
|
||||
elif subheader_index == const.SASIndex.column_list_index:
|
||||
processor = self._process_columnlist_subheader
|
||||
elif subheader_index == const.SASIndex.subheader_counts_index:
|
||||
processor = self._process_subheader_counts
|
||||
elif subheader_index == const.SASIndex.data_subheader_index:
|
||||
self._current_page_data_subheader_pointers.append(pointer)
|
||||
return
|
||||
else:
|
||||
raise ValueError("unknown subheader index")
|
||||
|
||||
processor(offset, length)
|
||||
|
||||
def _process_rowsize_subheader(self, offset, length):
|
||||
|
||||
int_len = self._int_length
|
||||
lcs_offset = offset
|
||||
lcp_offset = offset
|
||||
if self.U64:
|
||||
lcs_offset += 682
|
||||
lcp_offset += 706
|
||||
else:
|
||||
lcs_offset += 354
|
||||
lcp_offset += 378
|
||||
|
||||
self.row_length = self._read_int(
|
||||
offset + const.row_length_offset_multiplier * int_len, int_len)
|
||||
self.row_count = self._read_int(
|
||||
offset + const.row_count_offset_multiplier * int_len, int_len)
|
||||
self.col_count_p1 = self._read_int(
|
||||
offset + const.col_count_p1_multiplier * int_len, int_len)
|
||||
self.col_count_p2 = self._read_int(
|
||||
offset + const.col_count_p2_multiplier * int_len, int_len)
|
||||
mx = const.row_count_on_mix_page_offset_multiplier * int_len
|
||||
self._mix_page_row_count = self._read_int(offset + mx, int_len)
|
||||
self._lcs = self._read_int(lcs_offset, 2)
|
||||
self._lcp = self._read_int(lcp_offset, 2)
|
||||
|
||||
def _process_columnsize_subheader(self, offset, length):
|
||||
int_len = self._int_length
|
||||
offset += int_len
|
||||
self.column_count = self._read_int(offset, int_len)
|
||||
if (self.col_count_p1 + self.col_count_p2 !=
|
||||
self.column_count):
|
||||
print("Warning: column count mismatch (%d + %d != %d)\n",
|
||||
self.col_count_p1, self.col_count_p2, self.column_count)
|
||||
|
||||
# Unknown purpose
|
||||
def _process_subheader_counts(self, offset, length):
|
||||
pass
|
||||
|
||||
def _process_columntext_subheader(self, offset, length):
|
||||
|
||||
offset += self._int_length
|
||||
text_block_size = self._read_int(offset, const.text_block_size_length)
|
||||
|
||||
buf = self._read_bytes(offset, text_block_size)
|
||||
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
|
||||
cname = cname_raw
|
||||
if self.convert_header_text:
|
||||
cname = cname.decode(self.encoding or self.default_encoding)
|
||||
self.column_names_strings.append(cname)
|
||||
|
||||
if len(self.column_names_strings) == 1:
|
||||
compression_literal = ""
|
||||
for cl in const.compression_literals:
|
||||
if cl in cname_raw:
|
||||
compression_literal = cl
|
||||
self.compression = compression_literal
|
||||
offset -= self._int_length
|
||||
|
||||
offset1 = offset + 16
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
compression_literal = buf.rstrip(b"\x00")
|
||||
if compression_literal == "":
|
||||
self._lcs = 0
|
||||
offset1 = offset + 32
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
self.creator_proc = buf[0:self._lcp]
|
||||
elif compression_literal == const.rle_compression:
|
||||
offset1 = offset + 40
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
self.creator_proc = buf[0:self._lcp]
|
||||
elif self._lcs > 0:
|
||||
self._lcp = 0
|
||||
offset1 = offset + 16
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcs)
|
||||
self.creator_proc = buf[0:self._lcp]
|
||||
if self.convert_header_text:
|
||||
if hasattr(self, "creator_proc"):
|
||||
self.creator_proc = self.creator_proc.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
|
||||
def _process_columnname_subheader(self, offset, length):
|
||||
int_len = self._int_length
|
||||
offset += int_len
|
||||
column_name_pointers_count = (length - 2 * int_len - 12) // 8
|
||||
for i in range(column_name_pointers_count):
|
||||
text_subheader = offset + const.column_name_pointer_length * \
|
||||
(i + 1) + const.column_name_text_subheader_offset
|
||||
col_name_offset = offset + const.column_name_pointer_length * \
|
||||
(i + 1) + const.column_name_offset_offset
|
||||
col_name_length = offset + const.column_name_pointer_length * \
|
||||
(i + 1) + const.column_name_length_offset
|
||||
|
||||
idx = self._read_int(
|
||||
text_subheader, const.column_name_text_subheader_length)
|
||||
col_offset = self._read_int(
|
||||
col_name_offset, const.column_name_offset_length)
|
||||
col_len = self._read_int(
|
||||
col_name_length, const.column_name_length_length)
|
||||
|
||||
name_str = self.column_names_strings[idx]
|
||||
self.column_names.append(name_str[col_offset:col_offset + col_len])
|
||||
|
||||
def _process_columnattributes_subheader(self, offset, length):
|
||||
int_len = self._int_length
|
||||
column_attributes_vectors_count = (
|
||||
length - 2 * int_len - 12) // (int_len + 8)
|
||||
self.column_types = np.empty(
|
||||
column_attributes_vectors_count, dtype=np.dtype('S1'))
|
||||
self._column_data_lengths = np.empty(
|
||||
column_attributes_vectors_count, dtype=np.int64)
|
||||
self._column_data_offsets = np.empty(
|
||||
column_attributes_vectors_count, dtype=np.int64)
|
||||
for i in range(column_attributes_vectors_count):
|
||||
col_data_offset = (offset + int_len +
|
||||
const.column_data_offset_offset +
|
||||
i * (int_len + 8))
|
||||
col_data_len = (offset + 2 * int_len +
|
||||
const.column_data_length_offset +
|
||||
i * (int_len + 8))
|
||||
col_types = (offset + 2 * int_len +
|
||||
const.column_type_offset + i * (int_len + 8))
|
||||
|
||||
x = self._read_int(col_data_offset, int_len)
|
||||
self._column_data_offsets[i] = x
|
||||
|
||||
x = self._read_int(col_data_len, const.column_data_length_length)
|
||||
self._column_data_lengths[i] = x
|
||||
|
||||
x = self._read_int(col_types, const.column_type_length)
|
||||
if x == 1:
|
||||
self.column_types[i] = b'd'
|
||||
else:
|
||||
self.column_types[i] = b's'
|
||||
|
||||
def _process_columnlist_subheader(self, offset, length):
|
||||
# unknown purpose
|
||||
pass
|
||||
|
||||
def _process_format_subheader(self, offset, length):
|
||||
int_len = self._int_length
|
||||
text_subheader_format = (
|
||||
offset +
|
||||
const.column_format_text_subheader_index_offset +
|
||||
3 * int_len)
|
||||
col_format_offset = (offset +
|
||||
const.column_format_offset_offset +
|
||||
3 * int_len)
|
||||
col_format_len = (offset +
|
||||
const.column_format_length_offset +
|
||||
3 * int_len)
|
||||
text_subheader_label = (
|
||||
offset +
|
||||
const.column_label_text_subheader_index_offset +
|
||||
3 * int_len)
|
||||
col_label_offset = (offset +
|
||||
const.column_label_offset_offset +
|
||||
3 * int_len)
|
||||
col_label_len = offset + const.column_label_length_offset + 3 * int_len
|
||||
|
||||
x = self._read_int(text_subheader_format,
|
||||
const.column_format_text_subheader_index_length)
|
||||
format_idx = min(x, len(self.column_names_strings) - 1)
|
||||
|
||||
format_start = self._read_int(
|
||||
col_format_offset, const.column_format_offset_length)
|
||||
format_len = self._read_int(
|
||||
col_format_len, const.column_format_length_length)
|
||||
|
||||
label_idx = self._read_int(
|
||||
text_subheader_label,
|
||||
const.column_label_text_subheader_index_length)
|
||||
label_idx = min(label_idx, len(self.column_names_strings) - 1)
|
||||
|
||||
label_start = self._read_int(
|
||||
col_label_offset, const.column_label_offset_length)
|
||||
label_len = self._read_int(col_label_len,
|
||||
const.column_label_length_length)
|
||||
|
||||
label_names = self.column_names_strings[label_idx]
|
||||
column_label = label_names[label_start: label_start + label_len]
|
||||
format_names = self.column_names_strings[format_idx]
|
||||
column_format = format_names[format_start: format_start + format_len]
|
||||
current_column_number = len(self.columns)
|
||||
|
||||
col = _column()
|
||||
col.col_id = current_column_number
|
||||
col.name = self.column_names[current_column_number]
|
||||
col.label = column_label
|
||||
col.format = column_format
|
||||
col.ctype = self.column_types[current_column_number]
|
||||
col.length = self._column_data_lengths[current_column_number]
|
||||
|
||||
self.column_formats.append(column_format)
|
||||
self.columns.append(col)
|
||||
|
||||
def read(self, nrows=None):
|
||||
|
||||
if (nrows is None) and (self.chunksize is not None):
|
||||
nrows = self.chunksize
|
||||
elif nrows is None:
|
||||
nrows = self.row_count
|
||||
|
||||
if len(self.column_types) == 0:
|
||||
self.close()
|
||||
raise EmptyDataError("No columns to parse from file")
|
||||
|
||||
if self._current_row_in_file_index >= self.row_count:
|
||||
return None
|
||||
|
||||
m = self.row_count - self._current_row_in_file_index
|
||||
if nrows > m:
|
||||
nrows = m
|
||||
|
||||
nd = (self.column_types == b'd').sum()
|
||||
ns = (self.column_types == b's').sum()
|
||||
|
||||
self._string_chunk = np.empty((ns, nrows), dtype=np.object)
|
||||
self._byte_chunk = np.empty((nd, 8 * nrows), dtype=np.uint8)
|
||||
|
||||
self._current_row_in_chunk_index = 0
|
||||
p = Parser(self)
|
||||
p.read(nrows)
|
||||
|
||||
rslt = self._chunk_to_dataframe()
|
||||
if self.index is not None:
|
||||
rslt = rslt.set_index(self.index)
|
||||
|
||||
return rslt
|
||||
|
||||
def _read_next_page(self):
|
||||
self._current_page_data_subheader_pointers = []
|
||||
self._cached_page = self._path_or_buf.read(self._page_length)
|
||||
if len(self._cached_page) <= 0:
|
||||
return True
|
||||
elif len(self._cached_page) != self._page_length:
|
||||
self.close()
|
||||
msg = ("failed to read complete page from file "
|
||||
"(read {:d} of {:d} bytes)")
|
||||
raise ValueError(msg.format(len(self._cached_page),
|
||||
self._page_length))
|
||||
|
||||
self._read_page_header()
|
||||
if self._current_page_type == const.page_meta_type:
|
||||
self._process_page_metadata()
|
||||
pt = [const.page_meta_type, const.page_data_type]
|
||||
pt += [const.page_mix_types]
|
||||
if self._current_page_type not in pt:
|
||||
return self._read_next_page()
|
||||
|
||||
return False
|
||||
|
||||
def _chunk_to_dataframe(self):
|
||||
|
||||
n = self._current_row_in_chunk_index
|
||||
m = self._current_row_in_file_index
|
||||
ix = range(m - n, m)
|
||||
rslt = pd.DataFrame(index=ix)
|
||||
|
||||
js, jb = 0, 0
|
||||
for j in range(self.column_count):
|
||||
|
||||
name = self.column_names[j]
|
||||
|
||||
if self.column_types[j] == b'd':
|
||||
rslt[name] = self._byte_chunk[jb, :].view(
|
||||
dtype=self.byte_order + 'd')
|
||||
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
|
||||
if self.convert_dates:
|
||||
unit = None
|
||||
if self.column_formats[j] in const.sas_date_formats:
|
||||
unit = 'd'
|
||||
elif self.column_formats[j] in const.sas_datetime_formats:
|
||||
unit = 's'
|
||||
if unit:
|
||||
rslt[name] = pd.to_datetime(rslt[name], unit=unit,
|
||||
origin="1960-01-01")
|
||||
jb += 1
|
||||
elif self.column_types[j] == b's':
|
||||
rslt[name] = self._string_chunk[js, :]
|
||||
if self.convert_text and (self.encoding is not None):
|
||||
rslt[name] = rslt[name].str.decode(
|
||||
self.encoding or self.default_encoding)
|
||||
if self.blank_missing:
|
||||
ii = rslt[name].str.len() == 0
|
||||
rslt.loc[ii, name] = np.nan
|
||||
js += 1
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError("unknown column type %s" %
|
||||
self.column_types[j])
|
||||
|
||||
return rslt
|
||||
@@ -0,0 +1,171 @@
|
||||
magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" +
|
||||
b"\x00\x00\x00\x00\xc2\xea\x81\x60" +
|
||||
b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" +
|
||||
b"\x09\xc7\x31\x8c\x18\x1f\x10\x11")
|
||||
|
||||
align_1_checker_value = b'3'
|
||||
align_1_offset = 32
|
||||
align_1_length = 1
|
||||
align_1_value = 4
|
||||
u64_byte_checker_value = b'3'
|
||||
align_2_offset = 35
|
||||
align_2_length = 1
|
||||
align_2_value = 4
|
||||
endianness_offset = 37
|
||||
endianness_length = 1
|
||||
platform_offset = 39
|
||||
platform_length = 1
|
||||
encoding_offset = 70
|
||||
encoding_length = 1
|
||||
dataset_offset = 92
|
||||
dataset_length = 64
|
||||
file_type_offset = 156
|
||||
file_type_length = 8
|
||||
date_created_offset = 164
|
||||
date_created_length = 8
|
||||
date_modified_offset = 172
|
||||
date_modified_length = 8
|
||||
header_size_offset = 196
|
||||
header_size_length = 4
|
||||
page_size_offset = 200
|
||||
page_size_length = 4
|
||||
page_count_offset = 204
|
||||
page_count_length = 4
|
||||
sas_release_offset = 216
|
||||
sas_release_length = 8
|
||||
sas_server_type_offset = 224
|
||||
sas_server_type_length = 16
|
||||
os_version_number_offset = 240
|
||||
os_version_number_length = 16
|
||||
os_maker_offset = 256
|
||||
os_maker_length = 16
|
||||
os_name_offset = 272
|
||||
os_name_length = 16
|
||||
page_bit_offset_x86 = 16
|
||||
page_bit_offset_x64 = 32
|
||||
subheader_pointer_length_x86 = 12
|
||||
subheader_pointer_length_x64 = 24
|
||||
page_type_offset = 0
|
||||
page_type_length = 2
|
||||
block_count_offset = 2
|
||||
block_count_length = 2
|
||||
subheader_count_offset = 4
|
||||
subheader_count_length = 2
|
||||
page_meta_type = 0
|
||||
page_data_type = 256
|
||||
page_amd_type = 1024
|
||||
page_metc_type = 16384
|
||||
page_comp_type = -28672
|
||||
page_mix_types = [512, 640]
|
||||
subheader_pointers_offset = 8
|
||||
truncated_subheader_id = 1
|
||||
compressed_subheader_id = 4
|
||||
compressed_subheader_type = 1
|
||||
text_block_size_length = 2
|
||||
row_length_offset_multiplier = 5
|
||||
row_count_offset_multiplier = 6
|
||||
col_count_p1_multiplier = 9
|
||||
col_count_p2_multiplier = 10
|
||||
row_count_on_mix_page_offset_multiplier = 15
|
||||
column_name_pointer_length = 8
|
||||
column_name_text_subheader_offset = 0
|
||||
column_name_text_subheader_length = 2
|
||||
column_name_offset_offset = 2
|
||||
column_name_offset_length = 2
|
||||
column_name_length_offset = 4
|
||||
column_name_length_length = 2
|
||||
column_data_offset_offset = 8
|
||||
column_data_length_offset = 8
|
||||
column_data_length_length = 4
|
||||
column_type_offset = 14
|
||||
column_type_length = 1
|
||||
column_format_text_subheader_index_offset = 22
|
||||
column_format_text_subheader_index_length = 2
|
||||
column_format_offset_offset = 24
|
||||
column_format_offset_length = 2
|
||||
column_format_length_offset = 26
|
||||
column_format_length_length = 2
|
||||
column_label_text_subheader_index_offset = 28
|
||||
column_label_text_subheader_index_length = 2
|
||||
column_label_offset_offset = 30
|
||||
column_label_offset_length = 2
|
||||
column_label_length_offset = 32
|
||||
column_label_length_length = 2
|
||||
rle_compression = b'SASYZCRL'
|
||||
rdc_compression = b'SASYZCR2'
|
||||
|
||||
compression_literals = [rle_compression, rdc_compression]
|
||||
|
||||
# Incomplete list of encodings, using SAS nomenclature:
|
||||
# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
|
||||
encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2",
|
||||
61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"}
|
||||
|
||||
|
||||
class SASIndex(object):
|
||||
row_size_index = 0
|
||||
column_size_index = 1
|
||||
subheader_counts_index = 2
|
||||
column_text_index = 3
|
||||
column_name_index = 4
|
||||
column_attributes_index = 5
|
||||
format_and_label_index = 6
|
||||
column_list_index = 7
|
||||
data_subheader_index = 8
|
||||
|
||||
|
||||
subheader_signature_to_index = {
|
||||
b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
|
||||
b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
|
||||
b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
|
||||
b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
|
||||
b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
|
||||
b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
|
||||
b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
|
||||
b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
|
||||
b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
|
||||
b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
|
||||
b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
|
||||
b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
|
||||
b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
|
||||
b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
|
||||
b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
|
||||
b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
|
||||
b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
|
||||
b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
|
||||
b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
|
||||
b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
|
||||
b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
|
||||
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index}
|
||||
|
||||
|
||||
# List of frequently used SAS date and datetime formats
|
||||
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
|
||||
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
|
||||
sas_date_formats = ("DATE", "DAY", "DDMMYY", "DOWNAME", "JULDAY", "JULIAN",
|
||||
"MMDDYY", "MMYY", "MMYYC", "MMYYD", "MMYYP", "MMYYS",
|
||||
"MMYYN", "MONNAME", "MONTH", "MONYY", "QTR", "QTRR",
|
||||
"NENGO", "WEEKDATE", "WEEKDATX", "WEEKDAY", "WEEKV",
|
||||
"WORDDATE", "WORDDATX", "YEAR", "YYMM", "YYMMC", "YYMMD",
|
||||
"YYMMP", "YYMMS", "YYMMN", "YYMON", "YYMMDD", "YYQ",
|
||||
"YYQC", "YYQD", "YYQP", "YYQS", "YYQN", "YYQR", "YYQRC",
|
||||
"YYQRD", "YYQRP", "YYQRS", "YYQRN",
|
||||
"YYMMDDP", "YYMMDDC", "E8601DA", "YYMMDDN", "MMDDYYC",
|
||||
"MMDDYYS", "MMDDYYD", "YYMMDDS", "B8601DA", "DDMMYYN",
|
||||
"YYMMDDD", "DDMMYYB", "DDMMYYP", "MMDDYYP", "YYMMDDB",
|
||||
"MMDDYYN", "DDMMYYC", "DDMMYYD", "DDMMYYS",
|
||||
"MINGUO")
|
||||
|
||||
sas_datetime_formats = ("DATETIME", "DTWKDATX",
|
||||
"B8601DN", "B8601DT", "B8601DX", "B8601DZ", "B8601LX",
|
||||
"E8601DN", "E8601DT", "E8601DX", "E8601DZ", "E8601LX",
|
||||
"DATEAMPM", "DTDATE", "DTMONYY", "DTMONYY", "DTWKDATX",
|
||||
"DTYEAR", "TOD", "MDYAMPM")
|
||||
@@ -0,0 +1,465 @@
|
||||
"""
|
||||
Read a SAS XPort format file into a Pandas DataFrame.
|
||||
|
||||
Based on code from Jack Cushman (github.com/jcushman/xport).
|
||||
|
||||
The file format is defined here:
|
||||
|
||||
https://support.sas.com/techsup/technote/ts140.pdf
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
import pandas as pd
|
||||
from pandas.io.common import get_filepath_or_buffer, BaseIterator
|
||||
from pandas import compat
|
||||
import struct
|
||||
import numpy as np
|
||||
from pandas.util._decorators import Appender
|
||||
import warnings
|
||||
|
||||
_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 ")
|
||||
_correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!"
|
||||
"000000000000000001600000000")
|
||||
_correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 ")
|
||||
_correct_obs_header = ("HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 ")
|
||||
_fieldkeys = ['ntype', 'nhfun', 'field_length', 'nvar0', 'name', 'label',
|
||||
'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform',
|
||||
'nifl', 'nifd', 'npos', '_']
|
||||
|
||||
|
||||
_base_params_doc = """\
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : string or file-like object
|
||||
Path to SAS file or object implementing binary read method."""
|
||||
|
||||
_params2_doc = """\
|
||||
index : identifier of index column
|
||||
Identifier of column that should be used as index of the DataFrame.
|
||||
encoding : string
|
||||
Encoding for text data.
|
||||
chunksize : int
|
||||
Read file `chunksize` lines at a time, returns iterator."""
|
||||
|
||||
_format_params_doc = """\
|
||||
format : string
|
||||
File format, only `xport` is currently supported."""
|
||||
|
||||
_iterator_doc = """\
|
||||
iterator : boolean, default False
|
||||
Return XportReader object for reading file incrementally."""
|
||||
|
||||
|
||||
_read_sas_doc = """Read a SAS file into a DataFrame.
|
||||
|
||||
%(_base_params_doc)s
|
||||
%(_format_params_doc)s
|
||||
%(_params2_doc)s
|
||||
%(_iterator_doc)s
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or XportReader
|
||||
|
||||
Examples
|
||||
--------
|
||||
Read a SAS Xport file:
|
||||
|
||||
>>> df = pandas.read_sas('filename.XPT')
|
||||
|
||||
Read a Xport file in 10,000 line chunks:
|
||||
|
||||
>>> itr = pandas.read_sas('filename.XPT', chunksize=10000)
|
||||
>>> for chunk in itr:
|
||||
>>> do_something(chunk)
|
||||
|
||||
""" % {"_base_params_doc": _base_params_doc,
|
||||
"_format_params_doc": _format_params_doc,
|
||||
"_params2_doc": _params2_doc,
|
||||
"_iterator_doc": _iterator_doc}
|
||||
|
||||
|
||||
_xport_reader_doc = """\
|
||||
Class for reading SAS Xport files.
|
||||
|
||||
%(_base_params_doc)s
|
||||
%(_params2_doc)s
|
||||
|
||||
Attributes
|
||||
----------
|
||||
member_info : list
|
||||
Contains information about the file
|
||||
fields : list
|
||||
Contains information about the variables in the file
|
||||
""" % {"_base_params_doc": _base_params_doc,
|
||||
"_params2_doc": _params2_doc}
|
||||
|
||||
|
||||
_read_method_doc = """\
|
||||
Read observations from SAS Xport file, returning as data frame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nrows : int
|
||||
Number of rows to read from data file; if None, read whole
|
||||
file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A DataFrame.
|
||||
"""
|
||||
|
||||
|
||||
def _parse_date(datestr):
|
||||
""" Given a date in xport format, return Python date. """
|
||||
try:
|
||||
# e.g. "16FEB11:10:07:55"
|
||||
return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
|
||||
except ValueError:
|
||||
return pd.NaT
|
||||
|
||||
|
||||
def _split_line(s, parts):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
s: string
|
||||
Fixed-length string to split
|
||||
parts: list of (name, length) pairs
|
||||
Used to break up string, name '_' will be filtered from output.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict of name:contents of string at given location.
|
||||
"""
|
||||
out = {}
|
||||
start = 0
|
||||
for name, length in parts:
|
||||
out[name] = s[start:start + length].strip()
|
||||
start += length
|
||||
del out['_']
|
||||
return out
|
||||
|
||||
|
||||
def _handle_truncated_float_vec(vec, nbytes):
|
||||
# This feature is not well documented, but some SAS XPORT files
|
||||
# have 2-7 byte "truncated" floats. To read these truncated
|
||||
# floats, pad them with zeros on the right to make 8 byte floats.
|
||||
#
|
||||
# References:
|
||||
# https://github.com/jcushman/xport/pull/3
|
||||
# The R "foreign" library
|
||||
|
||||
if nbytes != 8:
|
||||
vec1 = np.zeros(len(vec), np.dtype('S8'))
|
||||
dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes))
|
||||
vec2 = vec1.view(dtype=dtype)
|
||||
vec2['f0'] = vec
|
||||
return vec2
|
||||
|
||||
return vec
|
||||
|
||||
|
||||
def _parse_float_vec(vec):
|
||||
"""
|
||||
Parse a vector of float values representing IBM 8 byte floats into
|
||||
native 8 byte floats.
|
||||
"""
|
||||
|
||||
dtype = np.dtype('>u4,>u4')
|
||||
vec1 = vec.view(dtype=dtype)
|
||||
xport1 = vec1['f0']
|
||||
xport2 = vec1['f1']
|
||||
|
||||
# Start by setting first half of ieee number to first half of IBM
|
||||
# number sans exponent
|
||||
ieee1 = xport1 & 0x00ffffff
|
||||
|
||||
# Get the second half of the ibm number into the second half of
|
||||
# the ieee number
|
||||
ieee2 = xport2
|
||||
|
||||
# The fraction bit to the left of the binary point in the ieee
|
||||
# format was set and the number was shifted 0, 1, 2, or 3
|
||||
# places. This will tell us how to adjust the ibm exponent to be a
|
||||
# power of 2 ieee exponent and how to shift the fraction bits to
|
||||
# restore the correct magnitude.
|
||||
shift = np.zeros(len(vec), dtype=np.uint8)
|
||||
shift[np.where(xport1 & 0x00200000)] = 1
|
||||
shift[np.where(xport1 & 0x00400000)] = 2
|
||||
shift[np.where(xport1 & 0x00800000)] = 3
|
||||
|
||||
# shift the ieee number down the correct number of places then
|
||||
# set the second half of the ieee number to be the second half
|
||||
# of the ibm number shifted appropriately, ored with the bits
|
||||
# from the first half that would have been shifted in if we
|
||||
# could shift a double. All we are worried about are the low
|
||||
# order 3 bits of the first half since we're only shifting by
|
||||
# 1, 2, or 3.
|
||||
ieee1 >>= shift
|
||||
ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
|
||||
|
||||
# clear the 1 bit to the left of the binary point
|
||||
ieee1 &= 0xffefffff
|
||||
|
||||
# set the exponent of the ieee number to be the actual exponent
|
||||
# plus the shift count + 1023. Or this into the first half of the
|
||||
# ieee number. The ibm exponent is excess 64 but is adjusted by 65
|
||||
# since during conversion to ibm format the exponent is
|
||||
# incremented by 1 and the fraction bits left 4 positions to the
|
||||
# right of the radix point. (had to add >> 24 because C treats &
|
||||
# 0x7f as 0x7f000000 and Python doesn't)
|
||||
ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) +
|
||||
shift + 1023) << 20) | (xport1 & 0x80000000)
|
||||
|
||||
ieee = np.empty((len(ieee1),), dtype='>u4,>u4')
|
||||
ieee['f0'] = ieee1
|
||||
ieee['f1'] = ieee2
|
||||
ieee = ieee.view(dtype='>f8')
|
||||
ieee = ieee.astype('f8')
|
||||
|
||||
return ieee
|
||||
|
||||
|
||||
class XportReader(BaseIterator):
|
||||
__doc__ = _xport_reader_doc
|
||||
|
||||
def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
|
||||
chunksize=None):
|
||||
|
||||
self._encoding = encoding
|
||||
self._lines_read = 0
|
||||
self._index = index
|
||||
self._chunksize = chunksize
|
||||
|
||||
if isinstance(filepath_or_buffer, str):
|
||||
(filepath_or_buffer, encoding,
|
||||
compression, should_close) = get_filepath_or_buffer(
|
||||
filepath_or_buffer, encoding=encoding)
|
||||
|
||||
if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)):
|
||||
self.filepath_or_buffer = open(filepath_or_buffer, 'rb')
|
||||
else:
|
||||
# Copy to BytesIO, and ensure no encoding
|
||||
contents = filepath_or_buffer.read()
|
||||
try:
|
||||
contents = contents.encode(self._encoding)
|
||||
except:
|
||||
pass
|
||||
self.filepath_or_buffer = compat.BytesIO(contents)
|
||||
|
||||
self._read_header()
|
||||
|
||||
def close(self):
|
||||
self.filepath_or_buffer.close()
|
||||
|
||||
def _get_row(self):
|
||||
return self.filepath_or_buffer.read(80).decode()
|
||||
|
||||
def _read_header(self):
|
||||
self.filepath_or_buffer.seek(0)
|
||||
|
||||
# read file header
|
||||
line1 = self._get_row()
|
||||
if line1 != _correct_line1:
|
||||
self.close()
|
||||
raise ValueError("Header record is not an XPORT file.")
|
||||
|
||||
line2 = self._get_row()
|
||||
fif = [['prefix', 24], ['version', 8], ['OS', 8],
|
||||
['_', 24], ['created', 16]]
|
||||
file_info = _split_line(line2, fif)
|
||||
if file_info['prefix'] != "SAS SAS SASLIB":
|
||||
self.close()
|
||||
raise ValueError("Header record has invalid prefix.")
|
||||
file_info['created'] = _parse_date(file_info['created'])
|
||||
self.file_info = file_info
|
||||
|
||||
line3 = self._get_row()
|
||||
file_info['modified'] = _parse_date(line3[:16])
|
||||
|
||||
# read member header
|
||||
header1 = self._get_row()
|
||||
header2 = self._get_row()
|
||||
headflag1 = header1.startswith(_correct_header1)
|
||||
headflag2 = (header2 == _correct_header2)
|
||||
if not (headflag1 and headflag2):
|
||||
self.close()
|
||||
raise ValueError("Member header not found")
|
||||
# usually 140, could be 135
|
||||
fieldnamelength = int(header1[-5:-2])
|
||||
|
||||
# member info
|
||||
mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8],
|
||||
['version', 8], ['OS', 8], ['_', 24], ['created', 16]]
|
||||
member_info = _split_line(self._get_row(), mem)
|
||||
mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]]
|
||||
member_info.update(_split_line(self._get_row(), mem))
|
||||
member_info['modified'] = _parse_date(member_info['modified'])
|
||||
member_info['created'] = _parse_date(member_info['created'])
|
||||
self.member_info = member_info
|
||||
|
||||
# read field names
|
||||
types = {1: 'numeric', 2: 'char'}
|
||||
fieldcount = int(self._get_row()[54:58])
|
||||
datalength = fieldnamelength * fieldcount
|
||||
# round up to nearest 80
|
||||
if datalength % 80:
|
||||
datalength += 80 - datalength % 80
|
||||
fielddata = self.filepath_or_buffer.read(datalength)
|
||||
fields = []
|
||||
obs_length = 0
|
||||
while len(fielddata) >= fieldnamelength:
|
||||
# pull data for one field
|
||||
field, fielddata = (fielddata[:fieldnamelength],
|
||||
fielddata[fieldnamelength:])
|
||||
|
||||
# rest at end gets ignored, so if field is short, pad out
|
||||
# to match struct pattern below
|
||||
field = field.ljust(140)
|
||||
|
||||
fieldstruct = struct.unpack('>hhhh8s40s8shhh2s8shhl52s', field)
|
||||
field = dict(zip(_fieldkeys, fieldstruct))
|
||||
del field['_']
|
||||
field['ntype'] = types[field['ntype']]
|
||||
fl = field['field_length']
|
||||
if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
|
||||
self.close()
|
||||
msg = "Floating field width {0} is not between 2 and 8."
|
||||
raise TypeError(msg.format(fl))
|
||||
|
||||
for k, v in field.items():
|
||||
try:
|
||||
field[k] = v.strip()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
obs_length += field['field_length']
|
||||
fields += [field]
|
||||
|
||||
header = self._get_row()
|
||||
if not header == _correct_obs_header:
|
||||
self.close()
|
||||
raise ValueError("Observation header not found.")
|
||||
|
||||
self.fields = fields
|
||||
self.record_length = obs_length
|
||||
self.record_start = self.filepath_or_buffer.tell()
|
||||
|
||||
self.nobs = self._record_count()
|
||||
self.columns = [x['name'].decode() for x in self.fields]
|
||||
|
||||
# Setup the dtype.
|
||||
dtypel = []
|
||||
for i, field in enumerate(self.fields):
|
||||
dtypel.append(('s' + str(i), "S" + str(field['field_length'])))
|
||||
dtype = np.dtype(dtypel)
|
||||
self._dtype = dtype
|
||||
|
||||
def __next__(self):
|
||||
return self.read(nrows=self._chunksize or 1)
|
||||
|
||||
def _record_count(self):
|
||||
"""
|
||||
Get number of records in file.
|
||||
|
||||
This is maybe suboptimal because we have to seek to the end of
|
||||
the file.
|
||||
|
||||
Side effect: returns file position to record_start.
|
||||
"""
|
||||
|
||||
self.filepath_or_buffer.seek(0, 2)
|
||||
total_records_length = (self.filepath_or_buffer.tell() -
|
||||
self.record_start)
|
||||
|
||||
if total_records_length % 80 != 0:
|
||||
warnings.warn("xport file may be corrupted")
|
||||
|
||||
if self.record_length > 80:
|
||||
self.filepath_or_buffer.seek(self.record_start)
|
||||
return total_records_length // self.record_length
|
||||
|
||||
self.filepath_or_buffer.seek(-80, 2)
|
||||
last_card = self.filepath_or_buffer.read(80)
|
||||
last_card = np.frombuffer(last_card, dtype=np.uint64)
|
||||
|
||||
# 8 byte blank
|
||||
ix = np.flatnonzero(last_card == 2314885530818453536)
|
||||
|
||||
if len(ix) == 0:
|
||||
tail_pad = 0
|
||||
else:
|
||||
tail_pad = 8 * len(ix)
|
||||
|
||||
self.filepath_or_buffer.seek(self.record_start)
|
||||
|
||||
return (total_records_length - tail_pad) // self.record_length
|
||||
|
||||
def get_chunk(self, size=None):
|
||||
"""
|
||||
Reads lines from Xport file and returns as dataframe
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int, defaults to None
|
||||
Number of lines to read. If None, reads whole file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
"""
|
||||
if size is None:
|
||||
size = self._chunksize
|
||||
return self.read(nrows=size)
|
||||
|
||||
def _missing_double(self, vec):
|
||||
v = vec.view(dtype='u1,u1,u2,u4')
|
||||
miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0)
|
||||
miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |
|
||||
(v['f0'] == 0x5f) | (v['f0'] == 0x2e))
|
||||
miss &= miss1
|
||||
return miss
|
||||
|
||||
@Appender(_read_method_doc)
|
||||
def read(self, nrows=None):
|
||||
|
||||
if nrows is None:
|
||||
nrows = self.nobs
|
||||
|
||||
read_lines = min(nrows, self.nobs - self._lines_read)
|
||||
read_len = read_lines * self.record_length
|
||||
if read_len <= 0:
|
||||
self.close()
|
||||
raise StopIteration
|
||||
raw = self.filepath_or_buffer.read(read_len)
|
||||
data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
|
||||
|
||||
df = pd.DataFrame(index=range(read_lines))
|
||||
for j, x in enumerate(self.columns):
|
||||
vec = data['s%d' % j]
|
||||
ntype = self.fields[j]['ntype']
|
||||
if ntype == "numeric":
|
||||
vec = _handle_truncated_float_vec(
|
||||
vec, self.fields[j]['field_length'])
|
||||
miss = self._missing_double(vec)
|
||||
v = _parse_float_vec(vec)
|
||||
v[miss] = np.nan
|
||||
elif self.fields[j]['ntype'] == 'char':
|
||||
v = [y.rstrip() for y in vec]
|
||||
if compat.PY3:
|
||||
if self._encoding is not None:
|
||||
v = [y.decode(self._encoding) for y in v]
|
||||
df[x] = v
|
||||
|
||||
if self._index is None:
|
||||
df.index = range(self._lines_read, self._lines_read + read_lines)
|
||||
else:
|
||||
df = df.set_index(self._index)
|
||||
|
||||
self._lines_read += read_lines
|
||||
|
||||
return df
|
||||
@@ -0,0 +1,70 @@
|
||||
"""
|
||||
Read SAS sas7bdat or xport files.
|
||||
"""
|
||||
from pandas import compat
|
||||
from pandas.io.common import _stringify_path
|
||||
|
||||
|
||||
def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
|
||||
chunksize=None, iterator=False):
|
||||
"""
|
||||
Read SAS files stored as either XPORT or SAS7BDAT format files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : string or file-like object
|
||||
Path to the SAS file.
|
||||
format : string {'xport', 'sas7bdat'} or None
|
||||
If None, file format is inferred. If 'xport' or 'sas7bdat',
|
||||
uses the corresponding format.
|
||||
index : identifier of index column, defaults to None
|
||||
Identifier of column that should be used as index of the DataFrame.
|
||||
encoding : string, default is None
|
||||
Encoding for text data. If None, text data are stored as raw bytes.
|
||||
chunksize : int
|
||||
Read file `chunksize` lines at a time, returns iterator.
|
||||
iterator : bool, defaults to False
|
||||
If True, returns an iterator for reading the file incrementally.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
|
||||
or XportReader
|
||||
"""
|
||||
if format is None:
|
||||
buffer_error_msg = ("If this is a buffer object rather "
|
||||
"than a string name, you must specify "
|
||||
"a format string")
|
||||
filepath_or_buffer = _stringify_path(filepath_or_buffer)
|
||||
if not isinstance(filepath_or_buffer, compat.string_types):
|
||||
raise ValueError(buffer_error_msg)
|
||||
try:
|
||||
fname = filepath_or_buffer.lower()
|
||||
if fname.endswith(".xpt"):
|
||||
format = "xport"
|
||||
elif fname.endswith(".sas7bdat"):
|
||||
format = "sas7bdat"
|
||||
else:
|
||||
raise ValueError("unable to infer format of SAS file")
|
||||
except:
|
||||
pass
|
||||
|
||||
if format.lower() == 'xport':
|
||||
from pandas.io.sas.sas_xport import XportReader
|
||||
reader = XportReader(filepath_or_buffer, index=index,
|
||||
encoding=encoding,
|
||||
chunksize=chunksize)
|
||||
elif format.lower() == 'sas7bdat':
|
||||
from pandas.io.sas.sas7bdat import SAS7BDATReader
|
||||
reader = SAS7BDATReader(filepath_or_buffer, index=index,
|
||||
encoding=encoding,
|
||||
chunksize=chunksize)
|
||||
else:
|
||||
raise ValueError('unknown SAS format')
|
||||
|
||||
if iterator or chunksize:
|
||||
return reader
|
||||
|
||||
data = reader.read()
|
||||
reader.close()
|
||||
return data
|
||||
Reference in New Issue
Block a user