started work on backend

This commit is contained in:
d3m1g0d
2019-01-21 17:36:00 +01:00
parent a1a8bca34b
commit 9f9a7e4974
4032 changed files with 745079 additions and 0 deletions
@@ -0,0 +1 @@
from .sasreader import read_sas # noqa
@@ -0,0 +1,687 @@
"""
Read SAS7BDAT files
Based on code written by Jared Hobbs:
https://bitbucket.org/jaredhobbs/sas7bdat
See also:
https://github.com/BioStatMatt/sas7bdat
Partial documentation of the file format:
https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
Reference for binary data compression:
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
"""
import pandas as pd
from pandas import compat
from pandas.io.common import get_filepath_or_buffer, BaseIterator
from pandas.errors import EmptyDataError
import numpy as np
import struct
import pandas.io.sas.sas_constants as const
from pandas.io.sas._sas import Parser
class _subheader_pointer(object):
pass
class _column(object):
pass
# SAS7BDAT represents a SAS data file in SAS7BDAT format.
class SAS7BDATReader(BaseIterator):
"""
Read SAS files in SAS7BDAT format.
Parameters
----------
path_or_buf : path name or buffer
Name of SAS file or file-like object pointing to SAS file
contents.
index : column identifier, defaults to None
Column to use as index.
convert_dates : boolean, defaults to True
Attempt to convert dates to Pandas datetime values. Note that
some rarely used SAS date formats may be unsupported.
blank_missing : boolean, defaults to True
Convert empty strings to missing values (SAS uses blanks to
indicate missing character variables).
chunksize : int, defaults to None
Return SAS7BDATReader object for iterations, returns chunks
with given number of lines.
encoding : string, defaults to None
String encoding.
convert_text : bool, defaults to True
If False, text variables are left as raw bytes.
convert_header_text : bool, defaults to True
If False, header text, including column names, are left as raw
bytes.
"""
def __init__(self, path_or_buf, index=None, convert_dates=True,
blank_missing=True, chunksize=None, encoding=None,
convert_text=True, convert_header_text=True):
self.index = index
self.convert_dates = convert_dates
self.blank_missing = blank_missing
self.chunksize = chunksize
self.encoding = encoding
self.convert_text = convert_text
self.convert_header_text = convert_header_text
self.default_encoding = "latin-1"
self.compression = ""
self.column_names_strings = []
self.column_names = []
self.column_types = []
self.column_formats = []
self.columns = []
self._current_page_data_subheader_pointers = []
self._cached_page = None
self._column_data_lengths = []
self._column_data_offsets = []
self._current_row_in_file_index = 0
self._current_row_on_page_index = 0
self._current_row_in_file_index = 0
self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
if isinstance(self._path_or_buf, compat.string_types):
self._path_or_buf = open(self._path_or_buf, 'rb')
self.handle = self._path_or_buf
self._get_properties()
self._parse_metadata()
def close(self):
try:
self.handle.close()
except AttributeError:
pass
def _get_properties(self):
# Check magic number
self._path_or_buf.seek(0)
self._cached_page = self._path_or_buf.read(288)
if self._cached_page[0:len(const.magic)] != const.magic:
self.close()
raise ValueError("magic number mismatch (not a SAS file?)")
# Get alignment information
align1, align2 = 0, 0
buf = self._read_bytes(const.align_1_offset, const.align_1_length)
if buf == const.u64_byte_checker_value:
align2 = const.align_2_value
self.U64 = True
self._int_length = 8
self._page_bit_offset = const.page_bit_offset_x64
self._subheader_pointer_length = const.subheader_pointer_length_x64
else:
self.U64 = False
self._page_bit_offset = const.page_bit_offset_x86
self._subheader_pointer_length = const.subheader_pointer_length_x86
self._int_length = 4
buf = self._read_bytes(const.align_2_offset, const.align_2_length)
if buf == const.align_1_checker_value:
align1 = const.align_2_value
total_align = align1 + align2
# Get endianness information
buf = self._read_bytes(const.endianness_offset,
const.endianness_length)
if buf == b'\x01':
self.byte_order = "<"
else:
self.byte_order = ">"
# Get encoding information
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
if buf in const.encoding_names:
self.file_encoding = const.encoding_names[buf]
else:
self.file_encoding = "unknown (code=%s)" % str(buf)
# Get platform information
buf = self._read_bytes(const.platform_offset, const.platform_length)
if buf == b'1':
self.platform = "unix"
elif buf == b'2':
self.platform = "windows"
else:
self.platform = "unknown"
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
self.name = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.name = self.name.decode(
self.encoding or self.default_encoding)
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
self.file_type = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.file_type = self.file_type.decode(
self.encoding or self.default_encoding)
# Timestamp is epoch 01/01/1960
epoch = pd.datetime(1960, 1, 1)
x = self._read_float(const.date_created_offset + align1,
const.date_created_length)
self.date_created = epoch + pd.to_timedelta(x, unit='s')
x = self._read_float(const.date_modified_offset + align1,
const.date_modified_length)
self.date_modified = epoch + pd.to_timedelta(x, unit='s')
self.header_length = self._read_int(const.header_size_offset + align1,
const.header_size_length)
# Read the rest of the header into cached_page.
buf = self._path_or_buf.read(self.header_length - 288)
self._cached_page += buf
if len(self._cached_page) != self.header_length:
self.close()
raise ValueError("The SAS7BDAT file appears to be truncated.")
self._page_length = self._read_int(const.page_size_offset + align1,
const.page_size_length)
self._page_count = self._read_int(const.page_count_offset + align1,
const.page_count_length)
buf = self._read_bytes(const.sas_release_offset + total_align,
const.sas_release_length)
self.sas_release = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.sas_release = self.sas_release.decode(
self.encoding or self.default_encoding)
buf = self._read_bytes(const.sas_server_type_offset + total_align,
const.sas_server_type_length)
self.server_type = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.server_type = self.server_type.decode(
self.encoding or self.default_encoding)
buf = self._read_bytes(const.os_version_number_offset + total_align,
const.os_version_number_length)
self.os_version = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.os_version = self.os_version.decode(
self.encoding or self.default_encoding)
buf = self._read_bytes(const.os_name_offset + total_align,
const.os_name_length)
buf = buf.rstrip(b'\x00 ')
if len(buf) > 0:
self.os_name = buf.decode(self.encoding or self.default_encoding)
else:
buf = self._read_bytes(const.os_maker_offset + total_align,
const.os_maker_length)
self.os_name = buf.rstrip(b'\x00 ')
if self.convert_header_text:
self.os_name = self.os_name.decode(
self.encoding or self.default_encoding)
def __next__(self):
da = self.read(nrows=self.chunksize or 1)
if da is None:
raise StopIteration
return da
# Read a single float of the given width (4 or 8).
def _read_float(self, offset, width):
if width not in (4, 8):
self.close()
raise ValueError("invalid float width")
buf = self._read_bytes(offset, width)
fd = "f" if width == 4 else "d"
return struct.unpack(self.byte_order + fd, buf)[0]
# Read a single signed integer of the given width (1, 2, 4 or 8).
def _read_int(self, offset, width):
if width not in (1, 2, 4, 8):
self.close()
raise ValueError("invalid int width")
buf = self._read_bytes(offset, width)
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
iv = struct.unpack(self.byte_order + it, buf)[0]
return iv
def _read_bytes(self, offset, length):
if self._cached_page is None:
self._path_or_buf.seek(offset)
buf = self._path_or_buf.read(length)
if len(buf) < length:
self.close()
msg = "Unable to read {:d} bytes from file position {:d}."
raise ValueError(msg.format(length, offset))
return buf
else:
if offset + length > len(self._cached_page):
self.close()
raise ValueError("The cached page is too small.")
return self._cached_page[offset:offset + length]
def _parse_metadata(self):
done = False
while not done:
self._cached_page = self._path_or_buf.read(self._page_length)
if len(self._cached_page) <= 0:
break
if len(self._cached_page) != self._page_length:
self.close()
raise ValueError(
"Failed to read a meta data page from the SAS file.")
done = self._process_page_meta()
def _process_page_meta(self):
self._read_page_header()
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
if self._current_page_type in pt:
self._process_page_metadata()
return ((self._current_page_type in [256] + const.page_mix_types) or
(self._current_page_data_subheader_pointers is not None))
def _read_page_header(self):
bit_offset = self._page_bit_offset
tx = const.page_type_offset + bit_offset
self._current_page_type = self._read_int(tx, const.page_type_length)
tx = const.block_count_offset + bit_offset
self._current_page_block_count = self._read_int(
tx, const.block_count_length)
tx = const.subheader_count_offset + bit_offset
self._current_page_subheaders_count = (
self._read_int(tx, const.subheader_count_length))
def _process_page_metadata(self):
bit_offset = self._page_bit_offset
for i in range(self._current_page_subheaders_count):
pointer = self._process_subheader_pointers(
const.subheader_pointers_offset + bit_offset, i)
if pointer.length == 0:
continue
if pointer.compression == const.truncated_subheader_id:
continue
subheader_signature = self._read_subheader_signature(
pointer.offset)
subheader_index = (
self._get_subheader_index(subheader_signature,
pointer.compression, pointer.ptype))
self._process_subheader(subheader_index, pointer)
def _get_subheader_index(self, signature, compression, ptype):
index = const.subheader_signature_to_index.get(signature)
if index is None:
f1 = ((compression == const.compressed_subheader_id) or
(compression == 0))
f2 = (ptype == const.compressed_subheader_type)
if (self.compression != "") and f1 and f2:
index = const.SASIndex.data_subheader_index
else:
self.close()
raise ValueError("Unknown subheader signature")
return index
def _process_subheader_pointers(self, offset, subheader_pointer_index):
subheader_pointer_length = self._subheader_pointer_length
total_offset = (offset +
subheader_pointer_length * subheader_pointer_index)
subheader_offset = self._read_int(total_offset, self._int_length)
total_offset += self._int_length
subheader_length = self._read_int(total_offset, self._int_length)
total_offset += self._int_length
subheader_compression = self._read_int(total_offset, 1)
total_offset += 1
subheader_type = self._read_int(total_offset, 1)
x = _subheader_pointer()
x.offset = subheader_offset
x.length = subheader_length
x.compression = subheader_compression
x.ptype = subheader_type
return x
def _read_subheader_signature(self, offset):
subheader_signature = self._read_bytes(offset, self._int_length)
return subheader_signature
def _process_subheader(self, subheader_index, pointer):
offset = pointer.offset
length = pointer.length
if subheader_index == const.SASIndex.row_size_index:
processor = self._process_rowsize_subheader
elif subheader_index == const.SASIndex.column_size_index:
processor = self._process_columnsize_subheader
elif subheader_index == const.SASIndex.column_text_index:
processor = self._process_columntext_subheader
elif subheader_index == const.SASIndex.column_name_index:
processor = self._process_columnname_subheader
elif subheader_index == const.SASIndex.column_attributes_index:
processor = self._process_columnattributes_subheader
elif subheader_index == const.SASIndex.format_and_label_index:
processor = self._process_format_subheader
elif subheader_index == const.SASIndex.column_list_index:
processor = self._process_columnlist_subheader
elif subheader_index == const.SASIndex.subheader_counts_index:
processor = self._process_subheader_counts
elif subheader_index == const.SASIndex.data_subheader_index:
self._current_page_data_subheader_pointers.append(pointer)
return
else:
raise ValueError("unknown subheader index")
processor(offset, length)
def _process_rowsize_subheader(self, offset, length):
int_len = self._int_length
lcs_offset = offset
lcp_offset = offset
if self.U64:
lcs_offset += 682
lcp_offset += 706
else:
lcs_offset += 354
lcp_offset += 378
self.row_length = self._read_int(
offset + const.row_length_offset_multiplier * int_len, int_len)
self.row_count = self._read_int(
offset + const.row_count_offset_multiplier * int_len, int_len)
self.col_count_p1 = self._read_int(
offset + const.col_count_p1_multiplier * int_len, int_len)
self.col_count_p2 = self._read_int(
offset + const.col_count_p2_multiplier * int_len, int_len)
mx = const.row_count_on_mix_page_offset_multiplier * int_len
self._mix_page_row_count = self._read_int(offset + mx, int_len)
self._lcs = self._read_int(lcs_offset, 2)
self._lcp = self._read_int(lcp_offset, 2)
def _process_columnsize_subheader(self, offset, length):
int_len = self._int_length
offset += int_len
self.column_count = self._read_int(offset, int_len)
if (self.col_count_p1 + self.col_count_p2 !=
self.column_count):
print("Warning: column count mismatch (%d + %d != %d)\n",
self.col_count_p1, self.col_count_p2, self.column_count)
# Unknown purpose
def _process_subheader_counts(self, offset, length):
pass
def _process_columntext_subheader(self, offset, length):
offset += self._int_length
text_block_size = self._read_int(offset, const.text_block_size_length)
buf = self._read_bytes(offset, text_block_size)
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
cname = cname_raw
if self.convert_header_text:
cname = cname.decode(self.encoding or self.default_encoding)
self.column_names_strings.append(cname)
if len(self.column_names_strings) == 1:
compression_literal = ""
for cl in const.compression_literals:
if cl in cname_raw:
compression_literal = cl
self.compression = compression_literal
offset -= self._int_length
offset1 = offset + 16
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
compression_literal = buf.rstrip(b"\x00")
if compression_literal == "":
self._lcs = 0
offset1 = offset + 32
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
self.creator_proc = buf[0:self._lcp]
elif compression_literal == const.rle_compression:
offset1 = offset + 40
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
self.creator_proc = buf[0:self._lcp]
elif self._lcs > 0:
self._lcp = 0
offset1 = offset + 16
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcs)
self.creator_proc = buf[0:self._lcp]
if self.convert_header_text:
if hasattr(self, "creator_proc"):
self.creator_proc = self.creator_proc.decode(
self.encoding or self.default_encoding)
def _process_columnname_subheader(self, offset, length):
int_len = self._int_length
offset += int_len
column_name_pointers_count = (length - 2 * int_len - 12) // 8
for i in range(column_name_pointers_count):
text_subheader = offset + const.column_name_pointer_length * \
(i + 1) + const.column_name_text_subheader_offset
col_name_offset = offset + const.column_name_pointer_length * \
(i + 1) + const.column_name_offset_offset
col_name_length = offset + const.column_name_pointer_length * \
(i + 1) + const.column_name_length_offset
idx = self._read_int(
text_subheader, const.column_name_text_subheader_length)
col_offset = self._read_int(
col_name_offset, const.column_name_offset_length)
col_len = self._read_int(
col_name_length, const.column_name_length_length)
name_str = self.column_names_strings[idx]
self.column_names.append(name_str[col_offset:col_offset + col_len])
def _process_columnattributes_subheader(self, offset, length):
int_len = self._int_length
column_attributes_vectors_count = (
length - 2 * int_len - 12) // (int_len + 8)
self.column_types = np.empty(
column_attributes_vectors_count, dtype=np.dtype('S1'))
self._column_data_lengths = np.empty(
column_attributes_vectors_count, dtype=np.int64)
self._column_data_offsets = np.empty(
column_attributes_vectors_count, dtype=np.int64)
for i in range(column_attributes_vectors_count):
col_data_offset = (offset + int_len +
const.column_data_offset_offset +
i * (int_len + 8))
col_data_len = (offset + 2 * int_len +
const.column_data_length_offset +
i * (int_len + 8))
col_types = (offset + 2 * int_len +
const.column_type_offset + i * (int_len + 8))
x = self._read_int(col_data_offset, int_len)
self._column_data_offsets[i] = x
x = self._read_int(col_data_len, const.column_data_length_length)
self._column_data_lengths[i] = x
x = self._read_int(col_types, const.column_type_length)
if x == 1:
self.column_types[i] = b'd'
else:
self.column_types[i] = b's'
def _process_columnlist_subheader(self, offset, length):
# unknown purpose
pass
def _process_format_subheader(self, offset, length):
int_len = self._int_length
text_subheader_format = (
offset +
const.column_format_text_subheader_index_offset +
3 * int_len)
col_format_offset = (offset +
const.column_format_offset_offset +
3 * int_len)
col_format_len = (offset +
const.column_format_length_offset +
3 * int_len)
text_subheader_label = (
offset +
const.column_label_text_subheader_index_offset +
3 * int_len)
col_label_offset = (offset +
const.column_label_offset_offset +
3 * int_len)
col_label_len = offset + const.column_label_length_offset + 3 * int_len
x = self._read_int(text_subheader_format,
const.column_format_text_subheader_index_length)
format_idx = min(x, len(self.column_names_strings) - 1)
format_start = self._read_int(
col_format_offset, const.column_format_offset_length)
format_len = self._read_int(
col_format_len, const.column_format_length_length)
label_idx = self._read_int(
text_subheader_label,
const.column_label_text_subheader_index_length)
label_idx = min(label_idx, len(self.column_names_strings) - 1)
label_start = self._read_int(
col_label_offset, const.column_label_offset_length)
label_len = self._read_int(col_label_len,
const.column_label_length_length)
label_names = self.column_names_strings[label_idx]
column_label = label_names[label_start: label_start + label_len]
format_names = self.column_names_strings[format_idx]
column_format = format_names[format_start: format_start + format_len]
current_column_number = len(self.columns)
col = _column()
col.col_id = current_column_number
col.name = self.column_names[current_column_number]
col.label = column_label
col.format = column_format
col.ctype = self.column_types[current_column_number]
col.length = self._column_data_lengths[current_column_number]
self.column_formats.append(column_format)
self.columns.append(col)
def read(self, nrows=None):
if (nrows is None) and (self.chunksize is not None):
nrows = self.chunksize
elif nrows is None:
nrows = self.row_count
if len(self.column_types) == 0:
self.close()
raise EmptyDataError("No columns to parse from file")
if self._current_row_in_file_index >= self.row_count:
return None
m = self.row_count - self._current_row_in_file_index
if nrows > m:
nrows = m
nd = (self.column_types == b'd').sum()
ns = (self.column_types == b's').sum()
self._string_chunk = np.empty((ns, nrows), dtype=np.object)
self._byte_chunk = np.empty((nd, 8 * nrows), dtype=np.uint8)
self._current_row_in_chunk_index = 0
p = Parser(self)
p.read(nrows)
rslt = self._chunk_to_dataframe()
if self.index is not None:
rslt = rslt.set_index(self.index)
return rslt
def _read_next_page(self):
self._current_page_data_subheader_pointers = []
self._cached_page = self._path_or_buf.read(self._page_length)
if len(self._cached_page) <= 0:
return True
elif len(self._cached_page) != self._page_length:
self.close()
msg = ("failed to read complete page from file "
"(read {:d} of {:d} bytes)")
raise ValueError(msg.format(len(self._cached_page),
self._page_length))
self._read_page_header()
if self._current_page_type == const.page_meta_type:
self._process_page_metadata()
pt = [const.page_meta_type, const.page_data_type]
pt += [const.page_mix_types]
if self._current_page_type not in pt:
return self._read_next_page()
return False
def _chunk_to_dataframe(self):
n = self._current_row_in_chunk_index
m = self._current_row_in_file_index
ix = range(m - n, m)
rslt = pd.DataFrame(index=ix)
js, jb = 0, 0
for j in range(self.column_count):
name = self.column_names[j]
if self.column_types[j] == b'd':
rslt[name] = self._byte_chunk[jb, :].view(
dtype=self.byte_order + 'd')
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
if self.convert_dates:
unit = None
if self.column_formats[j] in const.sas_date_formats:
unit = 'd'
elif self.column_formats[j] in const.sas_datetime_formats:
unit = 's'
if unit:
rslt[name] = pd.to_datetime(rslt[name], unit=unit,
origin="1960-01-01")
jb += 1
elif self.column_types[j] == b's':
rslt[name] = self._string_chunk[js, :]
if self.convert_text and (self.encoding is not None):
rslt[name] = rslt[name].str.decode(
self.encoding or self.default_encoding)
if self.blank_missing:
ii = rslt[name].str.len() == 0
rslt.loc[ii, name] = np.nan
js += 1
else:
self.close()
raise ValueError("unknown column type %s" %
self.column_types[j])
return rslt
@@ -0,0 +1,171 @@
magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" +
b"\x00\x00\x00\x00\xc2\xea\x81\x60" +
b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" +
b"\x09\xc7\x31\x8c\x18\x1f\x10\x11")
align_1_checker_value = b'3'
align_1_offset = 32
align_1_length = 1
align_1_value = 4
u64_byte_checker_value = b'3'
align_2_offset = 35
align_2_length = 1
align_2_value = 4
endianness_offset = 37
endianness_length = 1
platform_offset = 39
platform_length = 1
encoding_offset = 70
encoding_length = 1
dataset_offset = 92
dataset_length = 64
file_type_offset = 156
file_type_length = 8
date_created_offset = 164
date_created_length = 8
date_modified_offset = 172
date_modified_length = 8
header_size_offset = 196
header_size_length = 4
page_size_offset = 200
page_size_length = 4
page_count_offset = 204
page_count_length = 4
sas_release_offset = 216
sas_release_length = 8
sas_server_type_offset = 224
sas_server_type_length = 16
os_version_number_offset = 240
os_version_number_length = 16
os_maker_offset = 256
os_maker_length = 16
os_name_offset = 272
os_name_length = 16
page_bit_offset_x86 = 16
page_bit_offset_x64 = 32
subheader_pointer_length_x86 = 12
subheader_pointer_length_x64 = 24
page_type_offset = 0
page_type_length = 2
block_count_offset = 2
block_count_length = 2
subheader_count_offset = 4
subheader_count_length = 2
page_meta_type = 0
page_data_type = 256
page_amd_type = 1024
page_metc_type = 16384
page_comp_type = -28672
page_mix_types = [512, 640]
subheader_pointers_offset = 8
truncated_subheader_id = 1
compressed_subheader_id = 4
compressed_subheader_type = 1
text_block_size_length = 2
row_length_offset_multiplier = 5
row_count_offset_multiplier = 6
col_count_p1_multiplier = 9
col_count_p2_multiplier = 10
row_count_on_mix_page_offset_multiplier = 15
column_name_pointer_length = 8
column_name_text_subheader_offset = 0
column_name_text_subheader_length = 2
column_name_offset_offset = 2
column_name_offset_length = 2
column_name_length_offset = 4
column_name_length_length = 2
column_data_offset_offset = 8
column_data_length_offset = 8
column_data_length_length = 4
column_type_offset = 14
column_type_length = 1
column_format_text_subheader_index_offset = 22
column_format_text_subheader_index_length = 2
column_format_offset_offset = 24
column_format_offset_length = 2
column_format_length_offset = 26
column_format_length_length = 2
column_label_text_subheader_index_offset = 28
column_label_text_subheader_index_length = 2
column_label_offset_offset = 30
column_label_offset_length = 2
column_label_length_offset = 32
column_label_length_length = 2
rle_compression = b'SASYZCRL'
rdc_compression = b'SASYZCR2'
compression_literals = [rle_compression, rdc_compression]
# Incomplete list of encodings, using SAS nomenclature:
# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2",
61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"}
class SASIndex(object):
row_size_index = 0
column_size_index = 1
subheader_counts_index = 2
column_text_index = 3
column_name_index = 4
column_attributes_index = 5
format_and_label_index = 6
column_list_index = 7
data_subheader_index = 8
subheader_signature_to_index = {
b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index}
# List of frequently used SAS date and datetime formats
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
sas_date_formats = ("DATE", "DAY", "DDMMYY", "DOWNAME", "JULDAY", "JULIAN",
"MMDDYY", "MMYY", "MMYYC", "MMYYD", "MMYYP", "MMYYS",
"MMYYN", "MONNAME", "MONTH", "MONYY", "QTR", "QTRR",
"NENGO", "WEEKDATE", "WEEKDATX", "WEEKDAY", "WEEKV",
"WORDDATE", "WORDDATX", "YEAR", "YYMM", "YYMMC", "YYMMD",
"YYMMP", "YYMMS", "YYMMN", "YYMON", "YYMMDD", "YYQ",
"YYQC", "YYQD", "YYQP", "YYQS", "YYQN", "YYQR", "YYQRC",
"YYQRD", "YYQRP", "YYQRS", "YYQRN",
"YYMMDDP", "YYMMDDC", "E8601DA", "YYMMDDN", "MMDDYYC",
"MMDDYYS", "MMDDYYD", "YYMMDDS", "B8601DA", "DDMMYYN",
"YYMMDDD", "DDMMYYB", "DDMMYYP", "MMDDYYP", "YYMMDDB",
"MMDDYYN", "DDMMYYC", "DDMMYYD", "DDMMYYS",
"MINGUO")
sas_datetime_formats = ("DATETIME", "DTWKDATX",
"B8601DN", "B8601DT", "B8601DX", "B8601DZ", "B8601LX",
"E8601DN", "E8601DT", "E8601DX", "E8601DZ", "E8601LX",
"DATEAMPM", "DTDATE", "DTMONYY", "DTMONYY", "DTWKDATX",
"DTYEAR", "TOD", "MDYAMPM")
@@ -0,0 +1,465 @@
"""
Read a SAS XPort format file into a Pandas DataFrame.
Based on code from Jack Cushman (github.com/jcushman/xport).
The file format is defined here:
https://support.sas.com/techsup/technote/ts140.pdf
"""
from datetime import datetime
import pandas as pd
from pandas.io.common import get_filepath_or_buffer, BaseIterator
from pandas import compat
import struct
import numpy as np
from pandas.util._decorators import Appender
import warnings
_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
"000000000000000000000000000000 ")
_correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!"
"000000000000000001600000000")
_correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
"000000000000000000000000000000 ")
_correct_obs_header = ("HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
"000000000000000000000000000000 ")
_fieldkeys = ['ntype', 'nhfun', 'field_length', 'nvar0', 'name', 'label',
'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform',
'nifl', 'nifd', 'npos', '_']
_base_params_doc = """\
Parameters
----------
filepath_or_buffer : string or file-like object
Path to SAS file or object implementing binary read method."""
_params2_doc = """\
index : identifier of index column
Identifier of column that should be used as index of the DataFrame.
encoding : string
Encoding for text data.
chunksize : int
Read file `chunksize` lines at a time, returns iterator."""
_format_params_doc = """\
format : string
File format, only `xport` is currently supported."""
_iterator_doc = """\
iterator : boolean, default False
Return XportReader object for reading file incrementally."""
_read_sas_doc = """Read a SAS file into a DataFrame.
%(_base_params_doc)s
%(_format_params_doc)s
%(_params2_doc)s
%(_iterator_doc)s
Returns
-------
DataFrame or XportReader
Examples
--------
Read a SAS Xport file:
>>> df = pandas.read_sas('filename.XPT')
Read a Xport file in 10,000 line chunks:
>>> itr = pandas.read_sas('filename.XPT', chunksize=10000)
>>> for chunk in itr:
>>> do_something(chunk)
""" % {"_base_params_doc": _base_params_doc,
"_format_params_doc": _format_params_doc,
"_params2_doc": _params2_doc,
"_iterator_doc": _iterator_doc}
_xport_reader_doc = """\
Class for reading SAS Xport files.
%(_base_params_doc)s
%(_params2_doc)s
Attributes
----------
member_info : list
Contains information about the file
fields : list
Contains information about the variables in the file
""" % {"_base_params_doc": _base_params_doc,
"_params2_doc": _params2_doc}
_read_method_doc = """\
Read observations from SAS Xport file, returning as data frame.
Parameters
----------
nrows : int
Number of rows to read from data file; if None, read whole
file.
Returns
-------
A DataFrame.
"""
def _parse_date(datestr):
""" Given a date in xport format, return Python date. """
try:
# e.g. "16FEB11:10:07:55"
return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
except ValueError:
return pd.NaT
def _split_line(s, parts):
"""
Parameters
----------
s: string
Fixed-length string to split
parts: list of (name, length) pairs
Used to break up string, name '_' will be filtered from output.
Returns
-------
Dict of name:contents of string at given location.
"""
out = {}
start = 0
for name, length in parts:
out[name] = s[start:start + length].strip()
start += length
del out['_']
return out
def _handle_truncated_float_vec(vec, nbytes):
# This feature is not well documented, but some SAS XPORT files
# have 2-7 byte "truncated" floats. To read these truncated
# floats, pad them with zeros on the right to make 8 byte floats.
#
# References:
# https://github.com/jcushman/xport/pull/3
# The R "foreign" library
if nbytes != 8:
vec1 = np.zeros(len(vec), np.dtype('S8'))
dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes))
vec2 = vec1.view(dtype=dtype)
vec2['f0'] = vec
return vec2
return vec
def _parse_float_vec(vec):
"""
Parse a vector of float values representing IBM 8 byte floats into
native 8 byte floats.
"""
dtype = np.dtype('>u4,>u4')
vec1 = vec.view(dtype=dtype)
xport1 = vec1['f0']
xport2 = vec1['f1']
# Start by setting first half of ieee number to first half of IBM
# number sans exponent
ieee1 = xport1 & 0x00ffffff
# Get the second half of the ibm number into the second half of
# the ieee number
ieee2 = xport2
# The fraction bit to the left of the binary point in the ieee
# format was set and the number was shifted 0, 1, 2, or 3
# places. This will tell us how to adjust the ibm exponent to be a
# power of 2 ieee exponent and how to shift the fraction bits to
# restore the correct magnitude.
shift = np.zeros(len(vec), dtype=np.uint8)
shift[np.where(xport1 & 0x00200000)] = 1
shift[np.where(xport1 & 0x00400000)] = 2
shift[np.where(xport1 & 0x00800000)] = 3
# shift the ieee number down the correct number of places then
# set the second half of the ieee number to be the second half
# of the ibm number shifted appropriately, ored with the bits
# from the first half that would have been shifted in if we
# could shift a double. All we are worried about are the low
# order 3 bits of the first half since we're only shifting by
# 1, 2, or 3.
ieee1 >>= shift
ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
# clear the 1 bit to the left of the binary point
ieee1 &= 0xffefffff
# set the exponent of the ieee number to be the actual exponent
# plus the shift count + 1023. Or this into the first half of the
# ieee number. The ibm exponent is excess 64 but is adjusted by 65
# since during conversion to ibm format the exponent is
# incremented by 1 and the fraction bits left 4 positions to the
# right of the radix point. (had to add >> 24 because C treats &
# 0x7f as 0x7f000000 and Python doesn't)
ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) +
shift + 1023) << 20) | (xport1 & 0x80000000)
ieee = np.empty((len(ieee1),), dtype='>u4,>u4')
ieee['f0'] = ieee1
ieee['f1'] = ieee2
ieee = ieee.view(dtype='>f8')
ieee = ieee.astype('f8')
return ieee
class XportReader(BaseIterator):
__doc__ = _xport_reader_doc
def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
chunksize=None):
self._encoding = encoding
self._lines_read = 0
self._index = index
self._chunksize = chunksize
if isinstance(filepath_or_buffer, str):
(filepath_or_buffer, encoding,
compression, should_close) = get_filepath_or_buffer(
filepath_or_buffer, encoding=encoding)
if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)):
self.filepath_or_buffer = open(filepath_or_buffer, 'rb')
else:
# Copy to BytesIO, and ensure no encoding
contents = filepath_or_buffer.read()
try:
contents = contents.encode(self._encoding)
except:
pass
self.filepath_or_buffer = compat.BytesIO(contents)
self._read_header()
def close(self):
self.filepath_or_buffer.close()
def _get_row(self):
return self.filepath_or_buffer.read(80).decode()
def _read_header(self):
self.filepath_or_buffer.seek(0)
# read file header
line1 = self._get_row()
if line1 != _correct_line1:
self.close()
raise ValueError("Header record is not an XPORT file.")
line2 = self._get_row()
fif = [['prefix', 24], ['version', 8], ['OS', 8],
['_', 24], ['created', 16]]
file_info = _split_line(line2, fif)
if file_info['prefix'] != "SAS SAS SASLIB":
self.close()
raise ValueError("Header record has invalid prefix.")
file_info['created'] = _parse_date(file_info['created'])
self.file_info = file_info
line3 = self._get_row()
file_info['modified'] = _parse_date(line3[:16])
# read member header
header1 = self._get_row()
header2 = self._get_row()
headflag1 = header1.startswith(_correct_header1)
headflag2 = (header2 == _correct_header2)
if not (headflag1 and headflag2):
self.close()
raise ValueError("Member header not found")
# usually 140, could be 135
fieldnamelength = int(header1[-5:-2])
# member info
mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8],
['version', 8], ['OS', 8], ['_', 24], ['created', 16]]
member_info = _split_line(self._get_row(), mem)
mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]]
member_info.update(_split_line(self._get_row(), mem))
member_info['modified'] = _parse_date(member_info['modified'])
member_info['created'] = _parse_date(member_info['created'])
self.member_info = member_info
# read field names
types = {1: 'numeric', 2: 'char'}
fieldcount = int(self._get_row()[54:58])
datalength = fieldnamelength * fieldcount
# round up to nearest 80
if datalength % 80:
datalength += 80 - datalength % 80
fielddata = self.filepath_or_buffer.read(datalength)
fields = []
obs_length = 0
while len(fielddata) >= fieldnamelength:
# pull data for one field
field, fielddata = (fielddata[:fieldnamelength],
fielddata[fieldnamelength:])
# rest at end gets ignored, so if field is short, pad out
# to match struct pattern below
field = field.ljust(140)
fieldstruct = struct.unpack('>hhhh8s40s8shhh2s8shhl52s', field)
field = dict(zip(_fieldkeys, fieldstruct))
del field['_']
field['ntype'] = types[field['ntype']]
fl = field['field_length']
if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
self.close()
msg = "Floating field width {0} is not between 2 and 8."
raise TypeError(msg.format(fl))
for k, v in field.items():
try:
field[k] = v.strip()
except AttributeError:
pass
obs_length += field['field_length']
fields += [field]
header = self._get_row()
if not header == _correct_obs_header:
self.close()
raise ValueError("Observation header not found.")
self.fields = fields
self.record_length = obs_length
self.record_start = self.filepath_or_buffer.tell()
self.nobs = self._record_count()
self.columns = [x['name'].decode() for x in self.fields]
# Setup the dtype.
dtypel = []
for i, field in enumerate(self.fields):
dtypel.append(('s' + str(i), "S" + str(field['field_length'])))
dtype = np.dtype(dtypel)
self._dtype = dtype
def __next__(self):
return self.read(nrows=self._chunksize or 1)
def _record_count(self):
"""
Get number of records in file.
This is maybe suboptimal because we have to seek to the end of
the file.
Side effect: returns file position to record_start.
"""
self.filepath_or_buffer.seek(0, 2)
total_records_length = (self.filepath_or_buffer.tell() -
self.record_start)
if total_records_length % 80 != 0:
warnings.warn("xport file may be corrupted")
if self.record_length > 80:
self.filepath_or_buffer.seek(self.record_start)
return total_records_length // self.record_length
self.filepath_or_buffer.seek(-80, 2)
last_card = self.filepath_or_buffer.read(80)
last_card = np.frombuffer(last_card, dtype=np.uint64)
# 8 byte blank
ix = np.flatnonzero(last_card == 2314885530818453536)
if len(ix) == 0:
tail_pad = 0
else:
tail_pad = 8 * len(ix)
self.filepath_or_buffer.seek(self.record_start)
return (total_records_length - tail_pad) // self.record_length
def get_chunk(self, size=None):
"""
Reads lines from Xport file and returns as dataframe
Parameters
----------
size : int, defaults to None
Number of lines to read. If None, reads whole file.
Returns
-------
DataFrame
"""
if size is None:
size = self._chunksize
return self.read(nrows=size)
def _missing_double(self, vec):
v = vec.view(dtype='u1,u1,u2,u4')
miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0)
miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |
(v['f0'] == 0x5f) | (v['f0'] == 0x2e))
miss &= miss1
return miss
@Appender(_read_method_doc)
def read(self, nrows=None):
if nrows is None:
nrows = self.nobs
read_lines = min(nrows, self.nobs - self._lines_read)
read_len = read_lines * self.record_length
if read_len <= 0:
self.close()
raise StopIteration
raw = self.filepath_or_buffer.read(read_len)
data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
df = pd.DataFrame(index=range(read_lines))
for j, x in enumerate(self.columns):
vec = data['s%d' % j]
ntype = self.fields[j]['ntype']
if ntype == "numeric":
vec = _handle_truncated_float_vec(
vec, self.fields[j]['field_length'])
miss = self._missing_double(vec)
v = _parse_float_vec(vec)
v[miss] = np.nan
elif self.fields[j]['ntype'] == 'char':
v = [y.rstrip() for y in vec]
if compat.PY3:
if self._encoding is not None:
v = [y.decode(self._encoding) for y in v]
df[x] = v
if self._index is None:
df.index = range(self._lines_read, self._lines_read + read_lines)
else:
df = df.set_index(self._index)
self._lines_read += read_lines
return df
@@ -0,0 +1,70 @@
"""
Read SAS sas7bdat or xport files.
"""
from pandas import compat
from pandas.io.common import _stringify_path
def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
chunksize=None, iterator=False):
"""
Read SAS files stored as either XPORT or SAS7BDAT format files.
Parameters
----------
filepath_or_buffer : string or file-like object
Path to the SAS file.
format : string {'xport', 'sas7bdat'} or None
If None, file format is inferred. If 'xport' or 'sas7bdat',
uses the corresponding format.
index : identifier of index column, defaults to None
Identifier of column that should be used as index of the DataFrame.
encoding : string, default is None
Encoding for text data. If None, text data are stored as raw bytes.
chunksize : int
Read file `chunksize` lines at a time, returns iterator.
iterator : bool, defaults to False
If True, returns an iterator for reading the file incrementally.
Returns
-------
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
or XportReader
"""
if format is None:
buffer_error_msg = ("If this is a buffer object rather "
"than a string name, you must specify "
"a format string")
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, compat.string_types):
raise ValueError(buffer_error_msg)
try:
fname = filepath_or_buffer.lower()
if fname.endswith(".xpt"):
format = "xport"
elif fname.endswith(".sas7bdat"):
format = "sas7bdat"
else:
raise ValueError("unable to infer format of SAS file")
except:
pass
if format.lower() == 'xport':
from pandas.io.sas.sas_xport import XportReader
reader = XportReader(filepath_or_buffer, index=index,
encoding=encoding,
chunksize=chunksize)
elif format.lower() == 'sas7bdat':
from pandas.io.sas.sas7bdat import SAS7BDATReader
reader = SAS7BDATReader(filepath_or_buffer, index=index,
encoding=encoding,
chunksize=chunksize)
else:
raise ValueError('unknown SAS format')
if iterator or chunksize:
return reader
data = reader.read()
reader.close()
return data