Static code analysis and corrections

2019-07-17 16:06:09 +02:00
parent 674692c2fc
commit 21bfae9fbc
10086 changed files with 2102103 additions and 51 deletions
@@ -0,0 +1 @@
+from .sasreader import read_sas  # noqa
@@ -0,0 +1,703 @@
+"""
+Read SAS7BDAT files
+
+Based on code written by Jared Hobbs:
+  https://bitbucket.org/jaredhobbs/sas7bdat
+
+See also:
+  https://github.com/BioStatMatt/sas7bdat
+
+Partial documentation of the file format:
+  https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
+
+Reference for binary data compression:
+  http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
+"""
+from datetime import datetime
+import struct
+
+import numpy as np
+
+from pandas.errors import EmptyDataError
+
+import pandas as pd
+from pandas import compat
+
+from pandas.io.common import BaseIterator, get_filepath_or_buffer
+from pandas.io.sas._sas import Parser
+import pandas.io.sas.sas_constants as const
+
+
+class _subheader_pointer(object):
+    pass
+
+
+class _column(object):
+    pass
+
+
+# SAS7BDAT represents a SAS data file in SAS7BDAT format.
+class SAS7BDATReader(BaseIterator):
+    """
+    Read SAS files in SAS7BDAT format.
+
+    Parameters
+    ----------
+    path_or_buf : path name or buffer
+        Name of SAS file or file-like object pointing to SAS file
+        contents.
+    index : column identifier, defaults to None
+        Column to use as index.
+    convert_dates : boolean, defaults to True
+        Attempt to convert dates to Pandas datetime values.  Note that
+        some rarely used SAS date formats may be unsupported.
+    blank_missing : boolean, defaults to True
+        Convert empty strings to missing values (SAS uses blanks to
+        indicate missing character variables).
+    chunksize : int, defaults to None
+        Return SAS7BDATReader object for iterations, returns chunks
+        with given number of lines.
+    encoding : string, defaults to None
+        String encoding.
+    convert_text : bool, defaults to True
+        If False, text variables are left as raw bytes.
+    convert_header_text : bool, defaults to True
+        If False, header text, including column names, are left as raw
+        bytes.
+    """
+
+    def __init__(self, path_or_buf, index=None, convert_dates=True,
+                 blank_missing=True, chunksize=None, encoding=None,
+                 convert_text=True, convert_header_text=True):
+
+        self.index = index
+        self.convert_dates = convert_dates
+        self.blank_missing = blank_missing
+        self.chunksize = chunksize
+        self.encoding = encoding
+        self.convert_text = convert_text
+        self.convert_header_text = convert_header_text
+
+        self.default_encoding = "latin-1"
+        self.compression = ""
+        self.column_names_strings = []
+        self.column_names = []
+        self.column_formats = []
+        self.columns = []
+
+        self._current_page_data_subheader_pointers = []
+        self._cached_page = None
+        self._column_data_lengths = []
+        self._column_data_offsets = []
+        self._column_types = []
+
+        self._current_row_in_file_index = 0
+        self._current_row_on_page_index = 0
+        self._current_row_in_file_index = 0
+
+        self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
+        if isinstance(self._path_or_buf, compat.string_types):
+            self._path_or_buf = open(self._path_or_buf, 'rb')
+            self.handle = self._path_or_buf
+
+        self._get_properties()
+        self._parse_metadata()
+
+    def column_data_lengths(self):
+        """Return a numpy int64 array of the column data lengths"""
+        return np.asarray(self._column_data_lengths, dtype=np.int64)
+
+    def column_data_offsets(self):
+        """Return a numpy int64 array of the column offsets"""
+        return np.asarray(self._column_data_offsets, dtype=np.int64)
+
+    def column_types(self):
+        """Returns a numpy character array of the column types:
+           s (string) or d (double)"""
+        return np.asarray(self._column_types, dtype=np.dtype('S1'))
+
+    def close(self):
+        try:
+            self.handle.close()
+        except AttributeError:
+            pass
+
+    def _get_properties(self):
+
+        # Check magic number
+        self._path_or_buf.seek(0)
+        self._cached_page = self._path_or_buf.read(288)
+        if self._cached_page[0:len(const.magic)] != const.magic:
+            self.close()
+            raise ValueError("magic number mismatch (not a SAS file?)")
+
+        # Get alignment information
+        align1, align2 = 0, 0
+        buf = self._read_bytes(const.align_1_offset, const.align_1_length)
+        if buf == const.u64_byte_checker_value:
+            align2 = const.align_2_value
+            self.U64 = True
+            self._int_length = 8
+            self._page_bit_offset = const.page_bit_offset_x64
+            self._subheader_pointer_length = const.subheader_pointer_length_x64
+        else:
+            self.U64 = False
+            self._page_bit_offset = const.page_bit_offset_x86
+            self._subheader_pointer_length = const.subheader_pointer_length_x86
+            self._int_length = 4
+        buf = self._read_bytes(const.align_2_offset, const.align_2_length)
+        if buf == const.align_1_checker_value:
+            align1 = const.align_2_value
+        total_align = align1 + align2
+
+        # Get endianness information
+        buf = self._read_bytes(const.endianness_offset,
+                               const.endianness_length)
+        if buf == b'\x01':
+            self.byte_order = "<"
+        else:
+            self.byte_order = ">"
+
+        # Get encoding information
+        buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
+        if buf in const.encoding_names:
+            self.file_encoding = const.encoding_names[buf]
+        else:
+            self.file_encoding = "unknown (code={name!s})".format(name=buf)
+
+        # Get platform information
+        buf = self._read_bytes(const.platform_offset, const.platform_length)
+        if buf == b'1':
+            self.platform = "unix"
+        elif buf == b'2':
+            self.platform = "windows"
+        else:
+            self.platform = "unknown"
+
+        buf = self._read_bytes(const.dataset_offset, const.dataset_length)
+        self.name = buf.rstrip(b'\x00 ')
+        if self.convert_header_text:
+            self.name = self.name.decode(
+                self.encoding or self.default_encoding)
+
+        buf = self._read_bytes(const.file_type_offset, const.file_type_length)
+        self.file_type = buf.rstrip(b'\x00 ')
+        if self.convert_header_text:
+            self.file_type = self.file_type.decode(
+                self.encoding or self.default_encoding)
+
+        # Timestamp is epoch 01/01/1960
+        epoch = datetime(1960, 1, 1)
+        x = self._read_float(const.date_created_offset + align1,
+                             const.date_created_length)
+        self.date_created = epoch + pd.to_timedelta(x, unit='s')
+        x = self._read_float(const.date_modified_offset + align1,
+                             const.date_modified_length)
+        self.date_modified = epoch + pd.to_timedelta(x, unit='s')
+
+        self.header_length = self._read_int(const.header_size_offset + align1,
+                                            const.header_size_length)
+
+        # Read the rest of the header into cached_page.
+        buf = self._path_or_buf.read(self.header_length - 288)
+        self._cached_page += buf
+        if len(self._cached_page) != self.header_length:
+            self.close()
+            raise ValueError("The SAS7BDAT file appears to be truncated.")
+
+        self._page_length = self._read_int(const.page_size_offset + align1,
+                                           const.page_size_length)
+        self._page_count = self._read_int(const.page_count_offset + align1,
+                                          const.page_count_length)
+
+        buf = self._read_bytes(const.sas_release_offset + total_align,
+                               const.sas_release_length)
+        self.sas_release = buf.rstrip(b'\x00 ')
+        if self.convert_header_text:
+            self.sas_release = self.sas_release.decode(
+                self.encoding or self.default_encoding)
+
+        buf = self._read_bytes(const.sas_server_type_offset + total_align,
+                               const.sas_server_type_length)
+        self.server_type = buf.rstrip(b'\x00 ')
+        if self.convert_header_text:
+            self.server_type = self.server_type.decode(
+                self.encoding or self.default_encoding)
+
+        buf = self._read_bytes(const.os_version_number_offset + total_align,
+                               const.os_version_number_length)
+        self.os_version = buf.rstrip(b'\x00 ')
+        if self.convert_header_text:
+            self.os_version = self.os_version.decode(
+                self.encoding or self.default_encoding)
+
+        buf = self._read_bytes(const.os_name_offset + total_align,
+                               const.os_name_length)
+        buf = buf.rstrip(b'\x00 ')
+        if len(buf) > 0:
+            self.os_name = buf.decode(self.encoding or self.default_encoding)
+        else:
+            buf = self._read_bytes(const.os_maker_offset + total_align,
+                                   const.os_maker_length)
+            self.os_name = buf.rstrip(b'\x00 ')
+            if self.convert_header_text:
+                self.os_name = self.os_name.decode(
+                    self.encoding or self.default_encoding)
+
+    def __next__(self):
+        da = self.read(nrows=self.chunksize or 1)
+        if da is None:
+            raise StopIteration
+        return da
+
+    # Read a single float of the given width (4 or 8).
+    def _read_float(self, offset, width):
+        if width not in (4, 8):
+            self.close()
+            raise ValueError("invalid float width")
+        buf = self._read_bytes(offset, width)
+        fd = "f" if width == 4 else "d"
+        return struct.unpack(self.byte_order + fd, buf)[0]
+
+    # Read a single signed integer of the given width (1, 2, 4 or 8).
+    def _read_int(self, offset, width):
+        if width not in (1, 2, 4, 8):
+            self.close()
+            raise ValueError("invalid int width")
+        buf = self._read_bytes(offset, width)
+        it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
+        iv = struct.unpack(self.byte_order + it, buf)[0]
+        return iv
+
+    def _read_bytes(self, offset, length):
+        if self._cached_page is None:
+            self._path_or_buf.seek(offset)
+            buf = self._path_or_buf.read(length)
+            if len(buf) < length:
+                self.close()
+                msg = "Unable to read {:d} bytes from file position {:d}."
+                raise ValueError(msg.format(length, offset))
+            return buf
+        else:
+            if offset + length > len(self._cached_page):
+                self.close()
+                raise ValueError("The cached page is too small.")
+            return self._cached_page[offset:offset + length]
+
+    def _parse_metadata(self):
+        done = False
+        while not done:
+            self._cached_page = self._path_or_buf.read(self._page_length)
+            if len(self._cached_page) <= 0:
+                break
+            if len(self._cached_page) != self._page_length:
+                self.close()
+                raise ValueError(
+                    "Failed to read a meta data page from the SAS file.")
+            done = self._process_page_meta()
+
+    def _process_page_meta(self):
+        self._read_page_header()
+        pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
+        if self._current_page_type in pt:
+            self._process_page_metadata()
+        is_data_page = self._current_page_type & const.page_data_type
+        is_mix_page = self._current_page_type in const.page_mix_types
+        return (is_data_page or is_mix_page
+                or self._current_page_data_subheader_pointers != [])
+
+    def _read_page_header(self):
+        bit_offset = self._page_bit_offset
+        tx = const.page_type_offset + bit_offset
+        self._current_page_type = self._read_int(tx, const.page_type_length)
+        tx = const.block_count_offset + bit_offset
+        self._current_page_block_count = self._read_int(
+            tx, const.block_count_length)
+        tx = const.subheader_count_offset + bit_offset
+        self._current_page_subheaders_count = (
+            self._read_int(tx, const.subheader_count_length))
+
+    def _process_page_metadata(self):
+        bit_offset = self._page_bit_offset
+
+        for i in range(self._current_page_subheaders_count):
+            pointer = self._process_subheader_pointers(
+                const.subheader_pointers_offset + bit_offset, i)
+            if pointer.length == 0:
+                continue
+            if pointer.compression == const.truncated_subheader_id:
+                continue
+            subheader_signature = self._read_subheader_signature(
+                pointer.offset)
+            subheader_index = (
+                self._get_subheader_index(subheader_signature,
+                                          pointer.compression, pointer.ptype))
+            self._process_subheader(subheader_index, pointer)
+
+    def _get_subheader_index(self, signature, compression, ptype):
+        index = const.subheader_signature_to_index.get(signature)
+        if index is None:
+            f1 = ((compression == const.compressed_subheader_id) or
+                  (compression == 0))
+            f2 = (ptype == const.compressed_subheader_type)
+            if (self.compression != "") and f1 and f2:
+                index = const.SASIndex.data_subheader_index
+            else:
+                self.close()
+                raise ValueError("Unknown subheader signature")
+        return index
+
+    def _process_subheader_pointers(self, offset, subheader_pointer_index):
+
+        subheader_pointer_length = self._subheader_pointer_length
+        total_offset = (offset +
+                        subheader_pointer_length * subheader_pointer_index)
+
+        subheader_offset = self._read_int(total_offset, self._int_length)
+        total_offset += self._int_length
+
+        subheader_length = self._read_int(total_offset, self._int_length)
+        total_offset += self._int_length
+
+        subheader_compression = self._read_int(total_offset, 1)
+        total_offset += 1
+
+        subheader_type = self._read_int(total_offset, 1)
+
+        x = _subheader_pointer()
+        x.offset = subheader_offset
+        x.length = subheader_length
+        x.compression = subheader_compression
+        x.ptype = subheader_type
+
+        return x
+
+    def _read_subheader_signature(self, offset):
+        subheader_signature = self._read_bytes(offset, self._int_length)
+        return subheader_signature
+
+    def _process_subheader(self, subheader_index, pointer):
+        offset = pointer.offset
+        length = pointer.length
+
+        if subheader_index == const.SASIndex.row_size_index:
+            processor = self._process_rowsize_subheader
+        elif subheader_index == const.SASIndex.column_size_index:
+            processor = self._process_columnsize_subheader
+        elif subheader_index == const.SASIndex.column_text_index:
+            processor = self._process_columntext_subheader
+        elif subheader_index == const.SASIndex.column_name_index:
+            processor = self._process_columnname_subheader
+        elif subheader_index == const.SASIndex.column_attributes_index:
+            processor = self._process_columnattributes_subheader
+        elif subheader_index == const.SASIndex.format_and_label_index:
+            processor = self._process_format_subheader
+        elif subheader_index == const.SASIndex.column_list_index:
+            processor = self._process_columnlist_subheader
+        elif subheader_index == const.SASIndex.subheader_counts_index:
+            processor = self._process_subheader_counts
+        elif subheader_index == const.SASIndex.data_subheader_index:
+            self._current_page_data_subheader_pointers.append(pointer)
+            return
+        else:
+            raise ValueError("unknown subheader index")
+
+        processor(offset, length)
+
+    def _process_rowsize_subheader(self, offset, length):
+
+        int_len = self._int_length
+        lcs_offset = offset
+        lcp_offset = offset
+        if self.U64:
+            lcs_offset += 682
+            lcp_offset += 706
+        else:
+            lcs_offset += 354
+            lcp_offset += 378
+
+        self.row_length = self._read_int(
+            offset + const.row_length_offset_multiplier * int_len, int_len)
+        self.row_count = self._read_int(
+            offset + const.row_count_offset_multiplier * int_len, int_len)
+        self.col_count_p1 = self._read_int(
+            offset + const.col_count_p1_multiplier * int_len, int_len)
+        self.col_count_p2 = self._read_int(
+            offset + const.col_count_p2_multiplier * int_len, int_len)
+        mx = const.row_count_on_mix_page_offset_multiplier * int_len
+        self._mix_page_row_count = self._read_int(offset + mx, int_len)
+        self._lcs = self._read_int(lcs_offset, 2)
+        self._lcp = self._read_int(lcp_offset, 2)
+
+    def _process_columnsize_subheader(self, offset, length):
+        int_len = self._int_length
+        offset += int_len
+        self.column_count = self._read_int(offset, int_len)
+        if (self.col_count_p1 + self.col_count_p2 !=
+                self.column_count):
+            print(
+                "Warning: column count mismatch ({p1} + {p2} != "
+                "{column_count})\n".format(
+                    p1=self.col_count_p1, p2=self.col_count_p2,
+                    column_count=self.column_count))
+
+    # Unknown purpose
+    def _process_subheader_counts(self, offset, length):
+        pass
+
+    def _process_columntext_subheader(self, offset, length):
+
+        offset += self._int_length
+        text_block_size = self._read_int(offset, const.text_block_size_length)
+
+        buf = self._read_bytes(offset, text_block_size)
+        cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
+        cname = cname_raw
+        if self.convert_header_text:
+            cname = cname.decode(self.encoding or self.default_encoding)
+        self.column_names_strings.append(cname)
+
+        if len(self.column_names_strings) == 1:
+            compression_literal = ""
+            for cl in const.compression_literals:
+                if cl in cname_raw:
+                    compression_literal = cl
+            self.compression = compression_literal
+            offset -= self._int_length
+
+            offset1 = offset + 16
+            if self.U64:
+                offset1 += 4
+
+            buf = self._read_bytes(offset1, self._lcp)
+            compression_literal = buf.rstrip(b"\x00")
+            if compression_literal == "":
+                self._lcs = 0
+                offset1 = offset + 32
+                if self.U64:
+                    offset1 += 4
+                buf = self._read_bytes(offset1, self._lcp)
+                self.creator_proc = buf[0:self._lcp]
+            elif compression_literal == const.rle_compression:
+                offset1 = offset + 40
+                if self.U64:
+                    offset1 += 4
+                buf = self._read_bytes(offset1, self._lcp)
+                self.creator_proc = buf[0:self._lcp]
+            elif self._lcs > 0:
+                self._lcp = 0
+                offset1 = offset + 16
+                if self.U64:
+                    offset1 += 4
+                buf = self._read_bytes(offset1, self._lcs)
+                self.creator_proc = buf[0:self._lcp]
+            if self.convert_header_text:
+                if hasattr(self, "creator_proc"):
+                    self.creator_proc = self.creator_proc.decode(
+                        self.encoding or self.default_encoding)
+
+    def _process_columnname_subheader(self, offset, length):
+        int_len = self._int_length
+        offset += int_len
+        column_name_pointers_count = (length - 2 * int_len - 12) // 8
+        for i in range(column_name_pointers_count):
+            text_subheader = offset + const.column_name_pointer_length * \
+                (i + 1) + const.column_name_text_subheader_offset
+            col_name_offset = offset + const.column_name_pointer_length * \
+                (i + 1) + const.column_name_offset_offset
+            col_name_length = offset + const.column_name_pointer_length * \
+                (i + 1) + const.column_name_length_offset
+
+            idx = self._read_int(
+                text_subheader, const.column_name_text_subheader_length)
+            col_offset = self._read_int(
+                col_name_offset, const.column_name_offset_length)
+            col_len = self._read_int(
+                col_name_length, const.column_name_length_length)
+
+            name_str = self.column_names_strings[idx]
+            self.column_names.append(name_str[col_offset:col_offset + col_len])
+
+    def _process_columnattributes_subheader(self, offset, length):
+        int_len = self._int_length
+        column_attributes_vectors_count = (
+            length - 2 * int_len - 12) // (int_len + 8)
+        for i in range(column_attributes_vectors_count):
+            col_data_offset = (offset + int_len +
+                               const.column_data_offset_offset +
+                               i * (int_len + 8))
+            col_data_len = (offset + 2 * int_len +
+                            const.column_data_length_offset +
+                            i * (int_len + 8))
+            col_types = (offset + 2 * int_len +
+                         const.column_type_offset + i * (int_len + 8))
+
+            x = self._read_int(col_data_offset, int_len)
+            self._column_data_offsets.append(x)
+
+            x = self._read_int(col_data_len, const.column_data_length_length)
+            self._column_data_lengths.append(x)
+
+            x = self._read_int(col_types, const.column_type_length)
+            self._column_types.append(b'd' if x == 1 else b's')
+
+    def _process_columnlist_subheader(self, offset, length):
+        # unknown purpose
+        pass
+
+    def _process_format_subheader(self, offset, length):
+        int_len = self._int_length
+        text_subheader_format = (
+            offset +
+            const.column_format_text_subheader_index_offset +
+            3 * int_len)
+        col_format_offset = (offset +
+                             const.column_format_offset_offset +
+                             3 * int_len)
+        col_format_len = (offset +
+                          const.column_format_length_offset +
+                          3 * int_len)
+        text_subheader_label = (
+            offset +
+            const.column_label_text_subheader_index_offset +
+            3 * int_len)
+        col_label_offset = (offset +
+                            const.column_label_offset_offset +
+                            3 * int_len)
+        col_label_len = offset + const.column_label_length_offset + 3 * int_len
+
+        x = self._read_int(text_subheader_format,
+                           const.column_format_text_subheader_index_length)
+        format_idx = min(x, len(self.column_names_strings) - 1)
+
+        format_start = self._read_int(
+            col_format_offset, const.column_format_offset_length)
+        format_len = self._read_int(
+            col_format_len, const.column_format_length_length)
+
+        label_idx = self._read_int(
+            text_subheader_label,
+            const.column_label_text_subheader_index_length)
+        label_idx = min(label_idx, len(self.column_names_strings) - 1)
+
+        label_start = self._read_int(
+            col_label_offset, const.column_label_offset_length)
+        label_len = self._read_int(col_label_len,
+                                   const.column_label_length_length)
+
+        label_names = self.column_names_strings[label_idx]
+        column_label = label_names[label_start: label_start + label_len]
+        format_names = self.column_names_strings[format_idx]
+        column_format = format_names[format_start: format_start + format_len]
+        current_column_number = len(self.columns)
+
+        col = _column()
+        col.col_id = current_column_number
+        col.name = self.column_names[current_column_number]
+        col.label = column_label
+        col.format = column_format
+        col.ctype = self._column_types[current_column_number]
+        col.length = self._column_data_lengths[current_column_number]
+
+        self.column_formats.append(column_format)
+        self.columns.append(col)
+
+    def read(self, nrows=None):
+
+        if (nrows is None) and (self.chunksize is not None):
+            nrows = self.chunksize
+        elif nrows is None:
+            nrows = self.row_count
+
+        if len(self._column_types) == 0:
+            self.close()
+            raise EmptyDataError("No columns to parse from file")
+
+        if self._current_row_in_file_index >= self.row_count:
+            return None
+
+        m = self.row_count - self._current_row_in_file_index
+        if nrows > m:
+            nrows = m
+
+        nd = self._column_types.count(b'd')
+        ns = self._column_types.count(b's')
+
+        self._string_chunk = np.empty((ns, nrows), dtype=np.object)
+        self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
+
+        self._current_row_in_chunk_index = 0
+        p = Parser(self)
+        p.read(nrows)
+
+        rslt = self._chunk_to_dataframe()
+        if self.index is not None:
+            rslt = rslt.set_index(self.index)
+
+        return rslt
+
+    def _read_next_page(self):
+        self._current_page_data_subheader_pointers = []
+        self._cached_page = self._path_or_buf.read(self._page_length)
+        if len(self._cached_page) <= 0:
+            return True
+        elif len(self._cached_page) != self._page_length:
+            self.close()
+            msg = ("failed to read complete page from file "
+                   "(read {:d} of {:d} bytes)")
+            raise ValueError(msg.format(len(self._cached_page),
+                                        self._page_length))
+
+        self._read_page_header()
+        page_type = self._current_page_type
+        if page_type == const.page_meta_type:
+            self._process_page_metadata()
+
+        is_data_page = page_type & const.page_data_type
+        pt = [const.page_meta_type] + const.page_mix_types
+        if not is_data_page and self._current_page_type not in pt:
+            return self._read_next_page()
+
+        return False
+
+    def _chunk_to_dataframe(self):
+
+        n = self._current_row_in_chunk_index
+        m = self._current_row_in_file_index
+        ix = range(m - n, m)
+        rslt = pd.DataFrame(index=ix)
+
+        js, jb = 0, 0
+        for j in range(self.column_count):
+
+            name = self.column_names[j]
+
+            if self._column_types[j] == b'd':
+                rslt[name] = self._byte_chunk[jb, :].view(
+                    dtype=self.byte_order + 'd')
+                rslt[name] = np.asarray(rslt[name], dtype=np.float64)
+                if self.convert_dates:
+                    unit = None
+                    if self.column_formats[j] in const.sas_date_formats:
+                        unit = 'd'
+                    elif self.column_formats[j] in const.sas_datetime_formats:
+                        unit = 's'
+                    if unit:
+                        rslt[name] = pd.to_datetime(rslt[name], unit=unit,
+                                                    origin="1960-01-01")
+                jb += 1
+            elif self._column_types[j] == b's':
+                rslt[name] = self._string_chunk[js, :]
+                if self.convert_text and (self.encoding is not None):
+                    rslt[name] = rslt[name].str.decode(
+                        self.encoding or self.default_encoding)
+                if self.blank_missing:
+                    ii = rslt[name].str.len() == 0
+                    rslt.loc[ii, name] = np.nan
+                js += 1
+            else:
+                self.close()
+                raise ValueError("unknown column type {type}".format(
+                    type=self._column_types[j]))
+
+        return rslt
@@ -0,0 +1,171 @@
+magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" +
+         b"\x00\x00\x00\x00\xc2\xea\x81\x60" +
+         b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" +
+         b"\x09\xc7\x31\x8c\x18\x1f\x10\x11")
+
+align_1_checker_value = b'3'
+align_1_offset = 32
+align_1_length = 1
+align_1_value = 4
+u64_byte_checker_value = b'3'
+align_2_offset = 35
+align_2_length = 1
+align_2_value = 4
+endianness_offset = 37
+endianness_length = 1
+platform_offset = 39
+platform_length = 1
+encoding_offset = 70
+encoding_length = 1
+dataset_offset = 92
+dataset_length = 64
+file_type_offset = 156
+file_type_length = 8
+date_created_offset = 164
+date_created_length = 8
+date_modified_offset = 172
+date_modified_length = 8
+header_size_offset = 196
+header_size_length = 4
+page_size_offset = 200
+page_size_length = 4
+page_count_offset = 204
+page_count_length = 4
+sas_release_offset = 216
+sas_release_length = 8
+sas_server_type_offset = 224
+sas_server_type_length = 16
+os_version_number_offset = 240
+os_version_number_length = 16
+os_maker_offset = 256
+os_maker_length = 16
+os_name_offset = 272
+os_name_length = 16
+page_bit_offset_x86 = 16
+page_bit_offset_x64 = 32
+subheader_pointer_length_x86 = 12
+subheader_pointer_length_x64 = 24
+page_type_offset = 0
+page_type_length = 2
+block_count_offset = 2
+block_count_length = 2
+subheader_count_offset = 4
+subheader_count_length = 2
+page_meta_type = 0
+page_data_type = 256
+page_amd_type = 1024
+page_metc_type = 16384
+page_comp_type = -28672
+page_mix_types = [512, 640]
+subheader_pointers_offset = 8
+truncated_subheader_id = 1
+compressed_subheader_id = 4
+compressed_subheader_type = 1
+text_block_size_length = 2
+row_length_offset_multiplier = 5
+row_count_offset_multiplier = 6
+col_count_p1_multiplier = 9
+col_count_p2_multiplier = 10
+row_count_on_mix_page_offset_multiplier = 15
+column_name_pointer_length = 8
+column_name_text_subheader_offset = 0
+column_name_text_subheader_length = 2
+column_name_offset_offset = 2
+column_name_offset_length = 2
+column_name_length_offset = 4
+column_name_length_length = 2
+column_data_offset_offset = 8
+column_data_length_offset = 8
+column_data_length_length = 4
+column_type_offset = 14
+column_type_length = 1
+column_format_text_subheader_index_offset = 22
+column_format_text_subheader_index_length = 2
+column_format_offset_offset = 24
+column_format_offset_length = 2
+column_format_length_offset = 26
+column_format_length_length = 2
+column_label_text_subheader_index_offset = 28
+column_label_text_subheader_index_length = 2
+column_label_offset_offset = 30
+column_label_offset_length = 2
+column_label_length_offset = 32
+column_label_length_length = 2
+rle_compression = b'SASYZCRL'
+rdc_compression = b'SASYZCR2'
+
+compression_literals = [rle_compression, rdc_compression]
+
+# Incomplete list of encodings, using SAS nomenclature:
+# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
+encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2",
+                  61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"}
+
+
+class SASIndex(object):
+    row_size_index = 0
+    column_size_index = 1
+    subheader_counts_index = 2
+    column_text_index = 3
+    column_name_index = 4
+    column_attributes_index = 5
+    format_and_label_index = 6
+    column_list_index = 7
+    data_subheader_index = 8
+
+
+subheader_signature_to_index = {
+    b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
+    b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
+    b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
+    b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
+    b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
+    b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
+    b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
+    b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
+    b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
+    b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
+    b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
+    b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
+    b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
+    b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
+    b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
+    b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
+    b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
+    b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
+    b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
+    b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
+    b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
+    b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
+    b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
+    b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
+    b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
+    b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
+    b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
+    b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
+    b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
+    b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index}
+
+
+# List of frequently used SAS date and datetime formats
+# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
+# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
+sas_date_formats = ("DATE", "DAY", "DDMMYY", "DOWNAME", "JULDAY", "JULIAN",
+                    "MMDDYY", "MMYY", "MMYYC", "MMYYD", "MMYYP", "MMYYS",
+                    "MMYYN", "MONNAME", "MONTH", "MONYY", "QTR", "QTRR",
+                    "NENGO", "WEEKDATE", "WEEKDATX", "WEEKDAY", "WEEKV",
+                    "WORDDATE", "WORDDATX", "YEAR", "YYMM", "YYMMC", "YYMMD",
+                    "YYMMP", "YYMMS", "YYMMN", "YYMON", "YYMMDD", "YYQ",
+                    "YYQC", "YYQD", "YYQP", "YYQS", "YYQN", "YYQR", "YYQRC",
+                    "YYQRD", "YYQRP", "YYQRS", "YYQRN",
+                    "YYMMDDP", "YYMMDDC", "E8601DA", "YYMMDDN", "MMDDYYC",
+                    "MMDDYYS", "MMDDYYD", "YYMMDDS", "B8601DA", "DDMMYYN",
+                    "YYMMDDD", "DDMMYYB", "DDMMYYP", "MMDDYYP", "YYMMDDB",
+                    "MMDDYYN", "DDMMYYC", "DDMMYYD", "DDMMYYS",
+                    "MINGUO")
+
+sas_datetime_formats = ("DATETIME", "DTWKDATX",
+                        "B8601DN", "B8601DT", "B8601DX", "B8601DZ", "B8601LX",
+                        "E8601DN", "E8601DT", "E8601DX", "E8601DZ", "E8601LX",
+                        "DATEAMPM", "DTDATE", "DTMONYY", "DTMONYY", "DTWKDATX",
+                        "DTYEAR", "TOD", "MDYAMPM")
@@ -0,0 +1,464 @@
+"""
+Read a SAS XPort format file into a Pandas DataFrame.
+
+Based on code from Jack Cushman (github.com/jcushman/xport).
+
+The file format is defined here:
+
+https://support.sas.com/techsup/technote/ts140.pdf
+"""
+
+from datetime import datetime
+import struct
+import warnings
+
+import numpy as np
+
+from pandas.util._decorators import Appender
+
+import pandas as pd
+from pandas import compat
+
+from pandas.io.common import BaseIterator, get_filepath_or_buffer
+
+_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
+                  "000000000000000000000000000000  ")
+_correct_header1 = ("HEADER RECORD*******MEMBER  HEADER RECORD!!!!!!!"
+                    "000000000000000001600000000")
+_correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
+                    "000000000000000000000000000000  ")
+_correct_obs_header = ("HEADER RECORD*******OBS     HEADER RECORD!!!!!!!"
+                       "000000000000000000000000000000  ")
+_fieldkeys = ['ntype', 'nhfun', 'field_length', 'nvar0', 'name', 'label',
+              'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform',
+              'nifl', 'nifd', 'npos', '_']
+
+
+_base_params_doc = """\
+Parameters
+----------
+filepath_or_buffer : string or file-like object
+    Path to SAS file or object implementing binary read method."""
+
+_params2_doc = """\
+index : identifier of index column
+    Identifier of column that should be used as index of the DataFrame.
+encoding : string
+    Encoding for text data.
+chunksize : int
+    Read file `chunksize` lines at a time, returns iterator."""
+
+_format_params_doc = """\
+format : string
+    File format, only `xport` is currently supported."""
+
+_iterator_doc = """\
+iterator : boolean, default False
+    Return XportReader object for reading file incrementally."""
+
+
+_read_sas_doc = """Read a SAS file into a DataFrame.
+
+%(_base_params_doc)s
+%(_format_params_doc)s
+%(_params2_doc)s
+%(_iterator_doc)s
+
+Returns
+-------
+DataFrame or XportReader
+
+Examples
+--------
+Read a SAS Xport file:
+
+>>> df = pd.read_sas('filename.XPT')
+
+Read a Xport file in 10,000 line chunks:
+
+>>> itr = pd.read_sas('filename.XPT', chunksize=10000)
+>>> for chunk in itr:
+>>>     do_something(chunk)
+
+""" % {"_base_params_doc": _base_params_doc,
+       "_format_params_doc": _format_params_doc,
+       "_params2_doc": _params2_doc,
+       "_iterator_doc": _iterator_doc}
+
+
+_xport_reader_doc = """\
+Class for reading SAS Xport files.
+
+%(_base_params_doc)s
+%(_params2_doc)s
+
+Attributes
+----------
+member_info : list
+    Contains information about the file
+fields : list
+    Contains information about the variables in the file
+""" % {"_base_params_doc": _base_params_doc,
+       "_params2_doc": _params2_doc}
+
+
+_read_method_doc = """\
+Read observations from SAS Xport file, returning as data frame.
+
+Parameters
+----------
+nrows : int
+    Number of rows to read from data file; if None, read whole
+    file.
+
+Returns
+-------
+A DataFrame.
+"""
+
+
+def _parse_date(datestr):
+    """ Given a date in xport format, return Python date. """
+    try:
+        # e.g. "16FEB11:10:07:55"
+        return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
+    except ValueError:
+        return pd.NaT
+
+
+def _split_line(s, parts):
+    """
+    Parameters
+    ----------
+    s: string
+        Fixed-length string to split
+    parts: list of (name, length) pairs
+        Used to break up string, name '_' will be filtered from output.
+
+    Returns
+    -------
+    Dict of name:contents of string at given location.
+    """
+    out = {}
+    start = 0
+    for name, length in parts:
+        out[name] = s[start:start + length].strip()
+        start += length
+    del out['_']
+    return out
+
+
+def _handle_truncated_float_vec(vec, nbytes):
+    # This feature is not well documented, but some SAS XPORT files
+    # have 2-7 byte "truncated" floats.  To read these truncated
+    # floats, pad them with zeros on the right to make 8 byte floats.
+    #
+    # References:
+    # https://github.com/jcushman/xport/pull/3
+    # The R "foreign" library
+
+    if nbytes != 8:
+        vec1 = np.zeros(len(vec), np.dtype('S8'))
+        dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes))
+        vec2 = vec1.view(dtype=dtype)
+        vec2['f0'] = vec
+        return vec2
+
+    return vec
+
+
+def _parse_float_vec(vec):
+    """
+    Parse a vector of float values representing IBM 8 byte floats into
+    native 8 byte floats.
+    """
+
+    dtype = np.dtype('>u4,>u4')
+    vec1 = vec.view(dtype=dtype)
+    xport1 = vec1['f0']
+    xport2 = vec1['f1']
+
+    # Start by setting first half of ieee number to first half of IBM
+    # number sans exponent
+    ieee1 = xport1 & 0x00ffffff
+
+    # The fraction bit to the left of the binary point in the ieee
+    # format was set and the number was shifted 0, 1, 2, or 3
+    # places. This will tell us how to adjust the ibm exponent to be a
+    # power of 2 ieee exponent and how to shift the fraction bits to
+    # restore the correct magnitude.
+    shift = np.zeros(len(vec), dtype=np.uint8)
+    shift[np.where(xport1 & 0x00200000)] = 1
+    shift[np.where(xport1 & 0x00400000)] = 2
+    shift[np.where(xport1 & 0x00800000)] = 3
+
+    # shift the ieee number down the correct number of places then
+    # set the second half of the ieee number to be the second half
+    # of the ibm number shifted appropriately, ored with the bits
+    # from the first half that would have been shifted in if we
+    # could shift a double. All we are worried about are the low
+    # order 3 bits of the first half since we're only shifting by
+    # 1, 2, or 3.
+    ieee1 >>= shift
+    ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
+
+    # clear the 1 bit to the left of the binary point
+    ieee1 &= 0xffefffff
+
+    # set the exponent of the ieee number to be the actual exponent
+    # plus the shift count + 1023. Or this into the first half of the
+    # ieee number. The ibm exponent is excess 64 but is adjusted by 65
+    # since during conversion to ibm format the exponent is
+    # incremented by 1 and the fraction bits left 4 positions to the
+    # right of the radix point.  (had to add >> 24 because C treats &
+    # 0x7f as 0x7f000000 and Python doesn't)
+    ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) +
+               shift + 1023) << 20) | (xport1 & 0x80000000)
+
+    ieee = np.empty((len(ieee1),), dtype='>u4,>u4')
+    ieee['f0'] = ieee1
+    ieee['f1'] = ieee2
+    ieee = ieee.view(dtype='>f8')
+    ieee = ieee.astype('f8')
+
+    return ieee
+
+
+class XportReader(BaseIterator):
+    __doc__ = _xport_reader_doc
+
+    def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
+                 chunksize=None):
+
+        self._encoding = encoding
+        self._lines_read = 0
+        self._index = index
+        self._chunksize = chunksize
+
+        if isinstance(filepath_or_buffer, str):
+            (filepath_or_buffer, encoding,
+             compression, should_close) = get_filepath_or_buffer(
+                filepath_or_buffer, encoding=encoding)
+
+        if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)):
+            self.filepath_or_buffer = open(filepath_or_buffer, 'rb')
+        else:
+            # Copy to BytesIO, and ensure no encoding
+            contents = filepath_or_buffer.read()
+            try:
+                contents = contents.encode(self._encoding)
+            except UnicodeEncodeError:
+                pass
+            self.filepath_or_buffer = compat.BytesIO(contents)
+
+        self._read_header()
+
+    def close(self):
+        self.filepath_or_buffer.close()
+
+    def _get_row(self):
+        return self.filepath_or_buffer.read(80).decode()
+
+    def _read_header(self):
+        self.filepath_or_buffer.seek(0)
+
+        # read file header
+        line1 = self._get_row()
+        if line1 != _correct_line1:
+            self.close()
+            raise ValueError("Header record is not an XPORT file.")
+
+        line2 = self._get_row()
+        fif = [['prefix', 24], ['version', 8], ['OS', 8],
+               ['_', 24], ['created', 16]]
+        file_info = _split_line(line2, fif)
+        if file_info['prefix'] != "SAS     SAS     SASLIB":
+            self.close()
+            raise ValueError("Header record has invalid prefix.")
+        file_info['created'] = _parse_date(file_info['created'])
+        self.file_info = file_info
+
+        line3 = self._get_row()
+        file_info['modified'] = _parse_date(line3[:16])
+
+        # read member header
+        header1 = self._get_row()
+        header2 = self._get_row()
+        headflag1 = header1.startswith(_correct_header1)
+        headflag2 = (header2 == _correct_header2)
+        if not (headflag1 and headflag2):
+            self.close()
+            raise ValueError("Member header not found")
+        # usually 140, could be 135
+        fieldnamelength = int(header1[-5:-2])
+
+        # member info
+        mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8],
+               ['version', 8], ['OS', 8], ['_', 24], ['created', 16]]
+        member_info = _split_line(self._get_row(), mem)
+        mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]]
+        member_info.update(_split_line(self._get_row(), mem))
+        member_info['modified'] = _parse_date(member_info['modified'])
+        member_info['created'] = _parse_date(member_info['created'])
+        self.member_info = member_info
+
+        # read field names
+        types = {1: 'numeric', 2: 'char'}
+        fieldcount = int(self._get_row()[54:58])
+        datalength = fieldnamelength * fieldcount
+        # round up to nearest 80
+        if datalength % 80:
+            datalength += 80 - datalength % 80
+        fielddata = self.filepath_or_buffer.read(datalength)
+        fields = []
+        obs_length = 0
+        while len(fielddata) >= fieldnamelength:
+            # pull data for one field
+            field, fielddata = (fielddata[:fieldnamelength],
+                                fielddata[fieldnamelength:])
+
+            # rest at end gets ignored, so if field is short, pad out
+            # to match struct pattern below
+            field = field.ljust(140)
+
+            fieldstruct = struct.unpack('>hhhh8s40s8shhh2s8shhl52s', field)
+            field = dict(zip(_fieldkeys, fieldstruct))
+            del field['_']
+            field['ntype'] = types[field['ntype']]
+            fl = field['field_length']
+            if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
+                self.close()
+                msg = "Floating field width {0} is not between 2 and 8."
+                raise TypeError(msg.format(fl))
+
+            for k, v in field.items():
+                try:
+                    field[k] = v.strip()
+                except AttributeError:
+                    pass
+
+            obs_length += field['field_length']
+            fields += [field]
+
+        header = self._get_row()
+        if not header == _correct_obs_header:
+            self.close()
+            raise ValueError("Observation header not found.")
+
+        self.fields = fields
+        self.record_length = obs_length
+        self.record_start = self.filepath_or_buffer.tell()
+
+        self.nobs = self._record_count()
+        self.columns = [x['name'].decode() for x in self.fields]
+
+        # Setup the dtype.
+        dtypel = [('s' + str(i), "S" + str(field['field_length']))
+                  for i, field in enumerate(self.fields)]
+        dtype = np.dtype(dtypel)
+        self._dtype = dtype
+
+    def __next__(self):
+        return self.read(nrows=self._chunksize or 1)
+
+    def _record_count(self):
+        """
+        Get number of records in file.
+
+        This is maybe suboptimal because we have to seek to the end of
+        the file.
+
+        Side effect: returns file position to record_start.
+        """
+
+        self.filepath_or_buffer.seek(0, 2)
+        total_records_length = (self.filepath_or_buffer.tell() -
+                                self.record_start)
+
+        if total_records_length % 80 != 0:
+            warnings.warn("xport file may be corrupted")
+
+        if self.record_length > 80:
+            self.filepath_or_buffer.seek(self.record_start)
+            return total_records_length // self.record_length
+
+        self.filepath_or_buffer.seek(-80, 2)
+        last_card = self.filepath_or_buffer.read(80)
+        last_card = np.frombuffer(last_card, dtype=np.uint64)
+
+        # 8 byte blank
+        ix = np.flatnonzero(last_card == 2314885530818453536)
+
+        if len(ix) == 0:
+            tail_pad = 0
+        else:
+            tail_pad = 8 * len(ix)
+
+        self.filepath_or_buffer.seek(self.record_start)
+
+        return (total_records_length - tail_pad) // self.record_length
+
+    def get_chunk(self, size=None):
+        """
+        Reads lines from Xport file and returns as dataframe
+
+        Parameters
+        ----------
+        size : int, defaults to None
+            Number of lines to read.  If None, reads whole file.
+
+        Returns
+        -------
+        DataFrame
+        """
+        if size is None:
+            size = self._chunksize
+        return self.read(nrows=size)
+
+    def _missing_double(self, vec):
+        v = vec.view(dtype='u1,u1,u2,u4')
+        miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0)
+        miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |
+                 (v['f0'] == 0x5f) | (v['f0'] == 0x2e))
+        miss &= miss1
+        return miss
+
+    @Appender(_read_method_doc)
+    def read(self, nrows=None):
+
+        if nrows is None:
+            nrows = self.nobs
+
+        read_lines = min(nrows, self.nobs - self._lines_read)
+        read_len = read_lines * self.record_length
+        if read_len <= 0:
+            self.close()
+            raise StopIteration
+        raw = self.filepath_or_buffer.read(read_len)
+        data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
+
+        df = pd.DataFrame(index=range(read_lines))
+        for j, x in enumerate(self.columns):
+            vec = data['s%d' % j]
+            ntype = self.fields[j]['ntype']
+            if ntype == "numeric":
+                vec = _handle_truncated_float_vec(
+                    vec, self.fields[j]['field_length'])
+                miss = self._missing_double(vec)
+                v = _parse_float_vec(vec)
+                v[miss] = np.nan
+            elif self.fields[j]['ntype'] == 'char':
+                v = [y.rstrip() for y in vec]
+                if compat.PY3:
+                    if self._encoding is not None:
+                        v = [y.decode(self._encoding) for y in v]
+            df[x] = v
+
+        if self._index is None:
+            df.index = range(self._lines_read, self._lines_read + read_lines)
+        else:
+            df = df.set_index(self._index)
+
+        self._lines_read += read_lines
+
+        return df
@@ -0,0 +1,68 @@
+"""
+Read SAS sas7bdat or xport files.
+"""
+from pandas import compat
+
+from pandas.io.common import _stringify_path
+
+
+def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
+             chunksize=None, iterator=False):
+    """
+    Read SAS files stored as either XPORT or SAS7BDAT format files.
+
+    Parameters
+    ----------
+    filepath_or_buffer : string or file-like object
+        Path to the SAS file.
+    format : string {'xport', 'sas7bdat'} or None
+        If None, file format is inferred from file extension. If 'xport' or
+        'sas7bdat', uses the corresponding format.
+    index : identifier of index column, defaults to None
+        Identifier of column that should be used as index of the DataFrame.
+    encoding : string, default is None
+        Encoding for text data.  If None, text data are stored as raw bytes.
+    chunksize : int
+        Read file `chunksize` lines at a time, returns iterator.
+    iterator : bool, defaults to False
+        If True, returns an iterator for reading the file incrementally.
+
+    Returns
+    -------
+    DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
+    or XportReader
+    """
+    if format is None:
+        buffer_error_msg = ("If this is a buffer object rather "
+                            "than a string name, you must specify "
+                            "a format string")
+        filepath_or_buffer = _stringify_path(filepath_or_buffer)
+        if not isinstance(filepath_or_buffer, compat.string_types):
+            raise ValueError(buffer_error_msg)
+        fname = filepath_or_buffer.lower()
+        if fname.endswith(".xpt"):
+            format = "xport"
+        elif fname.endswith(".sas7bdat"):
+            format = "sas7bdat"
+        else:
+            raise ValueError("unable to infer format of SAS file")
+
+    if format.lower() == 'xport':
+        from pandas.io.sas.sas_xport import XportReader
+        reader = XportReader(filepath_or_buffer, index=index,
+                             encoding=encoding,
+                             chunksize=chunksize)
+    elif format.lower() == 'sas7bdat':
+        from pandas.io.sas.sas7bdat import SAS7BDATReader
+        reader = SAS7BDATReader(filepath_or_buffer, index=index,
+                                encoding=encoding,
+                                chunksize=chunksize)
+    else:
+        raise ValueError('unknown SAS format')
+
+    if iterator or chunksize:
+        return reader
+
+    data = reader.read()
+    reader.close()
+    return data