pruned venvs
This commit is contained in:
@@ -1,26 +0,0 @@
|
||||
"""
|
||||
Module to read ARFF files, which are the standard data format for WEKA.
|
||||
|
||||
ARFF is a text file format which support numerical, string and data values.
|
||||
The format can also represent missing data and sparse data.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The ARFF support in ``scipy.io`` provides file reading functionality only.
|
||||
For more extensive ARFF functionality, see `liac-arff
|
||||
<https://github.com/renatopp/liac-arff>`_.
|
||||
|
||||
See the `WEKA website <http://weka.wikispaces.com/ARFF>`_
|
||||
for more details about the ARFF format and available datasets.
|
||||
|
||||
"""
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from .arffread import *
|
||||
from . import arffread
|
||||
|
||||
__all__ = arffread.__all__
|
||||
|
||||
from scipy._lib._testutils import PytestTester
|
||||
test = PytestTester(__name__)
|
||||
del PytestTester
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
@@ -1,670 +0,0 @@
|
||||
# Last Change: Mon Aug 20 08:00 PM 2007 J
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import re
|
||||
import itertools
|
||||
import datetime
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy._lib.six import next
|
||||
|
||||
"""A module to read arff files."""
|
||||
|
||||
__all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
|
||||
|
||||
# An Arff file is basically two parts:
|
||||
# - header
|
||||
# - data
|
||||
#
|
||||
# A header has each of its components starting by @META where META is one of
|
||||
# the keyword (attribute of relation, for now).
|
||||
|
||||
# TODO:
|
||||
# - both integer and reals are treated as numeric -> the integer info
|
||||
# is lost!
|
||||
# - Replace ValueError by ParseError or something
|
||||
|
||||
# We know can handle the following:
|
||||
# - numeric and nominal attributes
|
||||
# - missing values for numeric attributes
|
||||
|
||||
r_meta = re.compile(r'^\s*@')
|
||||
# Match a comment
|
||||
r_comment = re.compile(r'^%')
|
||||
# Match an empty line
|
||||
r_empty = re.compile(r'^\s+$')
|
||||
# Match a header line, that is a line which starts by @ + a word
|
||||
r_headerline = re.compile(r'^@\S*')
|
||||
r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
|
||||
r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
|
||||
r_attribute = re.compile(r'^@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
|
||||
|
||||
# To get attributes name enclosed with ''
|
||||
r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
|
||||
# To get normal attributes
|
||||
r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
|
||||
|
||||
#-------------------------
|
||||
# Module defined exception
|
||||
#-------------------------
|
||||
|
||||
|
||||
class ArffError(IOError):
|
||||
pass
|
||||
|
||||
|
||||
class ParseArffError(ArffError):
|
||||
pass
|
||||
|
||||
#------------------
|
||||
# Various utilities
|
||||
#------------------
|
||||
|
||||
# An attribute is defined as @attribute name value
|
||||
|
||||
|
||||
def parse_type(attrtype):
|
||||
"""Given an arff attribute value (meta data), returns its type.
|
||||
|
||||
Expect the value to be a name."""
|
||||
uattribute = attrtype.lower().strip()
|
||||
if uattribute[0] == '{':
|
||||
return 'nominal'
|
||||
elif uattribute[:len('real')] == 'real':
|
||||
return 'numeric'
|
||||
elif uattribute[:len('integer')] == 'integer':
|
||||
return 'numeric'
|
||||
elif uattribute[:len('numeric')] == 'numeric':
|
||||
return 'numeric'
|
||||
elif uattribute[:len('string')] == 'string':
|
||||
return 'string'
|
||||
elif uattribute[:len('relational')] == 'relational':
|
||||
return 'relational'
|
||||
elif uattribute[:len('date')] == 'date':
|
||||
return 'date'
|
||||
else:
|
||||
raise ParseArffError("unknown attribute %s" % uattribute)
|
||||
|
||||
|
||||
def get_nominal(attribute):
|
||||
"""If attribute is nominal, returns a list of the values"""
|
||||
return attribute.split(',')
|
||||
|
||||
|
||||
def read_data_list(ofile):
|
||||
"""Read each line of the iterable and put it in a list."""
|
||||
data = [next(ofile)]
|
||||
if data[0].strip()[0] == '{':
|
||||
raise ValueError("This looks like a sparse ARFF: not supported yet")
|
||||
data.extend([i for i in ofile])
|
||||
return data
|
||||
|
||||
|
||||
def get_ndata(ofile):
|
||||
"""Read the whole file to get number of data attributes."""
|
||||
data = [next(ofile)]
|
||||
loc = 1
|
||||
if data[0].strip()[0] == '{':
|
||||
raise ValueError("This looks like a sparse ARFF: not supported yet")
|
||||
for i in ofile:
|
||||
loc += 1
|
||||
return loc
|
||||
|
||||
|
||||
def maxnomlen(atrv):
|
||||
"""Given a string containing a nominal type definition, returns the
|
||||
string len of the biggest component.
|
||||
|
||||
A nominal type is defined as seomthing framed between brace ({}).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
atrv : str
|
||||
Nominal type definition
|
||||
|
||||
Returns
|
||||
-------
|
||||
slen : int
|
||||
length of longest component
|
||||
|
||||
Examples
|
||||
--------
|
||||
maxnomlen("{floup, bouga, fl, ratata}") returns 6 (the size of
|
||||
ratata, the longest nominal value).
|
||||
|
||||
>>> maxnomlen("{floup, bouga, fl, ratata}")
|
||||
6
|
||||
"""
|
||||
nomtp = get_nom_val(atrv)
|
||||
return max(len(i) for i in nomtp)
|
||||
|
||||
|
||||
def get_nom_val(atrv):
|
||||
"""Given a string containing a nominal type, returns a tuple of the
|
||||
possible values.
|
||||
|
||||
A nominal type is defined as something framed between braces ({}).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
atrv : str
|
||||
Nominal type definition
|
||||
|
||||
Returns
|
||||
-------
|
||||
poss_vals : tuple
|
||||
possible values
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> get_nom_val("{floup, bouga, fl, ratata}")
|
||||
('floup', 'bouga', 'fl', 'ratata')
|
||||
"""
|
||||
r_nominal = re.compile('{(.+)}')
|
||||
m = r_nominal.match(atrv)
|
||||
if m:
|
||||
return tuple(i.strip() for i in m.group(1).split(','))
|
||||
else:
|
||||
raise ValueError("This does not look like a nominal string")
|
||||
|
||||
|
||||
def get_date_format(atrv):
|
||||
r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")
|
||||
m = r_date.match(atrv)
|
||||
if m:
|
||||
pattern = m.group(1).strip()
|
||||
# convert time pattern from Java's SimpleDateFormat to C's format
|
||||
datetime_unit = None
|
||||
if "yyyy" in pattern:
|
||||
pattern = pattern.replace("yyyy", "%Y")
|
||||
datetime_unit = "Y"
|
||||
elif "yy":
|
||||
pattern = pattern.replace("yy", "%y")
|
||||
datetime_unit = "Y"
|
||||
if "MM" in pattern:
|
||||
pattern = pattern.replace("MM", "%m")
|
||||
datetime_unit = "M"
|
||||
if "dd" in pattern:
|
||||
pattern = pattern.replace("dd", "%d")
|
||||
datetime_unit = "D"
|
||||
if "HH" in pattern:
|
||||
pattern = pattern.replace("HH", "%H")
|
||||
datetime_unit = "h"
|
||||
if "mm" in pattern:
|
||||
pattern = pattern.replace("mm", "%M")
|
||||
datetime_unit = "m"
|
||||
if "ss" in pattern:
|
||||
pattern = pattern.replace("ss", "%S")
|
||||
datetime_unit = "s"
|
||||
if "z" in pattern or "Z" in pattern:
|
||||
raise ValueError("Date type attributes with time zone not "
|
||||
"supported, yet")
|
||||
|
||||
if datetime_unit is None:
|
||||
raise ValueError("Invalid or unsupported date format")
|
||||
|
||||
return pattern, datetime_unit
|
||||
else:
|
||||
raise ValueError("Invalid or no date format")
|
||||
|
||||
|
||||
def go_data(ofile):
|
||||
"""Skip header.
|
||||
|
||||
the first next() call of the returned iterator will be the @data line"""
|
||||
return itertools.dropwhile(lambda x: not r_datameta.match(x), ofile)
|
||||
|
||||
|
||||
#----------------
|
||||
# Parsing header
|
||||
#----------------
|
||||
def tokenize_attribute(iterable, attribute):
|
||||
"""Parse a raw string in header (eg starts by @attribute).
|
||||
|
||||
Given a raw string attribute, try to get the name and type of the
|
||||
attribute. Constraints:
|
||||
|
||||
* The first line must start with @attribute (case insensitive, and
|
||||
space like characters before @attribute are allowed)
|
||||
* Works also if the attribute is spread on multilines.
|
||||
* Works if empty lines or comments are in between
|
||||
|
||||
Parameters
|
||||
----------
|
||||
attribute : str
|
||||
the attribute string.
|
||||
|
||||
Returns
|
||||
-------
|
||||
name : str
|
||||
name of the attribute
|
||||
value : str
|
||||
value of the attribute
|
||||
next : str
|
||||
next line to be parsed
|
||||
|
||||
Examples
|
||||
--------
|
||||
If attribute is a string defined in python as r"floupi real", will
|
||||
return floupi as name, and real as value.
|
||||
|
||||
>>> iterable = iter([0] * 10) # dummy iterator
|
||||
>>> tokenize_attribute(iterable, r"@attribute floupi real")
|
||||
('floupi', 'real', 0)
|
||||
|
||||
If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
|
||||
and real as value.
|
||||
|
||||
>>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")
|
||||
('floupi 2', 'real', 0)
|
||||
|
||||
"""
|
||||
sattr = attribute.strip()
|
||||
mattr = r_attribute.match(sattr)
|
||||
if mattr:
|
||||
# atrv is everything after @attribute
|
||||
atrv = mattr.group(1)
|
||||
if r_comattrval.match(atrv):
|
||||
name, type = tokenize_single_comma(atrv)
|
||||
next_item = next(iterable)
|
||||
elif r_wcomattrval.match(atrv):
|
||||
name, type = tokenize_single_wcomma(atrv)
|
||||
next_item = next(iterable)
|
||||
else:
|
||||
# Not sure we should support this, as it does not seem supported by
|
||||
# weka.
|
||||
raise ValueError("multi line not supported yet")
|
||||
#name, type, next_item = tokenize_multilines(iterable, atrv)
|
||||
else:
|
||||
raise ValueError("First line unparsable: %s" % sattr)
|
||||
|
||||
if type == 'relational':
|
||||
raise ValueError("relational attributes not supported yet")
|
||||
return name, type, next_item
|
||||
|
||||
|
||||
def tokenize_single_comma(val):
|
||||
# XXX we match twice the same string (here and at the caller level). It is
|
||||
# stupid, but it is easier for now...
|
||||
m = r_comattrval.match(val)
|
||||
if m:
|
||||
try:
|
||||
name = m.group(1).strip()
|
||||
type = m.group(2).strip()
|
||||
except IndexError:
|
||||
raise ValueError("Error while tokenizing attribute")
|
||||
else:
|
||||
raise ValueError("Error while tokenizing single %s" % val)
|
||||
return name, type
|
||||
|
||||
|
||||
def tokenize_single_wcomma(val):
|
||||
# XXX we match twice the same string (here and at the caller level). It is
|
||||
# stupid, but it is easier for now...
|
||||
m = r_wcomattrval.match(val)
|
||||
if m:
|
||||
try:
|
||||
name = m.group(1).strip()
|
||||
type = m.group(2).strip()
|
||||
except IndexError:
|
||||
raise ValueError("Error while tokenizing attribute")
|
||||
else:
|
||||
raise ValueError("Error while tokenizing single %s" % val)
|
||||
return name, type
|
||||
|
||||
|
||||
def read_header(ofile):
|
||||
"""Read the header of the iterable ofile."""
|
||||
i = next(ofile)
|
||||
|
||||
# Pass first comments
|
||||
while r_comment.match(i):
|
||||
i = next(ofile)
|
||||
|
||||
# Header is everything up to DATA attribute ?
|
||||
relation = None
|
||||
attributes = []
|
||||
while not r_datameta.match(i):
|
||||
m = r_headerline.match(i)
|
||||
if m:
|
||||
isattr = r_attribute.match(i)
|
||||
if isattr:
|
||||
name, type, i = tokenize_attribute(ofile, i)
|
||||
attributes.append((name, type))
|
||||
else:
|
||||
isrel = r_relation.match(i)
|
||||
if isrel:
|
||||
relation = isrel.group(1)
|
||||
else:
|
||||
raise ValueError("Error parsing line %s" % i)
|
||||
i = next(ofile)
|
||||
else:
|
||||
i = next(ofile)
|
||||
|
||||
return relation, attributes
|
||||
|
||||
|
||||
#--------------------
|
||||
# Parsing actual data
|
||||
#--------------------
|
||||
def safe_float(x):
|
||||
"""given a string x, convert it to a float. If the stripped string is a ?,
|
||||
return a Nan (missing value).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : str
|
||||
string to convert
|
||||
|
||||
Returns
|
||||
-------
|
||||
f : float
|
||||
where float can be nan
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> safe_float('1')
|
||||
1.0
|
||||
>>> safe_float('1\\n')
|
||||
1.0
|
||||
>>> safe_float('?\\n')
|
||||
nan
|
||||
"""
|
||||
if '?' in x:
|
||||
return np.nan
|
||||
else:
|
||||
return float(x)
|
||||
|
||||
|
||||
def safe_nominal(value, pvalue):
|
||||
svalue = value.strip()
|
||||
if svalue in pvalue:
|
||||
return svalue
|
||||
elif svalue == '?':
|
||||
return svalue
|
||||
else:
|
||||
raise ValueError("%s value not in %s" % (str(svalue), str(pvalue)))
|
||||
|
||||
|
||||
def safe_date(value, date_format, datetime_unit):
|
||||
date_str = value.strip().strip("'").strip('"')
|
||||
if date_str == '?':
|
||||
return np.datetime64('NaT', datetime_unit)
|
||||
else:
|
||||
dt = datetime.datetime.strptime(date_str, date_format)
|
||||
return np.datetime64(dt).astype("datetime64[%s]" % datetime_unit)
|
||||
|
||||
|
||||
class MetaData(object):
|
||||
"""Small container to keep useful information on a ARFF dataset.
|
||||
|
||||
Knows about attributes names and types.
|
||||
|
||||
Examples
|
||||
--------
|
||||
::
|
||||
|
||||
data, meta = loadarff('iris.arff')
|
||||
# This will print the attributes names of the iris.arff dataset
|
||||
for i in meta:
|
||||
print(i)
|
||||
# This works too
|
||||
meta.names()
|
||||
# Getting attribute type
|
||||
types = meta.types()
|
||||
|
||||
Notes
|
||||
-----
|
||||
Also maintains the list of attributes in order, i.e. doing for i in
|
||||
meta, where meta is an instance of MetaData, will return the
|
||||
different attribute names in the order they were defined.
|
||||
"""
|
||||
def __init__(self, rel, attr):
|
||||
self.name = rel
|
||||
# We need the dictionary to be ordered
|
||||
# XXX: may be better to implement an ordered dictionary
|
||||
self._attributes = {}
|
||||
self._attrnames = []
|
||||
for name, value in attr:
|
||||
tp = parse_type(value)
|
||||
self._attrnames.append(name)
|
||||
if tp == 'nominal':
|
||||
self._attributes[name] = (tp, get_nom_val(value))
|
||||
elif tp == 'date':
|
||||
self._attributes[name] = (tp, get_date_format(value)[0])
|
||||
else:
|
||||
self._attributes[name] = (tp, None)
|
||||
|
||||
def __repr__(self):
|
||||
msg = ""
|
||||
msg += "Dataset: %s\n" % self.name
|
||||
for i in self._attrnames:
|
||||
msg += "\t%s's type is %s" % (i, self._attributes[i][0])
|
||||
if self._attributes[i][1]:
|
||||
msg += ", range is %s" % str(self._attributes[i][1])
|
||||
msg += '\n'
|
||||
return msg
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._attrnames)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self._attributes[key]
|
||||
|
||||
def names(self):
|
||||
"""Return the list of attribute names."""
|
||||
return self._attrnames
|
||||
|
||||
def types(self):
|
||||
"""Return the list of attribute types."""
|
||||
attr_types = [self._attributes[name][0] for name in self._attrnames]
|
||||
return attr_types
|
||||
|
||||
|
||||
def loadarff(f):
|
||||
"""
|
||||
Read an arff file.
|
||||
|
||||
The data is returned as a record array, which can be accessed much like
|
||||
a dictionary of numpy arrays. For example, if one of the attributes is
|
||||
called 'pressure', then its first 10 data points can be accessed from the
|
||||
``data`` record array like so: ``data['pressure'][0:10]``
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
f : file-like or str
|
||||
File-like object to read from, or filename to open.
|
||||
|
||||
Returns
|
||||
-------
|
||||
data : record array
|
||||
The data of the arff file, accessible by attribute names.
|
||||
meta : `MetaData`
|
||||
Contains information about the arff file such as name and
|
||||
type of attributes, the relation (name of the dataset), etc...
|
||||
|
||||
Raises
|
||||
------
|
||||
ParseArffError
|
||||
This is raised if the given file is not ARFF-formatted.
|
||||
NotImplementedError
|
||||
The ARFF file has an attribute which is not supported yet.
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
This function should be able to read most arff files. Not
|
||||
implemented functionality include:
|
||||
|
||||
* date type attributes
|
||||
* string type attributes
|
||||
|
||||
It can read files with numeric and nominal attributes. It cannot read
|
||||
files with sparse data ({} in the file). However, this function can
|
||||
read files with missing data (? in the file), representing the data
|
||||
points as NaNs.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.io import arff
|
||||
>>> from io import StringIO
|
||||
>>> content = \"\"\"
|
||||
... @relation foo
|
||||
... @attribute width numeric
|
||||
... @attribute height numeric
|
||||
... @attribute color {red,green,blue,yellow,black}
|
||||
... @data
|
||||
... 5.0,3.25,blue
|
||||
... 4.5,3.75,green
|
||||
... 3.0,4.00,red
|
||||
... \"\"\"
|
||||
>>> f = StringIO(content)
|
||||
>>> data, meta = arff.loadarff(f)
|
||||
>>> data
|
||||
array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')],
|
||||
dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')])
|
||||
>>> meta
|
||||
Dataset: foo
|
||||
\twidth's type is numeric
|
||||
\theight's type is numeric
|
||||
\tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black')
|
||||
|
||||
"""
|
||||
if hasattr(f, 'read'):
|
||||
ofile = f
|
||||
else:
|
||||
ofile = open(f, 'rt')
|
||||
try:
|
||||
return _loadarff(ofile)
|
||||
finally:
|
||||
if ofile is not f: # only close what we opened
|
||||
ofile.close()
|
||||
|
||||
|
||||
def _loadarff(ofile):
|
||||
# Parse the header file
|
||||
try:
|
||||
rel, attr = read_header(ofile)
|
||||
except ValueError as e:
|
||||
msg = "Error while parsing header, error was: " + str(e)
|
||||
raise ParseArffError(msg)
|
||||
|
||||
# Check whether we have a string attribute (not supported yet)
|
||||
hasstr = False
|
||||
for name, value in attr:
|
||||
type = parse_type(value)
|
||||
if type == 'string':
|
||||
hasstr = True
|
||||
|
||||
meta = MetaData(rel, attr)
|
||||
|
||||
# XXX The following code is not great
|
||||
# Build the type descriptor descr and the list of convertors to convert
|
||||
# each attribute to the suitable type (which should match the one in
|
||||
# descr).
|
||||
|
||||
# This can be used once we want to support integer as integer values and
|
||||
# not as numeric anymore (using masked arrays ?).
|
||||
acls2dtype = {'real': float, 'integer': float, 'numeric': float}
|
||||
acls2conv = {'real': safe_float,
|
||||
'integer': safe_float,
|
||||
'numeric': safe_float}
|
||||
descr = []
|
||||
convertors = []
|
||||
if not hasstr:
|
||||
for name, value in attr:
|
||||
type = parse_type(value)
|
||||
if type == 'date':
|
||||
date_format, datetime_unit = get_date_format(value)
|
||||
descr.append((name, "datetime64[%s]" % datetime_unit))
|
||||
convertors.append(partial(safe_date, date_format=date_format,
|
||||
datetime_unit=datetime_unit))
|
||||
elif type == 'nominal':
|
||||
n = maxnomlen(value)
|
||||
descr.append((name, 'S%d' % n))
|
||||
pvalue = get_nom_val(value)
|
||||
convertors.append(partial(safe_nominal, pvalue=pvalue))
|
||||
else:
|
||||
descr.append((name, acls2dtype[type]))
|
||||
convertors.append(safe_float)
|
||||
#dc.append(acls2conv[type])
|
||||
#sdescr.append((name, acls2sdtype[type]))
|
||||
else:
|
||||
# How to support string efficiently ? Ideally, we should know the max
|
||||
# size of the string before allocating the numpy array.
|
||||
raise NotImplementedError("String attributes not supported yet, sorry")
|
||||
|
||||
ni = len(convertors)
|
||||
|
||||
def generator(row_iter, delim=','):
|
||||
# TODO: this is where we are spending times (~80%). I think things
|
||||
# could be made more efficiently:
|
||||
# - We could for example "compile" the function, because some values
|
||||
# do not change here.
|
||||
# - The function to convert a line to dtyped values could also be
|
||||
# generated on the fly from a string and be executed instead of
|
||||
# looping.
|
||||
# - The regex are overkill: for comments, checking that a line starts
|
||||
# by % should be enough and faster, and for empty lines, same thing
|
||||
# --> this does not seem to change anything.
|
||||
|
||||
# 'compiling' the range since it does not change
|
||||
# Note, I have already tried zipping the converters and
|
||||
# row elements and got slightly worse performance.
|
||||
elems = list(range(ni))
|
||||
|
||||
for raw in row_iter:
|
||||
# We do not abstract skipping comments and empty lines for
|
||||
# performance reasons.
|
||||
if r_comment.match(raw) or r_empty.match(raw):
|
||||
continue
|
||||
row = raw.split(delim)
|
||||
yield tuple([convertors[i](row[i]) for i in elems])
|
||||
|
||||
a = generator(ofile)
|
||||
# No error should happen here: it is a bug otherwise
|
||||
data = np.fromiter(a, descr)
|
||||
return data, meta
|
||||
|
||||
|
||||
#-----
|
||||
# Misc
|
||||
#-----
|
||||
def basic_stats(data):
|
||||
nbfac = data.size * 1. / (data.size - 1)
|
||||
return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac
|
||||
|
||||
|
||||
def print_attribute(name, tp, data):
|
||||
type = tp[0]
|
||||
if type == 'numeric' or type == 'real' or type == 'integer':
|
||||
min, max, mean, std = basic_stats(data)
|
||||
print("%s,%s,%f,%f,%f,%f" % (name, type, min, max, mean, std))
|
||||
else:
|
||||
msg = name + ",{"
|
||||
for i in range(len(tp[1])-1):
|
||||
msg += tp[1][i] + ","
|
||||
msg += tp[1][-1]
|
||||
msg += "}"
|
||||
print(msg)
|
||||
|
||||
|
||||
def test_weka(filename):
|
||||
data, meta = loadarff(filename)
|
||||
print(len(data.dtype))
|
||||
print(data.size)
|
||||
for i in meta:
|
||||
print_attribute(i, meta[i], data[i])
|
||||
|
||||
|
||||
# make sure nose does not find this as a test
|
||||
test_weka.__test__ = False
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
filename = sys.argv[1]
|
||||
test_weka(filename)
|
||||
@@ -1,13 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
|
||||
def configuration(parent_package='io',top_path=None):
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
config = Configuration('arff', parent_package, top_path)
|
||||
config.add_data_dir('tests')
|
||||
return config
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from numpy.distutils.core import setup
|
||||
setup(**configuration(top_path='').todict())
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -1,225 +0,0 @@
|
||||
% 1. Title: Iris Plants Database
|
||||
%
|
||||
% 2. Sources:
|
||||
% (a) Creator: R.A. Fisher
|
||||
% (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
|
||||
% (c) Date: July, 1988
|
||||
%
|
||||
% 3. Past Usage:
|
||||
% - Publications: too many to mention!!! Here are a few.
|
||||
% 1. Fisher,R.A. "The use of multiple measurements in taxonomic problems"
|
||||
% Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions
|
||||
% to Mathematical Statistics" (John Wiley, NY, 1950).
|
||||
% 2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
|
||||
% (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.
|
||||
% 3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
|
||||
% Structure and Classification Rule for Recognition in Partially Exposed
|
||||
% Environments". IEEE Transactions on Pattern Analysis and Machine
|
||||
% Intelligence, Vol. PAMI-2, No. 1, 67-71.
|
||||
% -- Results:
|
||||
% -- very low misclassification rates (0% for the setosa class)
|
||||
% 4. Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE
|
||||
% Transactions on Information Theory, May 1972, 431-433.
|
||||
% -- Results:
|
||||
% -- very low misclassification rates again
|
||||
% 5. See also: 1988 MLC Proceedings, 54-64. Cheeseman et al's AUTOCLASS II
|
||||
% conceptual clustering system finds 3 classes in the data.
|
||||
%
|
||||
% 4. Relevant Information:
|
||||
% --- This is perhaps the best known database to be found in the pattern
|
||||
% recognition literature. Fisher's paper is a classic in the field
|
||||
% and is referenced frequently to this day. (See Duda & Hart, for
|
||||
% example.) The data set contains 3 classes of 50 instances each,
|
||||
% where each class refers to a type of iris plant. One class is
|
||||
% linearly separable from the other 2; the latter are NOT linearly
|
||||
% separable from each other.
|
||||
% --- Predicted attribute: class of iris plant.
|
||||
% --- This is an exceedingly simple domain.
|
||||
%
|
||||
% 5. Number of Instances: 150 (50 in each of three classes)
|
||||
%
|
||||
% 6. Number of Attributes: 4 numeric, predictive attributes and the class
|
||||
%
|
||||
% 7. Attribute Information:
|
||||
% 1. sepal length in cm
|
||||
% 2. sepal width in cm
|
||||
% 3. petal length in cm
|
||||
% 4. petal width in cm
|
||||
% 5. class:
|
||||
% -- Iris Setosa
|
||||
% -- Iris Versicolour
|
||||
% -- Iris Virginica
|
||||
%
|
||||
% 8. Missing Attribute Values: None
|
||||
%
|
||||
% Summary Statistics:
|
||||
% Min Max Mean SD Class Correlation
|
||||
% sepal length: 4.3 7.9 5.84 0.83 0.7826
|
||||
% sepal width: 2.0 4.4 3.05 0.43 -0.4194
|
||||
% petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)
|
||||
% petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)
|
||||
%
|
||||
% 9. Class Distribution: 33.3% for each of 3 classes.
|
||||
|
||||
@RELATION iris
|
||||
|
||||
@ATTRIBUTE sepallength REAL
|
||||
@ATTRIBUTE sepalwidth REAL
|
||||
@ATTRIBUTE petallength REAL
|
||||
@ATTRIBUTE petalwidth REAL
|
||||
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
|
||||
|
||||
@DATA
|
||||
5.1,3.5,1.4,0.2,Iris-setosa
|
||||
4.9,3.0,1.4,0.2,Iris-setosa
|
||||
4.7,3.2,1.3,0.2,Iris-setosa
|
||||
4.6,3.1,1.5,0.2,Iris-setosa
|
||||
5.0,3.6,1.4,0.2,Iris-setosa
|
||||
5.4,3.9,1.7,0.4,Iris-setosa
|
||||
4.6,3.4,1.4,0.3,Iris-setosa
|
||||
5.0,3.4,1.5,0.2,Iris-setosa
|
||||
4.4,2.9,1.4,0.2,Iris-setosa
|
||||
4.9,3.1,1.5,0.1,Iris-setosa
|
||||
5.4,3.7,1.5,0.2,Iris-setosa
|
||||
4.8,3.4,1.6,0.2,Iris-setosa
|
||||
4.8,3.0,1.4,0.1,Iris-setosa
|
||||
4.3,3.0,1.1,0.1,Iris-setosa
|
||||
5.8,4.0,1.2,0.2,Iris-setosa
|
||||
5.7,4.4,1.5,0.4,Iris-setosa
|
||||
5.4,3.9,1.3,0.4,Iris-setosa
|
||||
5.1,3.5,1.4,0.3,Iris-setosa
|
||||
5.7,3.8,1.7,0.3,Iris-setosa
|
||||
5.1,3.8,1.5,0.3,Iris-setosa
|
||||
5.4,3.4,1.7,0.2,Iris-setosa
|
||||
5.1,3.7,1.5,0.4,Iris-setosa
|
||||
4.6,3.6,1.0,0.2,Iris-setosa
|
||||
5.1,3.3,1.7,0.5,Iris-setosa
|
||||
4.8,3.4,1.9,0.2,Iris-setosa
|
||||
5.0,3.0,1.6,0.2,Iris-setosa
|
||||
5.0,3.4,1.6,0.4,Iris-setosa
|
||||
5.2,3.5,1.5,0.2,Iris-setosa
|
||||
5.2,3.4,1.4,0.2,Iris-setosa
|
||||
4.7,3.2,1.6,0.2,Iris-setosa
|
||||
4.8,3.1,1.6,0.2,Iris-setosa
|
||||
5.4,3.4,1.5,0.4,Iris-setosa
|
||||
5.2,4.1,1.5,0.1,Iris-setosa
|
||||
5.5,4.2,1.4,0.2,Iris-setosa
|
||||
4.9,3.1,1.5,0.1,Iris-setosa
|
||||
5.0,3.2,1.2,0.2,Iris-setosa
|
||||
5.5,3.5,1.3,0.2,Iris-setosa
|
||||
4.9,3.1,1.5,0.1,Iris-setosa
|
||||
4.4,3.0,1.3,0.2,Iris-setosa
|
||||
5.1,3.4,1.5,0.2,Iris-setosa
|
||||
5.0,3.5,1.3,0.3,Iris-setosa
|
||||
4.5,2.3,1.3,0.3,Iris-setosa
|
||||
4.4,3.2,1.3,0.2,Iris-setosa
|
||||
5.0,3.5,1.6,0.6,Iris-setosa
|
||||
5.1,3.8,1.9,0.4,Iris-setosa
|
||||
4.8,3.0,1.4,0.3,Iris-setosa
|
||||
5.1,3.8,1.6,0.2,Iris-setosa
|
||||
4.6,3.2,1.4,0.2,Iris-setosa
|
||||
5.3,3.7,1.5,0.2,Iris-setosa
|
||||
5.0,3.3,1.4,0.2,Iris-setosa
|
||||
7.0,3.2,4.7,1.4,Iris-versicolor
|
||||
6.4,3.2,4.5,1.5,Iris-versicolor
|
||||
6.9,3.1,4.9,1.5,Iris-versicolor
|
||||
5.5,2.3,4.0,1.3,Iris-versicolor
|
||||
6.5,2.8,4.6,1.5,Iris-versicolor
|
||||
5.7,2.8,4.5,1.3,Iris-versicolor
|
||||
6.3,3.3,4.7,1.6,Iris-versicolor
|
||||
4.9,2.4,3.3,1.0,Iris-versicolor
|
||||
6.6,2.9,4.6,1.3,Iris-versicolor
|
||||
5.2,2.7,3.9,1.4,Iris-versicolor
|
||||
5.0,2.0,3.5,1.0,Iris-versicolor
|
||||
5.9,3.0,4.2,1.5,Iris-versicolor
|
||||
6.0,2.2,4.0,1.0,Iris-versicolor
|
||||
6.1,2.9,4.7,1.4,Iris-versicolor
|
||||
5.6,2.9,3.6,1.3,Iris-versicolor
|
||||
6.7,3.1,4.4,1.4,Iris-versicolor
|
||||
5.6,3.0,4.5,1.5,Iris-versicolor
|
||||
5.8,2.7,4.1,1.0,Iris-versicolor
|
||||
6.2,2.2,4.5,1.5,Iris-versicolor
|
||||
5.6,2.5,3.9,1.1,Iris-versicolor
|
||||
5.9,3.2,4.8,1.8,Iris-versicolor
|
||||
6.1,2.8,4.0,1.3,Iris-versicolor
|
||||
6.3,2.5,4.9,1.5,Iris-versicolor
|
||||
6.1,2.8,4.7,1.2,Iris-versicolor
|
||||
6.4,2.9,4.3,1.3,Iris-versicolor
|
||||
6.6,3.0,4.4,1.4,Iris-versicolor
|
||||
6.8,2.8,4.8,1.4,Iris-versicolor
|
||||
6.7,3.0,5.0,1.7,Iris-versicolor
|
||||
6.0,2.9,4.5,1.5,Iris-versicolor
|
||||
5.7,2.6,3.5,1.0,Iris-versicolor
|
||||
5.5,2.4,3.8,1.1,Iris-versicolor
|
||||
5.5,2.4,3.7,1.0,Iris-versicolor
|
||||
5.8,2.7,3.9,1.2,Iris-versicolor
|
||||
6.0,2.7,5.1,1.6,Iris-versicolor
|
||||
5.4,3.0,4.5,1.5,Iris-versicolor
|
||||
6.0,3.4,4.5,1.6,Iris-versicolor
|
||||
6.7,3.1,4.7,1.5,Iris-versicolor
|
||||
6.3,2.3,4.4,1.3,Iris-versicolor
|
||||
5.6,3.0,4.1,1.3,Iris-versicolor
|
||||
5.5,2.5,4.0,1.3,Iris-versicolor
|
||||
5.5,2.6,4.4,1.2,Iris-versicolor
|
||||
6.1,3.0,4.6,1.4,Iris-versicolor
|
||||
5.8,2.6,4.0,1.2,Iris-versicolor
|
||||
5.0,2.3,3.3,1.0,Iris-versicolor
|
||||
5.6,2.7,4.2,1.3,Iris-versicolor
|
||||
5.7,3.0,4.2,1.2,Iris-versicolor
|
||||
5.7,2.9,4.2,1.3,Iris-versicolor
|
||||
6.2,2.9,4.3,1.3,Iris-versicolor
|
||||
5.1,2.5,3.0,1.1,Iris-versicolor
|
||||
5.7,2.8,4.1,1.3,Iris-versicolor
|
||||
6.3,3.3,6.0,2.5,Iris-virginica
|
||||
5.8,2.7,5.1,1.9,Iris-virginica
|
||||
7.1,3.0,5.9,2.1,Iris-virginica
|
||||
6.3,2.9,5.6,1.8,Iris-virginica
|
||||
6.5,3.0,5.8,2.2,Iris-virginica
|
||||
7.6,3.0,6.6,2.1,Iris-virginica
|
||||
4.9,2.5,4.5,1.7,Iris-virginica
|
||||
7.3,2.9,6.3,1.8,Iris-virginica
|
||||
6.7,2.5,5.8,1.8,Iris-virginica
|
||||
7.2,3.6,6.1,2.5,Iris-virginica
|
||||
6.5,3.2,5.1,2.0,Iris-virginica
|
||||
6.4,2.7,5.3,1.9,Iris-virginica
|
||||
6.8,3.0,5.5,2.1,Iris-virginica
|
||||
5.7,2.5,5.0,2.0,Iris-virginica
|
||||
5.8,2.8,5.1,2.4,Iris-virginica
|
||||
6.4,3.2,5.3,2.3,Iris-virginica
|
||||
6.5,3.0,5.5,1.8,Iris-virginica
|
||||
7.7,3.8,6.7,2.2,Iris-virginica
|
||||
7.7,2.6,6.9,2.3,Iris-virginica
|
||||
6.0,2.2,5.0,1.5,Iris-virginica
|
||||
6.9,3.2,5.7,2.3,Iris-virginica
|
||||
5.6,2.8,4.9,2.0,Iris-virginica
|
||||
7.7,2.8,6.7,2.0,Iris-virginica
|
||||
6.3,2.7,4.9,1.8,Iris-virginica
|
||||
6.7,3.3,5.7,2.1,Iris-virginica
|
||||
7.2,3.2,6.0,1.8,Iris-virginica
|
||||
6.2,2.8,4.8,1.8,Iris-virginica
|
||||
6.1,3.0,4.9,1.8,Iris-virginica
|
||||
6.4,2.8,5.6,2.1,Iris-virginica
|
||||
7.2,3.0,5.8,1.6,Iris-virginica
|
||||
7.4,2.8,6.1,1.9,Iris-virginica
|
||||
7.9,3.8,6.4,2.0,Iris-virginica
|
||||
6.4,2.8,5.6,2.2,Iris-virginica
|
||||
6.3,2.8,5.1,1.5,Iris-virginica
|
||||
6.1,2.6,5.6,1.4,Iris-virginica
|
||||
7.7,3.0,6.1,2.3,Iris-virginica
|
||||
6.3,3.4,5.6,2.4,Iris-virginica
|
||||
6.4,3.1,5.5,1.8,Iris-virginica
|
||||
6.0,3.0,4.8,1.8,Iris-virginica
|
||||
6.9,3.1,5.4,2.1,Iris-virginica
|
||||
6.7,3.1,5.6,2.4,Iris-virginica
|
||||
6.9,3.1,5.1,2.3,Iris-virginica
|
||||
5.8,2.7,5.1,1.9,Iris-virginica
|
||||
6.8,3.2,5.9,2.3,Iris-virginica
|
||||
6.7,3.3,5.7,2.5,Iris-virginica
|
||||
6.7,3.0,5.2,2.3,Iris-virginica
|
||||
6.3,2.5,5.0,1.9,Iris-virginica
|
||||
6.5,3.0,5.2,2.0,Iris-virginica
|
||||
6.2,3.4,5.4,2.3,Iris-virginica
|
||||
5.9,3.0,5.1,1.8,Iris-virginica
|
||||
%
|
||||
%
|
||||
%
|
||||
@@ -1,8 +0,0 @@
|
||||
% This arff file contains some missing data
|
||||
@relation missing
|
||||
@attribute yop real
|
||||
@attribute yap real
|
||||
@data
|
||||
1,5
|
||||
2,4
|
||||
?,?
|
||||
@@ -1,11 +0,0 @@
|
||||
@RELATION iris
|
||||
|
||||
@ATTRIBUTE sepallength REAL
|
||||
@ATTRIBUTE sepalwidth REAL
|
||||
@ATTRIBUTE petallength REAL
|
||||
@ATTRIBUTE petalwidth REAL
|
||||
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
|
||||
|
||||
@DATA
|
||||
|
||||
% This file has no data
|
||||
@@ -1,10 +0,0 @@
|
||||
@RELATION test1
|
||||
|
||||
@ATTRIBUTE attr0 REAL
|
||||
@ATTRIBUTE attr1 REAL
|
||||
@ATTRIBUTE attr2 REAL
|
||||
@ATTRIBUTE attr3 REAL
|
||||
@ATTRIBUTE class {class0, class1, class2, class3}
|
||||
|
||||
@DATA
|
||||
0.1, 0.2, 0.3, 0.4,class1
|
||||
@@ -1,15 +0,0 @@
|
||||
@RELATION test2
|
||||
|
||||
@ATTRIBUTE attr0 REAL
|
||||
@ATTRIBUTE attr1 real
|
||||
@ATTRIBUTE attr2 integer
|
||||
@ATTRIBUTE attr3 Integer
|
||||
@ATTRIBUTE attr4 Numeric
|
||||
@ATTRIBUTE attr5 numeric
|
||||
@ATTRIBUTE attr6 string
|
||||
@ATTRIBUTE attr7 STRING
|
||||
@ATTRIBUTE attr8 {bla}
|
||||
@ATTRIBUTE attr9 {bla, bla}
|
||||
|
||||
@DATA
|
||||
0.1, 0.2, 0.3, 0.4,class1
|
||||
@@ -1,6 +0,0 @@
|
||||
@RELATION test3
|
||||
|
||||
@ATTRIBUTE attr0 crap
|
||||
|
||||
@DATA
|
||||
0.1, 0.2, 0.3, 0.4,class1
|
||||
@@ -1,11 +0,0 @@
|
||||
@RELATION test5
|
||||
|
||||
@ATTRIBUTE attr0 REAL
|
||||
@ATTRIBUTE attr1 REAL
|
||||
@ATTRIBUTE attr2 REAL
|
||||
@ATTRIBUTE attr3 REAL
|
||||
@ATTRIBUTE class {class0, class1, class2, class3}
|
||||
@DATA
|
||||
0.1, 0.2, 0.3, 0.4,class1
|
||||
-0.1, -0.2, -0.3, -0.4,class2
|
||||
1, 2, 3, 4,class3
|
||||
@@ -1,26 +0,0 @@
|
||||
@RELATION test4
|
||||
|
||||
@ATTRIBUTE attr0 REAL
|
||||
@ATTRIBUTE attr1 REAL
|
||||
@ATTRIBUTE attr2 REAL
|
||||
@ATTRIBUTE attr3 REAL
|
||||
@ATTRIBUTE class {class0, class1, class2, class3}
|
||||
|
||||
@DATA
|
||||
|
||||
% lsdflkjhaksjdhf
|
||||
|
||||
% lsdflkjhaksjdhf
|
||||
|
||||
0.1, 0.2, 0.3, 0.4,class1
|
||||
% laksjdhf
|
||||
|
||||
% lsdflkjhaksjdhf
|
||||
-0.1, -0.2, -0.3, -0.4,class2
|
||||
|
||||
% lsdflkjhaksjdhf
|
||||
% lsdflkjhaksjdhf
|
||||
|
||||
% lsdflkjhaksjdhf
|
||||
|
||||
1, 2, 3, 4,class3
|
||||
@@ -1,12 +0,0 @@
|
||||
@RELATION test6
|
||||
|
||||
@ATTRIBUTE attr0 REAL
|
||||
@ATTRIBUTE attr1 REAL
|
||||
@ATTRIBUTE attr2 REAL
|
||||
@ATTRIBUTE attr3 REAL
|
||||
@ATTRIBUTE class {C}
|
||||
|
||||
@DATA
|
||||
0.1, 0.2, 0.3, 0.4,C
|
||||
-0.1, -0.2, -0.3, -0.4,C
|
||||
1, 2, 3, 4,C
|
||||
@@ -1,15 +0,0 @@
|
||||
@RELATION test7
|
||||
|
||||
@ATTRIBUTE attr_year DATE yyyy
|
||||
@ATTRIBUTE attr_month DATE yyyy-MM
|
||||
@ATTRIBUTE attr_date DATE yyyy-MM-dd
|
||||
@ATTRIBUTE attr_datetime_local DATE "yyyy-MM-dd HH:mm"
|
||||
@ATTRIBUTE attr_datetime_missing DATE "yyyy-MM-dd HH:mm"
|
||||
|
||||
@DATA
|
||||
1999,1999-01,1999-01-31,"1999-01-31 00:01",?
|
||||
2004,2004-12,2004-12-01,"2004-12-01 23:59","2004-12-01 23:59"
|
||||
1817,1817-04,1817-04-28,"1817-04-28 13:00",?
|
||||
2100,2100-09,2100-09-10,"2100-09-10 12:00",?
|
||||
2013,2013-11,2013-11-30,"2013-11-30 04:55","2013-11-30 04:55"
|
||||
1631,1631-10,1631-10-15,"1631-10-15 20:04","1631-10-15 20:04"
|
||||
@@ -1,12 +0,0 @@
|
||||
@RELATION test8
|
||||
|
||||
@ATTRIBUTE attr_datetime_utc DATE "yyyy-MM-dd HH:mm Z"
|
||||
@ATTRIBUTE attr_datetime_full DATE "yy-MM-dd HH:mm:ss z"
|
||||
|
||||
@DATA
|
||||
"1999-01-31 00:01 UTC","99-01-31 00:01:08 +0430"
|
||||
"2004-12-01 23:59 UTC","04-12-01 23:59:59 -0800"
|
||||
"1817-04-28 13:00 UTC","17-04-28 13:00:33 +1000"
|
||||
"2100-09-10 12:00 UTC","21-09-10 12:00:21 -0300"
|
||||
"2013-11-30 04:55 UTC","13-11-30 04:55:48 -1100"
|
||||
"1631-10-15 20:04 UTC","31-10-15 20:04:10 +0000"
|
||||
@@ -1,259 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import datetime
|
||||
import os
|
||||
import sys
|
||||
from os.path import join as pjoin
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
from io import StringIO
|
||||
else:
|
||||
from cStringIO import StringIO
|
||||
|
||||
import numpy as np
|
||||
|
||||
from numpy.testing import (assert_array_almost_equal,
|
||||
assert_array_equal, assert_equal, assert_)
|
||||
import pytest
|
||||
from pytest import raises as assert_raises
|
||||
|
||||
from scipy.io.arff.arffread import loadarff
|
||||
from scipy.io.arff.arffread import read_header, parse_type, ParseArffError
|
||||
|
||||
|
||||
data_path = pjoin(os.path.dirname(__file__), 'data')
|
||||
|
||||
test1 = pjoin(data_path, 'test1.arff')
|
||||
test2 = pjoin(data_path, 'test2.arff')
|
||||
test3 = pjoin(data_path, 'test3.arff')
|
||||
|
||||
test4 = pjoin(data_path, 'test4.arff')
|
||||
test5 = pjoin(data_path, 'test5.arff')
|
||||
test6 = pjoin(data_path, 'test6.arff')
|
||||
test7 = pjoin(data_path, 'test7.arff')
|
||||
test8 = pjoin(data_path, 'test8.arff')
|
||||
expect4_data = [(0.1, 0.2, 0.3, 0.4, 'class1'),
|
||||
(-0.1, -0.2, -0.3, -0.4, 'class2'),
|
||||
(1, 2, 3, 4, 'class3')]
|
||||
expected_types = ['numeric', 'numeric', 'numeric', 'numeric', 'nominal']
|
||||
|
||||
missing = pjoin(data_path, 'missing.arff')
|
||||
expect_missing_raw = np.array([[1, 5], [2, 4], [np.nan, np.nan]])
|
||||
expect_missing = np.empty(3, [('yop', float), ('yap', float)])
|
||||
expect_missing['yop'] = expect_missing_raw[:, 0]
|
||||
expect_missing['yap'] = expect_missing_raw[:, 1]
|
||||
|
||||
|
||||
class TestData(object):
|
||||
def test1(self):
|
||||
# Parsing trivial file with nothing.
|
||||
self._test(test4)
|
||||
|
||||
def test2(self):
|
||||
# Parsing trivial file with some comments in the data section.
|
||||
self._test(test5)
|
||||
|
||||
def test3(self):
|
||||
# Parsing trivial file with nominal attribute of 1 character.
|
||||
self._test(test6)
|
||||
|
||||
def _test(self, test_file):
|
||||
data, meta = loadarff(test_file)
|
||||
for i in range(len(data)):
|
||||
for j in range(4):
|
||||
assert_array_almost_equal(expect4_data[i][j], data[i][j])
|
||||
assert_equal(meta.types(), expected_types)
|
||||
|
||||
def test_filelike(self):
|
||||
# Test reading from file-like object (StringIO)
|
||||
f1 = open(test1)
|
||||
data1, meta1 = loadarff(f1)
|
||||
f1.close()
|
||||
f2 = open(test1)
|
||||
data2, meta2 = loadarff(StringIO(f2.read()))
|
||||
f2.close()
|
||||
assert_(data1 == data2)
|
||||
assert_(repr(meta1) == repr(meta2))
|
||||
|
||||
@pytest.mark.skipif(sys.version_info < (3, 6),
|
||||
reason='Passing path-like objects to IO functions requires Python >= 3.6')
|
||||
def test_path(self):
|
||||
# Test reading from `pathlib.Path` object
|
||||
from pathlib import Path
|
||||
|
||||
with open(test1) as f1:
|
||||
data1, meta1 = loadarff(f1)
|
||||
|
||||
data2, meta2 = loadarff(Path(test1))
|
||||
|
||||
assert_(data1 == data2)
|
||||
assert_(repr(meta1) == repr(meta2))
|
||||
|
||||
class TestMissingData(object):
|
||||
def test_missing(self):
|
||||
data, meta = loadarff(missing)
|
||||
for i in ['yop', 'yap']:
|
||||
assert_array_almost_equal(data[i], expect_missing[i])
|
||||
|
||||
|
||||
class TestNoData(object):
|
||||
def test_nodata(self):
|
||||
# The file nodata.arff has no data in the @DATA section.
|
||||
# Reading it should result in an array with length 0.
|
||||
nodata_filename = os.path.join(data_path, 'nodata.arff')
|
||||
data, meta = loadarff(nodata_filename)
|
||||
expected_dtype = np.dtype([('sepallength', '<f8'),
|
||||
('sepalwidth', '<f8'),
|
||||
('petallength', '<f8'),
|
||||
('petalwidth', '<f8'),
|
||||
('class', 'S15')])
|
||||
assert_equal(data.dtype, expected_dtype)
|
||||
assert_equal(data.size, 0)
|
||||
|
||||
|
||||
class TestHeader(object):
|
||||
def test_type_parsing(self):
|
||||
# Test parsing type of attribute from their value.
|
||||
ofile = open(test2)
|
||||
rel, attrs = read_header(ofile)
|
||||
ofile.close()
|
||||
|
||||
expected = ['numeric', 'numeric', 'numeric', 'numeric', 'numeric',
|
||||
'numeric', 'string', 'string', 'nominal', 'nominal']
|
||||
|
||||
for i in range(len(attrs)):
|
||||
assert_(parse_type(attrs[i][1]) == expected[i])
|
||||
|
||||
def test_badtype_parsing(self):
|
||||
# Test parsing wrong type of attribute from their value.
|
||||
ofile = open(test3)
|
||||
rel, attrs = read_header(ofile)
|
||||
ofile.close()
|
||||
|
||||
for name, value in attrs:
|
||||
assert_raises(ParseArffError, parse_type, value)
|
||||
|
||||
def test_fullheader1(self):
|
||||
# Parsing trivial header with nothing.
|
||||
ofile = open(test1)
|
||||
rel, attrs = read_header(ofile)
|
||||
ofile.close()
|
||||
|
||||
# Test relation
|
||||
assert_(rel == 'test1')
|
||||
|
||||
# Test numerical attributes
|
||||
assert_(len(attrs) == 5)
|
||||
for i in range(4):
|
||||
assert_(attrs[i][0] == 'attr%d' % i)
|
||||
assert_(attrs[i][1] == 'REAL')
|
||||
|
||||
# Test nominal attribute
|
||||
assert_(attrs[4][0] == 'class')
|
||||
assert_(attrs[4][1] == '{class0, class1, class2, class3}')
|
||||
|
||||
def test_dateheader(self):
|
||||
ofile = open(test7)
|
||||
rel, attrs = read_header(ofile)
|
||||
ofile.close()
|
||||
|
||||
assert_(rel == 'test7')
|
||||
|
||||
assert_(len(attrs) == 5)
|
||||
|
||||
assert_(attrs[0][0] == 'attr_year')
|
||||
assert_(attrs[0][1] == 'DATE yyyy')
|
||||
|
||||
assert_(attrs[1][0] == 'attr_month')
|
||||
assert_(attrs[1][1] == 'DATE yyyy-MM')
|
||||
|
||||
assert_(attrs[2][0] == 'attr_date')
|
||||
assert_(attrs[2][1] == 'DATE yyyy-MM-dd')
|
||||
|
||||
assert_(attrs[3][0] == 'attr_datetime_local')
|
||||
assert_(attrs[3][1] == 'DATE "yyyy-MM-dd HH:mm"')
|
||||
|
||||
assert_(attrs[4][0] == 'attr_datetime_missing')
|
||||
assert_(attrs[4][1] == 'DATE "yyyy-MM-dd HH:mm"')
|
||||
|
||||
def test_dateheader_unsupported(self):
|
||||
ofile = open(test8)
|
||||
rel, attrs = read_header(ofile)
|
||||
ofile.close()
|
||||
|
||||
assert_(rel == 'test8')
|
||||
|
||||
assert_(len(attrs) == 2)
|
||||
assert_(attrs[0][0] == 'attr_datetime_utc')
|
||||
assert_(attrs[0][1] == 'DATE "yyyy-MM-dd HH:mm Z"')
|
||||
|
||||
assert_(attrs[1][0] == 'attr_datetime_full')
|
||||
assert_(attrs[1][1] == 'DATE "yy-MM-dd HH:mm:ss z"')
|
||||
|
||||
|
||||
class TestDateAttribute(object):
|
||||
def setup_method(self):
|
||||
self.data, self.meta = loadarff(test7)
|
||||
|
||||
def test_year_attribute(self):
|
||||
expected = np.array([
|
||||
'1999',
|
||||
'2004',
|
||||
'1817',
|
||||
'2100',
|
||||
'2013',
|
||||
'1631'
|
||||
], dtype='datetime64[Y]')
|
||||
|
||||
assert_array_equal(self.data["attr_year"], expected)
|
||||
|
||||
def test_month_attribute(self):
|
||||
expected = np.array([
|
||||
'1999-01',
|
||||
'2004-12',
|
||||
'1817-04',
|
||||
'2100-09',
|
||||
'2013-11',
|
||||
'1631-10'
|
||||
], dtype='datetime64[M]')
|
||||
|
||||
assert_array_equal(self.data["attr_month"], expected)
|
||||
|
||||
def test_date_attribute(self):
|
||||
expected = np.array([
|
||||
'1999-01-31',
|
||||
'2004-12-01',
|
||||
'1817-04-28',
|
||||
'2100-09-10',
|
||||
'2013-11-30',
|
||||
'1631-10-15'
|
||||
], dtype='datetime64[D]')
|
||||
|
||||
assert_array_equal(self.data["attr_date"], expected)
|
||||
|
||||
def test_datetime_local_attribute(self):
|
||||
expected = np.array([
|
||||
datetime.datetime(year=1999, month=1, day=31, hour=0, minute=1),
|
||||
datetime.datetime(year=2004, month=12, day=1, hour=23, minute=59),
|
||||
datetime.datetime(year=1817, month=4, day=28, hour=13, minute=0),
|
||||
datetime.datetime(year=2100, month=9, day=10, hour=12, minute=0),
|
||||
datetime.datetime(year=2013, month=11, day=30, hour=4, minute=55),
|
||||
datetime.datetime(year=1631, month=10, day=15, hour=20, minute=4)
|
||||
], dtype='datetime64[m]')
|
||||
|
||||
assert_array_equal(self.data["attr_datetime_local"], expected)
|
||||
|
||||
def test_datetime_missing(self):
|
||||
expected = np.array([
|
||||
'nat',
|
||||
'2004-12-01T23:59',
|
||||
'nat',
|
||||
'nat',
|
||||
'2013-11-30T04:55',
|
||||
'1631-10-15T20:04'
|
||||
], dtype='datetime64[m]')
|
||||
|
||||
assert_array_equal(self.data["attr_datetime_missing"], expected)
|
||||
|
||||
def test_datetime_timezone(self):
|
||||
assert_raises(ValueError, loadarff, test8)
|
||||
Reference in New Issue
Block a user