Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/scipy/io/arff/_arffread.py: 23%
373 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-23 06:43 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-23 06:43 +0000
1# Last Change: Mon Aug 20 08:00 PM 2007 J
2import re
3import datetime
5import numpy as np
7import csv
8import ctypes
10"""A module to read arff files."""
12__all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
14# An Arff file is basically two parts:
15# - header
16# - data
17#
18# A header has each of its components starting by @META where META is one of
19# the keyword (attribute of relation, for now).
21# TODO:
22# - both integer and reals are treated as numeric -> the integer info
23# is lost!
24# - Replace ValueError by ParseError or something
26# We know can handle the following:
27# - numeric and nominal attributes
28# - missing values for numeric attributes
30r_meta = re.compile(r'^\s*@')
31# Match a comment
32r_comment = re.compile(r'^%')
33# Match an empty line
34r_empty = re.compile(r'^\s+$')
35# Match a header line, that is a line which starts by @ + a word
36r_headerline = re.compile(r'^\s*@\S*')
37r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
38r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
39r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
41r_nominal = re.compile(r'{(.+)}')
42r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")
44# To get attributes name enclosed with ''
45r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
46# To get normal attributes
47r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
49# ------------------------
50# Module defined exception
51# ------------------------
54class ArffError(OSError):
55 pass
58class ParseArffError(ArffError):
59 pass
62# ----------
63# Attributes
64# ----------
65class Attribute:
67 type_name = None
69 def __init__(self, name):
70 self.name = name
71 self.range = None
72 self.dtype = np.object_
74 @classmethod
75 def parse_attribute(cls, name, attr_string):
76 """
77 Parse the attribute line if it knows how. Returns the parsed
78 attribute, or None.
79 """
80 return None
82 def parse_data(self, data_str):
83 """
84 Parse a value of this type.
85 """
86 return None
88 def __str__(self):
89 """
90 Parse a value of this type.
91 """
92 return self.name + ',' + self.type_name
95class NominalAttribute(Attribute):
97 type_name = 'nominal'
99 def __init__(self, name, values):
100 super().__init__(name)
101 self.values = values
102 self.range = values
103 self.dtype = (np.bytes_, max(len(i) for i in values))
105 @staticmethod
106 def _get_nom_val(atrv):
107 """Given a string containing a nominal type, returns a tuple of the
108 possible values.
110 A nominal type is defined as something framed between braces ({}).
112 Parameters
113 ----------
114 atrv : str
115 Nominal type definition
117 Returns
118 -------
119 poss_vals : tuple
120 possible values
122 Examples
123 --------
124 >>> from scipy.io.arff._arffread import NominalAttribute
125 >>> NominalAttribute._get_nom_val("{floup, bouga, fl, ratata}")
126 ('floup', 'bouga', 'fl', 'ratata')
127 """
128 m = r_nominal.match(atrv)
129 if m:
130 attrs, _ = split_data_line(m.group(1))
131 return tuple(attrs)
132 else:
133 raise ValueError("This does not look like a nominal string")
135 @classmethod
136 def parse_attribute(cls, name, attr_string):
137 """
138 Parse the attribute line if it knows how. Returns the parsed
139 attribute, or None.
141 For nominal attributes, the attribute string would be like '{<attr_1>,
142 <attr2>, <attr_3>}'.
143 """
144 if attr_string[0] == '{':
145 values = cls._get_nom_val(attr_string)
146 return cls(name, values)
147 else:
148 return None
150 def parse_data(self, data_str):
151 """
152 Parse a value of this type.
153 """
154 if data_str in self.values:
155 return data_str
156 elif data_str == '?':
157 return data_str
158 else:
159 raise ValueError("{} value not in {}".format(str(data_str),
160 str(self.values)))
162 def __str__(self):
163 msg = self.name + ",{"
164 for i in range(len(self.values)-1):
165 msg += self.values[i] + ","
166 msg += self.values[-1]
167 msg += "}"
168 return msg
171class NumericAttribute(Attribute):
173 def __init__(self, name):
174 super().__init__(name)
175 self.type_name = 'numeric'
176 self.dtype = np.float64
178 @classmethod
179 def parse_attribute(cls, name, attr_string):
180 """
181 Parse the attribute line if it knows how. Returns the parsed
182 attribute, or None.
184 For numeric attributes, the attribute string would be like
185 'numeric' or 'int' or 'real'.
186 """
188 attr_string = attr_string.lower().strip()
190 if (attr_string[:len('numeric')] == 'numeric' or
191 attr_string[:len('int')] == 'int' or
192 attr_string[:len('real')] == 'real'):
193 return cls(name)
194 else:
195 return None
197 def parse_data(self, data_str):
198 """
199 Parse a value of this type.
201 Parameters
202 ----------
203 data_str : str
204 string to convert
206 Returns
207 -------
208 f : float
209 where float can be nan
211 Examples
212 --------
213 >>> from scipy.io.arff._arffread import NumericAttribute
214 >>> atr = NumericAttribute('atr')
215 >>> atr.parse_data('1')
216 1.0
217 >>> atr.parse_data('1\\n')
218 1.0
219 >>> atr.parse_data('?\\n')
220 nan
221 """
222 if '?' in data_str:
223 return np.nan
224 else:
225 return float(data_str)
227 def _basic_stats(self, data):
228 nbfac = data.size * 1. / (data.size - 1)
229 return (np.nanmin(data), np.nanmax(data),
230 np.mean(data), np.std(data) * nbfac)
233class StringAttribute(Attribute):
235 def __init__(self, name):
236 super().__init__(name)
237 self.type_name = 'string'
239 @classmethod
240 def parse_attribute(cls, name, attr_string):
241 """
242 Parse the attribute line if it knows how. Returns the parsed
243 attribute, or None.
245 For string attributes, the attribute string would be like
246 'string'.
247 """
249 attr_string = attr_string.lower().strip()
251 if attr_string[:len('string')] == 'string':
252 return cls(name)
253 else:
254 return None
257class DateAttribute(Attribute):
259 def __init__(self, name, date_format, datetime_unit):
260 super().__init__(name)
261 self.date_format = date_format
262 self.datetime_unit = datetime_unit
263 self.type_name = 'date'
264 self.range = date_format
265 self.dtype = np.datetime64(0, self.datetime_unit)
267 @staticmethod
268 def _get_date_format(atrv):
269 m = r_date.match(atrv)
270 if m:
271 pattern = m.group(1).strip()
272 # convert time pattern from Java's SimpleDateFormat to C's format
273 datetime_unit = None
274 if "yyyy" in pattern:
275 pattern = pattern.replace("yyyy", "%Y")
276 datetime_unit = "Y"
277 elif "yy":
278 pattern = pattern.replace("yy", "%y")
279 datetime_unit = "Y"
280 if "MM" in pattern:
281 pattern = pattern.replace("MM", "%m")
282 datetime_unit = "M"
283 if "dd" in pattern:
284 pattern = pattern.replace("dd", "%d")
285 datetime_unit = "D"
286 if "HH" in pattern:
287 pattern = pattern.replace("HH", "%H")
288 datetime_unit = "h"
289 if "mm" in pattern:
290 pattern = pattern.replace("mm", "%M")
291 datetime_unit = "m"
292 if "ss" in pattern:
293 pattern = pattern.replace("ss", "%S")
294 datetime_unit = "s"
295 if "z" in pattern or "Z" in pattern:
296 raise ValueError("Date type attributes with time zone not "
297 "supported, yet")
299 if datetime_unit is None:
300 raise ValueError("Invalid or unsupported date format")
302 return pattern, datetime_unit
303 else:
304 raise ValueError("Invalid or no date format")
306 @classmethod
307 def parse_attribute(cls, name, attr_string):
308 """
309 Parse the attribute line if it knows how. Returns the parsed
310 attribute, or None.
312 For date attributes, the attribute string would be like
313 'date <format>'.
314 """
316 attr_string_lower = attr_string.lower().strip()
318 if attr_string_lower[:len('date')] == 'date':
319 date_format, datetime_unit = cls._get_date_format(attr_string)
320 return cls(name, date_format, datetime_unit)
321 else:
322 return None
324 def parse_data(self, data_str):
325 """
326 Parse a value of this type.
327 """
328 date_str = data_str.strip().strip("'").strip('"')
329 if date_str == '?':
330 return np.datetime64('NaT', self.datetime_unit)
331 else:
332 dt = datetime.datetime.strptime(date_str, self.date_format)
333 return np.datetime64(dt).astype(
334 "datetime64[%s]" % self.datetime_unit)
336 def __str__(self):
337 return super().__str__() + ',' + self.date_format
340class RelationalAttribute(Attribute):
342 def __init__(self, name):
343 super().__init__(name)
344 self.type_name = 'relational'
345 self.dtype = np.object_
346 self.attributes = []
347 self.dialect = None
349 @classmethod
350 def parse_attribute(cls, name, attr_string):
351 """
352 Parse the attribute line if it knows how. Returns the parsed
353 attribute, or None.
355 For date attributes, the attribute string would be like
356 'date <format>'.
357 """
359 attr_string_lower = attr_string.lower().strip()
361 if attr_string_lower[:len('relational')] == 'relational':
362 return cls(name)
363 else:
364 return None
366 def parse_data(self, data_str):
367 # Copy-pasted
368 elems = list(range(len(self.attributes)))
370 escaped_string = data_str.encode().decode("unicode-escape")
372 row_tuples = []
374 for raw in escaped_string.split("\n"):
375 row, self.dialect = split_data_line(raw, self.dialect)
377 row_tuples.append(tuple(
378 [self.attributes[i].parse_data(row[i]) for i in elems]))
380 return np.array(row_tuples,
381 [(a.name, a.dtype) for a in self.attributes])
383 def __str__(self):
384 return (super().__str__() + '\n\t' +
385 '\n\t'.join(str(a) for a in self.attributes))
388# -----------------
389# Various utilities
390# -----------------
391def to_attribute(name, attr_string):
392 attr_classes = (NominalAttribute, NumericAttribute, DateAttribute,
393 StringAttribute, RelationalAttribute)
395 for cls in attr_classes:
396 attr = cls.parse_attribute(name, attr_string)
397 if attr is not None:
398 return attr
400 raise ParseArffError("unknown attribute %s" % attr_string)
403def csv_sniffer_has_bug_last_field():
404 """
405 Checks if the bug https://bugs.python.org/issue30157 is unpatched.
406 """
408 # We only compute this once.
409 has_bug = getattr(csv_sniffer_has_bug_last_field, "has_bug", None)
411 if has_bug is None:
412 dialect = csv.Sniffer().sniff("3, 'a'")
413 csv_sniffer_has_bug_last_field.has_bug = dialect.quotechar != "'"
414 has_bug = csv_sniffer_has_bug_last_field.has_bug
416 return has_bug
419def workaround_csv_sniffer_bug_last_field(sniff_line, dialect, delimiters):
420 """
421 Workaround for the bug https://bugs.python.org/issue30157 if is unpatched.
422 """
423 if csv_sniffer_has_bug_last_field():
424 # Reuses code from the csv module
425 right_regex = r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'
427 for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
428 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # .*?",
429 right_regex, # ,".*?"
430 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
431 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
432 matches = regexp.findall(sniff_line)
433 if matches:
434 break
436 # If it does not match the expression that was bugged, then this bug does not apply
437 if restr != right_regex:
438 return
440 groupindex = regexp.groupindex
442 # There is only one end of the string
443 assert len(matches) == 1
444 m = matches[0]
446 n = groupindex['quote'] - 1
447 quote = m[n]
449 n = groupindex['delim'] - 1
450 delim = m[n]
452 n = groupindex['space'] - 1
453 space = bool(m[n])
455 dq_regexp = re.compile(
456 r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" %
457 {'delim': re.escape(delim), 'quote': quote}, re.MULTILINE
458 )
460 doublequote = bool(dq_regexp.search(sniff_line))
462 dialect.quotechar = quote
463 if delim in delimiters:
464 dialect.delimiter = delim
465 dialect.doublequote = doublequote
466 dialect.skipinitialspace = space
469def split_data_line(line, dialect=None):
470 delimiters = ",\t"
472 # This can not be done in a per reader basis, and relational fields
473 # can be HUGE
474 csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
476 # Remove the line end if any
477 if line[-1] == '\n':
478 line = line[:-1]
480 # Remove potential trailing whitespace
481 line = line.strip()
483 sniff_line = line
485 # Add a delimiter if none is present, so that the csv.Sniffer
486 # does not complain for a single-field CSV.
487 if not any(d in line for d in delimiters):
488 sniff_line += ","
490 if dialect is None:
491 dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters)
492 workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line,
493 dialect=dialect,
494 delimiters=delimiters)
496 row = next(csv.reader([line], dialect))
498 return row, dialect
501# --------------
502# Parsing header
503# --------------
504def tokenize_attribute(iterable, attribute):
505 """Parse a raw string in header (e.g., starts by @attribute).
507 Given a raw string attribute, try to get the name and type of the
508 attribute. Constraints:
510 * The first line must start with @attribute (case insensitive, and
511 space like characters before @attribute are allowed)
512 * Works also if the attribute is spread on multilines.
513 * Works if empty lines or comments are in between
515 Parameters
516 ----------
517 attribute : str
518 the attribute string.
520 Returns
521 -------
522 name : str
523 name of the attribute
524 value : str
525 value of the attribute
526 next : str
527 next line to be parsed
529 Examples
530 --------
531 If attribute is a string defined in python as r"floupi real", will
532 return floupi as name, and real as value.
534 >>> from scipy.io.arff._arffread import tokenize_attribute
535 >>> iterable = iter([0] * 10) # dummy iterator
536 >>> tokenize_attribute(iterable, r"@attribute floupi real")
537 ('floupi', 'real', 0)
539 If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
540 and real as value.
542 >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")
543 ('floupi 2', 'real', 0)
545 """
546 sattr = attribute.strip()
547 mattr = r_attribute.match(sattr)
548 if mattr:
549 # atrv is everything after @attribute
550 atrv = mattr.group(1)
551 if r_comattrval.match(atrv):
552 name, type = tokenize_single_comma(atrv)
553 next_item = next(iterable)
554 elif r_wcomattrval.match(atrv):
555 name, type = tokenize_single_wcomma(atrv)
556 next_item = next(iterable)
557 else:
558 # Not sure we should support this, as it does not seem supported by
559 # weka.
560 raise ValueError("multi line not supported yet")
561 else:
562 raise ValueError("First line unparsable: %s" % sattr)
564 attribute = to_attribute(name, type)
566 if type.lower() == 'relational':
567 next_item = read_relational_attribute(iterable, attribute, next_item)
568 # raise ValueError("relational attributes not supported yet")
570 return attribute, next_item
573def tokenize_single_comma(val):
574 # XXX we match twice the same string (here and at the caller level). It is
575 # stupid, but it is easier for now...
576 m = r_comattrval.match(val)
577 if m:
578 try:
579 name = m.group(1).strip()
580 type = m.group(2).strip()
581 except IndexError as e:
582 raise ValueError("Error while tokenizing attribute") from e
583 else:
584 raise ValueError("Error while tokenizing single %s" % val)
585 return name, type
588def tokenize_single_wcomma(val):
589 # XXX we match twice the same string (here and at the caller level). It is
590 # stupid, but it is easier for now...
591 m = r_wcomattrval.match(val)
592 if m:
593 try:
594 name = m.group(1).strip()
595 type = m.group(2).strip()
596 except IndexError as e:
597 raise ValueError("Error while tokenizing attribute") from e
598 else:
599 raise ValueError("Error while tokenizing single %s" % val)
600 return name, type
603def read_relational_attribute(ofile, relational_attribute, i):
604 """Read the nested attributes of a relational attribute"""
606 r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' +
607 relational_attribute.name + r'\s*$')
609 while not r_end_relational.match(i):
610 m = r_headerline.match(i)
611 if m:
612 isattr = r_attribute.match(i)
613 if isattr:
614 attr, i = tokenize_attribute(ofile, i)
615 relational_attribute.attributes.append(attr)
616 else:
617 raise ValueError("Error parsing line %s" % i)
618 else:
619 i = next(ofile)
621 i = next(ofile)
622 return i
625def read_header(ofile):
626 """Read the header of the iterable ofile."""
627 i = next(ofile)
629 # Pass first comments
630 while r_comment.match(i):
631 i = next(ofile)
633 # Header is everything up to DATA attribute ?
634 relation = None
635 attributes = []
636 while not r_datameta.match(i):
637 m = r_headerline.match(i)
638 if m:
639 isattr = r_attribute.match(i)
640 if isattr:
641 attr, i = tokenize_attribute(ofile, i)
642 attributes.append(attr)
643 else:
644 isrel = r_relation.match(i)
645 if isrel:
646 relation = isrel.group(1)
647 else:
648 raise ValueError("Error parsing line %s" % i)
649 i = next(ofile)
650 else:
651 i = next(ofile)
653 return relation, attributes
656class MetaData:
657 """Small container to keep useful information on a ARFF dataset.
659 Knows about attributes names and types.
661 Examples
662 --------
663 ::
665 data, meta = loadarff('iris.arff')
666 # This will print the attributes names of the iris.arff dataset
667 for i in meta:
668 print(i)
669 # This works too
670 meta.names()
671 # Getting attribute type
672 types = meta.types()
674 Methods
675 -------
676 names
677 types
679 Notes
680 -----
681 Also maintains the list of attributes in order, i.e., doing for i in
682 meta, where meta is an instance of MetaData, will return the
683 different attribute names in the order they were defined.
684 """
685 def __init__(self, rel, attr):
686 self.name = rel
687 self._attributes = {a.name: a for a in attr}
689 def __repr__(self):
690 msg = ""
691 msg += "Dataset: %s\n" % self.name
692 for i in self._attributes:
693 msg += f"\t{i}'s type is {self._attributes[i].type_name}"
694 if self._attributes[i].range:
695 msg += ", range is %s" % str(self._attributes[i].range)
696 msg += '\n'
697 return msg
699 def __iter__(self):
700 return iter(self._attributes)
702 def __getitem__(self, key):
703 attr = self._attributes[key]
705 return (attr.type_name, attr.range)
707 def names(self):
708 """Return the list of attribute names.
710 Returns
711 -------
712 attrnames : list of str
713 The attribute names.
714 """
715 return list(self._attributes)
717 def types(self):
718 """Return the list of attribute types.
720 Returns
721 -------
722 attr_types : list of str
723 The attribute types.
724 """
725 attr_types = [self._attributes[name].type_name
726 for name in self._attributes]
727 return attr_types
730def loadarff(f):
731 """
732 Read an arff file.
734 The data is returned as a record array, which can be accessed much like
735 a dictionary of NumPy arrays. For example, if one of the attributes is
736 called 'pressure', then its first 10 data points can be accessed from the
737 ``data`` record array like so: ``data['pressure'][0:10]``
740 Parameters
741 ----------
742 f : file-like or str
743 File-like object to read from, or filename to open.
745 Returns
746 -------
747 data : record array
748 The data of the arff file, accessible by attribute names.
749 meta : `MetaData`
750 Contains information about the arff file such as name and
751 type of attributes, the relation (name of the dataset), etc.
753 Raises
754 ------
755 ParseArffError
756 This is raised if the given file is not ARFF-formatted.
757 NotImplementedError
758 The ARFF file has an attribute which is not supported yet.
760 Notes
761 -----
763 This function should be able to read most arff files. Not
764 implemented functionality include:
766 * date type attributes
767 * string type attributes
769 It can read files with numeric and nominal attributes. It cannot read
770 files with sparse data ({} in the file). However, this function can
771 read files with missing data (? in the file), representing the data
772 points as NaNs.
774 Examples
775 --------
776 >>> from scipy.io import arff
777 >>> from io import StringIO
778 >>> content = \"\"\"
779 ... @relation foo
780 ... @attribute width numeric
781 ... @attribute height numeric
782 ... @attribute color {red,green,blue,yellow,black}
783 ... @data
784 ... 5.0,3.25,blue
785 ... 4.5,3.75,green
786 ... 3.0,4.00,red
787 ... \"\"\"
788 >>> f = StringIO(content)
789 >>> data, meta = arff.loadarff(f)
790 >>> data
791 array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')],
792 dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')])
793 >>> meta
794 Dataset: foo
795 \twidth's type is numeric
796 \theight's type is numeric
797 \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black')
799 """
800 if hasattr(f, 'read'):
801 ofile = f
802 else:
803 ofile = open(f)
804 try:
805 return _loadarff(ofile)
806 finally:
807 if ofile is not f: # only close what we opened
808 ofile.close()
811def _loadarff(ofile):
812 # Parse the header file
813 try:
814 rel, attr = read_header(ofile)
815 except ValueError as e:
816 msg = "Error while parsing header, error was: " + str(e)
817 raise ParseArffError(msg) from e
819 # Check whether we have a string attribute (not supported yet)
820 hasstr = False
821 for a in attr:
822 if isinstance(a, StringAttribute):
823 hasstr = True
825 meta = MetaData(rel, attr)
827 # XXX The following code is not great
828 # Build the type descriptor descr and the list of convertors to convert
829 # each attribute to the suitable type (which should match the one in
830 # descr).
832 # This can be used once we want to support integer as integer values and
833 # not as numeric anymore (using masked arrays ?).
835 if hasstr:
836 # How to support string efficiently ? Ideally, we should know the max
837 # size of the string before allocating the numpy array.
838 raise NotImplementedError("String attributes not supported yet, sorry")
840 ni = len(attr)
842 def generator(row_iter, delim=','):
843 # TODO: this is where we are spending time (~80%). I think things
844 # could be made more efficiently:
845 # - We could for example "compile" the function, because some values
846 # do not change here.
847 # - The function to convert a line to dtyped values could also be
848 # generated on the fly from a string and be executed instead of
849 # looping.
850 # - The regex are overkill: for comments, checking that a line starts
851 # by % should be enough and faster, and for empty lines, same thing
852 # --> this does not seem to change anything.
854 # 'compiling' the range since it does not change
855 # Note, I have already tried zipping the converters and
856 # row elements and got slightly worse performance.
857 elems = list(range(ni))
859 dialect = None
860 for raw in row_iter:
861 # We do not abstract skipping comments and empty lines for
862 # performance reasons.
863 if r_comment.match(raw) or r_empty.match(raw):
864 continue
866 row, dialect = split_data_line(raw, dialect)
868 yield tuple([attr[i].parse_data(row[i]) for i in elems])
870 a = list(generator(ofile))
871 # No error should happen here: it is a bug otherwise
872 data = np.array(a, [(a.name, a.dtype) for a in attr])
873 return data, meta
876# ----
877# Misc
878# ----
879def basic_stats(data):
880 nbfac = data.size * 1. / (data.size - 1)
881 return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac
884def print_attribute(name, tp, data):
885 type = tp.type_name
886 if type == 'numeric' or type == 'real' or type == 'integer':
887 min, max, mean, std = basic_stats(data)
888 print(f"{name},{type},{min:f},{max:f},{mean:f},{std:f}")
889 else:
890 print(str(tp))
893def test_weka(filename):
894 data, meta = loadarff(filename)
895 print(len(data.dtype))
896 print(data.size)
897 for i in meta:
898 print_attribute(i, meta[i], data[i])
901# make sure nose does not find this as a test
902test_weka.__test__ = False
905if __name__ == '__main__':
906 import sys
907 filename = sys.argv[1]
908 test_weka(filename)