Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/scipy/io/arff/_arffread.py: 23%
373 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-22 06:44 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-22 06:44 +0000
1# Last Change: Mon Aug 20 08:00 PM 2007 J
2import re
3import datetime
5import numpy as np
7import csv
8import ctypes
10"""A module to read arff files."""
12__all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
14# An Arff file is basically two parts:
15# - header
16# - data
17#
18# A header has each of its components starting by @META where META is one of
19# the keyword (attribute of relation, for now).
21# TODO:
22# - both integer and reals are treated as numeric -> the integer info
23# is lost!
24# - Replace ValueError by ParseError or something
26# We know can handle the following:
27# - numeric and nominal attributes
28# - missing values for numeric attributes
30r_meta = re.compile(r'^\s*@')
31# Match a comment
32r_comment = re.compile(r'^%')
33# Match an empty line
34r_empty = re.compile(r'^\s+$')
35# Match a header line, that is a line which starts by @ + a word
36r_headerline = re.compile(r'^\s*@\S*')
37r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
38r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
39r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
41r_nominal = re.compile(r'{(.+)}')
42r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")
44# To get attributes name enclosed with ''
45r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
46# To get normal attributes
47r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
49# ------------------------
50# Module defined exception
51# ------------------------
54class ArffError(OSError):
55 pass
58class ParseArffError(ArffError):
59 pass
62# ----------
63# Attributes
64# ----------
65class Attribute:
67 type_name = None
69 def __init__(self, name):
70 self.name = name
71 self.range = None
72 self.dtype = np.object_
74 @classmethod
75 def parse_attribute(cls, name, attr_string):
76 """
77 Parse the attribute line if it knows how. Returns the parsed
78 attribute, or None.
79 """
80 return None
82 def parse_data(self, data_str):
83 """
84 Parse a value of this type.
85 """
86 return None
88 def __str__(self):
89 """
90 Parse a value of this type.
91 """
92 return self.name + ',' + self.type_name
95class NominalAttribute(Attribute):
97 type_name = 'nominal'
99 def __init__(self, name, values):
100 super().__init__(name)
101 self.values = values
102 self.range = values
103 self.dtype = (np.bytes_, max(len(i) for i in values))
105 @staticmethod
106 def _get_nom_val(atrv):
107 """Given a string containing a nominal type, returns a tuple of the
108 possible values.
110 A nominal type is defined as something framed between braces ({}).
112 Parameters
113 ----------
114 atrv : str
115 Nominal type definition
117 Returns
118 -------
119 poss_vals : tuple
120 possible values
122 Examples
123 --------
124 >>> from scipy.io.arff._arffread import NominalAttribute
125 >>> NominalAttribute._get_nom_val("{floup, bouga, fl, ratata}")
126 ('floup', 'bouga', 'fl', 'ratata')
127 """
128 m = r_nominal.match(atrv)
129 if m:
130 attrs, _ = split_data_line(m.group(1))
131 return tuple(attrs)
132 else:
133 raise ValueError("This does not look like a nominal string")
135 @classmethod
136 def parse_attribute(cls, name, attr_string):
137 """
138 Parse the attribute line if it knows how. Returns the parsed
139 attribute, or None.
141 For nominal attributes, the attribute string would be like '{<attr_1>,
142 <attr2>, <attr_3>}'.
143 """
144 if attr_string[0] == '{':
145 values = cls._get_nom_val(attr_string)
146 return cls(name, values)
147 else:
148 return None
150 def parse_data(self, data_str):
151 """
152 Parse a value of this type.
153 """
154 if data_str in self.values:
155 return data_str
156 elif data_str == '?':
157 return data_str
158 else:
159 raise ValueError(f"{str(data_str)} value not in {str(self.values)}")
161 def __str__(self):
162 msg = self.name + ",{"
163 for i in range(len(self.values)-1):
164 msg += self.values[i] + ","
165 msg += self.values[-1]
166 msg += "}"
167 return msg
170class NumericAttribute(Attribute):
172 def __init__(self, name):
173 super().__init__(name)
174 self.type_name = 'numeric'
175 self.dtype = np.float64
177 @classmethod
178 def parse_attribute(cls, name, attr_string):
179 """
180 Parse the attribute line if it knows how. Returns the parsed
181 attribute, or None.
183 For numeric attributes, the attribute string would be like
184 'numeric' or 'int' or 'real'.
185 """
187 attr_string = attr_string.lower().strip()
189 if (attr_string[:len('numeric')] == 'numeric' or
190 attr_string[:len('int')] == 'int' or
191 attr_string[:len('real')] == 'real'):
192 return cls(name)
193 else:
194 return None
196 def parse_data(self, data_str):
197 """
198 Parse a value of this type.
200 Parameters
201 ----------
202 data_str : str
203 string to convert
205 Returns
206 -------
207 f : float
208 where float can be nan
210 Examples
211 --------
212 >>> from scipy.io.arff._arffread import NumericAttribute
213 >>> atr = NumericAttribute('atr')
214 >>> atr.parse_data('1')
215 1.0
216 >>> atr.parse_data('1\\n')
217 1.0
218 >>> atr.parse_data('?\\n')
219 nan
220 """
221 if '?' in data_str:
222 return np.nan
223 else:
224 return float(data_str)
226 def _basic_stats(self, data):
227 nbfac = data.size * 1. / (data.size - 1)
228 return (np.nanmin(data), np.nanmax(data),
229 np.mean(data), np.std(data) * nbfac)
232class StringAttribute(Attribute):
234 def __init__(self, name):
235 super().__init__(name)
236 self.type_name = 'string'
238 @classmethod
239 def parse_attribute(cls, name, attr_string):
240 """
241 Parse the attribute line if it knows how. Returns the parsed
242 attribute, or None.
244 For string attributes, the attribute string would be like
245 'string'.
246 """
248 attr_string = attr_string.lower().strip()
250 if attr_string[:len('string')] == 'string':
251 return cls(name)
252 else:
253 return None
256class DateAttribute(Attribute):
258 def __init__(self, name, date_format, datetime_unit):
259 super().__init__(name)
260 self.date_format = date_format
261 self.datetime_unit = datetime_unit
262 self.type_name = 'date'
263 self.range = date_format
264 self.dtype = np.datetime64(0, self.datetime_unit)
266 @staticmethod
267 def _get_date_format(atrv):
268 m = r_date.match(atrv)
269 if m:
270 pattern = m.group(1).strip()
271 # convert time pattern from Java's SimpleDateFormat to C's format
272 datetime_unit = None
273 if "yyyy" in pattern:
274 pattern = pattern.replace("yyyy", "%Y")
275 datetime_unit = "Y"
276 elif "yy":
277 pattern = pattern.replace("yy", "%y")
278 datetime_unit = "Y"
279 if "MM" in pattern:
280 pattern = pattern.replace("MM", "%m")
281 datetime_unit = "M"
282 if "dd" in pattern:
283 pattern = pattern.replace("dd", "%d")
284 datetime_unit = "D"
285 if "HH" in pattern:
286 pattern = pattern.replace("HH", "%H")
287 datetime_unit = "h"
288 if "mm" in pattern:
289 pattern = pattern.replace("mm", "%M")
290 datetime_unit = "m"
291 if "ss" in pattern:
292 pattern = pattern.replace("ss", "%S")
293 datetime_unit = "s"
294 if "z" in pattern or "Z" in pattern:
295 raise ValueError("Date type attributes with time zone not "
296 "supported, yet")
298 if datetime_unit is None:
299 raise ValueError("Invalid or unsupported date format")
301 return pattern, datetime_unit
302 else:
303 raise ValueError("Invalid or no date format")
305 @classmethod
306 def parse_attribute(cls, name, attr_string):
307 """
308 Parse the attribute line if it knows how. Returns the parsed
309 attribute, or None.
311 For date attributes, the attribute string would be like
312 'date <format>'.
313 """
315 attr_string_lower = attr_string.lower().strip()
317 if attr_string_lower[:len('date')] == 'date':
318 date_format, datetime_unit = cls._get_date_format(attr_string)
319 return cls(name, date_format, datetime_unit)
320 else:
321 return None
323 def parse_data(self, data_str):
324 """
325 Parse a value of this type.
326 """
327 date_str = data_str.strip().strip("'").strip('"')
328 if date_str == '?':
329 return np.datetime64('NaT', self.datetime_unit)
330 else:
331 dt = datetime.datetime.strptime(date_str, self.date_format)
332 return np.datetime64(dt).astype(
333 "datetime64[%s]" % self.datetime_unit)
335 def __str__(self):
336 return super().__str__() + ',' + self.date_format
339class RelationalAttribute(Attribute):
341 def __init__(self, name):
342 super().__init__(name)
343 self.type_name = 'relational'
344 self.dtype = np.object_
345 self.attributes = []
346 self.dialect = None
348 @classmethod
349 def parse_attribute(cls, name, attr_string):
350 """
351 Parse the attribute line if it knows how. Returns the parsed
352 attribute, or None.
354 For date attributes, the attribute string would be like
355 'date <format>'.
356 """
358 attr_string_lower = attr_string.lower().strip()
360 if attr_string_lower[:len('relational')] == 'relational':
361 return cls(name)
362 else:
363 return None
365 def parse_data(self, data_str):
366 # Copy-pasted
367 elems = list(range(len(self.attributes)))
369 escaped_string = data_str.encode().decode("unicode-escape")
371 row_tuples = []
373 for raw in escaped_string.split("\n"):
374 row, self.dialect = split_data_line(raw, self.dialect)
376 row_tuples.append(tuple(
377 [self.attributes[i].parse_data(row[i]) for i in elems]))
379 return np.array(row_tuples,
380 [(a.name, a.dtype) for a in self.attributes])
382 def __str__(self):
383 return (super().__str__() + '\n\t' +
384 '\n\t'.join(str(a) for a in self.attributes))
387# -----------------
388# Various utilities
389# -----------------
390def to_attribute(name, attr_string):
391 attr_classes = (NominalAttribute, NumericAttribute, DateAttribute,
392 StringAttribute, RelationalAttribute)
394 for cls in attr_classes:
395 attr = cls.parse_attribute(name, attr_string)
396 if attr is not None:
397 return attr
399 raise ParseArffError("unknown attribute %s" % attr_string)
402def csv_sniffer_has_bug_last_field():
403 """
404 Checks if the bug https://bugs.python.org/issue30157 is unpatched.
405 """
407 # We only compute this once.
408 has_bug = getattr(csv_sniffer_has_bug_last_field, "has_bug", None)
410 if has_bug is None:
411 dialect = csv.Sniffer().sniff("3, 'a'")
412 csv_sniffer_has_bug_last_field.has_bug = dialect.quotechar != "'"
413 has_bug = csv_sniffer_has_bug_last_field.has_bug
415 return has_bug
418def workaround_csv_sniffer_bug_last_field(sniff_line, dialect, delimiters):
419 """
420 Workaround for the bug https://bugs.python.org/issue30157 if is unpatched.
421 """
422 if csv_sniffer_has_bug_last_field():
423 # Reuses code from the csv module
424 right_regex = r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)' # noqa: E501
426 for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", # noqa: E501
427 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # .*?", # noqa: E501
428 right_regex, # ,".*?"
429 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) # noqa: E501
430 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
431 matches = regexp.findall(sniff_line)
432 if matches:
433 break
435 # If it does not match the expression that was bugged,
436 # then this bug does not apply
437 if restr != right_regex:
438 return
440 groupindex = regexp.groupindex
442 # There is only one end of the string
443 assert len(matches) == 1
444 m = matches[0]
446 n = groupindex['quote'] - 1
447 quote = m[n]
449 n = groupindex['delim'] - 1
450 delim = m[n]
452 n = groupindex['space'] - 1
453 space = bool(m[n])
455 dq_regexp = re.compile(
456 rf"(({re.escape(delim)})|^)\W*{quote}[^{re.escape(delim)}\n]*{quote}[^{re.escape(delim)}\n]*{quote}\W*(({re.escape(delim)})|$)", re.MULTILINE # noqa: E501
457 )
459 doublequote = bool(dq_regexp.search(sniff_line))
461 dialect.quotechar = quote
462 if delim in delimiters:
463 dialect.delimiter = delim
464 dialect.doublequote = doublequote
465 dialect.skipinitialspace = space
468def split_data_line(line, dialect=None):
469 delimiters = ",\t"
471 # This can not be done in a per reader basis, and relational fields
472 # can be HUGE
473 csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
475 # Remove the line end if any
476 if line[-1] == '\n':
477 line = line[:-1]
479 # Remove potential trailing whitespace
480 line = line.strip()
482 sniff_line = line
484 # Add a delimiter if none is present, so that the csv.Sniffer
485 # does not complain for a single-field CSV.
486 if not any(d in line for d in delimiters):
487 sniff_line += ","
489 if dialect is None:
490 dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters)
491 workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line,
492 dialect=dialect,
493 delimiters=delimiters)
495 row = next(csv.reader([line], dialect))
497 return row, dialect
500# --------------
501# Parsing header
502# --------------
503def tokenize_attribute(iterable, attribute):
504 """Parse a raw string in header (e.g., starts by @attribute).
506 Given a raw string attribute, try to get the name and type of the
507 attribute. Constraints:
509 * The first line must start with @attribute (case insensitive, and
510 space like characters before @attribute are allowed)
511 * Works also if the attribute is spread on multilines.
512 * Works if empty lines or comments are in between
514 Parameters
515 ----------
516 attribute : str
517 the attribute string.
519 Returns
520 -------
521 name : str
522 name of the attribute
523 value : str
524 value of the attribute
525 next : str
526 next line to be parsed
528 Examples
529 --------
530 If attribute is a string defined in python as r"floupi real", will
531 return floupi as name, and real as value.
533 >>> from scipy.io.arff._arffread import tokenize_attribute
534 >>> iterable = iter([0] * 10) # dummy iterator
535 >>> tokenize_attribute(iterable, r"@attribute floupi real")
536 ('floupi', 'real', 0)
538 If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
539 and real as value.
541 >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")
542 ('floupi 2', 'real', 0)
544 """
545 sattr = attribute.strip()
546 mattr = r_attribute.match(sattr)
547 if mattr:
548 # atrv is everything after @attribute
549 atrv = mattr.group(1)
550 if r_comattrval.match(atrv):
551 name, type = tokenize_single_comma(atrv)
552 next_item = next(iterable)
553 elif r_wcomattrval.match(atrv):
554 name, type = tokenize_single_wcomma(atrv)
555 next_item = next(iterable)
556 else:
557 # Not sure we should support this, as it does not seem supported by
558 # weka.
559 raise ValueError("multi line not supported yet")
560 else:
561 raise ValueError("First line unparsable: %s" % sattr)
563 attribute = to_attribute(name, type)
565 if type.lower() == 'relational':
566 next_item = read_relational_attribute(iterable, attribute, next_item)
567 # raise ValueError("relational attributes not supported yet")
569 return attribute, next_item
572def tokenize_single_comma(val):
573 # XXX we match twice the same string (here and at the caller level). It is
574 # stupid, but it is easier for now...
575 m = r_comattrval.match(val)
576 if m:
577 try:
578 name = m.group(1).strip()
579 type = m.group(2).strip()
580 except IndexError as e:
581 raise ValueError("Error while tokenizing attribute") from e
582 else:
583 raise ValueError("Error while tokenizing single %s" % val)
584 return name, type
587def tokenize_single_wcomma(val):
588 # XXX we match twice the same string (here and at the caller level). It is
589 # stupid, but it is easier for now...
590 m = r_wcomattrval.match(val)
591 if m:
592 try:
593 name = m.group(1).strip()
594 type = m.group(2).strip()
595 except IndexError as e:
596 raise ValueError("Error while tokenizing attribute") from e
597 else:
598 raise ValueError("Error while tokenizing single %s" % val)
599 return name, type
602def read_relational_attribute(ofile, relational_attribute, i):
603 """Read the nested attributes of a relational attribute"""
605 r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' +
606 relational_attribute.name + r'\s*$')
608 while not r_end_relational.match(i):
609 m = r_headerline.match(i)
610 if m:
611 isattr = r_attribute.match(i)
612 if isattr:
613 attr, i = tokenize_attribute(ofile, i)
614 relational_attribute.attributes.append(attr)
615 else:
616 raise ValueError("Error parsing line %s" % i)
617 else:
618 i = next(ofile)
620 i = next(ofile)
621 return i
624def read_header(ofile):
625 """Read the header of the iterable ofile."""
626 i = next(ofile)
628 # Pass first comments
629 while r_comment.match(i):
630 i = next(ofile)
632 # Header is everything up to DATA attribute ?
633 relation = None
634 attributes = []
635 while not r_datameta.match(i):
636 m = r_headerline.match(i)
637 if m:
638 isattr = r_attribute.match(i)
639 if isattr:
640 attr, i = tokenize_attribute(ofile, i)
641 attributes.append(attr)
642 else:
643 isrel = r_relation.match(i)
644 if isrel:
645 relation = isrel.group(1)
646 else:
647 raise ValueError("Error parsing line %s" % i)
648 i = next(ofile)
649 else:
650 i = next(ofile)
652 return relation, attributes
655class MetaData:
656 """Small container to keep useful information on a ARFF dataset.
658 Knows about attributes names and types.
660 Examples
661 --------
662 ::
664 data, meta = loadarff('iris.arff')
665 # This will print the attributes names of the iris.arff dataset
666 for i in meta:
667 print(i)
668 # This works too
669 meta.names()
670 # Getting attribute type
671 types = meta.types()
673 Methods
674 -------
675 names
676 types
678 Notes
679 -----
680 Also maintains the list of attributes in order, i.e., doing for i in
681 meta, where meta is an instance of MetaData, will return the
682 different attribute names in the order they were defined.
683 """
684 def __init__(self, rel, attr):
685 self.name = rel
686 self._attributes = {a.name: a for a in attr}
688 def __repr__(self):
689 msg = ""
690 msg += "Dataset: %s\n" % self.name
691 for i in self._attributes:
692 msg += f"\t{i}'s type is {self._attributes[i].type_name}"
693 if self._attributes[i].range:
694 msg += ", range is %s" % str(self._attributes[i].range)
695 msg += '\n'
696 return msg
698 def __iter__(self):
699 return iter(self._attributes)
701 def __getitem__(self, key):
702 attr = self._attributes[key]
704 return (attr.type_name, attr.range)
706 def names(self):
707 """Return the list of attribute names.
709 Returns
710 -------
711 attrnames : list of str
712 The attribute names.
713 """
714 return list(self._attributes)
716 def types(self):
717 """Return the list of attribute types.
719 Returns
720 -------
721 attr_types : list of str
722 The attribute types.
723 """
724 attr_types = [self._attributes[name].type_name
725 for name in self._attributes]
726 return attr_types
729def loadarff(f):
730 """
731 Read an arff file.
733 The data is returned as a record array, which can be accessed much like
734 a dictionary of NumPy arrays. For example, if one of the attributes is
735 called 'pressure', then its first 10 data points can be accessed from the
736 ``data`` record array like so: ``data['pressure'][0:10]``
739 Parameters
740 ----------
741 f : file-like or str
742 File-like object to read from, or filename to open.
744 Returns
745 -------
746 data : record array
747 The data of the arff file, accessible by attribute names.
748 meta : `MetaData`
749 Contains information about the arff file such as name and
750 type of attributes, the relation (name of the dataset), etc.
752 Raises
753 ------
754 ParseArffError
755 This is raised if the given file is not ARFF-formatted.
756 NotImplementedError
757 The ARFF file has an attribute which is not supported yet.
759 Notes
760 -----
762 This function should be able to read most arff files. Not
763 implemented functionality include:
765 * date type attributes
766 * string type attributes
768 It can read files with numeric and nominal attributes. It cannot read
769 files with sparse data ({} in the file). However, this function can
770 read files with missing data (? in the file), representing the data
771 points as NaNs.
773 Examples
774 --------
775 >>> from scipy.io import arff
776 >>> from io import StringIO
777 >>> content = \"\"\"
778 ... @relation foo
779 ... @attribute width numeric
780 ... @attribute height numeric
781 ... @attribute color {red,green,blue,yellow,black}
782 ... @data
783 ... 5.0,3.25,blue
784 ... 4.5,3.75,green
785 ... 3.0,4.00,red
786 ... \"\"\"
787 >>> f = StringIO(content)
788 >>> data, meta = arff.loadarff(f)
789 >>> data
790 array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')],
791 dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')])
792 >>> meta
793 Dataset: foo
794 \twidth's type is numeric
795 \theight's type is numeric
796 \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black')
798 """
799 if hasattr(f, 'read'):
800 ofile = f
801 else:
802 ofile = open(f)
803 try:
804 return _loadarff(ofile)
805 finally:
806 if ofile is not f: # only close what we opened
807 ofile.close()
810def _loadarff(ofile):
811 # Parse the header file
812 try:
813 rel, attr = read_header(ofile)
814 except ValueError as e:
815 msg = "Error while parsing header, error was: " + str(e)
816 raise ParseArffError(msg) from e
818 # Check whether we have a string attribute (not supported yet)
819 hasstr = False
820 for a in attr:
821 if isinstance(a, StringAttribute):
822 hasstr = True
824 meta = MetaData(rel, attr)
826 # XXX The following code is not great
827 # Build the type descriptor descr and the list of converters to convert
828 # each attribute to the suitable type (which should match the one in
829 # descr).
831 # This can be used once we want to support integer as integer values and
832 # not as numeric anymore (using masked arrays ?).
834 if hasstr:
835 # How to support string efficiently ? Ideally, we should know the max
836 # size of the string before allocating the numpy array.
837 raise NotImplementedError("String attributes not supported yet, sorry")
839 ni = len(attr)
841 def generator(row_iter, delim=','):
842 # TODO: this is where we are spending time (~80%). I think things
843 # could be made more efficiently:
844 # - We could for example "compile" the function, because some values
845 # do not change here.
846 # - The function to convert a line to dtyped values could also be
847 # generated on the fly from a string and be executed instead of
848 # looping.
849 # - The regex are overkill: for comments, checking that a line starts
850 # by % should be enough and faster, and for empty lines, same thing
851 # --> this does not seem to change anything.
853 # 'compiling' the range since it does not change
854 # Note, I have already tried zipping the converters and
855 # row elements and got slightly worse performance.
856 elems = list(range(ni))
858 dialect = None
859 for raw in row_iter:
860 # We do not abstract skipping comments and empty lines for
861 # performance reasons.
862 if r_comment.match(raw) or r_empty.match(raw):
863 continue
865 row, dialect = split_data_line(raw, dialect)
867 yield tuple([attr[i].parse_data(row[i]) for i in elems])
869 a = list(generator(ofile))
870 # No error should happen here: it is a bug otherwise
871 data = np.array(a, [(a.name, a.dtype) for a in attr])
872 return data, meta
875# ----
876# Misc
877# ----
878def basic_stats(data):
879 nbfac = data.size * 1. / (data.size - 1)
880 return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac
883def print_attribute(name, tp, data):
884 type = tp.type_name
885 if type == 'numeric' or type == 'real' or type == 'integer':
886 min, max, mean, std = basic_stats(data)
887 print(f"{name},{type},{min:f},{max:f},{mean:f},{std:f}")
888 else:
889 print(str(tp))
892def test_weka(filename):
893 data, meta = loadarff(filename)
894 print(len(data.dtype))
895 print(data.size)
896 for i in meta:
897 print_attribute(i, meta[i], data[i])
900# make sure nose does not find this as a test
901test_weka.__test__ = False
904if __name__ == '__main__':
905 import sys
906 filename = sys.argv[1]
907 test_weka(filename)