Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/scipy/io/arff/

1# Last Change: Mon Aug 20 08:00 PM 2007 J

2import re

3import datetime

5import numpy as np

7import csv

8import ctypes

10"""A module to read arff files."""

12__all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']

14# An Arff file is basically two parts:

15# - header

16# - data

17#

18# A header has each of its components starting by @META where META is one of

19# the keyword (attribute of relation, for now).

21# TODO:

22# - both integer and reals are treated as numeric -> the integer info

23# is lost!

24# - Replace ValueError by ParseError or something

26# We know can handle the following:

27# - numeric and nominal attributes

28# - missing values for numeric attributes

30r_meta = re.compile(r'^\s*@')

31# Match a comment

32r_comment = re.compile(r'^%')

33# Match an empty line

34r_empty = re.compile(r'^\s+$')

35# Match a header line, that is a line which starts by @ + a word

36r_headerline = re.compile(r'^\s*@\S*')

37r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')

38r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')

39r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')

41r_nominal = re.compile(r'{(.+)}')

42r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")

44# To get attributes name enclosed with ''

45r_comattrval = re.compile(r"'(..+)'\s+(..+$)")

46# To get normal attributes

47r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")

49# ------------------------

50# Module defined exception

51# ------------------------

54class ArffError(OSError):

55 pass

58class ParseArffError(ArffError):

59 pass

62# ----------

63# Attributes

64# ----------

65class Attribute:

67 type_name = None

69 def __init__(self, name):

70 self.name = name

71 self.range = None

72 self.dtype = np.object_

74 @classmethod

75 def parse_attribute(cls, name, attr_string):

76 """

77 Parse the attribute line if it knows how. Returns the parsed

78 attribute, or None.

79 """

80 return None

82 def parse_data(self, data_str):

83 """

84 Parse a value of this type.

85 """

86 return None

88 def __str__(self):

89 """

90 Parse a value of this type.

91 """

92 return self.name + ',' + self.type_name

95class NominalAttribute(Attribute):

97 type_name = 'nominal'

99 def __init__(self, name, values):

100 super().__init__(name)

101 self.values = values

102 self.range = values

103 self.dtype = (np.bytes_, max(len(i) for i in values))

104

105 @staticmethod

106 def _get_nom_val(atrv):

107 """Given a string containing a nominal type, returns a tuple of the

108 possible values.

109

110 A nominal type is defined as something framed between braces ({}).

111

112 Parameters

113 ----------

114 atrv : str

115 Nominal type definition

116

117 Returns

118 -------

119 poss_vals : tuple

120 possible values

121

122 Examples

123 --------

124 >>> from scipy.io.arff._arffread import NominalAttribute

125 >>> NominalAttribute._get_nom_val("{floup, bouga, fl, ratata}")

126 ('floup', 'bouga', 'fl', 'ratata')

127 """

128 m = r_nominal.match(atrv)

129 if m:

130 attrs, _ = split_data_line(m.group(1))

131 return tuple(attrs)

132 else:

133 raise ValueError("This does not look like a nominal string")

134

135 @classmethod

136 def parse_attribute(cls, name, attr_string):

137 """

138 Parse the attribute line if it knows how. Returns the parsed

139 attribute, or None.

140

141 For nominal attributes, the attribute string would be like '{<attr_1>,

142 <attr2>, <attr_3>}'.

143 """

144 if attr_string[0] == '{':

145 values = cls._get_nom_val(attr_string)

146 return cls(name, values)

147 else:

148 return None

149

150 def parse_data(self, data_str):

151 """

152 Parse a value of this type.

153 """

154 if data_str in self.values:

155 return data_str

156 elif data_str == '?':

157 return data_str

158 else:

159 raise ValueError("{} value not in {}".format(str(data_str),

160 str(self.values)))

161

162 def __str__(self):

163 msg = self.name + ",{"

164 for i in range(len(self.values)-1):

165 msg += self.values[i] + ","

166 msg += self.values[-1]

167 msg += "}"

168 return msg

169

170

171class NumericAttribute(Attribute):

172

173 def __init__(self, name):

174 super().__init__(name)

175 self.type_name = 'numeric'

176 self.dtype = np.float64

177

178 @classmethod

179 def parse_attribute(cls, name, attr_string):

180 """

181 Parse the attribute line if it knows how. Returns the parsed

182 attribute, or None.

183

184 For numeric attributes, the attribute string would be like

185 'numeric' or 'int' or 'real'.

186 """

187

188 attr_string = attr_string.lower().strip()

189

190 if (attr_string[:len('numeric')] == 'numeric' or

191 attr_string[:len('int')] == 'int' or

192 attr_string[:len('real')] == 'real'):

193 return cls(name)

194 else:

195 return None

196

197 def parse_data(self, data_str):

198 """

199 Parse a value of this type.

200

201 Parameters

202 ----------

203 data_str : str

204 string to convert

205

206 Returns

207 -------

208 f : float

209 where float can be nan

210

211 Examples

212 --------

213 >>> from scipy.io.arff._arffread import NumericAttribute

214 >>> atr = NumericAttribute('atr')

215 >>> atr.parse_data('1')

216 1.0

217 >>> atr.parse_data('1\\n')

218 1.0

219 >>> atr.parse_data('?\\n')

220 nan

221 """

222 if '?' in data_str:

223 return np.nan

224 else:

225 return float(data_str)

226

227 def _basic_stats(self, data):

228 nbfac = data.size * 1. / (data.size - 1)

229 return (np.nanmin(data), np.nanmax(data),

230 np.mean(data), np.std(data) * nbfac)

231

232

233class StringAttribute(Attribute):

234

235 def __init__(self, name):

236 super().__init__(name)

237 self.type_name = 'string'

238

239 @classmethod

240 def parse_attribute(cls, name, attr_string):

241 """

242 Parse the attribute line if it knows how. Returns the parsed

243 attribute, or None.

244

245 For string attributes, the attribute string would be like

246 'string'.

247 """

248

249 attr_string = attr_string.lower().strip()

250

251 if attr_string[:len('string')] == 'string':

252 return cls(name)

253 else:

254 return None

255

256

257class DateAttribute(Attribute):

258

259 def __init__(self, name, date_format, datetime_unit):

260 super().__init__(name)

261 self.date_format = date_format

262 self.datetime_unit = datetime_unit

263 self.type_name = 'date'

264 self.range = date_format

265 self.dtype = np.datetime64(0, self.datetime_unit)

266

267 @staticmethod

268 def _get_date_format(atrv):

269 m = r_date.match(atrv)

270 if m:

271 pattern = m.group(1).strip()

272 # convert time pattern from Java's SimpleDateFormat to C's format

273 datetime_unit = None

274 if "yyyy" in pattern:

275 pattern = pattern.replace("yyyy", "%Y")

276 datetime_unit = "Y"

277 elif "yy":

278 pattern = pattern.replace("yy", "%y")

279 datetime_unit = "Y"

280 if "MM" in pattern:

281 pattern = pattern.replace("MM", "%m")

282 datetime_unit = "M"

283 if "dd" in pattern:

284 pattern = pattern.replace("dd", "%d")

285 datetime_unit = "D"

286 if "HH" in pattern:

287 pattern = pattern.replace("HH", "%H")

288 datetime_unit = "h"

289 if "mm" in pattern:

290 pattern = pattern.replace("mm", "%M")

291 datetime_unit = "m"

292 if "ss" in pattern:

293 pattern = pattern.replace("ss", "%S")

294 datetime_unit = "s"

295 if "z" in pattern or "Z" in pattern:

296 raise ValueError("Date type attributes with time zone not "

297 "supported, yet")

298

299 if datetime_unit is None:

300 raise ValueError("Invalid or unsupported date format")

301

302 return pattern, datetime_unit

303 else:

304 raise ValueError("Invalid or no date format")

305

306 @classmethod

307 def parse_attribute(cls, name, attr_string):

308 """

309 Parse the attribute line if it knows how. Returns the parsed

310 attribute, or None.

311

312 For date attributes, the attribute string would be like

313 'date <format>'.

314 """

315

316 attr_string_lower = attr_string.lower().strip()

317

318 if attr_string_lower[:len('date')] == 'date':

319 date_format, datetime_unit = cls._get_date_format(attr_string)

320 return cls(name, date_format, datetime_unit)

321 else:

322 return None

323

324 def parse_data(self, data_str):

325 """

326 Parse a value of this type.

327 """

328 date_str = data_str.strip().strip("'").strip('"')

329 if date_str == '?':

330 return np.datetime64('NaT', self.datetime_unit)

331 else:

332 dt = datetime.datetime.strptime(date_str, self.date_format)

333 return np.datetime64(dt).astype(

334 "datetime64[%s]" % self.datetime_unit)

335

336 def __str__(self):

337 return super().__str__() + ',' + self.date_format

338

339

340class RelationalAttribute(Attribute):

341

342 def __init__(self, name):

343 super().__init__(name)

344 self.type_name = 'relational'

345 self.dtype = np.object_

346 self.attributes = []

347 self.dialect = None

348

349 @classmethod

350 def parse_attribute(cls, name, attr_string):

351 """

352 Parse the attribute line if it knows how. Returns the parsed

353 attribute, or None.

354

355 For date attributes, the attribute string would be like

356 'date <format>'.

357 """

358

359 attr_string_lower = attr_string.lower().strip()

360

361 if attr_string_lower[:len('relational')] == 'relational':

362 return cls(name)

363 else:

364 return None

365

366 def parse_data(self, data_str):

367 # Copy-pasted

368 elems = list(range(len(self.attributes)))

369

370 escaped_string = data_str.encode().decode("unicode-escape")

371

372 row_tuples = []

373

374 for raw in escaped_string.split("\n"):

375 row, self.dialect = split_data_line(raw, self.dialect)

376

377 row_tuples.append(tuple(

378 [self.attributes[i].parse_data(row[i]) for i in elems]))

379

380 return np.array(row_tuples,

381 [(a.name, a.dtype) for a in self.attributes])

382

383 def __str__(self):

384 return (super().__str__() + '\n\t' +

385 '\n\t'.join(str(a) for a in self.attributes))

386

387

388# -----------------

389# Various utilities

390# -----------------

391def to_attribute(name, attr_string):

392 attr_classes = (NominalAttribute, NumericAttribute, DateAttribute,

393 StringAttribute, RelationalAttribute)

394

395 for cls in attr_classes:

396 attr = cls.parse_attribute(name, attr_string)

397 if attr is not None:

398 return attr

399

400 raise ParseArffError("unknown attribute %s" % attr_string)

401

402

403def csv_sniffer_has_bug_last_field():

404 """

405 Checks if the bug https://bugs.python.org/issue30157 is unpatched.

406 """

407

408 # We only compute this once.

409 has_bug = getattr(csv_sniffer_has_bug_last_field, "has_bug", None)

410

411 if has_bug is None:

412 dialect = csv.Sniffer().sniff("3, 'a'")

413 csv_sniffer_has_bug_last_field.has_bug = dialect.quotechar != "'"

414 has_bug = csv_sniffer_has_bug_last_field.has_bug

415

416 return has_bug

417

418

419def workaround_csv_sniffer_bug_last_field(sniff_line, dialect, delimiters):

420 """

421 Workaround for the bug https://bugs.python.org/issue30157 if is unpatched.

422 """

423 if csv_sniffer_has_bug_last_field():

424 # Reuses code from the csv module

425 right_regex = r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'

426

427 for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",

428 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # .*?",

429 right_regex, # ,".*?"

430 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)

431 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)

432 matches = regexp.findall(sniff_line)

433 if matches:

434 break

435

436 # If it does not match the expression that was bugged, then this bug does not apply

437 if restr != right_regex:

438 return

439

440 groupindex = regexp.groupindex

441

442 # There is only one end of the string

443 assert len(matches) == 1

444 m = matches[0]

445

446 n = groupindex['quote'] - 1

447 quote = m[n]

448

449 n = groupindex['delim'] - 1

450 delim = m[n]

451

452 n = groupindex['space'] - 1

453 space = bool(m[n])

454

455 dq_regexp = re.compile(

456 r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" %

457 {'delim': re.escape(delim), 'quote': quote}, re.MULTILINE

458 )

459

460 doublequote = bool(dq_regexp.search(sniff_line))

461

462 dialect.quotechar = quote

463 if delim in delimiters:

464 dialect.delimiter = delim

465 dialect.doublequote = doublequote

466 dialect.skipinitialspace = space

467

468

469def split_data_line(line, dialect=None):

470 delimiters = ",\t"

471

472 # This can not be done in a per reader basis, and relational fields

473 # can be HUGE

474 csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))

475

476 # Remove the line end if any

477 if line[-1] == '\n':

478 line = line[:-1]

479

480 # Remove potential trailing whitespace

481 line = line.strip()

482

483 sniff_line = line

484

485 # Add a delimiter if none is present, so that the csv.Sniffer

486 # does not complain for a single-field CSV.

487 if not any(d in line for d in delimiters):

488 sniff_line += ","

489

490 if dialect is None:

491 dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters)

492 workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line,

493 dialect=dialect,

494 delimiters=delimiters)

495

496 row = next(csv.reader([line], dialect))

497

498 return row, dialect

499

500

501# --------------

502# Parsing header

503# --------------

504def tokenize_attribute(iterable, attribute):

505 """Parse a raw string in header (e.g., starts by @attribute).

506

507 Given a raw string attribute, try to get the name and type of the

508 attribute. Constraints:

509

510 * The first line must start with @attribute (case insensitive, and

511 space like characters before @attribute are allowed)

512 * Works also if the attribute is spread on multilines.

513 * Works if empty lines or comments are in between

514

515 Parameters

516 ----------

517 attribute : str

518 the attribute string.

519

520 Returns

521 -------

522 name : str

523 name of the attribute

524 value : str

525 value of the attribute

526 next : str

527 next line to be parsed

528

529 Examples

530 --------

531 If attribute is a string defined in python as r"floupi real", will

532 return floupi as name, and real as value.

533

534 >>> from scipy.io.arff._arffread import tokenize_attribute

535 >>> iterable = iter([0] * 10) # dummy iterator

536 >>> tokenize_attribute(iterable, r"@attribute floupi real")

537 ('floupi', 'real', 0)

538

539 If attribute is r"'floupi 2' real", will return 'floupi 2' as name,

540 and real as value.

541

542 >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")

543 ('floupi 2', 'real', 0)

544

545 """

546 sattr = attribute.strip()

547 mattr = r_attribute.match(sattr)

548 if mattr:

549 # atrv is everything after @attribute

550 atrv = mattr.group(1)

551 if r_comattrval.match(atrv):

552 name, type = tokenize_single_comma(atrv)

553 next_item = next(iterable)

554 elif r_wcomattrval.match(atrv):

555 name, type = tokenize_single_wcomma(atrv)

556 next_item = next(iterable)

557 else:

558 # Not sure we should support this, as it does not seem supported by

559 # weka.

560 raise ValueError("multi line not supported yet")

561 else:

562 raise ValueError("First line unparsable: %s" % sattr)

563

564 attribute = to_attribute(name, type)

565

566 if type.lower() == 'relational':

567 next_item = read_relational_attribute(iterable, attribute, next_item)

568 # raise ValueError("relational attributes not supported yet")

569

570 return attribute, next_item

571

572

573def tokenize_single_comma(val):

574 # XXX we match twice the same string (here and at the caller level). It is

575 # stupid, but it is easier for now...

576 m = r_comattrval.match(val)

577 if m:

578 try:

579 name = m.group(1).strip()

580 type = m.group(2).strip()

581 except IndexError as e:

582 raise ValueError("Error while tokenizing attribute") from e

583 else:

584 raise ValueError("Error while tokenizing single %s" % val)

585 return name, type

586

587

588def tokenize_single_wcomma(val):

589 # XXX we match twice the same string (here and at the caller level). It is

590 # stupid, but it is easier for now...

591 m = r_wcomattrval.match(val)

592 if m:

593 try:

594 name = m.group(1).strip()

595 type = m.group(2).strip()

596 except IndexError as e:

597 raise ValueError("Error while tokenizing attribute") from e

598 else:

599 raise ValueError("Error while tokenizing single %s" % val)

600 return name, type

601

602

603def read_relational_attribute(ofile, relational_attribute, i):

604 """Read the nested attributes of a relational attribute"""

605

606 r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' +

607 relational_attribute.name + r'\s*$')

608

609 while not r_end_relational.match(i):

610 m = r_headerline.match(i)

611 if m:

612 isattr = r_attribute.match(i)

613 if isattr:

614 attr, i = tokenize_attribute(ofile, i)

615 relational_attribute.attributes.append(attr)

616 else:

617 raise ValueError("Error parsing line %s" % i)

618 else:

619 i = next(ofile)

620

621 i = next(ofile)

622 return i

623

624

625def read_header(ofile):

626 """Read the header of the iterable ofile."""

627 i = next(ofile)

628

629 # Pass first comments

630 while r_comment.match(i):

631 i = next(ofile)

632

633 # Header is everything up to DATA attribute ?

634 relation = None

635 attributes = []

636 while not r_datameta.match(i):

637 m = r_headerline.match(i)

638 if m:

639 isattr = r_attribute.match(i)

640 if isattr:

641 attr, i = tokenize_attribute(ofile, i)

642 attributes.append(attr)

643 else:

644 isrel = r_relation.match(i)

645 if isrel:

646 relation = isrel.group(1)

647 else:

648 raise ValueError("Error parsing line %s" % i)

649 i = next(ofile)

650 else:

651 i = next(ofile)

652

653 return relation, attributes

654

655

656class MetaData:

657 """Small container to keep useful information on a ARFF dataset.

658

659 Knows about attributes names and types.

660

661 Examples

662 --------

663 ::

664

665 data, meta = loadarff('iris.arff')

666 # This will print the attributes names of the iris.arff dataset

667 for i in meta:

668 print(i)

669 # This works too

670 meta.names()

671 # Getting attribute type

672 types = meta.types()

673

674 Methods

675 -------

676 names

677 types

678

679 Notes

680 -----

681 Also maintains the list of attributes in order, i.e., doing for i in

682 meta, where meta is an instance of MetaData, will return the

683 different attribute names in the order they were defined.

684 """

685 def __init__(self, rel, attr):

686 self.name = rel

687 self._attributes = {a.name: a for a in attr}

688

689 def __repr__(self):

690 msg = ""

691 msg += "Dataset: %s\n" % self.name

692 for i in self._attributes:

693 msg += f"\t{i}'s type is {self._attributes[i].type_name}"

694 if self._attributes[i].range:

695 msg += ", range is %s" % str(self._attributes[i].range)

696 msg += '\n'

697 return msg

698

699 def __iter__(self):

700 return iter(self._attributes)

701

702 def __getitem__(self, key):

703 attr = self._attributes[key]

704

705 return (attr.type_name, attr.range)

706

707 def names(self):

708 """Return the list of attribute names.

709

710 Returns

711 -------

712 attrnames : list of str

713 The attribute names.

714 """

715 return list(self._attributes)

716

717 def types(self):

718 """Return the list of attribute types.

719

720 Returns

721 -------

722 attr_types : list of str

723 The attribute types.

724 """

725 attr_types = [self._attributes[name].type_name

726 for name in self._attributes]

727 return attr_types

728

729

730def loadarff(f):

731 """

732 Read an arff file.

733

734 The data is returned as a record array, which can be accessed much like

735 a dictionary of NumPy arrays. For example, if one of the attributes is

736 called 'pressure', then its first 10 data points can be accessed from the

737 ``data`` record array like so: ``data['pressure'][0:10]``

738

739

740 Parameters

741 ----------

742 f : file-like or str

743 File-like object to read from, or filename to open.

744

745 Returns

746 -------

747 data : record array

748 The data of the arff file, accessible by attribute names.

749 meta : `MetaData`

750 Contains information about the arff file such as name and

751 type of attributes, the relation (name of the dataset), etc.

752

753 Raises

754 ------

755 ParseArffError

756 This is raised if the given file is not ARFF-formatted.

757 NotImplementedError

758 The ARFF file has an attribute which is not supported yet.

759

760 Notes

761 -----

762

763 This function should be able to read most arff files. Not

764 implemented functionality include:

765

766 * date type attributes

767 * string type attributes

768

769 It can read files with numeric and nominal attributes. It cannot read

770 files with sparse data ({} in the file). However, this function can

771 read files with missing data (? in the file), representing the data

772 points as NaNs.

773

774 Examples

775 --------

776 >>> from scipy.io import arff

777 >>> from io import StringIO

778 >>> content = \"\"\"

779 ... @relation foo

780 ... @attribute width numeric

781 ... @attribute height numeric

782 ... @attribute color {red,green,blue,yellow,black}

783 ... @data

784 ... 5.0,3.25,blue

785 ... 4.5,3.75,green

786 ... 3.0,4.00,red

787 ... \"\"\"

788 >>> f = StringIO(content)

789 >>> data, meta = arff.loadarff(f)

790 >>> data

791 array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')],

792 dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')])

793 >>> meta

794 Dataset: foo

795 \twidth's type is numeric

796 \theight's type is numeric

797 \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black')

798

799 """

800 if hasattr(f, 'read'):

801 ofile = f

802 else:

803 ofile = open(f)

804 try:

805 return _loadarff(ofile)

806 finally:

807 if ofile is not f: # only close what we opened

808 ofile.close()

809

810

811def _loadarff(ofile):

812 # Parse the header file

813 try:

814 rel, attr = read_header(ofile)

815 except ValueError as e:

816 msg = "Error while parsing header, error was: " + str(e)

817 raise ParseArffError(msg) from e

818

819 # Check whether we have a string attribute (not supported yet)

820 hasstr = False

821 for a in attr:

822 if isinstance(a, StringAttribute):

823 hasstr = True

824

825 meta = MetaData(rel, attr)

826

827 # XXX The following code is not great

828 # Build the type descriptor descr and the list of convertors to convert

829 # each attribute to the suitable type (which should match the one in

830 # descr).

831

832 # This can be used once we want to support integer as integer values and

833 # not as numeric anymore (using masked arrays ?).

834

835 if hasstr:

836 # How to support string efficiently ? Ideally, we should know the max

837 # size of the string before allocating the numpy array.

838 raise NotImplementedError("String attributes not supported yet, sorry")

839

840 ni = len(attr)

841

842 def generator(row_iter, delim=','):

843 # TODO: this is where we are spending time (~80%). I think things

844 # could be made more efficiently:

845 # - We could for example "compile" the function, because some values

846 # do not change here.

847 # - The function to convert a line to dtyped values could also be

848 # generated on the fly from a string and be executed instead of

849 # looping.

850 # - The regex are overkill: for comments, checking that a line starts

851 # by % should be enough and faster, and for empty lines, same thing

852 # --> this does not seem to change anything.

853

854 # 'compiling' the range since it does not change

855 # Note, I have already tried zipping the converters and

856 # row elements and got slightly worse performance.

857 elems = list(range(ni))

858

859 dialect = None

860 for raw in row_iter:

861 # We do not abstract skipping comments and empty lines for

862 # performance reasons.

863 if r_comment.match(raw) or r_empty.match(raw):

864 continue

865

866 row, dialect = split_data_line(raw, dialect)

867

868 yield tuple([attr[i].parse_data(row[i]) for i in elems])

869

870 a = list(generator(ofile))

871 # No error should happen here: it is a bug otherwise

872 data = np.array(a, [(a.name, a.dtype) for a in attr])

873 return data, meta

874

875

876# ----

877# Misc

878# ----

879def basic_stats(data):

880 nbfac = data.size * 1. / (data.size - 1)

881 return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac

882

883

884def print_attribute(name, tp, data):

885 type = tp.type_name

886 if type == 'numeric' or type == 'real' or type == 'integer':

887 min, max, mean, std = basic_stats(data)

888 print(f"{name},{type},{min:f},{max:f},{mean:f},{std:f}")

889 else:

890 print(str(tp))

891

892

893def test_weka(filename):

894 data, meta = loadarff(filename)

895 print(len(data.dtype))

896 print(data.size)

897 for i in meta:

898 print_attribute(i, meta[i], data[i])

899

900

901# make sure nose does not find this as a test

902test_weka.__test__ = False

903

904

905if __name__ == '__main__':

906 import sys

907 filename = sys.argv[1]

908 test_weka(filename)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/scipy/io/arff/_arffread.py: 23%

373 statements