Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/scipy/io/arff/

1# Last Change: Mon Aug 20 08:00 PM 2007 J

2import re

3import datetime

5import numpy as np

7import csv

8import ctypes

10"""A module to read arff files."""

12__all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']

14# An Arff file is basically two parts:

15# - header

16# - data

17#

18# A header has each of its components starting by @META where META is one of

19# the keyword (attribute of relation, for now).

21# TODO:

22# - both integer and reals are treated as numeric -> the integer info

23# is lost!

24# - Replace ValueError by ParseError or something

26# We know can handle the following:

27# - numeric and nominal attributes

28# - missing values for numeric attributes

30r_meta = re.compile(r'^\s*@')

31# Match a comment

32r_comment = re.compile(r'^%')

33# Match an empty line

34r_empty = re.compile(r'^\s+$')

35# Match a header line, that is a line which starts by @ + a word

36r_headerline = re.compile(r'^\s*@\S*')

37r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')

38r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')

39r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')

41r_nominal = re.compile(r'{(.+)}')

42r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")

44# To get attributes name enclosed with ''

45r_comattrval = re.compile(r"'(..+)'\s+(..+$)")

46# To get normal attributes

47r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")

49# ------------------------

50# Module defined exception

51# ------------------------

54class ArffError(OSError):

55 pass

58class ParseArffError(ArffError):

59 pass

62# ----------

63# Attributes

64# ----------

65class Attribute:

67 type_name = None

69 def __init__(self, name):

70 self.name = name

71 self.range = None

72 self.dtype = np.object_

74 @classmethod

75 def parse_attribute(cls, name, attr_string):

76 """

77 Parse the attribute line if it knows how. Returns the parsed

78 attribute, or None.

79 """

80 return None

82 def parse_data(self, data_str):

83 """

84 Parse a value of this type.

85 """

86 return None

88 def __str__(self):

89 """

90 Parse a value of this type.

91 """

92 return self.name + ',' + self.type_name

95class NominalAttribute(Attribute):

97 type_name = 'nominal'

99 def __init__(self, name, values):

100 super().__init__(name)

101 self.values = values

102 self.range = values

103 self.dtype = (np.bytes_, max(len(i) for i in values))

104

105 @staticmethod

106 def _get_nom_val(atrv):

107 """Given a string containing a nominal type, returns a tuple of the

108 possible values.

109

110 A nominal type is defined as something framed between braces ({}).

111

112 Parameters

113 ----------

114 atrv : str

115 Nominal type definition

116

117 Returns

118 -------

119 poss_vals : tuple

120 possible values

121

122 Examples

123 --------

124 >>> from scipy.io.arff._arffread import NominalAttribute

125 >>> NominalAttribute._get_nom_val("{floup, bouga, fl, ratata}")

126 ('floup', 'bouga', 'fl', 'ratata')

127 """

128 m = r_nominal.match(atrv)

129 if m:

130 attrs, _ = split_data_line(m.group(1))

131 return tuple(attrs)

132 else:

133 raise ValueError("This does not look like a nominal string")

134

135 @classmethod

136 def parse_attribute(cls, name, attr_string):

137 """

138 Parse the attribute line if it knows how. Returns the parsed

139 attribute, or None.

140

141 For nominal attributes, the attribute string would be like '{<attr_1>,

142 <attr2>, <attr_3>}'.

143 """

144 if attr_string[0] == '{':

145 values = cls._get_nom_val(attr_string)

146 return cls(name, values)

147 else:

148 return None

149

150 def parse_data(self, data_str):

151 """

152 Parse a value of this type.

153 """

154 if data_str in self.values:

155 return data_str

156 elif data_str == '?':

157 return data_str

158 else:

159 raise ValueError(f"{str(data_str)} value not in {str(self.values)}")

160

161 def __str__(self):

162 msg = self.name + ",{"

163 for i in range(len(self.values)-1):

164 msg += self.values[i] + ","

165 msg += self.values[-1]

166 msg += "}"

167 return msg

168

169

170class NumericAttribute(Attribute):

171

172 def __init__(self, name):

173 super().__init__(name)

174 self.type_name = 'numeric'

175 self.dtype = np.float64

176

177 @classmethod

178 def parse_attribute(cls, name, attr_string):

179 """

180 Parse the attribute line if it knows how. Returns the parsed

181 attribute, or None.

182

183 For numeric attributes, the attribute string would be like

184 'numeric' or 'int' or 'real'.

185 """

186

187 attr_string = attr_string.lower().strip()

188

189 if (attr_string[:len('numeric')] == 'numeric' or

190 attr_string[:len('int')] == 'int' or

191 attr_string[:len('real')] == 'real'):

192 return cls(name)

193 else:

194 return None

195

196 def parse_data(self, data_str):

197 """

198 Parse a value of this type.

199

200 Parameters

201 ----------

202 data_str : str

203 string to convert

204

205 Returns

206 -------

207 f : float

208 where float can be nan

209

210 Examples

211 --------

212 >>> from scipy.io.arff._arffread import NumericAttribute

213 >>> atr = NumericAttribute('atr')

214 >>> atr.parse_data('1')

215 1.0

216 >>> atr.parse_data('1\\n')

217 1.0

218 >>> atr.parse_data('?\\n')

219 nan

220 """

221 if '?' in data_str:

222 return np.nan

223 else:

224 return float(data_str)

225

226 def _basic_stats(self, data):

227 nbfac = data.size * 1. / (data.size - 1)

228 return (np.nanmin(data), np.nanmax(data),

229 np.mean(data), np.std(data) * nbfac)

230

231

232class StringAttribute(Attribute):

233

234 def __init__(self, name):

235 super().__init__(name)

236 self.type_name = 'string'

237

238 @classmethod

239 def parse_attribute(cls, name, attr_string):

240 """

241 Parse the attribute line if it knows how. Returns the parsed

242 attribute, or None.

243

244 For string attributes, the attribute string would be like

245 'string'.

246 """

247

248 attr_string = attr_string.lower().strip()

249

250 if attr_string[:len('string')] == 'string':

251 return cls(name)

252 else:

253 return None

254

255

256class DateAttribute(Attribute):

257

258 def __init__(self, name, date_format, datetime_unit):

259 super().__init__(name)

260 self.date_format = date_format

261 self.datetime_unit = datetime_unit

262 self.type_name = 'date'

263 self.range = date_format

264 self.dtype = np.datetime64(0, self.datetime_unit)

265

266 @staticmethod

267 def _get_date_format(atrv):

268 m = r_date.match(atrv)

269 if m:

270 pattern = m.group(1).strip()

271 # convert time pattern from Java's SimpleDateFormat to C's format

272 datetime_unit = None

273 if "yyyy" in pattern:

274 pattern = pattern.replace("yyyy", "%Y")

275 datetime_unit = "Y"

276 elif "yy":

277 pattern = pattern.replace("yy", "%y")

278 datetime_unit = "Y"

279 if "MM" in pattern:

280 pattern = pattern.replace("MM", "%m")

281 datetime_unit = "M"

282 if "dd" in pattern:

283 pattern = pattern.replace("dd", "%d")

284 datetime_unit = "D"

285 if "HH" in pattern:

286 pattern = pattern.replace("HH", "%H")

287 datetime_unit = "h"

288 if "mm" in pattern:

289 pattern = pattern.replace("mm", "%M")

290 datetime_unit = "m"

291 if "ss" in pattern:

292 pattern = pattern.replace("ss", "%S")

293 datetime_unit = "s"

294 if "z" in pattern or "Z" in pattern:

295 raise ValueError("Date type attributes with time zone not "

296 "supported, yet")

297

298 if datetime_unit is None:

299 raise ValueError("Invalid or unsupported date format")

300

301 return pattern, datetime_unit

302 else:

303 raise ValueError("Invalid or no date format")

304

305 @classmethod

306 def parse_attribute(cls, name, attr_string):

307 """

308 Parse the attribute line if it knows how. Returns the parsed

309 attribute, or None.

310

311 For date attributes, the attribute string would be like

312 'date <format>'.

313 """

314

315 attr_string_lower = attr_string.lower().strip()

316

317 if attr_string_lower[:len('date')] == 'date':

318 date_format, datetime_unit = cls._get_date_format(attr_string)

319 return cls(name, date_format, datetime_unit)

320 else:

321 return None

322

323 def parse_data(self, data_str):

324 """

325 Parse a value of this type.

326 """

327 date_str = data_str.strip().strip("'").strip('"')

328 if date_str == '?':

329 return np.datetime64('NaT', self.datetime_unit)

330 else:

331 dt = datetime.datetime.strptime(date_str, self.date_format)

332 return np.datetime64(dt).astype(

333 "datetime64[%s]" % self.datetime_unit)

334

335 def __str__(self):

336 return super().__str__() + ',' + self.date_format

337

338

339class RelationalAttribute(Attribute):

340

341 def __init__(self, name):

342 super().__init__(name)

343 self.type_name = 'relational'

344 self.dtype = np.object_

345 self.attributes = []

346 self.dialect = None

347

348 @classmethod

349 def parse_attribute(cls, name, attr_string):

350 """

351 Parse the attribute line if it knows how. Returns the parsed

352 attribute, or None.

353

354 For date attributes, the attribute string would be like

355 'date <format>'.

356 """

357

358 attr_string_lower = attr_string.lower().strip()

359

360 if attr_string_lower[:len('relational')] == 'relational':

361 return cls(name)

362 else:

363 return None

364

365 def parse_data(self, data_str):

366 # Copy-pasted

367 elems = list(range(len(self.attributes)))

368

369 escaped_string = data_str.encode().decode("unicode-escape")

370

371 row_tuples = []

372

373 for raw in escaped_string.split("\n"):

374 row, self.dialect = split_data_line(raw, self.dialect)

375

376 row_tuples.append(tuple(

377 [self.attributes[i].parse_data(row[i]) for i in elems]))

378

379 return np.array(row_tuples,

380 [(a.name, a.dtype) for a in self.attributes])

381

382 def __str__(self):

383 return (super().__str__() + '\n\t' +

384 '\n\t'.join(str(a) for a in self.attributes))

385

386

387# -----------------

388# Various utilities

389# -----------------

390def to_attribute(name, attr_string):

391 attr_classes = (NominalAttribute, NumericAttribute, DateAttribute,

392 StringAttribute, RelationalAttribute)

393

394 for cls in attr_classes:

395 attr = cls.parse_attribute(name, attr_string)

396 if attr is not None:

397 return attr

398

399 raise ParseArffError("unknown attribute %s" % attr_string)

400

401

402def csv_sniffer_has_bug_last_field():

403 """

404 Checks if the bug https://bugs.python.org/issue30157 is unpatched.

405 """

406

407 # We only compute this once.

408 has_bug = getattr(csv_sniffer_has_bug_last_field, "has_bug", None)

409

410 if has_bug is None:

411 dialect = csv.Sniffer().sniff("3, 'a'")

412 csv_sniffer_has_bug_last_field.has_bug = dialect.quotechar != "'"

413 has_bug = csv_sniffer_has_bug_last_field.has_bug

414

415 return has_bug

416

417

418def workaround_csv_sniffer_bug_last_field(sniff_line, dialect, delimiters):

419 """

420 Workaround for the bug https://bugs.python.org/issue30157 if is unpatched.

421 """

422 if csv_sniffer_has_bug_last_field():

423 # Reuses code from the csv module

424 right_regex = r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)' # noqa: E501

425

426 for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", # noqa: E501

427 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # .*?", # noqa: E501

428 right_regex, # ,".*?"

429 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) # noqa: E501

430 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)

431 matches = regexp.findall(sniff_line)

432 if matches:

433 break

434

435 # If it does not match the expression that was bugged,

436 # then this bug does not apply

437 if restr != right_regex:

438 return

439

440 groupindex = regexp.groupindex

441

442 # There is only one end of the string

443 assert len(matches) == 1

444 m = matches[0]

445

446 n = groupindex['quote'] - 1

447 quote = m[n]

448

449 n = groupindex['delim'] - 1

450 delim = m[n]

451

452 n = groupindex['space'] - 1

453 space = bool(m[n])

454

455 dq_regexp = re.compile(

456 rf"(({re.escape(delim)})|^)\W*{quote}[^{re.escape(delim)}\n]*{quote}[^{re.escape(delim)}\n]*{quote}\W*(({re.escape(delim)})|$)", re.MULTILINE # noqa: E501

457 )

458

459 doublequote = bool(dq_regexp.search(sniff_line))

460

461 dialect.quotechar = quote

462 if delim in delimiters:

463 dialect.delimiter = delim

464 dialect.doublequote = doublequote

465 dialect.skipinitialspace = space

466

467

468def split_data_line(line, dialect=None):

469 delimiters = ",\t"

470

471 # This can not be done in a per reader basis, and relational fields

472 # can be HUGE

473 csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))

474

475 # Remove the line end if any

476 if line[-1] == '\n':

477 line = line[:-1]

478

479 # Remove potential trailing whitespace

480 line = line.strip()

481

482 sniff_line = line

483

484 # Add a delimiter if none is present, so that the csv.Sniffer

485 # does not complain for a single-field CSV.

486 if not any(d in line for d in delimiters):

487 sniff_line += ","

488

489 if dialect is None:

490 dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters)

491 workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line,

492 dialect=dialect,

493 delimiters=delimiters)

494

495 row = next(csv.reader([line], dialect))

496

497 return row, dialect

498

499

500# --------------

501# Parsing header

502# --------------

503def tokenize_attribute(iterable, attribute):

504 """Parse a raw string in header (e.g., starts by @attribute).

505

506 Given a raw string attribute, try to get the name and type of the

507 attribute. Constraints:

508

509 * The first line must start with @attribute (case insensitive, and

510 space like characters before @attribute are allowed)

511 * Works also if the attribute is spread on multilines.

512 * Works if empty lines or comments are in between

513

514 Parameters

515 ----------

516 attribute : str

517 the attribute string.

518

519 Returns

520 -------

521 name : str

522 name of the attribute

523 value : str

524 value of the attribute

525 next : str

526 next line to be parsed

527

528 Examples

529 --------

530 If attribute is a string defined in python as r"floupi real", will

531 return floupi as name, and real as value.

532

533 >>> from scipy.io.arff._arffread import tokenize_attribute

534 >>> iterable = iter([0] * 10) # dummy iterator

535 >>> tokenize_attribute(iterable, r"@attribute floupi real")

536 ('floupi', 'real', 0)

537

538 If attribute is r"'floupi 2' real", will return 'floupi 2' as name,

539 and real as value.

540

541 >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")

542 ('floupi 2', 'real', 0)

543

544 """

545 sattr = attribute.strip()

546 mattr = r_attribute.match(sattr)

547 if mattr:

548 # atrv is everything after @attribute

549 atrv = mattr.group(1)

550 if r_comattrval.match(atrv):

551 name, type = tokenize_single_comma(atrv)

552 next_item = next(iterable)

553 elif r_wcomattrval.match(atrv):

554 name, type = tokenize_single_wcomma(atrv)

555 next_item = next(iterable)

556 else:

557 # Not sure we should support this, as it does not seem supported by

558 # weka.

559 raise ValueError("multi line not supported yet")

560 else:

561 raise ValueError("First line unparsable: %s" % sattr)

562

563 attribute = to_attribute(name, type)

564

565 if type.lower() == 'relational':

566 next_item = read_relational_attribute(iterable, attribute, next_item)

567 # raise ValueError("relational attributes not supported yet")

568

569 return attribute, next_item

570

571

572def tokenize_single_comma(val):

573 # XXX we match twice the same string (here and at the caller level). It is

574 # stupid, but it is easier for now...

575 m = r_comattrval.match(val)

576 if m:

577 try:

578 name = m.group(1).strip()

579 type = m.group(2).strip()

580 except IndexError as e:

581 raise ValueError("Error while tokenizing attribute") from e

582 else:

583 raise ValueError("Error while tokenizing single %s" % val)

584 return name, type

585

586

587def tokenize_single_wcomma(val):

588 # XXX we match twice the same string (here and at the caller level). It is

589 # stupid, but it is easier for now...

590 m = r_wcomattrval.match(val)

591 if m:

592 try:

593 name = m.group(1).strip()

594 type = m.group(2).strip()

595 except IndexError as e:

596 raise ValueError("Error while tokenizing attribute") from e

597 else:

598 raise ValueError("Error while tokenizing single %s" % val)

599 return name, type

600

601

602def read_relational_attribute(ofile, relational_attribute, i):

603 """Read the nested attributes of a relational attribute"""

604

605 r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' +

606 relational_attribute.name + r'\s*$')

607

608 while not r_end_relational.match(i):

609 m = r_headerline.match(i)

610 if m:

611 isattr = r_attribute.match(i)

612 if isattr:

613 attr, i = tokenize_attribute(ofile, i)

614 relational_attribute.attributes.append(attr)

615 else:

616 raise ValueError("Error parsing line %s" % i)

617 else:

618 i = next(ofile)

619

620 i = next(ofile)

621 return i

622

623

624def read_header(ofile):

625 """Read the header of the iterable ofile."""

626 i = next(ofile)

627

628 # Pass first comments

629 while r_comment.match(i):

630 i = next(ofile)

631

632 # Header is everything up to DATA attribute ?

633 relation = None

634 attributes = []

635 while not r_datameta.match(i):

636 m = r_headerline.match(i)

637 if m:

638 isattr = r_attribute.match(i)

639 if isattr:

640 attr, i = tokenize_attribute(ofile, i)

641 attributes.append(attr)

642 else:

643 isrel = r_relation.match(i)

644 if isrel:

645 relation = isrel.group(1)

646 else:

647 raise ValueError("Error parsing line %s" % i)

648 i = next(ofile)

649 else:

650 i = next(ofile)

651

652 return relation, attributes

653

654

655class MetaData:

656 """Small container to keep useful information on a ARFF dataset.

657

658 Knows about attributes names and types.

659

660 Examples

661 --------

662 ::

663

664 data, meta = loadarff('iris.arff')

665 # This will print the attributes names of the iris.arff dataset

666 for i in meta:

667 print(i)

668 # This works too

669 meta.names()

670 # Getting attribute type

671 types = meta.types()

672

673 Methods

674 -------

675 names

676 types

677

678 Notes

679 -----

680 Also maintains the list of attributes in order, i.e., doing for i in

681 meta, where meta is an instance of MetaData, will return the

682 different attribute names in the order they were defined.

683 """

684 def __init__(self, rel, attr):

685 self.name = rel

686 self._attributes = {a.name: a for a in attr}

687

688 def __repr__(self):

689 msg = ""

690 msg += "Dataset: %s\n" % self.name

691 for i in self._attributes:

692 msg += f"\t{i}'s type is {self._attributes[i].type_name}"

693 if self._attributes[i].range:

694 msg += ", range is %s" % str(self._attributes[i].range)

695 msg += '\n'

696 return msg

697

698 def __iter__(self):

699 return iter(self._attributes)

700

701 def __getitem__(self, key):

702 attr = self._attributes[key]

703

704 return (attr.type_name, attr.range)

705

706 def names(self):

707 """Return the list of attribute names.

708

709 Returns

710 -------

711 attrnames : list of str

712 The attribute names.

713 """

714 return list(self._attributes)

715

716 def types(self):

717 """Return the list of attribute types.

718

719 Returns

720 -------

721 attr_types : list of str

722 The attribute types.

723 """

724 attr_types = [self._attributes[name].type_name

725 for name in self._attributes]

726 return attr_types

727

728

729def loadarff(f):

730 """

731 Read an arff file.

732

733 The data is returned as a record array, which can be accessed much like

734 a dictionary of NumPy arrays. For example, if one of the attributes is

735 called 'pressure', then its first 10 data points can be accessed from the

736 ``data`` record array like so: ``data['pressure'][0:10]``

737

738

739 Parameters

740 ----------

741 f : file-like or str

742 File-like object to read from, or filename to open.

743

744 Returns

745 -------

746 data : record array

747 The data of the arff file, accessible by attribute names.

748 meta : `MetaData`

749 Contains information about the arff file such as name and

750 type of attributes, the relation (name of the dataset), etc.

751

752 Raises

753 ------

754 ParseArffError

755 This is raised if the given file is not ARFF-formatted.

756 NotImplementedError

757 The ARFF file has an attribute which is not supported yet.

758

759 Notes

760 -----

761

762 This function should be able to read most arff files. Not

763 implemented functionality include:

764

765 * date type attributes

766 * string type attributes

767

768 It can read files with numeric and nominal attributes. It cannot read

769 files with sparse data ({} in the file). However, this function can

770 read files with missing data (? in the file), representing the data

771 points as NaNs.

772

773 Examples

774 --------

775 >>> from scipy.io import arff

776 >>> from io import StringIO

777 >>> content = \"\"\"

778 ... @relation foo

779 ... @attribute width numeric

780 ... @attribute height numeric

781 ... @attribute color {red,green,blue,yellow,black}

782 ... @data

783 ... 5.0,3.25,blue

784 ... 4.5,3.75,green

785 ... 3.0,4.00,red

786 ... \"\"\"

787 >>> f = StringIO(content)

788 >>> data, meta = arff.loadarff(f)

789 >>> data

790 array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')],

791 dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')])

792 >>> meta

793 Dataset: foo

794 \twidth's type is numeric

795 \theight's type is numeric

796 \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black')

797

798 """

799 if hasattr(f, 'read'):

800 ofile = f

801 else:

802 ofile = open(f)

803 try:

804 return _loadarff(ofile)

805 finally:

806 if ofile is not f: # only close what we opened

807 ofile.close()

808

809

810def _loadarff(ofile):

811 # Parse the header file

812 try:

813 rel, attr = read_header(ofile)

814 except ValueError as e:

815 msg = "Error while parsing header, error was: " + str(e)

816 raise ParseArffError(msg) from e

817

818 # Check whether we have a string attribute (not supported yet)

819 hasstr = False

820 for a in attr:

821 if isinstance(a, StringAttribute):

822 hasstr = True

823

824 meta = MetaData(rel, attr)

825

826 # XXX The following code is not great

827 # Build the type descriptor descr and the list of converters to convert

828 # each attribute to the suitable type (which should match the one in

829 # descr).

830

831 # This can be used once we want to support integer as integer values and

832 # not as numeric anymore (using masked arrays ?).

833

834 if hasstr:

835 # How to support string efficiently ? Ideally, we should know the max

836 # size of the string before allocating the numpy array.

837 raise NotImplementedError("String attributes not supported yet, sorry")

838

839 ni = len(attr)

840

841 def generator(row_iter, delim=','):

842 # TODO: this is where we are spending time (~80%). I think things

843 # could be made more efficiently:

844 # - We could for example "compile" the function, because some values

845 # do not change here.

846 # - The function to convert a line to dtyped values could also be

847 # generated on the fly from a string and be executed instead of

848 # looping.

849 # - The regex are overkill: for comments, checking that a line starts

850 # by % should be enough and faster, and for empty lines, same thing

851 # --> this does not seem to change anything.

852

853 # 'compiling' the range since it does not change

854 # Note, I have already tried zipping the converters and

855 # row elements and got slightly worse performance.

856 elems = list(range(ni))

857

858 dialect = None

859 for raw in row_iter:

860 # We do not abstract skipping comments and empty lines for

861 # performance reasons.

862 if r_comment.match(raw) or r_empty.match(raw):

863 continue

864

865 row, dialect = split_data_line(raw, dialect)

866

867 yield tuple([attr[i].parse_data(row[i]) for i in elems])

868

869 a = list(generator(ofile))

870 # No error should happen here: it is a bug otherwise

871 data = np.array(a, [(a.name, a.dtype) for a in attr])

872 return data, meta

873

874

875# ----

876# Misc

877# ----

878def basic_stats(data):

879 nbfac = data.size * 1. / (data.size - 1)

880 return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac

881

882

883def print_attribute(name, tp, data):

884 type = tp.type_name

885 if type == 'numeric' or type == 'real' or type == 'integer':

886 min, max, mean, std = basic_stats(data)

887 print(f"{name},{type},{min:f},{max:f},{mean:f},{std:f}")

888 else:

889 print(str(tp))

890

891

892def test_weka(filename):

893 data, meta = loadarff(filename)

894 print(len(data.dtype))

895 print(data.size)

896 for i in meta:

897 print_attribute(i, meta[i], data[i])

898

899

900# make sure nose does not find this as a test

901test_weka.__test__ = False

902

903

904if __name__ == '__main__':

905 import sys

906 filename = sys.argv[1]

907 test_weka(filename)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/scipy/io/arff/_arffread.py: 23%

373 statements