Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/scipy/io/arff/_arffread.py: 23%

373 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-23 06:43 +0000

1# Last Change: Mon Aug 20 08:00 PM 2007 J 

2import re 

3import datetime 

4 

5import numpy as np 

6 

7import csv 

8import ctypes 

9 

10"""A module to read arff files.""" 

11 

12__all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError'] 

13 

14# An Arff file is basically two parts: 

15# - header 

16# - data 

17# 

18# A header has each of its components starting by @META where META is one of 

19# the keyword (attribute of relation, for now). 

20 

21# TODO: 

22# - both integer and reals are treated as numeric -> the integer info 

23# is lost! 

24# - Replace ValueError by ParseError or something 

25 

26# We know can handle the following: 

27# - numeric and nominal attributes 

28# - missing values for numeric attributes 

29 

30r_meta = re.compile(r'^\s*@') 

31# Match a comment 

32r_comment = re.compile(r'^%') 

33# Match an empty line 

34r_empty = re.compile(r'^\s+$') 

35# Match a header line, that is a line which starts by @ + a word 

36r_headerline = re.compile(r'^\s*@\S*') 

37r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]') 

38r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)') 

39r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)') 

40 

41r_nominal = re.compile(r'{(.+)}') 

42r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$") 

43 

44# To get attributes name enclosed with '' 

45r_comattrval = re.compile(r"'(..+)'\s+(..+$)") 

46# To get normal attributes 

47r_wcomattrval = re.compile(r"(\S+)\s+(..+$)") 

48 

49# ------------------------ 

50# Module defined exception 

51# ------------------------ 

52 

53 

54class ArffError(OSError): 

55 pass 

56 

57 

58class ParseArffError(ArffError): 

59 pass 

60 

61 

62# ---------- 

63# Attributes 

64# ---------- 

65class Attribute: 

66 

67 type_name = None 

68 

69 def __init__(self, name): 

70 self.name = name 

71 self.range = None 

72 self.dtype = np.object_ 

73 

74 @classmethod 

75 def parse_attribute(cls, name, attr_string): 

76 """ 

77 Parse the attribute line if it knows how. Returns the parsed 

78 attribute, or None. 

79 """ 

80 return None 

81 

82 def parse_data(self, data_str): 

83 """ 

84 Parse a value of this type. 

85 """ 

86 return None 

87 

88 def __str__(self): 

89 """ 

90 Parse a value of this type. 

91 """ 

92 return self.name + ',' + self.type_name 

93 

94 

95class NominalAttribute(Attribute): 

96 

97 type_name = 'nominal' 

98 

99 def __init__(self, name, values): 

100 super().__init__(name) 

101 self.values = values 

102 self.range = values 

103 self.dtype = (np.bytes_, max(len(i) for i in values)) 

104 

105 @staticmethod 

106 def _get_nom_val(atrv): 

107 """Given a string containing a nominal type, returns a tuple of the 

108 possible values. 

109 

110 A nominal type is defined as something framed between braces ({}). 

111 

112 Parameters 

113 ---------- 

114 atrv : str 

115 Nominal type definition 

116 

117 Returns 

118 ------- 

119 poss_vals : tuple 

120 possible values 

121 

122 Examples 

123 -------- 

124 >>> from scipy.io.arff._arffread import NominalAttribute 

125 >>> NominalAttribute._get_nom_val("{floup, bouga, fl, ratata}") 

126 ('floup', 'bouga', 'fl', 'ratata') 

127 """ 

128 m = r_nominal.match(atrv) 

129 if m: 

130 attrs, _ = split_data_line(m.group(1)) 

131 return tuple(attrs) 

132 else: 

133 raise ValueError("This does not look like a nominal string") 

134 

135 @classmethod 

136 def parse_attribute(cls, name, attr_string): 

137 """ 

138 Parse the attribute line if it knows how. Returns the parsed 

139 attribute, or None. 

140 

141 For nominal attributes, the attribute string would be like '{<attr_1>, 

142 <attr2>, <attr_3>}'. 

143 """ 

144 if attr_string[0] == '{': 

145 values = cls._get_nom_val(attr_string) 

146 return cls(name, values) 

147 else: 

148 return None 

149 

150 def parse_data(self, data_str): 

151 """ 

152 Parse a value of this type. 

153 """ 

154 if data_str in self.values: 

155 return data_str 

156 elif data_str == '?': 

157 return data_str 

158 else: 

159 raise ValueError("{} value not in {}".format(str(data_str), 

160 str(self.values))) 

161 

162 def __str__(self): 

163 msg = self.name + ",{" 

164 for i in range(len(self.values)-1): 

165 msg += self.values[i] + "," 

166 msg += self.values[-1] 

167 msg += "}" 

168 return msg 

169 

170 

171class NumericAttribute(Attribute): 

172 

173 def __init__(self, name): 

174 super().__init__(name) 

175 self.type_name = 'numeric' 

176 self.dtype = np.float64 

177 

178 @classmethod 

179 def parse_attribute(cls, name, attr_string): 

180 """ 

181 Parse the attribute line if it knows how. Returns the parsed 

182 attribute, or None. 

183 

184 For numeric attributes, the attribute string would be like 

185 'numeric' or 'int' or 'real'. 

186 """ 

187 

188 attr_string = attr_string.lower().strip() 

189 

190 if (attr_string[:len('numeric')] == 'numeric' or 

191 attr_string[:len('int')] == 'int' or 

192 attr_string[:len('real')] == 'real'): 

193 return cls(name) 

194 else: 

195 return None 

196 

197 def parse_data(self, data_str): 

198 """ 

199 Parse a value of this type. 

200 

201 Parameters 

202 ---------- 

203 data_str : str 

204 string to convert 

205 

206 Returns 

207 ------- 

208 f : float 

209 where float can be nan 

210 

211 Examples 

212 -------- 

213 >>> from scipy.io.arff._arffread import NumericAttribute 

214 >>> atr = NumericAttribute('atr') 

215 >>> atr.parse_data('1') 

216 1.0 

217 >>> atr.parse_data('1\\n') 

218 1.0 

219 >>> atr.parse_data('?\\n') 

220 nan 

221 """ 

222 if '?' in data_str: 

223 return np.nan 

224 else: 

225 return float(data_str) 

226 

227 def _basic_stats(self, data): 

228 nbfac = data.size * 1. / (data.size - 1) 

229 return (np.nanmin(data), np.nanmax(data), 

230 np.mean(data), np.std(data) * nbfac) 

231 

232 

233class StringAttribute(Attribute): 

234 

235 def __init__(self, name): 

236 super().__init__(name) 

237 self.type_name = 'string' 

238 

239 @classmethod 

240 def parse_attribute(cls, name, attr_string): 

241 """ 

242 Parse the attribute line if it knows how. Returns the parsed 

243 attribute, or None. 

244 

245 For string attributes, the attribute string would be like 

246 'string'. 

247 """ 

248 

249 attr_string = attr_string.lower().strip() 

250 

251 if attr_string[:len('string')] == 'string': 

252 return cls(name) 

253 else: 

254 return None 

255 

256 

257class DateAttribute(Attribute): 

258 

259 def __init__(self, name, date_format, datetime_unit): 

260 super().__init__(name) 

261 self.date_format = date_format 

262 self.datetime_unit = datetime_unit 

263 self.type_name = 'date' 

264 self.range = date_format 

265 self.dtype = np.datetime64(0, self.datetime_unit) 

266 

267 @staticmethod 

268 def _get_date_format(atrv): 

269 m = r_date.match(atrv) 

270 if m: 

271 pattern = m.group(1).strip() 

272 # convert time pattern from Java's SimpleDateFormat to C's format 

273 datetime_unit = None 

274 if "yyyy" in pattern: 

275 pattern = pattern.replace("yyyy", "%Y") 

276 datetime_unit = "Y" 

277 elif "yy": 

278 pattern = pattern.replace("yy", "%y") 

279 datetime_unit = "Y" 

280 if "MM" in pattern: 

281 pattern = pattern.replace("MM", "%m") 

282 datetime_unit = "M" 

283 if "dd" in pattern: 

284 pattern = pattern.replace("dd", "%d") 

285 datetime_unit = "D" 

286 if "HH" in pattern: 

287 pattern = pattern.replace("HH", "%H") 

288 datetime_unit = "h" 

289 if "mm" in pattern: 

290 pattern = pattern.replace("mm", "%M") 

291 datetime_unit = "m" 

292 if "ss" in pattern: 

293 pattern = pattern.replace("ss", "%S") 

294 datetime_unit = "s" 

295 if "z" in pattern or "Z" in pattern: 

296 raise ValueError("Date type attributes with time zone not " 

297 "supported, yet") 

298 

299 if datetime_unit is None: 

300 raise ValueError("Invalid or unsupported date format") 

301 

302 return pattern, datetime_unit 

303 else: 

304 raise ValueError("Invalid or no date format") 

305 

306 @classmethod 

307 def parse_attribute(cls, name, attr_string): 

308 """ 

309 Parse the attribute line if it knows how. Returns the parsed 

310 attribute, or None. 

311 

312 For date attributes, the attribute string would be like 

313 'date <format>'. 

314 """ 

315 

316 attr_string_lower = attr_string.lower().strip() 

317 

318 if attr_string_lower[:len('date')] == 'date': 

319 date_format, datetime_unit = cls._get_date_format(attr_string) 

320 return cls(name, date_format, datetime_unit) 

321 else: 

322 return None 

323 

324 def parse_data(self, data_str): 

325 """ 

326 Parse a value of this type. 

327 """ 

328 date_str = data_str.strip().strip("'").strip('"') 

329 if date_str == '?': 

330 return np.datetime64('NaT', self.datetime_unit) 

331 else: 

332 dt = datetime.datetime.strptime(date_str, self.date_format) 

333 return np.datetime64(dt).astype( 

334 "datetime64[%s]" % self.datetime_unit) 

335 

336 def __str__(self): 

337 return super().__str__() + ',' + self.date_format 

338 

339 

340class RelationalAttribute(Attribute): 

341 

342 def __init__(self, name): 

343 super().__init__(name) 

344 self.type_name = 'relational' 

345 self.dtype = np.object_ 

346 self.attributes = [] 

347 self.dialect = None 

348 

349 @classmethod 

350 def parse_attribute(cls, name, attr_string): 

351 """ 

352 Parse the attribute line if it knows how. Returns the parsed 

353 attribute, or None. 

354 

355 For date attributes, the attribute string would be like 

356 'date <format>'. 

357 """ 

358 

359 attr_string_lower = attr_string.lower().strip() 

360 

361 if attr_string_lower[:len('relational')] == 'relational': 

362 return cls(name) 

363 else: 

364 return None 

365 

366 def parse_data(self, data_str): 

367 # Copy-pasted 

368 elems = list(range(len(self.attributes))) 

369 

370 escaped_string = data_str.encode().decode("unicode-escape") 

371 

372 row_tuples = [] 

373 

374 for raw in escaped_string.split("\n"): 

375 row, self.dialect = split_data_line(raw, self.dialect) 

376 

377 row_tuples.append(tuple( 

378 [self.attributes[i].parse_data(row[i]) for i in elems])) 

379 

380 return np.array(row_tuples, 

381 [(a.name, a.dtype) for a in self.attributes]) 

382 

383 def __str__(self): 

384 return (super().__str__() + '\n\t' + 

385 '\n\t'.join(str(a) for a in self.attributes)) 

386 

387 

388# ----------------- 

389# Various utilities 

390# ----------------- 

391def to_attribute(name, attr_string): 

392 attr_classes = (NominalAttribute, NumericAttribute, DateAttribute, 

393 StringAttribute, RelationalAttribute) 

394 

395 for cls in attr_classes: 

396 attr = cls.parse_attribute(name, attr_string) 

397 if attr is not None: 

398 return attr 

399 

400 raise ParseArffError("unknown attribute %s" % attr_string) 

401 

402 

403def csv_sniffer_has_bug_last_field(): 

404 """ 

405 Checks if the bug https://bugs.python.org/issue30157 is unpatched. 

406 """ 

407 

408 # We only compute this once. 

409 has_bug = getattr(csv_sniffer_has_bug_last_field, "has_bug", None) 

410 

411 if has_bug is None: 

412 dialect = csv.Sniffer().sniff("3, 'a'") 

413 csv_sniffer_has_bug_last_field.has_bug = dialect.quotechar != "'" 

414 has_bug = csv_sniffer_has_bug_last_field.has_bug 

415 

416 return has_bug 

417 

418 

419def workaround_csv_sniffer_bug_last_field(sniff_line, dialect, delimiters): 

420 """ 

421 Workaround for the bug https://bugs.python.org/issue30157 if is unpatched. 

422 """ 

423 if csv_sniffer_has_bug_last_field(): 

424 # Reuses code from the csv module 

425 right_regex = r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)' 

426 

427 for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", 

428 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # .*?", 

429 right_regex, # ,".*?" 

430 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) 

431 regexp = re.compile(restr, re.DOTALL | re.MULTILINE) 

432 matches = regexp.findall(sniff_line) 

433 if matches: 

434 break 

435 

436 # If it does not match the expression that was bugged, then this bug does not apply 

437 if restr != right_regex: 

438 return 

439 

440 groupindex = regexp.groupindex 

441 

442 # There is only one end of the string 

443 assert len(matches) == 1 

444 m = matches[0] 

445 

446 n = groupindex['quote'] - 1 

447 quote = m[n] 

448 

449 n = groupindex['delim'] - 1 

450 delim = m[n] 

451 

452 n = groupindex['space'] - 1 

453 space = bool(m[n]) 

454 

455 dq_regexp = re.compile( 

456 r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % 

457 {'delim': re.escape(delim), 'quote': quote}, re.MULTILINE 

458 ) 

459 

460 doublequote = bool(dq_regexp.search(sniff_line)) 

461 

462 dialect.quotechar = quote 

463 if delim in delimiters: 

464 dialect.delimiter = delim 

465 dialect.doublequote = doublequote 

466 dialect.skipinitialspace = space 

467 

468 

469def split_data_line(line, dialect=None): 

470 delimiters = ",\t" 

471 

472 # This can not be done in a per reader basis, and relational fields 

473 # can be HUGE 

474 csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) 

475 

476 # Remove the line end if any 

477 if line[-1] == '\n': 

478 line = line[:-1] 

479 

480 # Remove potential trailing whitespace 

481 line = line.strip() 

482 

483 sniff_line = line 

484 

485 # Add a delimiter if none is present, so that the csv.Sniffer 

486 # does not complain for a single-field CSV. 

487 if not any(d in line for d in delimiters): 

488 sniff_line += "," 

489 

490 if dialect is None: 

491 dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters) 

492 workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line, 

493 dialect=dialect, 

494 delimiters=delimiters) 

495 

496 row = next(csv.reader([line], dialect)) 

497 

498 return row, dialect 

499 

500 

501# -------------- 

502# Parsing header 

503# -------------- 

504def tokenize_attribute(iterable, attribute): 

505 """Parse a raw string in header (e.g., starts by @attribute). 

506 

507 Given a raw string attribute, try to get the name and type of the 

508 attribute. Constraints: 

509 

510 * The first line must start with @attribute (case insensitive, and 

511 space like characters before @attribute are allowed) 

512 * Works also if the attribute is spread on multilines. 

513 * Works if empty lines or comments are in between 

514 

515 Parameters 

516 ---------- 

517 attribute : str 

518 the attribute string. 

519 

520 Returns 

521 ------- 

522 name : str 

523 name of the attribute 

524 value : str 

525 value of the attribute 

526 next : str 

527 next line to be parsed 

528 

529 Examples 

530 -------- 

531 If attribute is a string defined in python as r"floupi real", will 

532 return floupi as name, and real as value. 

533 

534 >>> from scipy.io.arff._arffread import tokenize_attribute 

535 >>> iterable = iter([0] * 10) # dummy iterator 

536 >>> tokenize_attribute(iterable, r"@attribute floupi real") 

537 ('floupi', 'real', 0) 

538 

539 If attribute is r"'floupi 2' real", will return 'floupi 2' as name, 

540 and real as value. 

541 

542 >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ") 

543 ('floupi 2', 'real', 0) 

544 

545 """ 

546 sattr = attribute.strip() 

547 mattr = r_attribute.match(sattr) 

548 if mattr: 

549 # atrv is everything after @attribute 

550 atrv = mattr.group(1) 

551 if r_comattrval.match(atrv): 

552 name, type = tokenize_single_comma(atrv) 

553 next_item = next(iterable) 

554 elif r_wcomattrval.match(atrv): 

555 name, type = tokenize_single_wcomma(atrv) 

556 next_item = next(iterable) 

557 else: 

558 # Not sure we should support this, as it does not seem supported by 

559 # weka. 

560 raise ValueError("multi line not supported yet") 

561 else: 

562 raise ValueError("First line unparsable: %s" % sattr) 

563 

564 attribute = to_attribute(name, type) 

565 

566 if type.lower() == 'relational': 

567 next_item = read_relational_attribute(iterable, attribute, next_item) 

568 # raise ValueError("relational attributes not supported yet") 

569 

570 return attribute, next_item 

571 

572 

573def tokenize_single_comma(val): 

574 # XXX we match twice the same string (here and at the caller level). It is 

575 # stupid, but it is easier for now... 

576 m = r_comattrval.match(val) 

577 if m: 

578 try: 

579 name = m.group(1).strip() 

580 type = m.group(2).strip() 

581 except IndexError as e: 

582 raise ValueError("Error while tokenizing attribute") from e 

583 else: 

584 raise ValueError("Error while tokenizing single %s" % val) 

585 return name, type 

586 

587 

588def tokenize_single_wcomma(val): 

589 # XXX we match twice the same string (here and at the caller level). It is 

590 # stupid, but it is easier for now... 

591 m = r_wcomattrval.match(val) 

592 if m: 

593 try: 

594 name = m.group(1).strip() 

595 type = m.group(2).strip() 

596 except IndexError as e: 

597 raise ValueError("Error while tokenizing attribute") from e 

598 else: 

599 raise ValueError("Error while tokenizing single %s" % val) 

600 return name, type 

601 

602 

603def read_relational_attribute(ofile, relational_attribute, i): 

604 """Read the nested attributes of a relational attribute""" 

605 

606 r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' + 

607 relational_attribute.name + r'\s*$') 

608 

609 while not r_end_relational.match(i): 

610 m = r_headerline.match(i) 

611 if m: 

612 isattr = r_attribute.match(i) 

613 if isattr: 

614 attr, i = tokenize_attribute(ofile, i) 

615 relational_attribute.attributes.append(attr) 

616 else: 

617 raise ValueError("Error parsing line %s" % i) 

618 else: 

619 i = next(ofile) 

620 

621 i = next(ofile) 

622 return i 

623 

624 

625def read_header(ofile): 

626 """Read the header of the iterable ofile.""" 

627 i = next(ofile) 

628 

629 # Pass first comments 

630 while r_comment.match(i): 

631 i = next(ofile) 

632 

633 # Header is everything up to DATA attribute ? 

634 relation = None 

635 attributes = [] 

636 while not r_datameta.match(i): 

637 m = r_headerline.match(i) 

638 if m: 

639 isattr = r_attribute.match(i) 

640 if isattr: 

641 attr, i = tokenize_attribute(ofile, i) 

642 attributes.append(attr) 

643 else: 

644 isrel = r_relation.match(i) 

645 if isrel: 

646 relation = isrel.group(1) 

647 else: 

648 raise ValueError("Error parsing line %s" % i) 

649 i = next(ofile) 

650 else: 

651 i = next(ofile) 

652 

653 return relation, attributes 

654 

655 

656class MetaData: 

657 """Small container to keep useful information on a ARFF dataset. 

658 

659 Knows about attributes names and types. 

660 

661 Examples 

662 -------- 

663 :: 

664 

665 data, meta = loadarff('iris.arff') 

666 # This will print the attributes names of the iris.arff dataset 

667 for i in meta: 

668 print(i) 

669 # This works too 

670 meta.names() 

671 # Getting attribute type 

672 types = meta.types() 

673 

674 Methods 

675 ------- 

676 names 

677 types 

678 

679 Notes 

680 ----- 

681 Also maintains the list of attributes in order, i.e., doing for i in 

682 meta, where meta is an instance of MetaData, will return the 

683 different attribute names in the order they were defined. 

684 """ 

685 def __init__(self, rel, attr): 

686 self.name = rel 

687 self._attributes = {a.name: a for a in attr} 

688 

689 def __repr__(self): 

690 msg = "" 

691 msg += "Dataset: %s\n" % self.name 

692 for i in self._attributes: 

693 msg += f"\t{i}'s type is {self._attributes[i].type_name}" 

694 if self._attributes[i].range: 

695 msg += ", range is %s" % str(self._attributes[i].range) 

696 msg += '\n' 

697 return msg 

698 

699 def __iter__(self): 

700 return iter(self._attributes) 

701 

702 def __getitem__(self, key): 

703 attr = self._attributes[key] 

704 

705 return (attr.type_name, attr.range) 

706 

707 def names(self): 

708 """Return the list of attribute names. 

709 

710 Returns 

711 ------- 

712 attrnames : list of str 

713 The attribute names. 

714 """ 

715 return list(self._attributes) 

716 

717 def types(self): 

718 """Return the list of attribute types. 

719 

720 Returns 

721 ------- 

722 attr_types : list of str 

723 The attribute types. 

724 """ 

725 attr_types = [self._attributes[name].type_name 

726 for name in self._attributes] 

727 return attr_types 

728 

729 

730def loadarff(f): 

731 """ 

732 Read an arff file. 

733 

734 The data is returned as a record array, which can be accessed much like 

735 a dictionary of NumPy arrays. For example, if one of the attributes is 

736 called 'pressure', then its first 10 data points can be accessed from the 

737 ``data`` record array like so: ``data['pressure'][0:10]`` 

738 

739 

740 Parameters 

741 ---------- 

742 f : file-like or str 

743 File-like object to read from, or filename to open. 

744 

745 Returns 

746 ------- 

747 data : record array 

748 The data of the arff file, accessible by attribute names. 

749 meta : `MetaData` 

750 Contains information about the arff file such as name and 

751 type of attributes, the relation (name of the dataset), etc. 

752 

753 Raises 

754 ------ 

755 ParseArffError 

756 This is raised if the given file is not ARFF-formatted. 

757 NotImplementedError 

758 The ARFF file has an attribute which is not supported yet. 

759 

760 Notes 

761 ----- 

762 

763 This function should be able to read most arff files. Not 

764 implemented functionality include: 

765 

766 * date type attributes 

767 * string type attributes 

768 

769 It can read files with numeric and nominal attributes. It cannot read 

770 files with sparse data ({} in the file). However, this function can 

771 read files with missing data (? in the file), representing the data 

772 points as NaNs. 

773 

774 Examples 

775 -------- 

776 >>> from scipy.io import arff 

777 >>> from io import StringIO 

778 >>> content = \"\"\" 

779 ... @relation foo 

780 ... @attribute width numeric 

781 ... @attribute height numeric 

782 ... @attribute color {red,green,blue,yellow,black} 

783 ... @data 

784 ... 5.0,3.25,blue 

785 ... 4.5,3.75,green 

786 ... 3.0,4.00,red 

787 ... \"\"\" 

788 >>> f = StringIO(content) 

789 >>> data, meta = arff.loadarff(f) 

790 >>> data 

791 array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')], 

792 dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')]) 

793 >>> meta 

794 Dataset: foo 

795 \twidth's type is numeric 

796 \theight's type is numeric 

797 \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black') 

798 

799 """ 

800 if hasattr(f, 'read'): 

801 ofile = f 

802 else: 

803 ofile = open(f) 

804 try: 

805 return _loadarff(ofile) 

806 finally: 

807 if ofile is not f: # only close what we opened 

808 ofile.close() 

809 

810 

811def _loadarff(ofile): 

812 # Parse the header file 

813 try: 

814 rel, attr = read_header(ofile) 

815 except ValueError as e: 

816 msg = "Error while parsing header, error was: " + str(e) 

817 raise ParseArffError(msg) from e 

818 

819 # Check whether we have a string attribute (not supported yet) 

820 hasstr = False 

821 for a in attr: 

822 if isinstance(a, StringAttribute): 

823 hasstr = True 

824 

825 meta = MetaData(rel, attr) 

826 

827 # XXX The following code is not great 

828 # Build the type descriptor descr and the list of convertors to convert 

829 # each attribute to the suitable type (which should match the one in 

830 # descr). 

831 

832 # This can be used once we want to support integer as integer values and 

833 # not as numeric anymore (using masked arrays ?). 

834 

835 if hasstr: 

836 # How to support string efficiently ? Ideally, we should know the max 

837 # size of the string before allocating the numpy array. 

838 raise NotImplementedError("String attributes not supported yet, sorry") 

839 

840 ni = len(attr) 

841 

842 def generator(row_iter, delim=','): 

843 # TODO: this is where we are spending time (~80%). I think things 

844 # could be made more efficiently: 

845 # - We could for example "compile" the function, because some values 

846 # do not change here. 

847 # - The function to convert a line to dtyped values could also be 

848 # generated on the fly from a string and be executed instead of 

849 # looping. 

850 # - The regex are overkill: for comments, checking that a line starts 

851 # by % should be enough and faster, and for empty lines, same thing 

852 # --> this does not seem to change anything. 

853 

854 # 'compiling' the range since it does not change 

855 # Note, I have already tried zipping the converters and 

856 # row elements and got slightly worse performance. 

857 elems = list(range(ni)) 

858 

859 dialect = None 

860 for raw in row_iter: 

861 # We do not abstract skipping comments and empty lines for 

862 # performance reasons. 

863 if r_comment.match(raw) or r_empty.match(raw): 

864 continue 

865 

866 row, dialect = split_data_line(raw, dialect) 

867 

868 yield tuple([attr[i].parse_data(row[i]) for i in elems]) 

869 

870 a = list(generator(ofile)) 

871 # No error should happen here: it is a bug otherwise 

872 data = np.array(a, [(a.name, a.dtype) for a in attr]) 

873 return data, meta 

874 

875 

876# ---- 

877# Misc 

878# ---- 

879def basic_stats(data): 

880 nbfac = data.size * 1. / (data.size - 1) 

881 return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac 

882 

883 

884def print_attribute(name, tp, data): 

885 type = tp.type_name 

886 if type == 'numeric' or type == 'real' or type == 'integer': 

887 min, max, mean, std = basic_stats(data) 

888 print(f"{name},{type},{min:f},{max:f},{mean:f},{std:f}") 

889 else: 

890 print(str(tp)) 

891 

892 

893def test_weka(filename): 

894 data, meta = loadarff(filename) 

895 print(len(data.dtype)) 

896 print(data.size) 

897 for i in meta: 

898 print_attribute(i, meta[i], data[i]) 

899 

900 

901# make sure nose does not find this as a test 

902test_weka.__test__ = False 

903 

904 

905if __name__ == '__main__': 

906 import sys 

907 filename = sys.argv[1] 

908 test_weka(filename)