Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/scipy/io/arff/_arffread.py: 23%

373 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-14 06:37 +0000

1# Last Change: Mon Aug 20 08:00 PM 2007 J 

2import re 

3import datetime 

4 

5import numpy as np 

6 

7import csv 

8import ctypes 

9 

10"""A module to read arff files.""" 

11 

12__all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError'] 

13 

14# An Arff file is basically two parts: 

15# - header 

16# - data 

17# 

18# A header has each of its components starting by @META where META is one of 

19# the keyword (attribute of relation, for now). 

20 

21# TODO: 

22# - both integer and reals are treated as numeric -> the integer info 

23# is lost! 

24# - Replace ValueError by ParseError or something 

25 

26# We know can handle the following: 

27# - numeric and nominal attributes 

28# - missing values for numeric attributes 

29 

30r_meta = re.compile(r'^\s*@') 

31# Match a comment 

32r_comment = re.compile(r'^%') 

33# Match an empty line 

34r_empty = re.compile(r'^\s+$') 

35# Match a header line, that is a line which starts by @ + a word 

36r_headerline = re.compile(r'^\s*@\S*') 

37r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]') 

38r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)') 

39r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)') 

40 

41r_nominal = re.compile(r'{(.+)}') 

42r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$") 

43 

44# To get attributes name enclosed with '' 

45r_comattrval = re.compile(r"'(..+)'\s+(..+$)") 

46# To get normal attributes 

47r_wcomattrval = re.compile(r"(\S+)\s+(..+$)") 

48 

49# ------------------------ 

50# Module defined exception 

51# ------------------------ 

52 

53 

54class ArffError(OSError): 

55 pass 

56 

57 

58class ParseArffError(ArffError): 

59 pass 

60 

61 

62# ---------- 

63# Attributes 

64# ---------- 

65class Attribute: 

66 

67 type_name = None 

68 

69 def __init__(self, name): 

70 self.name = name 

71 self.range = None 

72 self.dtype = np.object_ 

73 

74 @classmethod 

75 def parse_attribute(cls, name, attr_string): 

76 """ 

77 Parse the attribute line if it knows how. Returns the parsed 

78 attribute, or None. 

79 """ 

80 return None 

81 

82 def parse_data(self, data_str): 

83 """ 

84 Parse a value of this type. 

85 """ 

86 return None 

87 

88 def __str__(self): 

89 """ 

90 Parse a value of this type. 

91 """ 

92 return self.name + ',' + self.type_name 

93 

94 

95class NominalAttribute(Attribute): 

96 

97 type_name = 'nominal' 

98 

99 def __init__(self, name, values): 

100 super().__init__(name) 

101 self.values = values 

102 self.range = values 

103 self.dtype = (np.bytes_, max(len(i) for i in values)) 

104 

105 @staticmethod 

106 def _get_nom_val(atrv): 

107 """Given a string containing a nominal type, returns a tuple of the 

108 possible values. 

109 

110 A nominal type is defined as something framed between braces ({}). 

111 

112 Parameters 

113 ---------- 

114 atrv : str 

115 Nominal type definition 

116 

117 Returns 

118 ------- 

119 poss_vals : tuple 

120 possible values 

121 

122 Examples 

123 -------- 

124 >>> from scipy.io.arff._arffread import NominalAttribute 

125 >>> NominalAttribute._get_nom_val("{floup, bouga, fl, ratata}") 

126 ('floup', 'bouga', 'fl', 'ratata') 

127 """ 

128 m = r_nominal.match(atrv) 

129 if m: 

130 attrs, _ = split_data_line(m.group(1)) 

131 return tuple(attrs) 

132 else: 

133 raise ValueError("This does not look like a nominal string") 

134 

135 @classmethod 

136 def parse_attribute(cls, name, attr_string): 

137 """ 

138 Parse the attribute line if it knows how. Returns the parsed 

139 attribute, or None. 

140 

141 For nominal attributes, the attribute string would be like '{<attr_1>, 

142 <attr2>, <attr_3>}'. 

143 """ 

144 if attr_string[0] == '{': 

145 values = cls._get_nom_val(attr_string) 

146 return cls(name, values) 

147 else: 

148 return None 

149 

150 def parse_data(self, data_str): 

151 """ 

152 Parse a value of this type. 

153 """ 

154 if data_str in self.values: 

155 return data_str 

156 elif data_str == '?': 

157 return data_str 

158 else: 

159 raise ValueError(f"{str(data_str)} value not in {str(self.values)}") 

160 

161 def __str__(self): 

162 msg = self.name + ",{" 

163 for i in range(len(self.values)-1): 

164 msg += self.values[i] + "," 

165 msg += self.values[-1] 

166 msg += "}" 

167 return msg 

168 

169 

170class NumericAttribute(Attribute): 

171 

172 def __init__(self, name): 

173 super().__init__(name) 

174 self.type_name = 'numeric' 

175 self.dtype = np.float64 

176 

177 @classmethod 

178 def parse_attribute(cls, name, attr_string): 

179 """ 

180 Parse the attribute line if it knows how. Returns the parsed 

181 attribute, or None. 

182 

183 For numeric attributes, the attribute string would be like 

184 'numeric' or 'int' or 'real'. 

185 """ 

186 

187 attr_string = attr_string.lower().strip() 

188 

189 if (attr_string[:len('numeric')] == 'numeric' or 

190 attr_string[:len('int')] == 'int' or 

191 attr_string[:len('real')] == 'real'): 

192 return cls(name) 

193 else: 

194 return None 

195 

196 def parse_data(self, data_str): 

197 """ 

198 Parse a value of this type. 

199 

200 Parameters 

201 ---------- 

202 data_str : str 

203 string to convert 

204 

205 Returns 

206 ------- 

207 f : float 

208 where float can be nan 

209 

210 Examples 

211 -------- 

212 >>> from scipy.io.arff._arffread import NumericAttribute 

213 >>> atr = NumericAttribute('atr') 

214 >>> atr.parse_data('1') 

215 1.0 

216 >>> atr.parse_data('1\\n') 

217 1.0 

218 >>> atr.parse_data('?\\n') 

219 nan 

220 """ 

221 if '?' in data_str: 

222 return np.nan 

223 else: 

224 return float(data_str) 

225 

226 def _basic_stats(self, data): 

227 nbfac = data.size * 1. / (data.size - 1) 

228 return (np.nanmin(data), np.nanmax(data), 

229 np.mean(data), np.std(data) * nbfac) 

230 

231 

232class StringAttribute(Attribute): 

233 

234 def __init__(self, name): 

235 super().__init__(name) 

236 self.type_name = 'string' 

237 

238 @classmethod 

239 def parse_attribute(cls, name, attr_string): 

240 """ 

241 Parse the attribute line if it knows how. Returns the parsed 

242 attribute, or None. 

243 

244 For string attributes, the attribute string would be like 

245 'string'. 

246 """ 

247 

248 attr_string = attr_string.lower().strip() 

249 

250 if attr_string[:len('string')] == 'string': 

251 return cls(name) 

252 else: 

253 return None 

254 

255 

256class DateAttribute(Attribute): 

257 

258 def __init__(self, name, date_format, datetime_unit): 

259 super().__init__(name) 

260 self.date_format = date_format 

261 self.datetime_unit = datetime_unit 

262 self.type_name = 'date' 

263 self.range = date_format 

264 self.dtype = np.datetime64(0, self.datetime_unit) 

265 

266 @staticmethod 

267 def _get_date_format(atrv): 

268 m = r_date.match(atrv) 

269 if m: 

270 pattern = m.group(1).strip() 

271 # convert time pattern from Java's SimpleDateFormat to C's format 

272 datetime_unit = None 

273 if "yyyy" in pattern: 

274 pattern = pattern.replace("yyyy", "%Y") 

275 datetime_unit = "Y" 

276 elif "yy": 

277 pattern = pattern.replace("yy", "%y") 

278 datetime_unit = "Y" 

279 if "MM" in pattern: 

280 pattern = pattern.replace("MM", "%m") 

281 datetime_unit = "M" 

282 if "dd" in pattern: 

283 pattern = pattern.replace("dd", "%d") 

284 datetime_unit = "D" 

285 if "HH" in pattern: 

286 pattern = pattern.replace("HH", "%H") 

287 datetime_unit = "h" 

288 if "mm" in pattern: 

289 pattern = pattern.replace("mm", "%M") 

290 datetime_unit = "m" 

291 if "ss" in pattern: 

292 pattern = pattern.replace("ss", "%S") 

293 datetime_unit = "s" 

294 if "z" in pattern or "Z" in pattern: 

295 raise ValueError("Date type attributes with time zone not " 

296 "supported, yet") 

297 

298 if datetime_unit is None: 

299 raise ValueError("Invalid or unsupported date format") 

300 

301 return pattern, datetime_unit 

302 else: 

303 raise ValueError("Invalid or no date format") 

304 

305 @classmethod 

306 def parse_attribute(cls, name, attr_string): 

307 """ 

308 Parse the attribute line if it knows how. Returns the parsed 

309 attribute, or None. 

310 

311 For date attributes, the attribute string would be like 

312 'date <format>'. 

313 """ 

314 

315 attr_string_lower = attr_string.lower().strip() 

316 

317 if attr_string_lower[:len('date')] == 'date': 

318 date_format, datetime_unit = cls._get_date_format(attr_string) 

319 return cls(name, date_format, datetime_unit) 

320 else: 

321 return None 

322 

323 def parse_data(self, data_str): 

324 """ 

325 Parse a value of this type. 

326 """ 

327 date_str = data_str.strip().strip("'").strip('"') 

328 if date_str == '?': 

329 return np.datetime64('NaT', self.datetime_unit) 

330 else: 

331 dt = datetime.datetime.strptime(date_str, self.date_format) 

332 return np.datetime64(dt).astype( 

333 "datetime64[%s]" % self.datetime_unit) 

334 

335 def __str__(self): 

336 return super().__str__() + ',' + self.date_format 

337 

338 

339class RelationalAttribute(Attribute): 

340 

341 def __init__(self, name): 

342 super().__init__(name) 

343 self.type_name = 'relational' 

344 self.dtype = np.object_ 

345 self.attributes = [] 

346 self.dialect = None 

347 

348 @classmethod 

349 def parse_attribute(cls, name, attr_string): 

350 """ 

351 Parse the attribute line if it knows how. Returns the parsed 

352 attribute, or None. 

353 

354 For date attributes, the attribute string would be like 

355 'date <format>'. 

356 """ 

357 

358 attr_string_lower = attr_string.lower().strip() 

359 

360 if attr_string_lower[:len('relational')] == 'relational': 

361 return cls(name) 

362 else: 

363 return None 

364 

365 def parse_data(self, data_str): 

366 # Copy-pasted 

367 elems = list(range(len(self.attributes))) 

368 

369 escaped_string = data_str.encode().decode("unicode-escape") 

370 

371 row_tuples = [] 

372 

373 for raw in escaped_string.split("\n"): 

374 row, self.dialect = split_data_line(raw, self.dialect) 

375 

376 row_tuples.append(tuple( 

377 [self.attributes[i].parse_data(row[i]) for i in elems])) 

378 

379 return np.array(row_tuples, 

380 [(a.name, a.dtype) for a in self.attributes]) 

381 

382 def __str__(self): 

383 return (super().__str__() + '\n\t' + 

384 '\n\t'.join(str(a) for a in self.attributes)) 

385 

386 

387# ----------------- 

388# Various utilities 

389# ----------------- 

390def to_attribute(name, attr_string): 

391 attr_classes = (NominalAttribute, NumericAttribute, DateAttribute, 

392 StringAttribute, RelationalAttribute) 

393 

394 for cls in attr_classes: 

395 attr = cls.parse_attribute(name, attr_string) 

396 if attr is not None: 

397 return attr 

398 

399 raise ParseArffError("unknown attribute %s" % attr_string) 

400 

401 

402def csv_sniffer_has_bug_last_field(): 

403 """ 

404 Checks if the bug https://bugs.python.org/issue30157 is unpatched. 

405 """ 

406 

407 # We only compute this once. 

408 has_bug = getattr(csv_sniffer_has_bug_last_field, "has_bug", None) 

409 

410 if has_bug is None: 

411 dialect = csv.Sniffer().sniff("3, 'a'") 

412 csv_sniffer_has_bug_last_field.has_bug = dialect.quotechar != "'" 

413 has_bug = csv_sniffer_has_bug_last_field.has_bug 

414 

415 return has_bug 

416 

417 

418def workaround_csv_sniffer_bug_last_field(sniff_line, dialect, delimiters): 

419 """ 

420 Workaround for the bug https://bugs.python.org/issue30157 if is unpatched. 

421 """ 

422 if csv_sniffer_has_bug_last_field(): 

423 # Reuses code from the csv module 

424 right_regex = r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)' # noqa: E501 

425 

426 for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", # noqa: E501 

427 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # .*?", # noqa: E501 

428 right_regex, # ,".*?" 

429 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) # noqa: E501 

430 regexp = re.compile(restr, re.DOTALL | re.MULTILINE) 

431 matches = regexp.findall(sniff_line) 

432 if matches: 

433 break 

434 

435 # If it does not match the expression that was bugged, 

436 # then this bug does not apply 

437 if restr != right_regex: 

438 return 

439 

440 groupindex = regexp.groupindex 

441 

442 # There is only one end of the string 

443 assert len(matches) == 1 

444 m = matches[0] 

445 

446 n = groupindex['quote'] - 1 

447 quote = m[n] 

448 

449 n = groupindex['delim'] - 1 

450 delim = m[n] 

451 

452 n = groupindex['space'] - 1 

453 space = bool(m[n]) 

454 

455 dq_regexp = re.compile( 

456 rf"(({re.escape(delim)})|^)\W*{quote}[^{re.escape(delim)}\n]*{quote}[^{re.escape(delim)}\n]*{quote}\W*(({re.escape(delim)})|$)", re.MULTILINE # noqa: E501 

457 ) 

458 

459 doublequote = bool(dq_regexp.search(sniff_line)) 

460 

461 dialect.quotechar = quote 

462 if delim in delimiters: 

463 dialect.delimiter = delim 

464 dialect.doublequote = doublequote 

465 dialect.skipinitialspace = space 

466 

467 

468def split_data_line(line, dialect=None): 

469 delimiters = ",\t" 

470 

471 # This can not be done in a per reader basis, and relational fields 

472 # can be HUGE 

473 csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) 

474 

475 # Remove the line end if any 

476 if line[-1] == '\n': 

477 line = line[:-1] 

478 

479 # Remove potential trailing whitespace 

480 line = line.strip() 

481 

482 sniff_line = line 

483 

484 # Add a delimiter if none is present, so that the csv.Sniffer 

485 # does not complain for a single-field CSV. 

486 if not any(d in line for d in delimiters): 

487 sniff_line += "," 

488 

489 if dialect is None: 

490 dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters) 

491 workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line, 

492 dialect=dialect, 

493 delimiters=delimiters) 

494 

495 row = next(csv.reader([line], dialect)) 

496 

497 return row, dialect 

498 

499 

500# -------------- 

501# Parsing header 

502# -------------- 

503def tokenize_attribute(iterable, attribute): 

504 """Parse a raw string in header (e.g., starts by @attribute). 

505 

506 Given a raw string attribute, try to get the name and type of the 

507 attribute. Constraints: 

508 

509 * The first line must start with @attribute (case insensitive, and 

510 space like characters before @attribute are allowed) 

511 * Works also if the attribute is spread on multilines. 

512 * Works if empty lines or comments are in between 

513 

514 Parameters 

515 ---------- 

516 attribute : str 

517 the attribute string. 

518 

519 Returns 

520 ------- 

521 name : str 

522 name of the attribute 

523 value : str 

524 value of the attribute 

525 next : str 

526 next line to be parsed 

527 

528 Examples 

529 -------- 

530 If attribute is a string defined in python as r"floupi real", will 

531 return floupi as name, and real as value. 

532 

533 >>> from scipy.io.arff._arffread import tokenize_attribute 

534 >>> iterable = iter([0] * 10) # dummy iterator 

535 >>> tokenize_attribute(iterable, r"@attribute floupi real") 

536 ('floupi', 'real', 0) 

537 

538 If attribute is r"'floupi 2' real", will return 'floupi 2' as name, 

539 and real as value. 

540 

541 >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ") 

542 ('floupi 2', 'real', 0) 

543 

544 """ 

545 sattr = attribute.strip() 

546 mattr = r_attribute.match(sattr) 

547 if mattr: 

548 # atrv is everything after @attribute 

549 atrv = mattr.group(1) 

550 if r_comattrval.match(atrv): 

551 name, type = tokenize_single_comma(atrv) 

552 next_item = next(iterable) 

553 elif r_wcomattrval.match(atrv): 

554 name, type = tokenize_single_wcomma(atrv) 

555 next_item = next(iterable) 

556 else: 

557 # Not sure we should support this, as it does not seem supported by 

558 # weka. 

559 raise ValueError("multi line not supported yet") 

560 else: 

561 raise ValueError("First line unparsable: %s" % sattr) 

562 

563 attribute = to_attribute(name, type) 

564 

565 if type.lower() == 'relational': 

566 next_item = read_relational_attribute(iterable, attribute, next_item) 

567 # raise ValueError("relational attributes not supported yet") 

568 

569 return attribute, next_item 

570 

571 

572def tokenize_single_comma(val): 

573 # XXX we match twice the same string (here and at the caller level). It is 

574 # stupid, but it is easier for now... 

575 m = r_comattrval.match(val) 

576 if m: 

577 try: 

578 name = m.group(1).strip() 

579 type = m.group(2).strip() 

580 except IndexError as e: 

581 raise ValueError("Error while tokenizing attribute") from e 

582 else: 

583 raise ValueError("Error while tokenizing single %s" % val) 

584 return name, type 

585 

586 

587def tokenize_single_wcomma(val): 

588 # XXX we match twice the same string (here and at the caller level). It is 

589 # stupid, but it is easier for now... 

590 m = r_wcomattrval.match(val) 

591 if m: 

592 try: 

593 name = m.group(1).strip() 

594 type = m.group(2).strip() 

595 except IndexError as e: 

596 raise ValueError("Error while tokenizing attribute") from e 

597 else: 

598 raise ValueError("Error while tokenizing single %s" % val) 

599 return name, type 

600 

601 

602def read_relational_attribute(ofile, relational_attribute, i): 

603 """Read the nested attributes of a relational attribute""" 

604 

605 r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' + 

606 relational_attribute.name + r'\s*$') 

607 

608 while not r_end_relational.match(i): 

609 m = r_headerline.match(i) 

610 if m: 

611 isattr = r_attribute.match(i) 

612 if isattr: 

613 attr, i = tokenize_attribute(ofile, i) 

614 relational_attribute.attributes.append(attr) 

615 else: 

616 raise ValueError("Error parsing line %s" % i) 

617 else: 

618 i = next(ofile) 

619 

620 i = next(ofile) 

621 return i 

622 

623 

624def read_header(ofile): 

625 """Read the header of the iterable ofile.""" 

626 i = next(ofile) 

627 

628 # Pass first comments 

629 while r_comment.match(i): 

630 i = next(ofile) 

631 

632 # Header is everything up to DATA attribute ? 

633 relation = None 

634 attributes = [] 

635 while not r_datameta.match(i): 

636 m = r_headerline.match(i) 

637 if m: 

638 isattr = r_attribute.match(i) 

639 if isattr: 

640 attr, i = tokenize_attribute(ofile, i) 

641 attributes.append(attr) 

642 else: 

643 isrel = r_relation.match(i) 

644 if isrel: 

645 relation = isrel.group(1) 

646 else: 

647 raise ValueError("Error parsing line %s" % i) 

648 i = next(ofile) 

649 else: 

650 i = next(ofile) 

651 

652 return relation, attributes 

653 

654 

655class MetaData: 

656 """Small container to keep useful information on a ARFF dataset. 

657 

658 Knows about attributes names and types. 

659 

660 Examples 

661 -------- 

662 :: 

663 

664 data, meta = loadarff('iris.arff') 

665 # This will print the attributes names of the iris.arff dataset 

666 for i in meta: 

667 print(i) 

668 # This works too 

669 meta.names() 

670 # Getting attribute type 

671 types = meta.types() 

672 

673 Methods 

674 ------- 

675 names 

676 types 

677 

678 Notes 

679 ----- 

680 Also maintains the list of attributes in order, i.e., doing for i in 

681 meta, where meta is an instance of MetaData, will return the 

682 different attribute names in the order they were defined. 

683 """ 

684 def __init__(self, rel, attr): 

685 self.name = rel 

686 self._attributes = {a.name: a for a in attr} 

687 

688 def __repr__(self): 

689 msg = "" 

690 msg += "Dataset: %s\n" % self.name 

691 for i in self._attributes: 

692 msg += f"\t{i}'s type is {self._attributes[i].type_name}" 

693 if self._attributes[i].range: 

694 msg += ", range is %s" % str(self._attributes[i].range) 

695 msg += '\n' 

696 return msg 

697 

698 def __iter__(self): 

699 return iter(self._attributes) 

700 

701 def __getitem__(self, key): 

702 attr = self._attributes[key] 

703 

704 return (attr.type_name, attr.range) 

705 

706 def names(self): 

707 """Return the list of attribute names. 

708 

709 Returns 

710 ------- 

711 attrnames : list of str 

712 The attribute names. 

713 """ 

714 return list(self._attributes) 

715 

716 def types(self): 

717 """Return the list of attribute types. 

718 

719 Returns 

720 ------- 

721 attr_types : list of str 

722 The attribute types. 

723 """ 

724 attr_types = [self._attributes[name].type_name 

725 for name in self._attributes] 

726 return attr_types 

727 

728 

729def loadarff(f): 

730 """ 

731 Read an arff file. 

732 

733 The data is returned as a record array, which can be accessed much like 

734 a dictionary of NumPy arrays. For example, if one of the attributes is 

735 called 'pressure', then its first 10 data points can be accessed from the 

736 ``data`` record array like so: ``data['pressure'][0:10]`` 

737 

738 

739 Parameters 

740 ---------- 

741 f : file-like or str 

742 File-like object to read from, or filename to open. 

743 

744 Returns 

745 ------- 

746 data : record array 

747 The data of the arff file, accessible by attribute names. 

748 meta : `MetaData` 

749 Contains information about the arff file such as name and 

750 type of attributes, the relation (name of the dataset), etc. 

751 

752 Raises 

753 ------ 

754 ParseArffError 

755 This is raised if the given file is not ARFF-formatted. 

756 NotImplementedError 

757 The ARFF file has an attribute which is not supported yet. 

758 

759 Notes 

760 ----- 

761 

762 This function should be able to read most arff files. Not 

763 implemented functionality include: 

764 

765 * date type attributes 

766 * string type attributes 

767 

768 It can read files with numeric and nominal attributes. It cannot read 

769 files with sparse data ({} in the file). However, this function can 

770 read files with missing data (? in the file), representing the data 

771 points as NaNs. 

772 

773 Examples 

774 -------- 

775 >>> from scipy.io import arff 

776 >>> from io import StringIO 

777 >>> content = \"\"\" 

778 ... @relation foo 

779 ... @attribute width numeric 

780 ... @attribute height numeric 

781 ... @attribute color {red,green,blue,yellow,black} 

782 ... @data 

783 ... 5.0,3.25,blue 

784 ... 4.5,3.75,green 

785 ... 3.0,4.00,red 

786 ... \"\"\" 

787 >>> f = StringIO(content) 

788 >>> data, meta = arff.loadarff(f) 

789 >>> data 

790 array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')], 

791 dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')]) 

792 >>> meta 

793 Dataset: foo 

794 \twidth's type is numeric 

795 \theight's type is numeric 

796 \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black') 

797 

798 """ 

799 if hasattr(f, 'read'): 

800 ofile = f 

801 else: 

802 ofile = open(f) 

803 try: 

804 return _loadarff(ofile) 

805 finally: 

806 if ofile is not f: # only close what we opened 

807 ofile.close() 

808 

809 

810def _loadarff(ofile): 

811 # Parse the header file 

812 try: 

813 rel, attr = read_header(ofile) 

814 except ValueError as e: 

815 msg = "Error while parsing header, error was: " + str(e) 

816 raise ParseArffError(msg) from e 

817 

818 # Check whether we have a string attribute (not supported yet) 

819 hasstr = False 

820 for a in attr: 

821 if isinstance(a, StringAttribute): 

822 hasstr = True 

823 

824 meta = MetaData(rel, attr) 

825 

826 # XXX The following code is not great 

827 # Build the type descriptor descr and the list of converters to convert 

828 # each attribute to the suitable type (which should match the one in 

829 # descr). 

830 

831 # This can be used once we want to support integer as integer values and 

832 # not as numeric anymore (using masked arrays ?). 

833 

834 if hasstr: 

835 # How to support string efficiently ? Ideally, we should know the max 

836 # size of the string before allocating the numpy array. 

837 raise NotImplementedError("String attributes not supported yet, sorry") 

838 

839 ni = len(attr) 

840 

841 def generator(row_iter, delim=','): 

842 # TODO: this is where we are spending time (~80%). I think things 

843 # could be made more efficiently: 

844 # - We could for example "compile" the function, because some values 

845 # do not change here. 

846 # - The function to convert a line to dtyped values could also be 

847 # generated on the fly from a string and be executed instead of 

848 # looping. 

849 # - The regex are overkill: for comments, checking that a line starts 

850 # by % should be enough and faster, and for empty lines, same thing 

851 # --> this does not seem to change anything. 

852 

853 # 'compiling' the range since it does not change 

854 # Note, I have already tried zipping the converters and 

855 # row elements and got slightly worse performance. 

856 elems = list(range(ni)) 

857 

858 dialect = None 

859 for raw in row_iter: 

860 # We do not abstract skipping comments and empty lines for 

861 # performance reasons. 

862 if r_comment.match(raw) or r_empty.match(raw): 

863 continue 

864 

865 row, dialect = split_data_line(raw, dialect) 

866 

867 yield tuple([attr[i].parse_data(row[i]) for i in elems]) 

868 

869 a = list(generator(ofile)) 

870 # No error should happen here: it is a bug otherwise 

871 data = np.array(a, [(a.name, a.dtype) for a in attr]) 

872 return data, meta 

873 

874 

875# ---- 

876# Misc 

877# ---- 

878def basic_stats(data): 

879 nbfac = data.size * 1. / (data.size - 1) 

880 return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac 

881 

882 

883def print_attribute(name, tp, data): 

884 type = tp.type_name 

885 if type == 'numeric' or type == 'real' or type == 'integer': 

886 min, max, mean, std = basic_stats(data) 

887 print(f"{name},{type},{min:f},{max:f},{mean:f},{std:f}") 

888 else: 

889 print(str(tp)) 

890 

891 

892def test_weka(filename): 

893 data, meta = loadarff(filename) 

894 print(len(data.dtype)) 

895 print(data.size) 

896 for i in meta: 

897 print_attribute(i, meta[i], data[i]) 

898 

899 

900# make sure nose does not find this as a test 

901test_weka.__test__ = False 

902 

903 

904if __name__ == '__main__': 

905 import sys 

906 filename = sys.argv[1] 

907 test_weka(filename)