Coverage for /pythoncovmergedfiles/medio/medio/usr/lib/python3.9/difflib.py: 10%

1"""

2Module difflib -- helpers for computing deltas between objects.

4Function get_close_matches(word, possibilities, n=3, cutoff=0.6):

5 Use SequenceMatcher to return list of the best "good enough" matches.

7Function context_diff(a, b):

8 For two lists of strings, return a delta in context diff format.

10Function ndiff(a, b):

11 Return a delta: the difference between `a` and `b` (lists of strings).

13Function restore(delta, which):

14 Return one of the two sequences that generated an ndiff delta.

16Function unified_diff(a, b):

17 For two lists of strings, return a delta in unified diff format.

19Class SequenceMatcher:

20 A flexible class for comparing pairs of sequences of any type.

22Class Differ:

23 For producing human-readable deltas from sequences of lines of text.

25Class HtmlDiff:

26 For producing HTML side by side comparison with change highlights.

27"""

29__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',

30 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',

31 'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']

33from heapq import nlargest as _nlargest

34from collections import namedtuple as _namedtuple

35from types import GenericAlias

37Match = _namedtuple('Match', 'a b size')

39def _calculate_ratio(matches, length):

40 if length:

41 return 2.0 * matches / length

42 return 1.0

44class SequenceMatcher:

46 """

47 SequenceMatcher is a flexible class for comparing pairs of sequences of

48 any type, so long as the sequence elements are hashable. The basic

49 algorithm predates, and is a little fancier than, an algorithm

50 published in the late 1980's by Ratcliff and Obershelp under the

51 hyperbolic name "gestalt pattern matching". The basic idea is to find

52 the longest contiguous matching subsequence that contains no "junk"

53 elements (R-O doesn't address junk). The same idea is then applied

54 recursively to the pieces of the sequences to the left and to the right

55 of the matching subsequence. This does not yield minimal edit

56 sequences, but does tend to yield matches that "look right" to people.

58 SequenceMatcher tries to compute a "human-friendly diff" between two

59 sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the

60 longest *contiguous* & junk-free matching subsequence. That's what

61 catches peoples' eyes. The Windows(tm) windiff has another interesting

62 notion, pairing up elements that appear uniquely in each sequence.

63 That, and the method here, appear to yield more intuitive difference

64 reports than does diff. This method appears to be the least vulnerable

65 to synching up on blocks of "junk lines", though (like blank lines in

66 ordinary text files, or maybe "<P>" lines in HTML files). That may be

67 because this is the only method of the 3 that has a *concept* of

68 "junk" <wink>.

70 Example, comparing two strings, and considering blanks to be "junk":

72 >>> s = SequenceMatcher(lambda x: x == " ",

73 ... "private Thread currentThread;",

74 ... "private volatile Thread currentThread;")

75 >>>

77 .ratio() returns a float in [0, 1], measuring the "similarity" of the

78 sequences. As a rule of thumb, a .ratio() value over 0.6 means the

79 sequences are close matches:

81 >>> print(round(s.ratio(), 3))

82 0.866

83 >>>

85 If you're only interested in where the sequences match,

86 .get_matching_blocks() is handy:

88 >>> for block in s.get_matching_blocks():

89 ... print("a[%d] and b[%d] match for %d elements" % block)

90 a[0] and b[0] match for 8 elements

91 a[8] and b[17] match for 21 elements

92 a[29] and b[38] match for 0 elements

94 Note that the last tuple returned by .get_matching_blocks() is always a

95 dummy, (len(a), len(b), 0), and this is the only case in which the last

96 tuple element (number of elements matched) is 0.

98 If you want to know how to change the first sequence into the second,

99 use .get_opcodes():

100

101 >>> for opcode in s.get_opcodes():

102 ... print("%6s a[%d:%d] b[%d:%d]" % opcode)

103 equal a[0:8] b[0:8]

104 insert a[8:8] b[8:17]

105 equal a[8:29] b[17:38]

106

107 See the Differ class for a fancy human-friendly file differencer, which

108 uses SequenceMatcher both to compare sequences of lines, and to compare

109 sequences of characters within similar (near-matching) lines.

110

111 See also function get_close_matches() in this module, which shows how

112 simple code building on SequenceMatcher can be used to do useful work.

113

114 Timing: Basic R-O is cubic time worst case and quadratic time expected

115 case. SequenceMatcher is quadratic time for the worst case and has

116 expected-case behavior dependent in a complicated way on how many

117 elements the sequences have in common; best case time is linear.

118

119 Methods:

120

121 __init__(isjunk=None, a='', b='')

122 Construct a SequenceMatcher.

123

124 set_seqs(a, b)

125 Set the two sequences to be compared.

126

127 set_seq1(a)

128 Set the first sequence to be compared.

129

130 set_seq2(b)

131 Set the second sequence to be compared.

132

133 find_longest_match(alo=0, ahi=None, blo=0, bhi=None)

134 Find longest matching block in a[alo:ahi] and b[blo:bhi].

135

136 get_matching_blocks()

137 Return list of triples describing matching subsequences.

138

139 get_opcodes()

140 Return list of 5-tuples describing how to turn a into b.

141

142 ratio()

143 Return a measure of the sequences' similarity (float in [0,1]).

144

145 quick_ratio()

146 Return an upper bound on .ratio() relatively quickly.

147

148 real_quick_ratio()

149 Return an upper bound on ratio() very quickly.

150 """

151

152 def __init__(self, isjunk=None, a='', b='', autojunk=True):

153 """Construct a SequenceMatcher.

154

155 Optional arg isjunk is None (the default), or a one-argument

156 function that takes a sequence element and returns true iff the

157 element is junk. None is equivalent to passing "lambda x: 0", i.e.

158 no elements are considered to be junk. For example, pass

159 lambda x: x in " \\t"

160 if you're comparing lines as sequences of characters, and don't

161 want to synch up on blanks or hard tabs.

162

163 Optional arg a is the first of two sequences to be compared. By

164 default, an empty string. The elements of a must be hashable. See

165 also .set_seqs() and .set_seq1().

166

167 Optional arg b is the second of two sequences to be compared. By

168 default, an empty string. The elements of b must be hashable. See

169 also .set_seqs() and .set_seq2().

170

171 Optional arg autojunk should be set to False to disable the

172 "automatic junk heuristic" that treats popular elements as junk

173 (see module documentation for more information).

174 """

175

176 # Members:

177 # a

178 # first sequence

179 # b

180 # second sequence; differences are computed as "what do

181 # we need to do to 'a' to change it into 'b'?"

182 # b2j

183 # for x in b, b2j[x] is a list of the indices (into b)

184 # at which x appears; junk and popular elements do not appear

185 # fullbcount

186 # for x in b, fullbcount[x] == the number of times x

187 # appears in b; only materialized if really needed (used

188 # only for computing quick_ratio())

189 # matching_blocks

190 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];

191 # ascending & non-overlapping in i and in j; terminated by

192 # a dummy (len(a), len(b), 0) sentinel

193 # opcodes

194 # a list of (tag, i1, i2, j1, j2) tuples, where tag is

195 # one of

196 # 'replace' a[i1:i2] should be replaced by b[j1:j2]

197 # 'delete' a[i1:i2] should be deleted

198 # 'insert' b[j1:j2] should be inserted

199 # 'equal' a[i1:i2] == b[j1:j2]

200 # isjunk

201 # a user-supplied function taking a sequence element and

202 # returning true iff the element is "junk" -- this has

203 # subtle but helpful effects on the algorithm, which I'll

204 # get around to writing up someday <0.9 wink>.

205 # DON'T USE! Only __chain_b uses this. Use "in self.bjunk".

206 # bjunk

207 # the items in b for which isjunk is True.

208 # bpopular

209 # nonjunk items in b treated as junk by the heuristic (if used).

210

211 self.isjunk = isjunk

212 self.a = self.b = None

213 self.autojunk = autojunk

214 self.set_seqs(a, b)

215

216 def set_seqs(self, a, b):

217 """Set the two sequences to be compared.

218

219 >>> s = SequenceMatcher()

220 >>> s.set_seqs("abcd", "bcde")

221 >>> s.ratio()

222 0.75

223 """

224

225 self.set_seq1(a)

226 self.set_seq2(b)

227

228 def set_seq1(self, a):

229 """Set the first sequence to be compared.

230

231 The second sequence to be compared is not changed.

232

233 >>> s = SequenceMatcher(None, "abcd", "bcde")

234 >>> s.ratio()

235 0.75

236 >>> s.set_seq1("bcde")

237 >>> s.ratio()

238 1.0

239 >>>

240

241 SequenceMatcher computes and caches detailed information about the

242 second sequence, so if you want to compare one sequence S against

243 many sequences, use .set_seq2(S) once and call .set_seq1(x)

244 repeatedly for each of the other sequences.

245

246 See also set_seqs() and set_seq2().

247 """

248

249 if a is self.a:

250 return

251 self.a = a

252 self.matching_blocks = self.opcodes = None

253

254 def set_seq2(self, b):

255 """Set the second sequence to be compared.

256

257 The first sequence to be compared is not changed.

258

259 >>> s = SequenceMatcher(None, "abcd", "bcde")

260 >>> s.ratio()

261 0.75

262 >>> s.set_seq2("abcd")

263 >>> s.ratio()

264 1.0

265 >>>

266

267 SequenceMatcher computes and caches detailed information about the

268 second sequence, so if you want to compare one sequence S against

269 many sequences, use .set_seq2(S) once and call .set_seq1(x)

270 repeatedly for each of the other sequences.

271

272 See also set_seqs() and set_seq1().

273 """

274

275 if b is self.b:

276 return

277 self.b = b

278 self.matching_blocks = self.opcodes = None

279 self.fullbcount = None

280 self.__chain_b()

281

282 # For each element x in b, set b2j[x] to a list of the indices in

283 # b where x appears; the indices are in increasing order; note that

284 # the number of times x appears in b is len(b2j[x]) ...

285 # when self.isjunk is defined, junk elements don't show up in this

286 # map at all, which stops the central find_longest_match method

287 # from starting any matching block at a junk element ...

288 # b2j also does not contain entries for "popular" elements, meaning

289 # elements that account for more than 1 + 1% of the total elements, and

290 # when the sequence is reasonably large (>= 200 elements); this can

291 # be viewed as an adaptive notion of semi-junk, and yields an enormous

292 # speedup when, e.g., comparing program files with hundreds of

293 # instances of "return NULL;" ...

294 # note that this is only called when b changes; so for cross-product

295 # kinds of matches, it's best to call set_seq2 once, then set_seq1

296 # repeatedly

297

298 def __chain_b(self):

299 # Because isjunk is a user-defined (not C) function, and we test

300 # for junk a LOT, it's important to minimize the number of calls.

301 # Before the tricks described here, __chain_b was by far the most

302 # time-consuming routine in the whole module! If anyone sees

303 # Jim Roskind, thank him again for profile.py -- I never would

304 # have guessed that.

305 # The first trick is to build b2j ignoring the possibility

306 # of junk. I.e., we don't call isjunk at all yet. Throwing

307 # out the junk later is much cheaper than building b2j "right"

308 # from the start.

309 b = self.b

310 self.b2j = b2j = {}

311

312 for i, elt in enumerate(b):

313 indices = b2j.setdefault(elt, [])

314 indices.append(i)

315

316 # Purge junk elements

317 self.bjunk = junk = set()

318 isjunk = self.isjunk

319 if isjunk:

320 for elt in b2j.keys():

321 if isjunk(elt):

322 junk.add(elt)

323 for elt in junk: # separate loop avoids separate list of keys

324 del b2j[elt]

325

326 # Purge popular elements that are not junk

327 self.bpopular = popular = set()

328 n = len(b)

329 if self.autojunk and n >= 200:

330 ntest = n // 100 + 1

331 for elt, idxs in b2j.items():

332 if len(idxs) > ntest:

333 popular.add(elt)

334 for elt in popular: # ditto; as fast for 1% deletion

335 del b2j[elt]

336

337 def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):

338 """Find longest matching block in a[alo:ahi] and b[blo:bhi].

339

340 By default it will find the longest match in the entirety of a and b.

341

342 If isjunk is not defined:

343

344 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

345 alo <= i <= i+k <= ahi

346 blo <= j <= j+k <= bhi

347 and for all (i',j',k') meeting those conditions,

348 k >= k'

349 i <= i'

350 and if i == i', j <= j'

351

352 In other words, of all maximal matching blocks, return one that

353 starts earliest in a, and of all those maximal matching blocks that

354 start earliest in a, return the one that starts earliest in b.

355

356 >>> s = SequenceMatcher(None, " abcd", "abcd abcd")

357 >>> s.find_longest_match(0, 5, 0, 9)

358 Match(a=0, b=4, size=5)

359

360 If isjunk is defined, first the longest matching block is

361 determined as above, but with the additional restriction that no

362 junk element appears in the block. Then that block is extended as

363 far as possible by matching (only) junk elements on both sides. So

364 the resulting block never matches on junk except as identical junk

365 happens to be adjacent to an "interesting" match.

366

367 Here's the same example as before, but considering blanks to be

368 junk. That prevents " abcd" from matching the " abcd" at the tail

369 end of the second sequence directly. Instead only the "abcd" can

370 match, and matches the leftmost "abcd" in the second sequence:

371

372 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

373 >>> s.find_longest_match(0, 5, 0, 9)

374 Match(a=1, b=0, size=4)

375

376 If no blocks match, return (alo, blo, 0).

377

378 >>> s = SequenceMatcher(None, "ab", "c")

379 >>> s.find_longest_match(0, 2, 0, 1)

380 Match(a=0, b=0, size=0)

381 """

382

383 # CAUTION: stripping common prefix or suffix would be incorrect.

384 # E.g.,

385 # ab

386 # acab

387 # Longest matching block is "ab", but if common prefix is

388 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

389 # strip, so ends up claiming that ab is changed to acab by

390 # inserting "ca" in the middle. That's minimal but unintuitive:

391 # "it's obvious" that someone inserted "ac" at the front.

392 # Windiff ends up at the same place as diff, but by pairing up

393 # the unique 'b's and then matching the first two 'a's.

394

395 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__

396 if ahi is None:

397 ahi = len(a)

398 if bhi is None:

399 bhi = len(b)

400 besti, bestj, bestsize = alo, blo, 0

401 # find longest junk-free match

402 # during an iteration of the loop, j2len[j] = length of longest

403 # junk-free match ending with a[i-1] and b[j]

404 j2len = {}

405 nothing = []

406 for i in range(alo, ahi):

407 # look at all instances of a[i] in b; note that because

408 # b2j has no junk keys, the loop is skipped if a[i] is junk

409 j2lenget = j2len.get

410 newj2len = {}

411 for j in b2j.get(a[i], nothing):

412 # a[i] matches b[j]

413 if j < blo:

414 continue

415 if j >= bhi:

416 break

417 k = newj2len[j] = j2lenget(j-1, 0) + 1

418 if k > bestsize:

419 besti, bestj, bestsize = i-k+1, j-k+1, k

420 j2len = newj2len

421

422 # Extend the best by non-junk elements on each end. In particular,

423 # "popular" non-junk elements aren't in b2j, which greatly speeds

424 # the inner loop above, but also means "the best" match so far

425 # doesn't contain any junk *or* popular non-junk elements.

426 while besti > alo and bestj > blo and \

427 not isbjunk(b[bestj-1]) and \

428 a[besti-1] == b[bestj-1]:

429 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

430 while besti+bestsize < ahi and bestj+bestsize < bhi and \

431 not isbjunk(b[bestj+bestsize]) and \

432 a[besti+bestsize] == b[bestj+bestsize]:

433 bestsize += 1

434

435 # Now that we have a wholly interesting match (albeit possibly

436 # empty!), we may as well suck up the matching junk on each

437 # side of it too. Can't think of a good reason not to, and it

438 # saves post-processing the (possibly considerable) expense of

439 # figuring out what to do with it. In the case of an empty

440 # interesting match, this is clearly the right thing to do,

441 # because no other kind of match is possible in the regions.

442 while besti > alo and bestj > blo and \

443 isbjunk(b[bestj-1]) and \

444 a[besti-1] == b[bestj-1]:

445 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

446 while besti+bestsize < ahi and bestj+bestsize < bhi and \

447 isbjunk(b[bestj+bestsize]) and \

448 a[besti+bestsize] == b[bestj+bestsize]:

449 bestsize = bestsize + 1

450

451 return Match(besti, bestj, bestsize)

452

453 def get_matching_blocks(self):

454 """Return list of triples describing matching subsequences.

455

456 Each triple is of the form (i, j, n), and means that

457 a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in

458 i and in j. New in Python 2.5, it's also guaranteed that if

459 (i, j, n) and (i', j', n') are adjacent triples in the list, and

460 the second is not the last triple in the list, then i+n != i' or

461 j+n != j'. IOW, adjacent triples never describe adjacent equal

462 blocks.

463

464 The last triple is a dummy, (len(a), len(b), 0), and is the only

465 triple with n==0.

466

467 >>> s = SequenceMatcher(None, "abxcd", "abcd")

468 >>> list(s.get_matching_blocks())

469 [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]

470 """

471

472 if self.matching_blocks is not None:

473 return self.matching_blocks

474 la, lb = len(self.a), len(self.b)

475

476 # This is most naturally expressed as a recursive algorithm, but

477 # at least one user bumped into extreme use cases that exceeded

478 # the recursion limit on their box. So, now we maintain a list

479 # ('queue`) of blocks we still need to look at, and append partial

480 # results to `matching_blocks` in a loop; the matches are sorted

481 # at the end.

482 queue = [(0, la, 0, lb)]

483 matching_blocks = []

484 while queue:

485 alo, ahi, blo, bhi = queue.pop()

486 i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)

487 # a[alo:i] vs b[blo:j] unknown

488 # a[i:i+k] same as b[j:j+k]

489 # a[i+k:ahi] vs b[j+k:bhi] unknown

490 if k: # if k is 0, there was no matching block

491 matching_blocks.append(x)

492 if alo < i and blo < j:

493 queue.append((alo, i, blo, j))

494 if i+k < ahi and j+k < bhi:

495 queue.append((i+k, ahi, j+k, bhi))

496 matching_blocks.sort()

497

498 # It's possible that we have adjacent equal blocks in the

499 # matching_blocks list now. Starting with 2.5, this code was added

500 # to collapse them.

501 i1 = j1 = k1 = 0

502 non_adjacent = []

503 for i2, j2, k2 in matching_blocks:

504 # Is this block adjacent to i1, j1, k1?

505 if i1 + k1 == i2 and j1 + k1 == j2:

506 # Yes, so collapse them -- this just increases the length of

507 # the first block by the length of the second, and the first

508 # block so lengthened remains the block to compare against.

509 k1 += k2

510 else:

511 # Not adjacent. Remember the first block (k1==0 means it's

512 # the dummy we started with), and make the second block the

513 # new block to compare against.

514 if k1:

515 non_adjacent.append((i1, j1, k1))

516 i1, j1, k1 = i2, j2, k2

517 if k1:

518 non_adjacent.append((i1, j1, k1))

519

520 non_adjacent.append( (la, lb, 0) )

521 self.matching_blocks = list(map(Match._make, non_adjacent))

522 return self.matching_blocks

523

524 def get_opcodes(self):

525 """Return list of 5-tuples describing how to turn a into b.

526

527 Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple

528 has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the

529 tuple preceding it, and likewise for j1 == the previous j2.

530

531 The tags are strings, with these meanings:

532

533 'replace': a[i1:i2] should be replaced by b[j1:j2]

534 'delete': a[i1:i2] should be deleted.

535 Note that j1==j2 in this case.

536 'insert': b[j1:j2] should be inserted at a[i1:i1].

537 Note that i1==i2 in this case.

538 'equal': a[i1:i2] == b[j1:j2]

539

540 >>> a = "qabxcd"

541 >>> b = "abycdf"

542 >>> s = SequenceMatcher(None, a, b)

543 >>> for tag, i1, i2, j1, j2 in s.get_opcodes():

544 ... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %

545 ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))

546 delete a[0:1] (q) b[0:0] ()

547 equal a[1:3] (ab) b[0:2] (ab)

548 replace a[3:4] (x) b[2:3] (y)

549 equal a[4:6] (cd) b[3:5] (cd)

550 insert a[6:6] () b[5:6] (f)

551 """

552

553 if self.opcodes is not None:

554 return self.opcodes

555 i = j = 0

556 self.opcodes = answer = []

557 for ai, bj, size in self.get_matching_blocks():

558 # invariant: we've pumped out correct diffs to change

559 # a[:i] into b[:j], and the next matching block is

560 # a[ai:ai+size] == b[bj:bj+size]. So we need to pump

561 # out a diff to change a[i:ai] into b[j:bj], pump out

562 # the matching block, and move (i,j) beyond the match

563 tag = ''

564 if i < ai and j < bj:

565 tag = 'replace'

566 elif i < ai:

567 tag = 'delete'

568 elif j < bj:

569 tag = 'insert'

570 if tag:

571 answer.append( (tag, i, ai, j, bj) )

572 i, j = ai+size, bj+size

573 # the list of matching blocks is terminated by a

574 # sentinel with size 0

575 if size:

576 answer.append( ('equal', ai, i, bj, j) )

577 return answer

578

579 def get_grouped_opcodes(self, n=3):

580 """ Isolate change clusters by eliminating ranges with no changes.

581

582 Return a generator of groups with up to n lines of context.

583 Each group is in the same format as returned by get_opcodes().

584

585 >>> from pprint import pprint

586 >>> a = list(map(str, range(1,40)))

587 >>> b = a[:]

588 >>> b[8:8] = ['i'] # Make an insertion

589 >>> b[20] += 'x' # Make a replacement

590 >>> b[23:28] = [] # Make a deletion

591 >>> b[30] += 'y' # Make another replacement

592 >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))

593 [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],

594 [('equal', 16, 19, 17, 20),

595 ('replace', 19, 20, 20, 21),

596 ('equal', 20, 22, 21, 23),

597 ('delete', 22, 27, 23, 23),

598 ('equal', 27, 30, 23, 26)],

599 [('equal', 31, 34, 27, 30),

600 ('replace', 34, 35, 30, 31),

601 ('equal', 35, 38, 31, 34)]]

602 """

603

604 codes = self.get_opcodes()

605 if not codes:

606 codes = [("equal", 0, 1, 0, 1)]

607 # Fixup leading and trailing groups if they show no changes.

608 if codes[0][0] == 'equal':

609 tag, i1, i2, j1, j2 = codes[0]

610 codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2

611 if codes[-1][0] == 'equal':

612 tag, i1, i2, j1, j2 = codes[-1]

613 codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)

614

615 nn = n + n

616 group = []

617 for tag, i1, i2, j1, j2 in codes:

618 # End the current group and start a new one whenever

619 # there is a large range with no changes.

620 if tag == 'equal' and i2-i1 > nn:

621 group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))

622 yield group

623 group = []

624 i1, j1 = max(i1, i2-n), max(j1, j2-n)

625 group.append((tag, i1, i2, j1 ,j2))

626 if group and not (len(group)==1 and group[0][0] == 'equal'):

627 yield group

628

629 def ratio(self):

630 """Return a measure of the sequences' similarity (float in [0,1]).

631

632 Where T is the total number of elements in both sequences, and

633 M is the number of matches, this is 2.0*M / T.

634 Note that this is 1 if the sequences are identical, and 0 if

635 they have nothing in common.

636

637 .ratio() is expensive to compute if you haven't already computed

638 .get_matching_blocks() or .get_opcodes(), in which case you may

639 want to try .quick_ratio() or .real_quick_ratio() first to get an

640 upper bound.

641

642 >>> s = SequenceMatcher(None, "abcd", "bcde")

643 >>> s.ratio()

644 0.75

645 >>> s.quick_ratio()

646 0.75

647 >>> s.real_quick_ratio()

648 1.0

649 """

650

651 matches = sum(triple[-1] for triple in self.get_matching_blocks())

652 return _calculate_ratio(matches, len(self.a) + len(self.b))

653

654 def quick_ratio(self):

655 """Return an upper bound on ratio() relatively quickly.

656

657 This isn't defined beyond that it is an upper bound on .ratio(), and

658 is faster to compute.

659 """

660

661 # viewing a and b as multisets, set matches to the cardinality

662 # of their intersection; this counts the number of matches

663 # without regard to order, so is clearly an upper bound

664 if self.fullbcount is None:

665 self.fullbcount = fullbcount = {}

666 for elt in self.b:

667 fullbcount[elt] = fullbcount.get(elt, 0) + 1

668 fullbcount = self.fullbcount

669 # avail[x] is the number of times x appears in 'b' less the

670 # number of times we've seen it in 'a' so far ... kinda

671 avail = {}

672 availhas, matches = avail.__contains__, 0

673 for elt in self.a:

674 if availhas(elt):

675 numb = avail[elt]

676 else:

677 numb = fullbcount.get(elt, 0)

678 avail[elt] = numb - 1

679 if numb > 0:

680 matches = matches + 1

681 return _calculate_ratio(matches, len(self.a) + len(self.b))

682

683 def real_quick_ratio(self):

684 """Return an upper bound on ratio() very quickly.

685

686 This isn't defined beyond that it is an upper bound on .ratio(), and

687 is faster to compute than either .ratio() or .quick_ratio().

688 """

689

690 la, lb = len(self.a), len(self.b)

691 # can't have more matches than the number of elements in the

692 # shorter sequence

693 return _calculate_ratio(min(la, lb), la + lb)

694

695 __class_getitem__ = classmethod(GenericAlias)

696

697

698def get_close_matches(word, possibilities, n=3, cutoff=0.6):

699 """Use SequenceMatcher to return list of the best "good enough" matches.

700

701 word is a sequence for which close matches are desired (typically a

702 string).

703

704 possibilities is a list of sequences against which to match word

705 (typically a list of strings).

706

707 Optional arg n (default 3) is the maximum number of close matches to

708 return. n must be > 0.

709

710 Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities

711 that don't score at least that similar to word are ignored.

712

713 The best (no more than n) matches among the possibilities are returned

714 in a list, sorted by similarity score, most similar first.

715

716 >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])

717 ['apple', 'ape']

718 >>> import keyword as _keyword

719 >>> get_close_matches("wheel", _keyword.kwlist)

720 ['while']

721 >>> get_close_matches("Apple", _keyword.kwlist)

722 []

723 >>> get_close_matches("accept", _keyword.kwlist)

724 ['except']

725 """

726

727 if not n > 0:

728 raise ValueError("n must be > 0: %r" % (n,))

729 if not 0.0 <= cutoff <= 1.0:

730 raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))

731 result = []

732 s = SequenceMatcher()

733 s.set_seq2(word)

734 for x in possibilities:

735 s.set_seq1(x)

736 if s.real_quick_ratio() >= cutoff and \

737 s.quick_ratio() >= cutoff and \

738 s.ratio() >= cutoff:

739 result.append((s.ratio(), x))

740

741 # Move the best scorers to head of list

742 result = _nlargest(n, result)

743 # Strip scores for the best n matches

744 return [x for score, x in result]

745

746

747def _keep_original_ws(s, tag_s):

748 """Replace whitespace with the original whitespace characters in `s`"""

749 return ''.join(

750 c if tag_c == " " and c.isspace() else tag_c

751 for c, tag_c in zip(s, tag_s)

752 )

753

754

755

756class Differ:

757 r"""

758 Differ is a class for comparing sequences of lines of text, and

759 producing human-readable differences or deltas. Differ uses

760 SequenceMatcher both to compare sequences of lines, and to compare

761 sequences of characters within similar (near-matching) lines.

762

763 Each line of a Differ delta begins with a two-letter code:

764

765 '- ' line unique to sequence 1

766 '+ ' line unique to sequence 2

767 ' ' line common to both sequences

768 '? ' line not present in either input sequence

769

770 Lines beginning with '? ' attempt to guide the eye to intraline

771 differences, and were not present in either input sequence. These lines

772 can be confusing if the sequences contain tab characters.

773

774 Note that Differ makes no claim to produce a *minimal* diff. To the

775 contrary, minimal diffs are often counter-intuitive, because they synch

776 up anywhere possible, sometimes accidental matches 100 pages apart.

777 Restricting synch points to contiguous matches preserves some notion of

778 locality, at the occasional cost of producing a longer diff.

779

780 Example: Comparing two texts.

781

782 First we set up the texts, sequences of individual single-line strings

783 ending with newlines (such sequences can also be obtained from the

784 `readlines()` method of file-like objects):

785

786 >>> text1 = ''' 1. Beautiful is better than ugly.

787 ... 2. Explicit is better than implicit.

788 ... 3. Simple is better than complex.

789 ... 4. Complex is better than complicated.

790 ... '''.splitlines(keepends=True)

791 >>> len(text1)

792 4

793 >>> text1[0][-1]

794 '\n'

795 >>> text2 = ''' 1. Beautiful is better than ugly.

796 ... 3. Simple is better than complex.

797 ... 4. Complicated is better than complex.

798 ... 5. Flat is better than nested.

799 ... '''.splitlines(keepends=True)

800

801 Next we instantiate a Differ object:

802

803 >>> d = Differ()

804

805 Note that when instantiating a Differ object we may pass functions to

806 filter out line and character 'junk'. See Differ.__init__ for details.

807

808 Finally, we compare the two:

809

810 >>> result = list(d.compare(text1, text2))

811

812 'result' is a list of strings, so let's pretty-print it:

813

814 >>> from pprint import pprint as _pprint

815 >>> _pprint(result)

816 [' 1. Beautiful is better than ugly.\n',

817 '- 2. Explicit is better than implicit.\n',

818 '- 3. Simple is better than complex.\n',

819 '+ 3. Simple is better than complex.\n',

820 '? ++\n',

821 '- 4. Complex is better than complicated.\n',

822 '? ^ ---- ^\n',

823 '+ 4. Complicated is better than complex.\n',

824 '? ++++ ^ ^\n',

825 '+ 5. Flat is better than nested.\n']

826

827 As a single multi-line string it looks like this:

828

829 >>> print(''.join(result), end="")

830 1. Beautiful is better than ugly.

831 - 2. Explicit is better than implicit.

832 - 3. Simple is better than complex.

833 + 3. Simple is better than complex.

834 ? ++

835 - 4. Complex is better than complicated.

836 ? ^ ---- ^

837 + 4. Complicated is better than complex.

838 ? ++++ ^ ^

839 + 5. Flat is better than nested.

840

841 Methods:

842

843 __init__(linejunk=None, charjunk=None)

844 Construct a text differencer, with optional filters.

845

846 compare(a, b)

847 Compare two sequences of lines; generate the resulting delta.

848 """

849

850 def __init__(self, linejunk=None, charjunk=None):

851 """

852 Construct a text differencer, with optional filters.

853

854 The two optional keyword parameters are for filter functions:

855

856 - `linejunk`: A function that should accept a single string argument,

857 and return true iff the string is junk. The module-level function

858 `IS_LINE_JUNK` may be used to filter out lines without visible

859 characters, except for at most one splat ('#'). It is recommended

860 to leave linejunk None; the underlying SequenceMatcher class has

861 an adaptive notion of "noise" lines that's better than any static

862 definition the author has ever been able to craft.

863

864 - `charjunk`: A function that should accept a string of length 1. The

865 module-level function `IS_CHARACTER_JUNK` may be used to filter out

866 whitespace characters (a blank or tab; **note**: bad idea to include

867 newline in this!). Use of IS_CHARACTER_JUNK is recommended.

868 """

869

870 self.linejunk = linejunk

871 self.charjunk = charjunk

872

873 def compare(self, a, b):

874 r"""

875 Compare two sequences of lines; generate the resulting delta.

876

877 Each sequence must contain individual single-line strings ending with

878 newlines. Such sequences can be obtained from the `readlines()` method

879 of file-like objects. The delta generated also consists of newline-

880 terminated strings, ready to be printed as-is via the writeline()

881 method of a file-like object.

882

883 Example:

884

885 >>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(True),

886 ... 'ore\ntree\nemu\n'.splitlines(True))),

887 ... end="")

888 - one

889 ? ^

890 + ore

891 ? ^

892 - two

893 - three

894 ? -

895 + tree

896 + emu

897 """

898

899 cruncher = SequenceMatcher(self.linejunk, a, b)

900 for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():

901 if tag == 'replace':

902 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)

903 elif tag == 'delete':

904 g = self._dump('-', a, alo, ahi)

905 elif tag == 'insert':

906 g = self._dump('+', b, blo, bhi)

907 elif tag == 'equal':

908 g = self._dump(' ', a, alo, ahi)

909 else:

910 raise ValueError('unknown tag %r' % (tag,))

911

912 yield from g

913

914 def _dump(self, tag, x, lo, hi):

915 """Generate comparison results for a same-tagged range."""

916 for i in range(lo, hi):

917 yield '%s %s' % (tag, x[i])

918

919 def _plain_replace(self, a, alo, ahi, b, blo, bhi):

920 assert alo < ahi and blo < bhi

921 # dump the shorter block first -- reduces the burden on short-term

922 # memory if the blocks are of very different sizes

923 if bhi - blo < ahi - alo:

924 first = self._dump('+', b, blo, bhi)

925 second = self._dump('-', a, alo, ahi)

926 else:

927 first = self._dump('-', a, alo, ahi)

928 second = self._dump('+', b, blo, bhi)

929

930 for g in first, second:

931 yield from g

932

933 def _fancy_replace(self, a, alo, ahi, b, blo, bhi):

934 r"""

935 When replacing one block of lines with another, search the blocks

936 for *similar* lines; the best-matching pair (if any) is used as a

937 synch point, and intraline difference marking is done on the

938 similar pair. Lots of work, but often worth it.

939

940 Example:

941

942 >>> d = Differ()

943 >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,

944 ... ['abcdefGhijkl\n'], 0, 1)

945 >>> print(''.join(results), end="")

946 - abcDefghiJkl

947 ? ^ ^ ^

948 + abcdefGhijkl

949 ? ^ ^ ^

950 """

951

952 # don't synch up unless the lines have a similarity score of at

953 # least cutoff; best_ratio tracks the best score seen so far

954 best_ratio, cutoff = 0.74, 0.75

955 cruncher = SequenceMatcher(self.charjunk)

956 eqi, eqj = None, None # 1st indices of equal lines (if any)

957

958 # search for the pair that matches best without being identical

959 # (identical lines must be junk lines, & we don't want to synch up

960 # on junk -- unless we have to)

961 for j in range(blo, bhi):

962 bj = b[j]

963 cruncher.set_seq2(bj)

964 for i in range(alo, ahi):

965 ai = a[i]

966 if ai == bj:

967 if eqi is None:

968 eqi, eqj = i, j

969 continue

970 cruncher.set_seq1(ai)

971 # computing similarity is expensive, so use the quick

972 # upper bounds first -- have seen this speed up messy

973 # compares by a factor of 3.

974 # note that ratio() is only expensive to compute the first

975 # time it's called on a sequence pair; the expensive part

976 # of the computation is cached by cruncher

977 if cruncher.real_quick_ratio() > best_ratio and \

978 cruncher.quick_ratio() > best_ratio and \

979 cruncher.ratio() > best_ratio:

980 best_ratio, best_i, best_j = cruncher.ratio(), i, j

981 if best_ratio < cutoff:

982 # no non-identical "pretty close" pair

983 if eqi is None:

984 # no identical pair either -- treat it as a straight replace

985 yield from self._plain_replace(a, alo, ahi, b, blo, bhi)

986 return

987 # no close pair, but an identical pair -- synch up on that

988 best_i, best_j, best_ratio = eqi, eqj, 1.0

989 else:

990 # there's a close pair, so forget the identical pair (if any)

991 eqi = None

992

993 # a[best_i] very similar to b[best_j]; eqi is None iff they're not

994 # identical

995

996 # pump out diffs from before the synch point

997 yield from self._fancy_helper(a, alo, best_i, b, blo, best_j)

998

999 # do intraline marking on the synch pair

1000 aelt, belt = a[best_i], b[best_j]

1001 if eqi is None:

1002 # pump out a '-', '?', '+', '?' quad for the synched lines

1003 atags = btags = ""

1004 cruncher.set_seqs(aelt, belt)

1005 for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():

1006 la, lb = ai2 - ai1, bj2 - bj1

1007 if tag == 'replace':

1008 atags += '^' * la

1009 btags += '^' * lb

1010 elif tag == 'delete':

1011 atags += '-' * la

1012 elif tag == 'insert':

1013 btags += '+' * lb

1014 elif tag == 'equal':

1015 atags += ' ' * la

1016 btags += ' ' * lb

1017 else:

1018 raise ValueError('unknown tag %r' % (tag,))

1019 yield from self._qformat(aelt, belt, atags, btags)

1020 else:

1021 # the synch pair is identical

1022 yield ' ' + aelt

1023

1024 # pump out diffs from after the synch point

1025 yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)

1026

1027 def _fancy_helper(self, a, alo, ahi, b, blo, bhi):

1028 g = []

1029 if alo < ahi:

1030 if blo < bhi:

1031 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)

1032 else:

1033 g = self._dump('-', a, alo, ahi)

1034 elif blo < bhi:

1035 g = self._dump('+', b, blo, bhi)

1036

1037 yield from g

1038

1039 def _qformat(self, aline, bline, atags, btags):

1040 r"""

1041 Format "?" output and deal with tabs.

1042

1043 Example:

1044

1045 >>> d = Differ()

1046 >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',

1047 ... ' ^ ^ ^ ', ' ^ ^ ^ ')

1048 >>> for line in results: print(repr(line))

1049 ...

1050 '- \tabcDefghiJkl\n'

1051 '? \t ^ ^ ^\n'

1052 '+ \tabcdefGhijkl\n'

1053 '? \t ^ ^ ^\n'

1054 """

1055 atags = _keep_original_ws(aline, atags).rstrip()

1056 btags = _keep_original_ws(bline, btags).rstrip()

1057

1058 yield "- " + aline

1059 if atags:

1060 yield f"? {atags}\n"

1061

1062 yield "+ " + bline

1063 if btags:

1064 yield f"? {btags}\n"

1065

1066# With respect to junk, an earlier version of ndiff simply refused to

1067# *start* a match with a junk element. The result was cases like this:

1068# before: private Thread currentThread;

1069# after: private volatile Thread currentThread;

1070# If you consider whitespace to be junk, the longest contiguous match

1071# not starting with junk is "e Thread currentThread". So ndiff reported

1072# that "e volatil" was inserted between the 't' and the 'e' in "private".

1073# While an accurate view, to people that's absurd. The current version

1074# looks for matching blocks that are entirely junk-free, then extends the

1075# longest one of those as far as possible but only with matching junk.

1076# So now "currentThread" is matched, then extended to suck up the

1077# preceding blank; then "private" is matched, and extended to suck up the

1078# following blank; then "Thread" is matched; and finally ndiff reports

1079# that "volatile " was inserted before "Thread". The only quibble

1080# remaining is that perhaps it was really the case that " volatile"

1081# was inserted after "private". I can live with that <wink>.

1082

1083import re

1084

1085def IS_LINE_JUNK(line, pat=re.compile(r"\s*(?:#\s*)?$").match):

1086 r"""

1087 Return True for ignorable line: iff `line` is blank or contains a single '#'.

1088

1089 Examples:

1090

1091 >>> IS_LINE_JUNK('\n')

1092 True

1093 >>> IS_LINE_JUNK(' # \n')

1094 True

1095 >>> IS_LINE_JUNK('hello\n')

1096 False

1097 """

1098

1099 return pat(line) is not None

1100

1101def IS_CHARACTER_JUNK(ch, ws=" \t"):

1102 r"""

1103 Return True for ignorable character: iff `ch` is a space or tab.

1104

1105 Examples:

1106

1107 >>> IS_CHARACTER_JUNK(' ')

1108 True

1109 >>> IS_CHARACTER_JUNK('\t')

1110 True

1111 >>> IS_CHARACTER_JUNK('\n')

1112 False

1113 >>> IS_CHARACTER_JUNK('x')

1114 False

1115 """

1116

1117 return ch in ws

1118

1119

1120########################################################################

1121### Unified Diff

1122########################################################################

1123

1124def _format_range_unified(start, stop):

1125 'Convert range to the "ed" format'

1126 # Per the diff spec at http://www.unix.org/single_unix_specification/

1127 beginning = start + 1 # lines start numbering with one

1128 length = stop - start

1129 if length == 1:

1130 return '{}'.format(beginning)

1131 if not length:

1132 beginning -= 1 # empty ranges begin at line just before the range

1133 return '{},{}'.format(beginning, length)

1134

1135def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',

1136 tofiledate='', n=3, lineterm='\n'):

1137 r"""

1138 Compare two sequences of lines; generate the delta as a unified diff.

1139

1140 Unified diffs are a compact way of showing line changes and a few

1141 lines of context. The number of context lines is set by 'n' which

1142 defaults to three.

1143

1144 By default, the diff control lines (those with ---, +++, or @@) are

1145 created with a trailing newline. This is helpful so that inputs

1146 created from file.readlines() result in diffs that are suitable for

1147 file.writelines() since both the inputs and outputs have trailing

1148 newlines.

1149

1150 For inputs that do not have trailing newlines, set the lineterm

1151 argument to "" so that the output will be uniformly newline free.

1152

1153 The unidiff format normally has a header for filenames and modification

1154 times. Any or all of these may be specified using strings for

1155 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.

1156 The modification times are normally expressed in the ISO 8601 format.

1157

1158 Example:

1159

1160 >>> for line in unified_diff('one two three four'.split(),

1161 ... 'zero one tree four'.split(), 'Original', 'Current',

1162 ... '2005-01-26 23:30:50', '2010-04-02 10:20:52',

1163 ... lineterm=''):

1164 ... print(line) # doctest: +NORMALIZE_WHITESPACE

1165 --- Original 2005-01-26 23:30:50

1166 +++ Current 2010-04-02 10:20:52

1167 @@ -1,4 +1,4 @@

1168 +zero

1169 one

1170 -two

1171 -three

1172 +tree

1173 four

1174 """

1175

1176 _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)

1177 started = False

1178 for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):

1179 if not started:

1180 started = True

1181 fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''

1182 todate = '\t{}'.format(tofiledate) if tofiledate else ''

1183 yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)

1184 yield '+++ {}{}{}'.format(tofile, todate, lineterm)

1185

1186 first, last = group[0], group[-1]

1187 file1_range = _format_range_unified(first[1], last[2])

1188 file2_range = _format_range_unified(first[3], last[4])

1189 yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)

1190

1191 for tag, i1, i2, j1, j2 in group:

1192 if tag == 'equal':

1193 for line in a[i1:i2]:

1194 yield ' ' + line

1195 continue

1196 if tag in {'replace', 'delete'}:

1197 for line in a[i1:i2]:

1198 yield '-' + line

1199 if tag in {'replace', 'insert'}:

1200 for line in b[j1:j2]:

1201 yield '+' + line

1202

1203

1204########################################################################

1205### Context Diff

1206########################################################################

1207

1208def _format_range_context(start, stop):

1209 'Convert range to the "ed" format'

1210 # Per the diff spec at http://www.unix.org/single_unix_specification/

1211 beginning = start + 1 # lines start numbering with one

1212 length = stop - start

1213 if not length:

1214 beginning -= 1 # empty ranges begin at line just before the range

1215 if length <= 1:

1216 return '{}'.format(beginning)

1217 return '{},{}'.format(beginning, beginning + length - 1)

1218

1219# See http://www.unix.org/single_unix_specification/

1220def context_diff(a, b, fromfile='', tofile='',

1221 fromfiledate='', tofiledate='', n=3, lineterm='\n'):

1222 r"""

1223 Compare two sequences of lines; generate the delta as a context diff.

1224

1225 Context diffs are a compact way of showing line changes and a few

1226 lines of context. The number of context lines is set by 'n' which

1227 defaults to three.

1228

1229 By default, the diff control lines (those with *** or ---) are

1230 created with a trailing newline. This is helpful so that inputs

1231 created from file.readlines() result in diffs that are suitable for

1232 file.writelines() since both the inputs and outputs have trailing

1233 newlines.

1234

1235 For inputs that do not have trailing newlines, set the lineterm

1236 argument to "" so that the output will be uniformly newline free.

1237

1238 The context diff format normally has a header for filenames and

1239 modification times. Any or all of these may be specified using

1240 strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.

1241 The modification times are normally expressed in the ISO 8601 format.

1242 If not specified, the strings default to blanks.

1243

1244 Example:

1245

1246 >>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(True),

1247 ... 'zero\none\ntree\nfour\n'.splitlines(True), 'Original', 'Current')),

1248 ... end="")

1249 *** Original

1250 --- Current

1251 ***************

1252 *** 1,4 ****

1253 one

1254 ! two

1255 ! three

1256 four

1257 --- 1,4 ----

1258 + zero

1259 one

1260 ! tree

1261 four

1262 """

1263

1264 _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)

1265 prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ')

1266 started = False

1267 for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):

1268 if not started:

1269 started = True

1270 fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''

1271 todate = '\t{}'.format(tofiledate) if tofiledate else ''

1272 yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)

1273 yield '--- {}{}{}'.format(tofile, todate, lineterm)

1274

1275 first, last = group[0], group[-1]

1276 yield '***************' + lineterm

1277

1278 file1_range = _format_range_context(first[1], last[2])

1279 yield '*** {} ****{}'.format(file1_range, lineterm)

1280

1281 if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group):

1282 for tag, i1, i2, _, _ in group:

1283 if tag != 'insert':

1284 for line in a[i1:i2]:

1285 yield prefix[tag] + line

1286

1287 file2_range = _format_range_context(first[3], last[4])

1288 yield '--- {} ----{}'.format(file2_range, lineterm)

1289

1290 if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group):

1291 for tag, _, _, j1, j2 in group:

1292 if tag != 'delete':

1293 for line in b[j1:j2]:

1294 yield prefix[tag] + line

1295

1296def _check_types(a, b, *args):

1297 # Checking types is weird, but the alternative is garbled output when

1298 # someone passes mixed bytes and str to {unified,context}_diff(). E.g.

1299 # without this check, passing filenames as bytes results in output like

1300 # --- b'oldfile.txt'

1301 # +++ b'newfile.txt'

1302 # because of how str.format() incorporates bytes objects.

1303 if a and not isinstance(a[0], str):

1304 raise TypeError('lines to compare must be str, not %s (%r)' %

1305 (type(a[0]).__name__, a[0]))

1306 if b and not isinstance(b[0], str):

1307 raise TypeError('lines to compare must be str, not %s (%r)' %

1308 (type(b[0]).__name__, b[0]))

1309 for arg in args:

1310 if not isinstance(arg, str):

1311 raise TypeError('all arguments must be str, not: %r' % (arg,))

1312

1313def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',

1314 fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):

1315 r"""

1316 Compare `a` and `b`, two sequences of lines represented as bytes rather

1317 than str. This is a wrapper for `dfunc`, which is typically either

1318 unified_diff() or context_diff(). Inputs are losslessly converted to

1319 strings so that `dfunc` only has to worry about strings, and encoded

1320 back to bytes on return. This is necessary to compare files with

1321 unknown or inconsistent encoding. All other inputs (except `n`) must be

1322 bytes rather than str.

1323 """

1324 def decode(s):

1325 try:

1326 return s.decode('ascii', 'surrogateescape')

1327 except AttributeError as err:

1328 msg = ('all arguments must be bytes, not %s (%r)' %

1329 (type(s).__name__, s))

1330 raise TypeError(msg) from err

1331 a = list(map(decode, a))

1332 b = list(map(decode, b))

1333 fromfile = decode(fromfile)

1334 tofile = decode(tofile)

1335 fromfiledate = decode(fromfiledate)

1336 tofiledate = decode(tofiledate)

1337 lineterm = decode(lineterm)

1338

1339 lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)

1340 for line in lines:

1341 yield line.encode('ascii', 'surrogateescape')

1342

1343def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):

1344 r"""

1345 Compare `a` and `b` (lists of strings); return a `Differ`-style delta.

1346

1347 Optional keyword parameters `linejunk` and `charjunk` are for filter

1348 functions, or can be None:

1349

1350 - linejunk: A function that should accept a single string argument and

1351 return true iff the string is junk. The default is None, and is

1352 recommended; the underlying SequenceMatcher class has an adaptive

1353 notion of "noise" lines.

1354

1355 - charjunk: A function that accepts a character (string of length

1356 1), and returns true iff the character is junk. The default is

1357 the module-level function IS_CHARACTER_JUNK, which filters out

1358 whitespace characters (a blank or tab; note: it's a bad idea to

1359 include newline in this!).

1360

1361 Tools/scripts/ndiff.py is a command-line front-end to this function.

1362

1363 Example:

1364

1365 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),

1366 ... 'ore\ntree\nemu\n'.splitlines(keepends=True))

1367 >>> print(''.join(diff), end="")

1368 - one

1369 ? ^

1370 + ore

1371 ? ^

1372 - two

1373 - three

1374 ? -

1375 + tree

1376 + emu

1377 """

1378 return Differ(linejunk, charjunk).compare(a, b)

1379

1380def _mdiff(fromlines, tolines, context=None, linejunk=None,

1381 charjunk=IS_CHARACTER_JUNK):

1382 r"""Returns generator yielding marked up from/to side by side differences.

1383

1384 Arguments:

1385 fromlines -- list of text lines to compared to tolines

1386 tolines -- list of text lines to be compared to fromlines

1387 context -- number of context lines to display on each side of difference,

1388 if None, all from/to text lines will be generated.

1389 linejunk -- passed on to ndiff (see ndiff documentation)

1390 charjunk -- passed on to ndiff (see ndiff documentation)

1391

1392 This function returns an iterator which returns a tuple:

1393 (from line tuple, to line tuple, boolean flag)

1394

1395 from/to line tuple -- (line num, line text)

1396 line num -- integer or None (to indicate a context separation)

1397 line text -- original line text with following markers inserted:

1398 '\0+' -- marks start of added text

1399 '\0-' -- marks start of deleted text

1400 '\0^' -- marks start of changed text

1401 '\1' -- marks end of added/deleted/changed text

1402

1403 boolean flag -- None indicates context separation, True indicates

1404 either "from" or "to" line contains a change, otherwise False.

1405

1406 This function/iterator was originally developed to generate side by side

1407 file difference for making HTML pages (see HtmlDiff class for example

1408 usage).

1409

1410 Note, this function utilizes the ndiff function to generate the side by

1411 side difference markup. Optional ndiff arguments may be passed to this

1412 function and they in turn will be passed to ndiff.

1413 """

1414 import re

1415

1416 # regular expression for finding intraline change indices

1417 change_re = re.compile(r'(\++|\-+|\^+)')

1418

1419 # create the difference iterator to generate the differences

1420 diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)

1421

1422 def _make_line(lines, format_key, side, num_lines=[0,0]):

1423 """Returns line of text with user's change markup and line formatting.

1424

1425 lines -- list of lines from the ndiff generator to produce a line of

1426 text from. When producing the line of text to return, the

1427 lines used are removed from this list.

1428 format_key -- '+' return first line in list with "add" markup around

1429 the entire line.

1430 '-' return first line in list with "delete" markup around

1431 the entire line.

1432 '?' return first line in list with add/delete/change

1433 intraline markup (indices obtained from second line)

1434 None return first line in list with no markup

1435 side -- indice into the num_lines list (0=from,1=to)

1436 num_lines -- from/to current line number. This is NOT intended to be a

1437 passed parameter. It is present as a keyword argument to

1438 maintain memory of the current line numbers between calls

1439 of this function.

1440

1441 Note, this function is purposefully not defined at the module scope so

1442 that data it needs from its parent function (within whose context it

1443 is defined) does not need to be of module scope.

1444 """

1445 num_lines[side] += 1

1446 # Handle case where no user markup is to be added, just return line of

1447 # text with user's line format to allow for usage of the line number.

1448 if format_key is None:

1449 return (num_lines[side],lines.pop(0)[2:])

1450 # Handle case of intraline changes

1451 if format_key == '?':

1452 text, markers = lines.pop(0), lines.pop(0)

1453 # find intraline changes (store change type and indices in tuples)

1454 sub_info = []

1455 def record_sub_info(match_object,sub_info=sub_info):

1456 sub_info.append([match_object.group(1)[0],match_object.span()])

1457 return match_object.group(1)

1458 change_re.sub(record_sub_info,markers)

1459 # process each tuple inserting our special marks that won't be

1460 # noticed by an xml/html escaper.

1461 for key,(begin,end) in reversed(sub_info):

1462 text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]

1463 text = text[2:]

1464 # Handle case of add/delete entire line

1465 else:

1466 text = lines.pop(0)[2:]

1467 # if line of text is just a newline, insert a space so there is

1468 # something for the user to highlight and see.

1469 if not text:

1470 text = ' '

1471 # insert marks that won't be noticed by an xml/html escaper.

1472 text = '\0' + format_key + text + '\1'

1473 # Return line of text, first allow user's line formatter to do its

1474 # thing (such as adding the line number) then replace the special

1475 # marks with what the user's change markup.

1476 return (num_lines[side],text)

1477

1478 def _line_iterator():

1479 """Yields from/to lines of text with a change indication.

1480

1481 This function is an iterator. It itself pulls lines from a

1482 differencing iterator, processes them and yields them. When it can

1483 it yields both a "from" and a "to" line, otherwise it will yield one

1484 or the other. In addition to yielding the lines of from/to text, a

1485 boolean flag is yielded to indicate if the text line(s) have

1486 differences in them.

1487

1488 Note, this function is purposefully not defined at the module scope so

1489 that data it needs from its parent function (within whose context it

1490 is defined) does not need to be of module scope.

1491 """

1492 lines = []

1493 num_blanks_pending, num_blanks_to_yield = 0, 0

1494 while True:

1495 # Load up next 4 lines so we can look ahead, create strings which

1496 # are a concatenation of the first character of each of the 4 lines

1497 # so we can do some very readable comparisons.

1498 while len(lines) < 4:

1499 lines.append(next(diff_lines_iterator, 'X'))

1500 s = ''.join([line[0] for line in lines])

1501 if s.startswith('X'):

1502 # When no more lines, pump out any remaining blank lines so the

1503 # corresponding add/delete lines get a matching blank line so

1504 # all line pairs get yielded at the next level.

1505 num_blanks_to_yield = num_blanks_pending

1506 elif s.startswith('-?+?'):

1507 # simple intraline change

1508 yield _make_line(lines,'?',0), _make_line(lines,'?',1), True

1509 continue

1510 elif s.startswith('--++'):

1511 # in delete block, add block coming: we do NOT want to get

1512 # caught up on blank lines yet, just process the delete line

1513 num_blanks_pending -= 1

1514 yield _make_line(lines,'-',0), None, True

1515 continue

1516 elif s.startswith(('--?+', '--+', '- ')):

1517 # in delete block and see an intraline change or unchanged line

1518 # coming: yield the delete line and then blanks

1519 from_line,to_line = _make_line(lines,'-',0), None

1520 num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0

1521 elif s.startswith('-+?'):

1522 # intraline change

1523 yield _make_line(lines,None,0), _make_line(lines,'?',1), True

1524 continue

1525 elif s.startswith('-?+'):

1526 # intraline change

1527 yield _make_line(lines,'?',0), _make_line(lines,None,1), True

1528 continue

1529 elif s.startswith('-'):

1530 # delete FROM line

1531 num_blanks_pending -= 1

1532 yield _make_line(lines,'-',0), None, True

1533 continue

1534 elif s.startswith('+--'):

1535 # in add block, delete block coming: we do NOT want to get

1536 # caught up on blank lines yet, just process the add line

1537 num_blanks_pending += 1

1538 yield None, _make_line(lines,'+',1), True

1539 continue

1540 elif s.startswith(('+ ', '+-')):

1541 # will be leaving an add block: yield blanks then add line

1542 from_line, to_line = None, _make_line(lines,'+',1)

1543 num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0

1544 elif s.startswith('+'):

1545 # inside an add block, yield the add line

1546 num_blanks_pending += 1

1547 yield None, _make_line(lines,'+',1), True

1548 continue

1549 elif s.startswith(' '):

1550 # unchanged text, yield it to both sides

1551 yield _make_line(lines[:],None,0),_make_line(lines,None,1),False

1552 continue

1553 # Catch up on the blank lines so when we yield the next from/to

1554 # pair, they are lined up.

1555 while(num_blanks_to_yield < 0):

1556 num_blanks_to_yield += 1

1557 yield None,('','\n'),True

1558 while(num_blanks_to_yield > 0):

1559 num_blanks_to_yield -= 1

1560 yield ('','\n'),None,True

1561 if s.startswith('X'):

1562 return

1563 else:

1564 yield from_line,to_line,True

1565

1566 def _line_pair_iterator():

1567 """Yields from/to lines of text with a change indication.

1568

1569 This function is an iterator. It itself pulls lines from the line

1570 iterator. Its difference from that iterator is that this function

1571 always yields a pair of from/to text lines (with the change

1572 indication). If necessary it will collect single from/to lines

1573 until it has a matching pair from/to pair to yield.

1574

1575 Note, this function is purposefully not defined at the module scope so

1576 that data it needs from its parent function (within whose context it

1577 is defined) does not need to be of module scope.

1578 """

1579 line_iterator = _line_iterator()

1580 fromlines,tolines=[],[]

1581 while True:

1582 # Collecting lines of text until we have a from/to pair

1583 while (len(fromlines)==0 or len(tolines)==0):

1584 try:

1585 from_line, to_line, found_diff = next(line_iterator)

1586 except StopIteration:

1587 return

1588 if from_line is not None:

1589 fromlines.append((from_line,found_diff))

1590 if to_line is not None:

1591 tolines.append((to_line,found_diff))

1592 # Once we have a pair, remove them from the collection and yield it

1593 from_line, fromDiff = fromlines.pop(0)

1594 to_line, to_diff = tolines.pop(0)

1595 yield (from_line,to_line,fromDiff or to_diff)

1596

1597 # Handle case where user does not want context differencing, just yield

1598 # them up without doing anything else with them.

1599 line_pair_iterator = _line_pair_iterator()

1600 if context is None:

1601 yield from line_pair_iterator

1602 # Handle case where user wants context differencing. We must do some

1603 # storage of lines until we know for sure that they are to be yielded.

1604 else:

1605 context += 1

1606 lines_to_write = 0

1607 while True:

1608 # Store lines up until we find a difference, note use of a

1609 # circular queue because we only need to keep around what

1610 # we need for context.

1611 index, contextLines = 0, [None]*(context)

1612 found_diff = False

1613 while(found_diff is False):

1614 try:

1615 from_line, to_line, found_diff = next(line_pair_iterator)

1616 except StopIteration:

1617 return

1618 i = index % context

1619 contextLines[i] = (from_line, to_line, found_diff)

1620 index += 1

1621 # Yield lines that we have collected so far, but first yield

1622 # the user's separator.

1623 if index > context:

1624 yield None, None, None

1625 lines_to_write = context

1626 else:

1627 lines_to_write = index

1628 index = 0

1629 while(lines_to_write):

1630 i = index % context

1631 index += 1

1632 yield contextLines[i]

1633 lines_to_write -= 1

1634 # Now yield the context lines after the change

1635 lines_to_write = context-1

1636 try:

1637 while(lines_to_write):

1638 from_line, to_line, found_diff = next(line_pair_iterator)

1639 # If another change within the context, extend the context

1640 if found_diff:

1641 lines_to_write = context-1

1642 else:

1643 lines_to_write -= 1

1644 yield from_line, to_line, found_diff

1645 except StopIteration:

1646 # Catch exception from next() and return normally

1647 return

1648

1649

1650_file_template = """

1651<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"

1652 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

1653

1654<html>

1655

1656<head>

1657 <meta http-equiv="Content-Type"

1658 content="text/html; charset=%(charset)s" />

1659 <title></title>

1660 <style type="text/css">%(styles)s

1661 </style>

1662</head>

1663

1664<body>

1665 %(table)s%(legend)s

1666</body>

1667

1668</html>"""

1669

1670_styles = """

1671 table.diff {font-family:Courier; border:medium;}

1672 .diff_header {background-color:#e0e0e0}

1673 td.diff_header {text-align:right}

1674 .diff_next {background-color:#c0c0c0}

1675 .diff_add {background-color:#aaffaa}

1676 .diff_chg {background-color:#ffff77}

1677 .diff_sub {background-color:#ffaaaa}"""

1678

1679_table_template = """

1680 <table class="diff" id="difflib_chg_%(prefix)s_top"

1681 cellspacing="0" cellpadding="0" rules="groups" >

1682 <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>

1683 <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>

1684 %(header_row)s

1685 <tbody>

1686%(data_rows)s </tbody>

1687 </table>"""

1688

1689_legend = """

1690 <table class="diff" summary="Legends">

1691 <tr> <th colspan="2"> Legends </th> </tr>

1692 <tr> <td> <table border="" summary="Colors">

1693 <tr><th> Colors </th> </tr>

1694 <tr><td class="diff_add"> Added </td></tr>

1695 <tr><td class="diff_chg">Changed</td> </tr>

1696 <tr><td class="diff_sub">Deleted</td> </tr>

1697 </table></td>

1698 <td> <table border="" summary="Links">

1699 <tr><th colspan="2"> Links </th> </tr>

1700 <tr><td>(f)irst change</td> </tr>

1701 <tr><td>(n)ext change</td> </tr>

1702 <tr><td>(t)op</td> </tr>

1703 </table></td> </tr>

1704 </table>"""

1705

1706class HtmlDiff(object):

1707 """For producing HTML side by side comparison with change highlights.

1708

1709 This class can be used to create an HTML table (or a complete HTML file

1710 containing the table) showing a side by side, line by line comparison

1711 of text with inter-line and intra-line change highlights. The table can

1712 be generated in either full or contextual difference mode.

1713

1714 The following methods are provided for HTML generation:

1715

1716 make_table -- generates HTML for a single side by side table

1717 make_file -- generates complete HTML file with a single side by side table

1718

1719 See tools/scripts/diff.py for an example usage of this class.

1720 """

1721

1722 _file_template = _file_template

1723 _styles = _styles

1724 _table_template = _table_template

1725 _legend = _legend

1726 _default_prefix = 0

1727

1728 def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,

1729 charjunk=IS_CHARACTER_JUNK):

1730 """HtmlDiff instance initializer

1731

1732 Arguments:

1733 tabsize -- tab stop spacing, defaults to 8.

1734 wrapcolumn -- column number where lines are broken and wrapped,

1735 defaults to None where lines are not wrapped.

1736 linejunk,charjunk -- keyword arguments passed into ndiff() (used by

1737 HtmlDiff() to generate the side by side HTML differences). See

1738 ndiff() documentation for argument default values and descriptions.

1739 """

1740 self._tabsize = tabsize

1741 self._wrapcolumn = wrapcolumn

1742 self._linejunk = linejunk

1743 self._charjunk = charjunk

1744

1745 def make_file(self, fromlines, tolines, fromdesc='', todesc='',

1746 context=False, numlines=5, *, charset='utf-8'):

1747 """Returns HTML file of side by side comparison with change highlights

1748

1749 Arguments:

1750 fromlines -- list of "from" lines

1751 tolines -- list of "to" lines

1752 fromdesc -- "from" file column header string

1753 todesc -- "to" file column header string

1754 context -- set to True for contextual differences (defaults to False

1755 which shows full differences).

1756 numlines -- number of context lines. When context is set True,

1757 controls number of lines displayed before and after the change.

1758 When context is False, controls the number of lines to place

1759 the "next" link anchors before the next change (so click of

1760 "next" link jumps to just before the change).

1761 charset -- charset of the HTML document

1762 """

1763

1764 return (self._file_template % dict(

1765 styles=self._styles,

1766 legend=self._legend,

1767 table=self.make_table(fromlines, tolines, fromdesc, todesc,

1768 context=context, numlines=numlines),

1769 charset=charset

1770 )).encode(charset, 'xmlcharrefreplace').decode(charset)

1771

1772 def _tab_newline_replace(self,fromlines,tolines):

1773 """Returns from/to line lists with tabs expanded and newlines removed.

1774

1775 Instead of tab characters being replaced by the number of spaces

1776 needed to fill in to the next tab stop, this function will fill

1777 the space with tab characters. This is done so that the difference

1778 algorithms can identify changes in a file when tabs are replaced by

1779 spaces and vice versa. At the end of the HTML generation, the tab

1780 characters will be replaced with a nonbreakable space.

1781 """

1782 def expand_tabs(line):

1783 # hide real spaces

1784 line = line.replace(' ','\0')

1785 # expand tabs into spaces

1786 line = line.expandtabs(self._tabsize)

1787 # replace spaces from expanded tabs back into tab characters

1788 # (we'll replace them with markup after we do differencing)

1789 line = line.replace(' ','\t')

1790 return line.replace('\0',' ').rstrip('\n')

1791 fromlines = [expand_tabs(line) for line in fromlines]

1792 tolines = [expand_tabs(line) for line in tolines]

1793 return fromlines,tolines

1794

1795 def _split_line(self,data_list,line_num,text):

1796 """Builds list of text lines by splitting text lines at wrap point

1797

1798 This function will determine if the input text line needs to be

1799 wrapped (split) into separate lines. If so, the first wrap point

1800 will be determined and the first line appended to the output

1801 text line list. This function is used recursively to handle

1802 the second part of the split line to further split it.

1803 """

1804 # if blank line or context separator, just add it to the output list

1805 if not line_num:

1806 data_list.append((line_num,text))

1807 return

1808

1809 # if line text doesn't need wrapping, just add it to the output list

1810 size = len(text)

1811 max = self._wrapcolumn

1812 if (size <= max) or ((size -(text.count('\0')*3)) <= max):

1813 data_list.append((line_num,text))

1814 return

1815

1816 # scan text looking for the wrap point, keeping track if the wrap

1817 # point is inside markers

1818 i = 0

1819 n = 0

1820 mark = ''

1821 while n < max and i < size:

1822 if text[i] == '\0':

1823 i += 1

1824 mark = text[i]

1825 i += 1

1826 elif text[i] == '\1':

1827 i += 1

1828 mark = ''

1829 else:

1830 i += 1

1831 n += 1

1832

1833 # wrap point is inside text, break it up into separate lines

1834 line1 = text[:i]

1835 line2 = text[i:]

1836

1837 # if wrap point is inside markers, place end marker at end of first

1838 # line and start marker at beginning of second line because each

1839 # line will have its own table tag markup around it.

1840 if mark:

1841 line1 = line1 + '\1'

1842 line2 = '\0' + mark + line2

1843

1844 # tack on first line onto the output list

1845 data_list.append((line_num,line1))

1846

1847 # use this routine again to wrap the remaining text

1848 self._split_line(data_list,'>',line2)

1849

1850 def _line_wrapper(self,diffs):

1851 """Returns iterator that splits (wraps) mdiff text lines"""

1852

1853 # pull from/to data and flags from mdiff iterator

1854 for fromdata,todata,flag in diffs:

1855 # check for context separators and pass them through

1856 if flag is None:

1857 yield fromdata,todata,flag

1858 continue

1859 (fromline,fromtext),(toline,totext) = fromdata,todata

1860 # for each from/to line split it at the wrap column to form

1861 # list of text lines.

1862 fromlist,tolist = [],[]

1863 self._split_line(fromlist,fromline,fromtext)

1864 self._split_line(tolist,toline,totext)

1865 # yield from/to line in pairs inserting blank lines as

1866 # necessary when one side has more wrapped lines

1867 while fromlist or tolist:

1868 if fromlist:

1869 fromdata = fromlist.pop(0)

1870 else:

1871 fromdata = ('',' ')

1872 if tolist:

1873 todata = tolist.pop(0)

1874 else:

1875 todata = ('',' ')

1876 yield fromdata,todata,flag

1877

1878 def _collect_lines(self,diffs):

1879 """Collects mdiff output into separate lists

1880

1881 Before storing the mdiff from/to data into a list, it is converted

1882 into a single line of text with HTML markup.

1883 """

1884

1885 fromlist,tolist,flaglist = [],[],[]

1886 # pull from/to data and flags from mdiff style iterator

1887 for fromdata,todata,flag in diffs:

1888 try:

1889 # store HTML markup of the lines into the lists

1890 fromlist.append(self._format_line(0,flag,*fromdata))

1891 tolist.append(self._format_line(1,flag,*todata))

1892 except TypeError:

1893 # exceptions occur for lines where context separators go

1894 fromlist.append(None)

1895 tolist.append(None)

1896 flaglist.append(flag)

1897 return fromlist,tolist,flaglist

1898

1899 def _format_line(self,side,flag,linenum,text):

1900 """Returns HTML markup of "from" / "to" text lines

1901

1902 side -- 0 or 1 indicating "from" or "to" text

1903 flag -- indicates if difference on line

1904 linenum -- line number (used for line number column)

1905 text -- line text to be marked up

1906 """

1907 try:

1908 linenum = '%d' % linenum

1909 id = ' id="%s%s"' % (self._prefix[side],linenum)

1910 except TypeError:

1911 # handle blank lines where linenum is '>' or ''

1912 id = ''

1913 # replace those things that would get confused with HTML symbols

1914 text=text.replace("&","&").replace(">",">").replace("<","<")

1915

1916 # make space non-breakable so they don't get compressed or line wrapped

1917 text = text.replace(' ',' ').rstrip()

1918

1919 return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \

1920 % (id,linenum,text)

1921

1922 def _make_prefix(self):

1923 """Create unique anchor prefixes"""

1924

1925 # Generate a unique anchor prefix so multiple tables

1926 # can exist on the same HTML page without conflicts.

1927 fromprefix = "from%d_" % HtmlDiff._default_prefix

1928 toprefix = "to%d_" % HtmlDiff._default_prefix

1929 HtmlDiff._default_prefix += 1

1930 # store prefixes so line format method has access

1931 self._prefix = [fromprefix,toprefix]

1932

1933 def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):

1934 """Makes list of "next" links"""

1935

1936 # all anchor names will be generated using the unique "to" prefix

1937 toprefix = self._prefix[1]

1938

1939 # process change flags, generating middle column of next anchors/links

1940 next_id = ['']*len(flaglist)

1941 next_href = ['']*len(flaglist)

1942 num_chg, in_change = 0, False

1943 last = 0

1944 for i,flag in enumerate(flaglist):

1945 if flag:

1946 if not in_change:

1947 in_change = True

1948 last = i

1949 # at the beginning of a change, drop an anchor a few lines

1950 # (the context lines) before the change for the previous

1951 # link

1952 i = max([0,i-numlines])

1953 next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)

1954 # at the beginning of a change, drop a link to the next

1955 # change

1956 num_chg += 1

1957 next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (

1958 toprefix,num_chg)

1959 else:

1960 in_change = False

1961 # check for cases where there is no content to avoid exceptions

1962 if not flaglist:

1963 flaglist = [False]

1964 next_id = ['']

1965 next_href = ['']

1966 last = 0

1967 if context:

1968 fromlist = ['<td></td><td> No Differences Found </td>']

1969 tolist = fromlist

1970 else:

1971 fromlist = tolist = ['<td></td><td> Empty File </td>']

1972 # if not a change on first line, drop a link

1973 if not flaglist[0]:

1974 next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix

1975 # redo the last link to link to the top

1976 next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)

1977

1978 return fromlist,tolist,flaglist,next_href,next_id

1979

1980 def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,

1981 numlines=5):

1982 """Returns HTML table of side by side comparison with change highlights

1983

1984 Arguments:

1985 fromlines -- list of "from" lines

1986 tolines -- list of "to" lines

1987 fromdesc -- "from" file column header string

1988 todesc -- "to" file column header string

1989 context -- set to True for contextual differences (defaults to False

1990 which shows full differences).

1991 numlines -- number of context lines. When context is set True,

1992 controls number of lines displayed before and after the change.

1993 When context is False, controls the number of lines to place

1994 the "next" link anchors before the next change (so click of

1995 "next" link jumps to just before the change).

1996 """

1997

1998 # make unique anchor prefixes so that multiple tables may exist

1999 # on the same page without conflict.

2000 self._make_prefix()

2001

2002 # change tabs to spaces before it gets more difficult after we insert

2003 # markup

2004 fromlines,tolines = self._tab_newline_replace(fromlines,tolines)

2005

2006 # create diffs iterator which generates side by side from/to data

2007 if context:

2008 context_lines = numlines

2009 else:

2010 context_lines = None

2011 diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,

2012 charjunk=self._charjunk)

2013

2014 # set up iterator to wrap lines that exceed desired width

2015 if self._wrapcolumn:

2016 diffs = self._line_wrapper(diffs)

2017

2018 # collect up from/to lines and flags into lists (also format the lines)

2019 fromlist,tolist,flaglist = self._collect_lines(diffs)

2020

2021 # process change flags, generating middle column of next anchors/links

2022 fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(

2023 fromlist,tolist,flaglist,context,numlines)

2024

2025 s = []

2026 fmt = ' <tr><td class="diff_next"%s>%s</td>%s' + \

2027 '<td class="diff_next">%s</td>%s</tr>\n'

2028 for i in range(len(flaglist)):

2029 if flaglist[i] is None:

2030 # mdiff yields None on separator lines skip the bogus ones

2031 # generated for the first line

2032 if i > 0:

2033 s.append(' </tbody> \n <tbody>\n')

2034 else:

2035 s.append( fmt % (next_id[i],next_href[i],fromlist[i],

2036 next_href[i],tolist[i]))

2037 if fromdesc or todesc:

2038 header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (

2039 '<th class="diff_next"><br /></th>',

2040 '<th colspan="2" class="diff_header">%s</th>' % fromdesc,

2041 '<th class="diff_next"><br /></th>',

2042 '<th colspan="2" class="diff_header">%s</th>' % todesc)

2043 else:

2044 header_row = ''

2045

2046 table = self._table_template % dict(

2047 data_rows=''.join(s),

2048 header_row=header_row,

2049 prefix=self._prefix[1])

2050

2051 return table.replace('\0+','<span class="diff_add">'). \

2052 replace('\0-','<span class="diff_sub">'). \

2053 replace('\0^','<span class="diff_chg">'). \

2054 replace('\1','</span>'). \

2055 replace('\t',' ')

2056

2057del re

2058

2059def restore(delta, which):

2060 r"""

2061 Generate one of the two sequences that generated a delta.

2062

2063 Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract

2064 lines originating from file 1 or 2 (parameter `which`), stripping off line

2065 prefixes.

2066

2067 Examples:

2068

2069 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),

2070 ... 'ore\ntree\nemu\n'.splitlines(keepends=True))

2071 >>> diff = list(diff)

2072 >>> print(''.join(restore(diff, 1)), end="")

2073 one

2074 two

2075 three

2076 >>> print(''.join(restore(diff, 2)), end="")

2077 ore

2078 tree

2079 emu

2080 """

2081 try:

2082 tag = {1: "- ", 2: "+ "}[int(which)]

2083 except KeyError:

2084 raise ValueError('unknown delta choice (must be 1 or 2): %r'

2085 % which) from None

2086 prefixes = (" ", tag)

2087 for line in delta:

2088 if line[:2] in prefixes:

2089 yield line[2:]

2090

2091def _test():

2092 import doctest, difflib

2093 return doctest.testmod(difflib)

2094

2095if __name__ == "__main__":

2096 _test()