Coverage Report

Created: 2025-07-18 06:28

/src/bzip2/blocksort.c
Line
Count
Source (jump to first uncovered line)
1
2
/*-------------------------------------------------------------*/
3
/*--- Block sorting machinery                               ---*/
4
/*---                                           blocksort.c ---*/
5
/*-------------------------------------------------------------*/
6
7
/* ------------------------------------------------------------------
8
   This file is part of bzip2/libbzip2, a program and library for
9
   lossless, block-sorting data compression.
10
11
   bzip2/libbzip2 version 1.0.8 of 13 July 2019
12
   Copyright (C) 1996-2019 Julian Seward <jseward@acm.org>
13
14
   Please read the WARNING, DISCLAIMER and PATENTS sections in the 
15
   README file.
16
17
   This program is released under the terms of the license contained
18
   in the file LICENSE.
19
   ------------------------------------------------------------------ */
20
21
22
#include "bzlib_private.h"
23
24
/*---------------------------------------------*/
25
/*--- Fallback O(N log(N)^2) sorting        ---*/
26
/*--- algorithm, for repetitive blocks      ---*/
27
/*---------------------------------------------*/
28
29
/*---------------------------------------------*/
30
static 
31
__inline__
32
void fallbackSimpleSort ( UInt32* fmap, 
33
                          UInt32* eclass, 
34
                          Int32   lo, 
35
                          Int32   hi )
36
201M
{
37
201M
   Int32 i, j, tmp;
38
201M
   UInt32 ec_tmp;
39
40
201M
   if (lo == hi) return;
41
42
197M
   if (hi - lo > 3) {
43
129M
      for ( i = hi-4; i >= lo; i-- ) {
44
96.8M
         tmp = fmap[i];
45
96.8M
         ec_tmp = eclass[tmp];
46
136M
         for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
47
39.2M
            fmap[j-4] = fmap[j];
48
96.8M
         fmap[j-4] = tmp;
49
96.8M
      }
50
32.2M
   }
51
52
611M
   for ( i = hi-1; i >= lo; i-- ) {
53
413M
      tmp = fmap[i];
54
413M
      ec_tmp = eclass[tmp];
55
603M
      for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
56
189M
         fmap[j-1] = fmap[j];
57
413M
      fmap[j-1] = tmp;
58
413M
   }
59
197M
}
60
61
62
/*---------------------------------------------*/
63
#define fswap(zz1, zz2) \
64
3.86G
   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
65
66
76.6M
#define fvswap(zzp1, zzp2, zzn)       \
67
76.6M
{                                     \
68
76.6M
   Int32 yyp1 = (zzp1);               \
69
76.6M
   Int32 yyp2 = (zzp2);               \
70
76.6M
   Int32 yyn  = (zzn);                \
71
713M
   while (yyn > 0) {                  \
72
637M
      fswap(fmap[yyp1], fmap[yyp2]);  \
73
637M
      yyp1++; yyp2++; yyn--;          \
74
637M
   }                                  \
75
76.6M
}
76
77
78
76.6M
#define fmin(a,b) ((a) < (b)) ? (a) : (b)
79
80
240M
#define fpush(lz,hz) { stackLo[sp] = lz; \
81
240M
                       stackHi[sp] = hz; \
82
240M
                       sp++; }
83
84
240M
#define fpop(lz,hz) { sp--;              \
85
240M
                      lz = stackLo[sp];  \
86
240M
                      hz = stackHi[sp]; }
87
88
240M
#define FALLBACK_QSORT_SMALL_THRESH 10
89
#define FALLBACK_QSORT_STACK_SIZE   100
90
91
92
static
93
void fallbackQSort3 ( UInt32* fmap, 
94
                      UInt32* eclass,
95
                      Int32   loSt, 
96
                      Int32   hiSt )
97
163M
{
98
163M
   Int32 unLo, unHi, ltLo, gtHi, n, m;
99
163M
   Int32 sp, lo, hi;
100
163M
   UInt32 med, r, r3;
101
163M
   Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
102
163M
   Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
103
104
163M
   r = 0;
105
106
163M
   sp = 0;
107
163M
   fpush ( loSt, hiSt );
108
109
404M
   while (sp > 0) {
110
111
240M
      AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
112
113
240M
      fpop ( lo, hi );
114
240M
      if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
115
201M
         fallbackSimpleSort ( fmap, eclass, lo, hi );
116
201M
         continue;
117
201M
      }
118
119
      /* Random partitioning.  Median of 3 sometimes fails to
120
         avoid bad cases.  Median of 9 seems to help but 
121
         looks rather expensive.  This too seems to work but
122
         is cheaper.  Guidance for the magic constants 
123
         7621 and 32768 is taken from Sedgewick's algorithms
124
         book, chapter 35.
125
      */
126
39.1M
      r = ((r * 7621) + 1) % 32768;
127
39.1M
      r3 = r % 3;
128
39.1M
      if (r3 == 0) med = eclass[fmap[lo]]; else
129
32.6M
      if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
130
13.3M
                   med = eclass[fmap[hi]];
131
132
39.1M
      unLo = ltLo = lo;
133
39.1M
      unHi = gtHi = hi;
134
135
398M
      while (1) {
136
2.88G
         while (1) {
137
2.88G
            if (unLo > unHi) break;
138
2.86G
            n = (Int32)eclass[fmap[unLo]] - (Int32)med;
139
2.86G
            if (n == 0) { 
140
1.35G
               fswap(fmap[unLo], fmap[ltLo]); 
141
1.35G
               ltLo++; unLo++; 
142
1.35G
               continue; 
143
1.51G
            };
144
1.51G
            if (n > 0) break;
145
1.13G
            unLo++;
146
1.13G
         }
147
3.05G
         while (1) {
148
3.05G
            if (unLo > unHi) break;
149
3.01G
            n = (Int32)eclass[fmap[unHi]] - (Int32)med;
150
3.01G
            if (n == 0) { 
151
1.51G
               fswap(fmap[unHi], fmap[gtHi]); 
152
1.51G
               gtHi--; unHi--; 
153
1.51G
               continue; 
154
1.51G
            };
155
1.49G
            if (n < 0) break;
156
1.13G
            unHi--;
157
1.13G
         }
158
398M
         if (unLo > unHi) break;
159
359M
         fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
160
359M
      }
161
162
39.1M
      AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
163
164
39.1M
      if (gtHi < ltLo) continue;
165
166
38.3M
      n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
167
38.3M
      m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
168
169
38.3M
      n = lo + unLo - ltLo - 1;
170
38.3M
      m = hi - (gtHi - unHi) + 1;
171
172
38.3M
      if (n - lo > hi - m) {
173
17.6M
         fpush ( lo, n );
174
17.6M
         fpush ( m, hi );
175
20.7M
      } else {
176
20.7M
         fpush ( m, hi );
177
20.7M
         fpush ( lo, n );
178
20.7M
      }
179
38.3M
   }
180
163M
}
181
182
#undef fmin
183
#undef fpush
184
#undef fpop
185
#undef fswap
186
#undef fvswap
187
#undef FALLBACK_QSORT_SMALL_THRESH
188
#undef FALLBACK_QSORT_STACK_SIZE
189
190
191
/*---------------------------------------------*/
192
/* Pre:
193
      nblock > 0
194
      eclass exists for [0 .. nblock-1]
195
      ((UChar*)eclass) [0 .. nblock-1] holds block
196
      ptr exists for [0 .. nblock-1]
197
198
   Post:
199
      ((UChar*)eclass) [0 .. nblock-1] holds block
200
      All other areas of eclass destroyed
201
      fmap [0 .. nblock-1] holds sorted order
202
      bhtab [ 0 .. 2+(nblock/32) ] destroyed
203
*/
204
205
423M
#define       SET_BH(zz)  bhtab[(zz) >> 5] |= ((UInt32)1 << ((zz) & 31))
206
104k
#define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~((UInt32)1 << ((zz) & 31))
207
7.60G
#define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & ((UInt32)1 << ((zz) & 31)))
208
143M
#define      WORD_BH(zz)  bhtab[(zz) >> 5]
209
920M
#define UNALIGNED_BH(zz)  ((zz) & 0x01f)
210
211
static
212
void fallbackSort ( UInt32* fmap, 
213
                    UInt32* eclass, 
214
                    UInt32* bhtab,
215
                    Int32   nblock,
216
                    Int32   verb )
217
3.26k
{
218
3.26k
   Int32 ftab[257];
219
3.26k
   Int32 ftabCopy[256];
220
3.26k
   Int32 H, i, j, k, l, r, cc, cc1;
221
3.26k
   Int32 nNotDone;
222
3.26k
   Int32 nBhtab;
223
3.26k
   UChar* eclass8 = (UChar*)eclass;
224
225
   /*--
226
      Initial 1-char radix sort to generate
227
      initial fmap and initial BH bits.
228
   --*/
229
3.26k
   if (verb >= 4)
230
0
      VPrintf0 ( "        bucket sorting ...\n" );
231
841k
   for (i = 0; i < 257;    i++) ftab[i] = 0;
232
259M
   for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
233
838k
   for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
234
838k
   for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
235
236
259M
   for (i = 0; i < nblock; i++) {
237
259M
      j = eclass8[i];
238
259M
      k = ftab[j] - 1;
239
259M
      ftab[j] = k;
240
259M
      fmap[k] = i;
241
259M
   }
242
243
3.26k
   nBhtab = 2 + (nblock / 32);
244
8.11M
   for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
245
838k
   for (i = 0; i < 256; i++) SET_BH(ftab[i]);
246
247
   /*--
248
      Inductively refine the buckets.  Kind-of an
249
      "exponential radix sort" (!), inspired by the
250
      Manber-Myers suffix array construction algorithm.
251
   --*/
252
253
   /*-- set sentinel bits for block-end detection --*/
254
107k
   for (i = 0; i < 32; i++) { 
255
104k
      SET_BH(nblock + 2*i);
256
104k
      CLEAR_BH(nblock + 2*i + 1);
257
104k
   }
258
259
   /*-- the log(N) loop --*/
260
3.26k
   H = 1;
261
27.5k
   while (1) {
262
263
27.5k
      if (verb >= 4) 
264
0
         VPrintf1 ( "        depth %6d has ", H );
265
266
27.5k
      j = 0;
267
4.58G
      for (i = 0; i < nblock; i++) {
268
4.58G
         if (ISSET_BH(i)) j = i;
269
4.58G
         k = fmap[i] - H; if (k < 0) k += nblock;
270
4.58G
         eclass[k] = j;
271
4.58G
      }
272
273
27.5k
      nNotDone = 0;
274
27.5k
      r = -1;
275
163M
      while (1) {
276
277
   /*-- find the next non-singleton bucket --*/
278
163M
         k = r + 1;
279
715M
         while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
280
163M
         if (ISSET_BH(k)) {
281
39.8M
            while (WORD_BH(k) == 0xffffffff) k += 32;
282
167M
            while (ISSET_BH(k)) k++;
283
21.9M
         }
284
163M
         l = k - 1;
285
163M
         if (l >= nblock) break;
286
498M
         while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
287
163M
         if (!ISSET_BH(k)) {
288
103M
            while (WORD_BH(k) == 0x00000000) k += 32;
289
100M
            while (!ISSET_BH(k)) k++;
290
13.2M
         }
291
163M
         r = k - 1;
292
163M
         if (r >= nblock) break;
293
294
         /*-- now [l, r] bracket current bucket --*/
295
163M
         if (r > l) {
296
163M
            nNotDone += (r - l + 1);
297
163M
            fallbackQSort3 ( fmap, eclass, l, r );
298
299
            /*-- scan bucket and generate header bits-- */
300
163M
            cc = -1;
301
3.63G
            for (i = l; i <= r; i++) {
302
3.47G
               cc1 = eclass[fmap[i]];
303
3.47G
               if (cc != cc1) { SET_BH(i); cc = cc1; };
304
3.47G
            }
305
163M
         }
306
163M
      }
307
308
27.5k
      if (verb >= 4) 
309
0
         VPrintf1 ( "%6d unresolved strings\n", nNotDone );
310
311
27.5k
      H *= 2;
312
27.5k
      if (H > nblock || nNotDone == 0) break;
313
27.5k
   }
314
315
   /*-- 
316
      Reconstruct the original block in
317
      eclass8 [0 .. nblock-1], since the
318
      previous phase destroyed it.
319
   --*/
320
3.26k
   if (verb >= 4)
321
0
      VPrintf0 ( "        reconstructing block ...\n" );
322
3.26k
   j = 0;
323
259M
   for (i = 0; i < nblock; i++) {
324
260M
      while (ftabCopy[j] == 0) j++;
325
259M
      ftabCopy[j]--;
326
259M
      eclass8[fmap[i]] = (UChar)j;
327
259M
   }
328
3.26k
   AssertH ( j < 256, 1005 );
329
3.26k
}
330
331
#undef       SET_BH
332
#undef     CLEAR_BH
333
#undef     ISSET_BH
334
#undef      WORD_BH
335
#undef UNALIGNED_BH
336
337
338
/*---------------------------------------------*/
339
/*--- The main, O(N^2 log(N)) sorting       ---*/
340
/*--- algorithm.  Faster for "normal"       ---*/
341
/*--- non-repetitive blocks.                ---*/
342
/*---------------------------------------------*/
343
344
/*---------------------------------------------*/
345
static
346
__inline__
347
Bool mainGtU ( UInt32  i1, 
348
               UInt32  i2,
349
               UChar*  block, 
350
               UInt16* quadrant,
351
               UInt32  nblock,
352
               Int32*  budget )
353
246M
{
354
246M
   Int32  k;
355
246M
   UChar  c1, c2;
356
246M
   UInt16 s1, s2;
357
358
246M
   AssertD ( i1 != i2, "mainGtU" );
359
   /* 1 */
360
246M
   c1 = block[i1]; c2 = block[i2];
361
246M
   if (c1 != c2) return (c1 > c2);
362
98.6M
   i1++; i2++;
363
   /* 2 */
364
98.6M
   c1 = block[i1]; c2 = block[i2];
365
98.6M
   if (c1 != c2) return (c1 > c2);
366
88.5M
   i1++; i2++;
367
   /* 3 */
368
88.5M
   c1 = block[i1]; c2 = block[i2];
369
88.5M
   if (c1 != c2) return (c1 > c2);
370
84.7M
   i1++; i2++;
371
   /* 4 */
372
84.7M
   c1 = block[i1]; c2 = block[i2];
373
84.7M
   if (c1 != c2) return (c1 > c2);
374
81.7M
   i1++; i2++;
375
   /* 5 */
376
81.7M
   c1 = block[i1]; c2 = block[i2];
377
81.7M
   if (c1 != c2) return (c1 > c2);
378
79.4M
   i1++; i2++;
379
   /* 6 */
380
79.4M
   c1 = block[i1]; c2 = block[i2];
381
79.4M
   if (c1 != c2) return (c1 > c2);
382
77.8M
   i1++; i2++;
383
   /* 7 */
384
77.8M
   c1 = block[i1]; c2 = block[i2];
385
77.8M
   if (c1 != c2) return (c1 > c2);
386
76.4M
   i1++; i2++;
387
   /* 8 */
388
76.4M
   c1 = block[i1]; c2 = block[i2];
389
76.4M
   if (c1 != c2) return (c1 > c2);
390
75.1M
   i1++; i2++;
391
   /* 9 */
392
75.1M
   c1 = block[i1]; c2 = block[i2];
393
75.1M
   if (c1 != c2) return (c1 > c2);
394
74.2M
   i1++; i2++;
395
   /* 10 */
396
74.2M
   c1 = block[i1]; c2 = block[i2];
397
74.2M
   if (c1 != c2) return (c1 > c2);
398
73.3M
   i1++; i2++;
399
   /* 11 */
400
73.3M
   c1 = block[i1]; c2 = block[i2];
401
73.3M
   if (c1 != c2) return (c1 > c2);
402
72.8M
   i1++; i2++;
403
   /* 12 */
404
72.8M
   c1 = block[i1]; c2 = block[i2];
405
72.8M
   if (c1 != c2) return (c1 > c2);
406
72.1M
   i1++; i2++;
407
408
72.1M
   k = nblock + 8;
409
410
2.56G
   do {
411
      /* 1 */
412
2.56G
      c1 = block[i1]; c2 = block[i2];
413
2.56G
      if (c1 != c2) return (c1 > c2);
414
2.56G
      s1 = quadrant[i1]; s2 = quadrant[i2];
415
2.56G
      if (s1 != s2) return (s1 > s2);
416
2.54G
      i1++; i2++;
417
      /* 2 */
418
2.54G
      c1 = block[i1]; c2 = block[i2];
419
2.54G
      if (c1 != c2) return (c1 > c2);
420
2.54G
      s1 = quadrant[i1]; s2 = quadrant[i2];
421
2.54G
      if (s1 != s2) return (s1 > s2);
422
2.53G
      i1++; i2++;
423
      /* 3 */
424
2.53G
      c1 = block[i1]; c2 = block[i2];
425
2.53G
      if (c1 != c2) return (c1 > c2);
426
2.52G
      s1 = quadrant[i1]; s2 = quadrant[i2];
427
2.52G
      if (s1 != s2) return (s1 > s2);
428
2.52G
      i1++; i2++;
429
      /* 4 */
430
2.52G
      c1 = block[i1]; c2 = block[i2];
431
2.52G
      if (c1 != c2) return (c1 > c2);
432
2.51G
      s1 = quadrant[i1]; s2 = quadrant[i2];
433
2.51G
      if (s1 != s2) return (s1 > s2);
434
2.51G
      i1++; i2++;
435
      /* 5 */
436
2.51G
      c1 = block[i1]; c2 = block[i2];
437
2.51G
      if (c1 != c2) return (c1 > c2);
438
2.51G
      s1 = quadrant[i1]; s2 = quadrant[i2];
439
2.51G
      if (s1 != s2) return (s1 > s2);
440
2.50G
      i1++; i2++;
441
      /* 6 */
442
2.50G
      c1 = block[i1]; c2 = block[i2];
443
2.50G
      if (c1 != c2) return (c1 > c2);
444
2.50G
      s1 = quadrant[i1]; s2 = quadrant[i2];
445
2.50G
      if (s1 != s2) return (s1 > s2);
446
2.50G
      i1++; i2++;
447
      /* 7 */
448
2.50G
      c1 = block[i1]; c2 = block[i2];
449
2.50G
      if (c1 != c2) return (c1 > c2);
450
2.49G
      s1 = quadrant[i1]; s2 = quadrant[i2];
451
2.49G
      if (s1 != s2) return (s1 > s2);
452
2.49G
      i1++; i2++;
453
      /* 8 */
454
2.49G
      c1 = block[i1]; c2 = block[i2];
455
2.49G
      if (c1 != c2) return (c1 > c2);
456
2.49G
      s1 = quadrant[i1]; s2 = quadrant[i2];
457
2.49G
      if (s1 != s2) return (s1 > s2);
458
2.49G
      i1++; i2++;
459
460
2.49G
      if (i1 >= nblock) i1 -= nblock;
461
2.49G
      if (i2 >= nblock) i2 -= nblock;
462
463
2.49G
      k -= 8;
464
2.49G
      (*budget)--;
465
2.49G
   }
466
2.49G
      while (k >= 0);
467
468
478
   return False;
469
72.1M
}
470
471
472
/*---------------------------------------------*/
473
/*--
474
   Knuth's increments seem to work better
475
   than Incerpi-Sedgewick here.  Possibly
476
   because the number of elems to sort is
477
   usually small, typically <= 20.
478
--*/
479
static
480
Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
481
                   9841, 29524, 88573, 265720,
482
                   797161, 2391484 };
483
484
static
485
void mainSimpleSort ( UInt32* ptr,
486
                      UChar*  block,
487
                      UInt16* quadrant,
488
                      Int32   nblock,
489
                      Int32   lo, 
490
                      Int32   hi, 
491
                      Int32   d,
492
                      Int32*  budget )
493
10.3M
{
494
10.3M
   Int32 i, j, h, bigN, hp;
495
10.3M
   UInt32 v;
496
497
10.3M
   bigN = hi - lo + 1;
498
10.3M
   if (bigN < 2) return;
499
500
9.94M
   hp = 0;
501
27.0M
   while (incs[hp] < bigN) hp++;
502
9.94M
   hp--;
503
504
27.0M
   for (; hp >= 0; hp--) {
505
17.0M
      h = incs[hp];
506
507
17.0M
      i = lo + h;
508
57.0M
      while (True) {
509
510
         /*-- copy 1 --*/
511
57.0M
         if (i > hi) break;
512
52.2M
         v = ptr[i];
513
52.2M
         j = i;
514
85.4M
         while ( mainGtU ( 
515
85.4M
                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
516
85.4M
                 ) ) {
517
45.3M
            ptr[j] = ptr[j-h];
518
45.3M
            j = j - h;
519
45.3M
            if (j <= (lo + h - 1)) break;
520
45.3M
         }
521
52.2M
         ptr[j] = v;
522
52.2M
         i++;
523
524
         /*-- copy 2 --*/
525
52.2M
         if (i > hi) break;
526
45.2M
         v = ptr[i];
527
45.2M
         j = i;
528
82.2M
         while ( mainGtU ( 
529
82.2M
                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
530
82.2M
                 ) ) {
531
45.1M
            ptr[j] = ptr[j-h];
532
45.1M
            j = j - h;
533
45.1M
            if (j <= (lo + h - 1)) break;
534
45.1M
         }
535
45.2M
         ptr[j] = v;
536
45.2M
         i++;
537
538
         /*-- copy 3 --*/
539
45.2M
         if (i > hi) break;
540
39.9M
         v = ptr[i];
541
39.9M
         j = i;
542
78.5M
         while ( mainGtU ( 
543
78.5M
                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
544
78.5M
                 ) ) {
545
44.9M
            ptr[j] = ptr[j-h];
546
44.9M
            j = j - h;
547
44.9M
            if (j <= (lo + h - 1)) break;
548
44.9M
         }
549
39.9M
         ptr[j] = v;
550
39.9M
         i++;
551
552
39.9M
         if (*budget < 0) return;
553
39.9M
      }
554
17.0M
   }
555
9.94M
}
556
557
558
/*---------------------------------------------*/
559
/*--
560
   The following is an implementation of
561
   an elegant 3-way quicksort for strings,
562
   described in a paper "Fast Algorithms for
563
   Sorting and Searching Strings", by Robert
564
   Sedgewick and Jon L. Bentley.
565
--*/
566
567
#define mswap(zz1, zz2) \
568
339M
   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
569
570
1.86M
#define mvswap(zzp1, zzp2, zzn)       \
571
1.86M
{                                     \
572
1.86M
   Int32 yyp1 = (zzp1);               \
573
1.86M
   Int32 yyp2 = (zzp2);               \
574
1.86M
   Int32 yyn  = (zzn);                \
575
15.9M
   while (yyn > 0) {                  \
576
14.0M
      mswap(ptr[yyp1], ptr[yyp2]);    \
577
14.0M
      yyp1++; yyp2++; yyn--;          \
578
14.0M
   }                                  \
579
1.86M
}
580
581
static 
582
__inline__
583
UChar mmed3 ( UChar a, UChar b, UChar c )
584
1.31M
{
585
1.31M
   UChar t;
586
1.31M
   if (a > b) { t = a; a = b; b = t; };
587
1.31M
   if (b > c) { 
588
505k
      b = c;
589
505k
      if (a > b) b = a;
590
505k
   }
591
1.31M
   return b;
592
1.31M
}
593
594
1.86M
#define mmin(a,b) ((a) < (b)) ? (a) : (b)
595
596
11.6M
#define mpush(lz,hz,dz) { stackLo[sp] = lz; \
597
11.6M
                          stackHi[sp] = hz; \
598
11.6M
                          stackD [sp] = dz; \
599
11.6M
                          sp++; }
600
601
11.6M
#define mpop(lz,hz,dz) { sp--;             \
602
11.6M
                         lz = stackLo[sp]; \
603
11.6M
                         hz = stackHi[sp]; \
604
11.6M
                         dz = stackD [sp]; }
605
606
607
5.60M
#define mnextsize(az) (nextHi[az]-nextLo[az])
608
609
#define mnextswap(az,bz)                                        \
610
1.01M
   { Int32 tz;                                                  \
611
1.01M
     tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
612
1.01M
     tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
613
1.01M
     tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
614
615
616
23.3M
#define MAIN_QSORT_SMALL_THRESH 20
617
1.35M
#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
618
#define MAIN_QSORT_STACK_SIZE 100
619
620
static
621
void mainQSort3 ( UInt32* ptr,
622
                  UChar*  block,
623
                  UInt16* quadrant,
624
                  Int32   nblock,
625
                  Int32   loSt, 
626
                  Int32   hiSt, 
627
                  Int32   dSt,
628
                  Int32*  budget )
629
8.51M
{
630
8.51M
   Int32 unLo, unHi, ltLo, gtHi, n, m, med;
631
8.51M
   Int32 sp, lo, hi, d;
632
633
8.51M
   Int32 stackLo[MAIN_QSORT_STACK_SIZE];
634
8.51M
   Int32 stackHi[MAIN_QSORT_STACK_SIZE];
635
8.51M
   Int32 stackD [MAIN_QSORT_STACK_SIZE];
636
637
8.51M
   Int32 nextLo[3];
638
8.51M
   Int32 nextHi[3];
639
8.51M
   Int32 nextD [3];
640
641
8.51M
   sp = 0;
642
8.51M
   mpush ( loSt, hiSt, dSt );
643
644
20.2M
   while (sp > 0) {
645
646
11.6M
      AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
647
648
11.6M
      mpop ( lo, hi, d );
649
11.6M
      if (hi - lo < MAIN_QSORT_SMALL_THRESH || 
650
11.6M
          d > MAIN_QSORT_DEPTH_THRESH) {
651
10.3M
         mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
652
10.3M
         if (*budget < 0) return;
653
10.3M
         continue;
654
10.3M
      }
655
656
1.31M
      med = (Int32) 
657
1.31M
            mmed3 ( block[ptr[ lo         ]+d],
658
1.31M
                    block[ptr[ hi         ]+d],
659
1.31M
                    block[ptr[ (lo+hi)>>1 ]+d] );
660
661
1.31M
      unLo = ltLo = lo;
662
1.31M
      unHi = gtHi = hi;
663
664
9.63M
      while (True) {
665
210M
         while (True) {
666
210M
            if (unLo > unHi) break;
667
209M
            n = ((Int32)block[ptr[unLo]+d]) - med;
668
209M
            if (n == 0) { 
669
178M
               mswap(ptr[unLo], ptr[ltLo]); 
670
178M
               ltLo++; unLo++; continue; 
671
178M
            };
672
31.2M
            if (n >  0) break;
673
22.4M
            unLo++;
674
22.4M
         }
675
169M
         while (True) {
676
169M
            if (unLo > unHi) break;
677
167M
            n = ((Int32)block[ptr[unHi]+d]) - med;
678
167M
            if (n == 0) { 
679
139M
               mswap(ptr[unHi], ptr[gtHi]); 
680
139M
               gtHi--; unHi--; continue; 
681
139M
            };
682
28.5M
            if (n <  0) break;
683
20.2M
            unHi--;
684
20.2M
         }
685
9.63M
         if (unLo > unHi) break;
686
8.31M
         mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
687
8.31M
      }
688
689
1.31M
      AssertD ( unHi == unLo-1, "mainQSort3(2)" );
690
691
1.31M
      if (gtHi < ltLo) {
692
380k
         mpush(lo, hi, d+1 );
693
380k
         continue;
694
380k
      }
695
696
934k
      n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
697
934k
      m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
698
699
934k
      n = lo + unLo - ltLo - 1;
700
934k
      m = hi - (gtHi - unHi) + 1;
701
702
934k
      nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
703
934k
      nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
704
934k
      nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
705
706
934k
      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
707
934k
      if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
708
934k
      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
709
710
934k
      AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
711
934k
      AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
712
713
934k
      mpush (nextLo[0], nextHi[0], nextD[0]);
714
934k
      mpush (nextLo[1], nextHi[1], nextD[1]);
715
934k
      mpush (nextLo[2], nextHi[2], nextD[2]);
716
934k
   }
717
8.51M
}
718
719
#undef mswap
720
#undef mvswap
721
#undef mpush
722
#undef mpop
723
#undef mmin
724
#undef mnextsize
725
#undef mnextswap
726
#undef MAIN_QSORT_SMALL_THRESH
727
#undef MAIN_QSORT_DEPTH_THRESH
728
#undef MAIN_QSORT_STACK_SIZE
729
730
731
/*---------------------------------------------*/
732
/* Pre:
733
      nblock > N_OVERSHOOT
734
      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
735
      ((UChar*)block32) [0 .. nblock-1] holds block
736
      ptr exists for [0 .. nblock-1]
737
738
   Post:
739
      ((UChar*)block32) [0 .. nblock-1] holds block
740
      All other areas of block32 destroyed
741
      ftab [0 .. 65536 ] destroyed
742
      ptr [0 .. nblock-1] holds sorted order
743
      if (*budget < 0), sorting was abandoned
744
*/
745
746
5.30M
#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
747
609M
#define SETMASK (1 << 21)
748
305M
#define CLEARMASK (~(SETMASK))
749
750
static
751
void mainSort ( UInt32* ptr, 
752
                UChar*  block,
753
                UInt16* quadrant, 
754
                UInt32* ftab,
755
                Int32   nblock,
756
                Int32   verb,
757
                Int32*  budget )
758
1.57k
{
759
1.57k
   Int32  i, j, k, ss, sb;
760
1.57k
   Int32  runningOrder[256];
761
1.57k
   Bool   bigDone[256];
762
1.57k
   Int32  copyStart[256];
763
1.57k
   Int32  copyEnd  [256];
764
1.57k
   UChar  c1;
765
1.57k
   Int32  numQSorted;
766
1.57k
   UInt16 s;
767
1.57k
   if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
768
769
   /*-- set up the 2-byte frequency table --*/
770
103M
   for (i = 65536; i >= 0; i--) ftab[i] = 0;
771
772
1.57k
   j = block[0] << 8;
773
1.57k
   i = nblock-1;
774
100M
   for (; i >= 3; i -= 4) {
775
100M
      quadrant[i] = 0;
776
100M
      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
777
100M
      ftab[j]++;
778
100M
      quadrant[i-1] = 0;
779
100M
      j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
780
100M
      ftab[j]++;
781
100M
      quadrant[i-2] = 0;
782
100M
      j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
783
100M
      ftab[j]++;
784
100M
      quadrant[i-3] = 0;
785
100M
      j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
786
100M
      ftab[j]++;
787
100M
   }
788
3.39k
   for (; i >= 0; i--) {
789
1.82k
      quadrant[i] = 0;
790
1.82k
      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
791
1.82k
      ftab[j]++;
792
1.82k
   }
793
794
   /*-- (emphasises close relationship of block & quadrant) --*/
795
55.0k
   for (i = 0; i < BZ_N_OVERSHOOT; i++) {
796
53.4k
      block   [nblock+i] = block[i];
797
53.4k
      quadrant[nblock+i] = 0;
798
53.4k
   }
799
800
1.57k
   if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
801
802
   /*-- Complete the initial radix sort --*/
803
103M
   for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
804
805
1.57k
   s = block[0] << 8;
806
1.57k
   i = nblock-1;
807
100M
   for (; i >= 3; i -= 4) {
808
100M
      s = (s >> 8) | (block[i] << 8);
809
100M
      j = ftab[s] -1;
810
100M
      ftab[s] = j;
811
100M
      ptr[j] = i;
812
100M
      s = (s >> 8) | (block[i-1] << 8);
813
100M
      j = ftab[s] -1;
814
100M
      ftab[s] = j;
815
100M
      ptr[j] = i-1;
816
100M
      s = (s >> 8) | (block[i-2] << 8);
817
100M
      j = ftab[s] -1;
818
100M
      ftab[s] = j;
819
100M
      ptr[j] = i-2;
820
100M
      s = (s >> 8) | (block[i-3] << 8);
821
100M
      j = ftab[s] -1;
822
100M
      ftab[s] = j;
823
100M
      ptr[j] = i-3;
824
100M
   }
825
3.39k
   for (; i >= 0; i--) {
826
1.82k
      s = (s >> 8) | (block[i] << 8);
827
1.82k
      j = ftab[s] -1;
828
1.82k
      ftab[s] = j;
829
1.82k
      ptr[j] = i;
830
1.82k
   }
831
832
   /*--
833
      Now ftab contains the first loc of every small bucket.
834
      Calculate the running order, from smallest to largest
835
      big bucket.
836
   --*/
837
404k
   for (i = 0; i <= 255; i++) {
838
402k
      bigDone     [i] = False;
839
402k
      runningOrder[i] = i;
840
402k
   }
841
842
1.57k
   {
843
1.57k
      Int32 vv;
844
1.57k
      Int32 h = 1;
845
7.86k
      do h = 3 * h + 1; while (h <= 256);
846
7.86k
      do {
847
7.86k
         h = h / 3;
848
1.73M
         for (i = h; i <= 255; i++) {
849
1.73M
            vv = runningOrder[i];
850
1.73M
            j = i;
851
2.65M
            while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
852
1.02M
               runningOrder[j] = runningOrder[j-h];
853
1.02M
               j = j - h;
854
1.02M
               if (j <= (h - 1)) goto zero;
855
1.02M
            }
856
1.73M
            zero:
857
1.73M
            runningOrder[j] = vv;
858
1.73M
         }
859
7.86k
      } while (h != 1);
860
1.57k
   }
861
862
   /*--
863
      The main sorting loop.
864
   --*/
865
866
1.57k
   numQSorted = 0;
867
868
397k
   for (i = 0; i <= 255; i++) {
869
870
      /*--
871
         Process big buckets, starting with the least full.
872
         Basically this is a 3-step process in which we call
873
         mainQSort3 to sort the small buckets [ss, j], but
874
         also make a big effort to avoid the calls if we can.
875
      --*/
876
396k
      ss = runningOrder[i];
877
878
      /*--
879
         Step 1:
880
         Complete the big bucket [ss] by quicksorting
881
         any unsorted small buckets [ss, j], for j != ss.  
882
         Hopefully previous pointer-scanning phases have already
883
         completed many of the small buckets [ss, j], so
884
         we don't have to sort them at all.
885
      --*/
886
101M
      for (j = 0; j <= 255; j++) {
887
101M
         if (j != ss) {
888
100M
            sb = (ss << 8) + j;
889
100M
            if ( ! (ftab[sb] & SETMASK) ) {
890
50.7M
               Int32 lo = ftab[sb]   & CLEARMASK;
891
50.7M
               Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
892
50.7M
               if (hi > lo) {
893
8.51M
                  if (verb >= 4)
894
0
                     VPrintf4 ( "        qsort [0x%x, 0x%x]   "
895
8.51M
                                "done %d   this %d\n",
896
8.51M
                                ss, j, numQSorted, hi - lo + 1 );
897
8.51M
                  mainQSort3 ( 
898
8.51M
                     ptr, block, quadrant, nblock, 
899
8.51M
                     lo, hi, BZ_N_RADIX, budget 
900
8.51M
                  );   
901
8.51M
                  numQSorted += (hi - lo + 1);
902
8.51M
                  if (*budget < 0) return;
903
8.51M
               }
904
50.7M
            }
905
100M
            ftab[sb] |= SETMASK;
906
100M
         }
907
101M
      }
908
909
395k
      AssertH ( !bigDone[ss], 1006 );
910
911
      /*--
912
         Step 2:
913
         Now scan this big bucket [ss] so as to synthesise the
914
         sorted order for small buckets [t, ss] for all t,
915
         including, magically, the bucket [ss,ss] too.
916
         This will avoid doing Real Work in subsequent Step 1's.
917
      --*/
918
395k
      {
919
101M
         for (j = 0; j <= 255; j++) {
920
101M
            copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
921
101M
            copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
922
101M
         }
923
82.5M
         for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
924
82.1M
            k = ptr[j]-1; if (k < 0) k += nblock;
925
82.1M
            c1 = block[k];
926
82.1M
            if (!bigDone[c1])
927
45.5M
               ptr[ copyStart[c1]++ ] = k;
928
82.1M
         }
929
86.1M
         for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
930
85.7M
            k = ptr[j]-1; if (k < 0) k += nblock;
931
85.7M
            c1 = block[k];
932
85.7M
            if (!bigDone[c1]) 
933
44.1M
               ptr[ copyEnd[c1]-- ] = k;
934
85.7M
         }
935
395k
      }
936
937
395k
      AssertH ( (copyStart[ss]-1 == copyEnd[ss])
938
395k
                || 
939
                /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
940
                   Necessity for this case is demonstrated by compressing 
941
                   a sequence of approximately 48.5 million of character 
942
                   251; 1.0.0/1.0.1 will then die here. */
943
395k
                (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
944
395k
                1007 )
945
946
101M
      for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
947
948
      /*--
949
         Step 3:
950
         The [ss] big bucket is now done.  Record this fact,
951
         and update the quadrant descriptors.  Remember to
952
         update quadrants in the overshoot area too, if
953
         necessary.  The "if (i < 255)" test merely skips
954
         this updating for the last bucket processed, since
955
         updating for the last bucket is pointless.
956
957
         The quadrant array provides a way to incrementally
958
         cache sort orderings, as they appear, so as to 
959
         make subsequent comparisons in fullGtU() complete
960
         faster.  For repetitive blocks this makes a big
961
         difference (but not big enough to be able to avoid
962
         the fallback sorting mechanism, exponential radix sort).
963
964
         The precise meaning is: at all times:
965
966
            for 0 <= i < nblock and 0 <= j <= nblock
967
968
            if block[i] != block[j], 
969
970
               then the relative values of quadrant[i] and 
971
                    quadrant[j] are meaningless.
972
973
               else {
974
                  if quadrant[i] < quadrant[j]
975
                     then the string starting at i lexicographically
976
                     precedes the string starting at j
977
978
                  else if quadrant[i] > quadrant[j]
979
                     then the string starting at j lexicographically
980
                     precedes the string starting at i
981
982
                  else
983
                     the relative ordering of the strings starting
984
                     at i and j has not yet been determined.
985
               }
986
      --*/
987
395k
      bigDone[ss] = True;
988
989
395k
      if (i < 255) {
990
394k
         Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
991
394k
         Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
992
394k
         Int32 shifts   = 0;
993
994
394k
         while ((bbSize >> shifts) > 65534) shifts++;
995
996
157M
         for (j = bbSize-1; j >= 0; j--) {
997
156M
            Int32 a2update     = ptr[bbStart + j];
998
156M
            UInt16 qVal        = (UInt16)(j >> shifts);
999
156M
            quadrant[a2update] = qVal;
1000
156M
            if (a2update < BZ_N_OVERSHOOT)
1001
25.3k
               quadrant[a2update + nblock] = qVal;
1002
156M
         }
1003
394k
         AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1004
394k
      }
1005
1006
395k
   }
1007
1008
666
   if (verb >= 4)
1009
0
      VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1010
666
                 nblock, numQSorted, nblock - numQSorted );
1011
666
}
1012
1013
#undef BIGFREQ
1014
#undef SETMASK
1015
#undef CLEARMASK
1016
1017
1018
/*---------------------------------------------*/
1019
/* Pre:
1020
      nblock > 0
1021
      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1022
      ((UChar*)arr2)  [0 .. nblock-1] holds block
1023
      arr1 exists for [0 .. nblock-1]
1024
1025
   Post:
1026
      ((UChar*)arr2) [0 .. nblock-1] holds block
1027
      All other areas of block destroyed
1028
      ftab [ 0 .. 65536 ] destroyed
1029
      arr1 [0 .. nblock-1] holds sorted order
1030
*/
1031
void BZ2_blockSort ( EState* s )
1032
3.92k
{
1033
3.92k
   UInt32* ptr    = s->ptr; 
1034
3.92k
   UChar*  block  = s->block;
1035
3.92k
   UInt32* ftab   = s->ftab;
1036
3.92k
   Int32   nblock = s->nblock;
1037
3.92k
   Int32   verb   = s->verbosity;
1038
3.92k
   Int32   wfact  = s->workFactor;
1039
3.92k
   UInt16* quadrant;
1040
3.92k
   Int32   budget;
1041
3.92k
   Int32   budgetInit;
1042
3.92k
   Int32   i;
1043
1044
3.92k
   if (nblock < 10000) {
1045
2.35k
      fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1046
2.35k
   } else {
1047
      /* Calculate the location for quadrant, remembering to get
1048
         the alignment right.  Assumes that &(block[0]) is at least
1049
         2-byte aligned -- this should be ok since block is really
1050
         the first section of arr2.
1051
      */
1052
1.57k
      i = nblock+BZ_N_OVERSHOOT;
1053
1.57k
      if (i & 1) i++;
1054
1.57k
      quadrant = (UInt16*)(&(block[i]));
1055
1056
      /* (wfact-1) / 3 puts the default-factor-30
1057
         transition point at very roughly the same place as 
1058
         with v0.1 and v0.9.0.  
1059
         Not that it particularly matters any more, since the
1060
         resulting compressed stream is now the same regardless
1061
         of whether or not we use the main sort or fallback sort.
1062
      */
1063
1.57k
      if (wfact < 1  ) wfact = 1;
1064
1.57k
      if (wfact > 100) wfact = 100;
1065
1.57k
      budgetInit = nblock * ((wfact-1) / 3);
1066
1.57k
      budget = budgetInit;
1067
1068
1.57k
      mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1069
1.57k
      if (verb >= 3) 
1070
0
         VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1071
1.57k
                    budgetInit - budget,
1072
1.57k
                    nblock, 
1073
1.57k
                    (float)(budgetInit - budget) /
1074
1.57k
                    (float)(nblock==0 ? 1 : nblock) ); 
1075
1.57k
      if (budget < 0) {
1076
906
         if (verb >= 2) 
1077
0
            VPrintf0 ( "    too repetitive; using fallback"
1078
906
                       " sorting algorithm\n" );
1079
906
         fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1080
906
      }
1081
1.57k
   }
1082
1083
3.92k
   s->origPtr = -1;
1084
222M
   for (i = 0; i < s->nblock; i++)
1085
222M
      if (ptr[i] == 0)
1086
3.92k
         { s->origPtr = i; break; };
1087
1088
3.92k
   AssertH( s->origPtr != -1, 1003 );
1089
3.92k
}
1090
1091
1092
/*-------------------------------------------------------------*/
1093
/*--- end                                       blocksort.c ---*/
1094
/*-------------------------------------------------------------*/