Coverage Report

Created: 2026-03-30 06:30

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/moddable/xs/sources/xsum.c
Line
Count
Source
1
/* FUNCTIONS FOR EXACT SUMMATION. */
2
3
/* Copyright 2015, 2018, 2021, 2024 Radford M. Neal
4
5
   Permission is hereby granted, free of charge, to any person obtaining
6
   a copy of this software and associated documentation files (the
7
   "Software"), to deal in the Software without restriction, including
8
   without limitation the rights to use, copy, modify, merge, publish,
9
   distribute, sublicense, and/or sell copies of the Software, and to
10
   permit persons to whom the Software is furnished to do so, subject to
11
   the following conditions:
12
13
   The above copyright notice and this permission notice shall be
14
   included in all copies or substantial portions of the Software.
15
16
   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
*/
24
25
26
#include <stdio.h>
27
#include <string.h>
28
#include <math.h>
29
#include "xsum.h"
30
31
/* ---------------------- IMPLEMENTATION ASSUMPTIONS ----------------------- */
32
33
/* This code makes the following assumptions:
34
35
     o The 'double' type is a IEEE-754 standard 64-bit floating-point value.
36
37
     o The 'int64_t' and 'uint64_t' types exist, for 64-bit signed and
38
       unsigned integers.
39
40
     o The 'endianness' of 'double' and 64-bit integers is consistent
41
       between these types - that is, looking at the bits of a 'double'
42
       value as an 64-bit integer will have the expected result.
43
44
     o Right shifts of a signed operand produce the results expected for
45
       a two's complement representation.
46
47
     o Rounding should be done in the "round to nearest, ties to even" mode.
48
*/
49
50
51
/* --------------------------- CONFIGURATION ------------------------------- */
52
53
54
/* IMPLEMENTATION OPTIONS.  Can be set to either 0 or 1, whichever seems
55
   to be fastest. */
56
57
#define USE_SIMD 1          /* Use SIMD intrinsics (SSE2/AVX) if available?   */
58
59
#define USE_MEMSET_SMALL 1  /* Use memset rather than a loop (for small mem)? */
60
#define USE_MEMSET_LARGE 1  /* Use memset rather than a loop (for large mem)? */
61
#define USE_USED_LARGE 1    /* Use the used flags in a large accumulator? */
62
63
#define OPT_SMALL 0         /* Class of manual optimization for operations on */
64
                            /*   small accumulator: 0 (none), 1, 2, 3 (SIMD)  */
65
#define OPT_CARRY 1         /* Use manually optimized carry propagation?      */
66
67
#define OPT_LARGE_SUM 1     /* Should manually optimized routines be used for */
68
#define OPT_LARGE_SQNORM 1  /*   operations using the large accumulator?      */
69
#define OPT_LARGE_DOT 1
70
71
#define OPT_SIMPLE_SUM 1    /* Should manually optimized routines be used for */
72
#define OPT_SIMPLE_SQNORM 1 /*   operations done with simple FP arithmetic?   */
73
#define OPT_SIMPLE_DOT 1
74
75
#define OPT_KAHAN_SUM 0     /* Use manually optimized routine for Kahan sum?  */
76
77
#define INLINE_SMALL 1      /* Inline more of the small accumulator routines? */
78
                            /*   (Not currently used)                         */
79
#define INLINE_LARGE 1      /* Inline more of the large accumulator routines? */
80
81
82
/* INCLUDE INTEL INTRINSICS IF USED AND AVAILABLE. */
83
84
#if USE_SIMD && __SSE2__
85
# include <immintrin.h>
86
#endif
87
88
89
/* COPY A 64-BIT QUANTITY - DOUBLE TO 64-BIT INT OR VICE VERSA.  The
90
   arguments are destination and source variables (not values). */
91
92
0
#define COPY64(dst,src) memcpy(&(dst),&(src),sizeof(double))
93
94
95
/* OPTIONAL INCLUSION OF PBINARY MODULE.  Used for debug output. */
96
97
#ifdef PBINARY
98
# include "pbinary.h"
99
#else
100
0
# define pbinary_int64(x,y) do {} while(0)
101
0
# define pbinary_double(x) do {} while(0)
102
#endif
103
104
105
/* SET UP DEBUG FLAG.  It's a variable if debuging is enabled, and a
106
   constant if disabled (so that no code will be generated then). */
107
108
int xsum_debug = 0;
109
110
#ifndef DEBUG
111
0
# define xsum_debug 0
112
#endif
113
114
115
/* SET UP INLINE / NOINLINE MACROS. */
116
117
#if __GNUC__
118
# define INLINE inline __attribute__ ((always_inline))
119
#ifndef NOINLINE
120
# define NOINLINE __attribute__ ((noinline))
121
#endif
122
#else
123
# define INLINE inline
124
# define NOINLINE
125
#endif
126
127
128
/* ------------------------ INTERNAL ROUTINES ------------------------------- */
129
130
131
/* ADD AN INF OR NAN TO A SMALL ACCUMULATOR.  This only changes the flags,
132
   not the chunks in the accumulator, which retains the sum of the finite
133
   terms (which is perhaps sometimes useful to access, though no function
134
   to do so is defined at present).  A NaN with larger payload (seen as a
135
   52-bit unsigned integer) takes precedence, with the sign of the NaN always
136
   being positive.  This ensures that the order of summing NaN values doesn't
137
   matter. */
138
139
static NOINLINE void xsum_small_add_inf_nan
140
                       (xsum_small_accumulator *restrict sacc, xsum_int ivalue)
141
0
{
142
0
  xsum_int mantissa;
143
0
  double fltv;
144
145
0
  mantissa = ivalue & XSUM_MANTISSA_MASK;
146
147
0
  if (mantissa == 0) /* Inf */
148
0
  { if (sacc->Inf == 0)
149
0
    { /* no previous Inf */
150
0
      sacc->Inf = ivalue;
151
0
    }
152
0
    else if (sacc->Inf != ivalue)
153
0
    { /* previous Inf was opposite sign */
154
0
      COPY64 (fltv, ivalue);
155
0
      fltv = fltv - fltv;  /* result will be a NaN */
156
0
      COPY64 (sacc->Inf, fltv);
157
0
    }
158
0
  }
159
0
  else /* NaN */
160
0
  { /* Choose the NaN with the bigger payload and clear its sign.  Using <=
161
       ensures that we will choose the first NaN over the previous zero. */
162
0
    if ((sacc->NaN & XSUM_MANTISSA_MASK) <= mantissa)
163
0
    { sacc->NaN = ivalue & ~XSUM_SIGN_MASK;
164
0
    }
165
0
  }
166
0
}
167
168
169
/* PROPAGATE CARRIES TO NEXT CHUNK IN A SMALL ACCUMULATOR.  Needs to
170
   be called often enough that accumulated carries don't overflow out
171
   the top, as indicated by sacc->adds_until_propagate.  Returns the
172
   index of the uppermost non-zero chunk (0 if number is zero).
173
174
   After carry propagation, the uppermost non-zero chunk will indicate
175
   the sign of the number, and will not be -1 (all 1s).  It will be in
176
   the range -2^XSUM_LOW_MANTISSA_BITS to 2^XSUM_LOW_MANTISSA_BITS - 1.
177
   Lower chunks will be non-negative, and in the range from 0 up to
178
   2^XSUM_LOW_MANTISSA_BITS - 1. */
179
180
static NOINLINE int xsum_carry_propagate (xsum_small_accumulator *restrict sacc)
181
0
{
182
0
  int i, u, uix;
183
184
0
  if (xsum_debug) c_printf("\nCARRY PROPAGATING IN SMALL ACCUMULATOR\n");
185
186
  /* Set u to the index of the uppermost non-zero (for now) chunk, or
187
     return with value 0 if there is none. */
188
189
0
# if OPT_CARRY
190
191
0
  { u = XSUM_SCHUNKS-1;
192
0
    switch (XSUM_SCHUNKS & 0x3)   /* get u to be a multiple of 4 minus one  */
193
0
    {
194
0
      case 3: if (sacc->chunk[u] != 0)
195
0
              { goto found2;
196
0
              }
197
0
              u -= 1;                            /* XSUM_SCHUNKS is a */
198
0
            mxFallThrough;
199
0
      case 2: if (sacc->chunk[u] != 0)           /* constant, so the  */
200
0
              { goto found2;                     /* compiler will do  */
201
0
              }                                  /* simple code here  */
202
0
              u -= 1;
203
0
            mxFallThrough;
204
0
      case 1: if (sacc->chunk[u] != 0)
205
0
              { goto found2;
206
0
              }
207
0
              u -= 1;
208
0
            mxFallThrough;
209
0
      case 0: ;
210
0
    }
211
212
0
    do  /* here, u should be a multiple of 4 minus one, and at least 3 */
213
0
    {
214
#     if USE_SIMD && __AVX__
215
      { __m256i ch;
216
        ch = _mm256_loadu_si256 ((__m256i *)(sacc->chunk+u-3));
217
        if (!_mm256_testz_si256(ch,ch))
218
        { goto found;
219
        }
220
        u -= 4;
221
        if (u < 0)  /* never actually happens, because value of XSUM_SCHUNKS */
222
        { break;    /*   is such that u < 0 occurs at end of do loop instead */
223
        }         
224
        ch = _mm256_loadu_si256 ((__m256i *)(sacc->chunk+u-3));
225
        if (!_mm256_testz_si256(ch,ch))
226
        { goto found;
227
        }
228
        u -= 4;
229
      }
230
#     else
231
0
      { if (sacc->chunk[u] | sacc->chunk[u-1]
232
0
          | sacc->chunk[u-2] | sacc->chunk[u-3])
233
0
        { goto found;
234
0
        }
235
0
        u -= 4;
236
0
      }
237
0
#     endif
238
239
0
    } while (u >= 0);
240
241
0
    if (xsum_debug) c_printf ("number is zero (1)\n");
242
0
    uix = 0;
243
0
    goto done;
244
245
0
  found:
246
0
    if (sacc->chunk[u] != 0)
247
0
    { goto found2;
248
0
    }
249
0
    u -= 1;
250
0
    if (sacc->chunk[u] != 0)
251
0
    { goto found2;
252
0
    }
253
0
    u -= 1;
254
0
    if (sacc->chunk[u] != 0)
255
0
    { goto found2;
256
0
    }
257
0
    u -= 1;  
258
259
0
   found2: ;
260
0
  }
261
262
# else  /* Non-optimized search for uppermost non-zero chunk */
263
264
  { for (u = XSUM_SCHUNKS-1; sacc->chunk[u] == 0; u--)
265
    { if (u == 0)
266
      { if (xsum_debug) c_printf ("number is zero (1)\n");
267
        uix = 0;
268
        goto done;
269
      }
270
    }
271
  }
272
273
# endif
274
275
  /* At this point, sacc->chunk[u] must be non-zero */
276
277
0
  if (xsum_debug) c_printf("u: %d, sacc->chunk[u]: %lld",u,sacc->chunk[u]);
278
279
  /* Carry propagate, starting at the low-order chunks.  Note that the
280
     loop limit of u may be increased inside the loop. */
281
282
0
  i = 0;     /* set to the index of the next non-zero chunck, from bottom */
283
284
0
# if OPT_CARRY
285
0
  {
286
    /* Quickly skip over unused low-order chunks.  Done here at the start
287
       on the theory that there are often many unused low-order chunks,
288
       justifying some overhead to begin, but later stretches of unused
289
       chunks may not be as large. */
290
291
0
    int e = u-3;  /* go only to 3 before so won't access beyond chunk array */
292
293
0
    do
294
0
    {
295
#     if USE_SIMD && __AVX__
296
      { __m256i ch;
297
        ch = _mm256_loadu_si256 ((__m256i *)(sacc->chunk+i));
298
        if (!_mm256_testz_si256(ch,ch))
299
        { break;
300
        }
301
        i += 4;
302
        if (i >= e)
303
        { break;
304
        }
305
        ch = _mm256_loadu_si256 ((__m256i *)(sacc->chunk+i));
306
        if (!_mm256_testz_si256(ch,ch))
307
        { break;
308
        }
309
      }
310
#     else
311
0
      { if (sacc->chunk[i] | sacc->chunk[i+1]
312
0
          | sacc->chunk[i+2] | sacc->chunk[i+3])
313
0
        { break;
314
0
        }
315
0
      }
316
0
#     endif
317
318
0
      i += 4;
319
320
0
    } while (i <= e);
321
0
  }
322
0
# endif
323
324
0
  uix = -1;  /* indicates that a non-zero chunk has not been found yet */
325
326
0
  do
327
0
  { xsum_schunk c;       /* Set to the chunk at index i (next non-zero one) */
328
0
    xsum_schunk clow;    /* Low-order bits of c */
329
0
    xsum_schunk chigh;   /* High-order bits of c */
330
331
    /* Find the next non-zero chunk, setting i to its index, or break out
332
       of loop if there is none.  Note that the chunk at index u is not
333
       necessarily non-zero - it was initially, but u or the chunk at u
334
       may have changed. */
335
336
0
#   if OPT_CARRY
337
0
    { 
338
0
      c = sacc->chunk[i];
339
0
      if (c != 0)
340
0
      { goto nonzero;
341
0
      }
342
0
      i += 1;
343
0
      if (i > u)
344
0
      { break;  /* reaching here is only possible when u == i initially, */
345
0
      }         /*   with the last add to a chunk having changed it to 0 */
346
347
0
      for (;;)
348
0
      { c = sacc->chunk[i];
349
0
        if (c != 0)
350
0
        { goto nonzero;
351
0
        }
352
0
        i += 1;
353
0
        c = sacc->chunk[i];
354
0
        if (c != 0)
355
0
        { goto nonzero;
356
0
        }
357
0
        i += 1;
358
0
        c = sacc->chunk[i];
359
0
        if (c != 0)
360
0
        { goto nonzero;
361
0
        }
362
0
        i += 1;
363
0
        c = sacc->chunk[i];
364
0
        if (c != 0)
365
0
        { goto nonzero;
366
0
        }
367
0
        i += 1;
368
0
      }
369
0
    }
370
#   else
371
    { 
372
      do
373
      { c = sacc->chunk[i];
374
        if (c != 0)
375
        { goto nonzero;
376
        }
377
        i += 1;
378
      } while (i <= u);
379
380
      break;
381
    }
382
#   endif
383
384
    /* Propagate possible carry from this chunk to next chunk up. */
385
386
0
  nonzero:
387
0
    chigh = c >> XSUM_LOW_MANTISSA_BITS;
388
0
    if (chigh == 0)
389
0
    { uix = i;
390
0
      i += 1;
391
0
      continue;  /* no need to change this chunk */
392
0
    }
393
394
0
    if (u == i)
395
0
    { if (chigh == -1)
396
0
      { uix = i;
397
0
        break;   /* don't propagate -1 into the region of all zeros above */
398
0
      }
399
0
      u = i+1;   /* we will change chunk[u+1], so we'll need to look at it */
400
0
    }
401
402
0
    clow = c & XSUM_LOW_MANTISSA_MASK;
403
0
    if (clow != 0)
404
0
    { uix = i;
405
0
    }
406
407
    /* We now change chunk[i] and add to chunk[i+1]. Note that i+1 should be
408
       in range (no bigger than XSUM_CHUNKS-1) if summing memory, since
409
       the number of chunks is big enough to hold any sum, and we do not
410
       store redundant chunks with values 0 or -1 above previously non-zero
411
       chunks.  But other add operations might cause overflow, in which
412
       case we produce a NaN with all 1s as payload.  (We can't reliably produce
413
       an Inf of the right sign.) */
414
415
0
    sacc->chunk[i] = clow;
416
0
    if (i+1 >= XSUM_SCHUNKS)
417
0
    { xsum_small_add_inf_nan (sacc,
418
0
        ((xsum_int)XSUM_EXP_MASK << XSUM_MANTISSA_BITS) | XSUM_MANTISSA_MASK);
419
0
      u = i;
420
0
    }
421
0
    else
422
0
    { sacc->chunk[i+1] += chigh;  /* note: this could make this chunk be zero */
423
0
    }
424
425
0
    i += 1;
426
427
0
  } while (i <= u);
428
429
0
  if (xsum_debug) c_printf ("  uix: %d  new u: %d\n", uix,u);
430
431
  /* Check again for the number being zero, since carry propagation might
432
     have created zero from something that initially looked non-zero. */
433
434
0
  if (uix < 0)
435
0
  { if (xsum_debug) c_printf ("number is zero (2)\n");
436
0
    uix = 0;
437
0
    goto done;
438
0
  }
439
440
  /* While the uppermost chunk is negative, with value -1, combine it with
441
     the chunk below (if there is one) to produce the same number but with
442
     one fewer non-zero chunks. */
443
444
0
  while (sacc->chunk[uix] == -1 && uix > 0)
445
0
  { /* Left shift of a negative number is undefined according to the standard,
446
       so do a multiply - it's all presumably constant-folded by the compiler.*/
447
0
    sacc->chunk[uix-1] += ((xsum_schunk) -1)
448
0
                             * (((xsum_schunk) 1) << XSUM_LOW_MANTISSA_BITS);
449
0
    sacc->chunk[uix] = 0;
450
0
    uix -= 1;
451
0
  }
452
453
  /* We can now add one less than the total allowed terms before the
454
     next carry propagate. */
455
456
0
done:
457
0
  sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS-1;
458
459
  /* Return index of uppermost non-zero chunk. */
460
461
0
  return uix;
462
0
}
463
464
465
/* INITIALIZE LARGE ACCUMULATOR CHUNKS.  Sets all counts to -1. */
466
467
static void xsum_large_init_chunks (xsum_large_accumulator *restrict lacc)
468
0
{
469
0
# if USE_MEMSET_LARGE
470
0
  {
471
    /* Since in two's complement representation, -1 consists of all 1 bits,
472
       we can initialize 16-bit values to -1 by initializing their component
473
       bytes to 0xff. */
474
475
0
    memset (lacc->count, 0xff, XSUM_LCHUNKS * sizeof *lacc->count);
476
0
  }
477
# else
478
  { xsum_lcount *p;
479
    int n;
480
    p = lacc->count;
481
    n = XSUM_LCHUNKS;
482
    do { *p++ = -1; n -= 1; } while (n > 0);
483
  }
484
# endif
485
486
0
# if USE_USED_LARGE
487
0
#   if USE_MEMSET_SMALL
488
0
    { memset(lacc->chunks_used, 0, XSUM_LCHUNKS/64 * sizeof *lacc->chunks_used);
489
0
    }
490
#   elif USE_SIMD && __AVX__ && XSUM_LCHUNKS/64==64
491
    { xsum_used *ch = lacc->chunks_used;
492
      __m256i z = _mm256_setzero_si256();
493
      _mm256_storeu_si256 ((__m256i *)(ch+0), z);
494
      _mm256_storeu_si256 ((__m256i *)(ch+4), z);
495
      _mm256_storeu_si256 ((__m256i *)(ch+8), z);
496
      _mm256_storeu_si256 ((__m256i *)(ch+12), z);
497
      _mm256_storeu_si256 ((__m256i *)(ch+16), z);
498
      _mm256_storeu_si256 ((__m256i *)(ch+20), z);
499
      _mm256_storeu_si256 ((__m256i *)(ch+24), z);
500
      _mm256_storeu_si256 ((__m256i *)(ch+28), z);
501
      _mm256_storeu_si256 ((__m256i *)(ch+32), z);
502
      _mm256_storeu_si256 ((__m256i *)(ch+36), z);
503
      _mm256_storeu_si256 ((__m256i *)(ch+40), z);
504
      _mm256_storeu_si256 ((__m256i *)(ch+44), z);
505
      _mm256_storeu_si256 ((__m256i *)(ch+48), z);
506
      _mm256_storeu_si256 ((__m256i *)(ch+52), z);
507
      _mm256_storeu_si256 ((__m256i *)(ch+56), z);
508
      _mm256_storeu_si256 ((__m256i *)(ch+60), z);
509
    }
510
#   else
511
    { xsum_lchunk *p;
512
      int n;
513
      p = lacc->chunks_used;
514
      n = XSUM_LCHUNKS/64;
515
      do { *p++ = 0; n -= 1; } while (n > 0);
516
    }
517
#   endif
518
0
    lacc->used_used = 0;
519
0
# endif
520
0
}
521
522
523
/* ADD CHUNK FROM A LARGE ACCUMULATOR TO THE SMALL ACCUMULATOR WITHIN IT.
524
   The large accumulator chunk to add is indexed by ix.  This chunk will
525
   be cleared to zero and its count reset after it has been added to the
526
   small accumulator (except no add is done for a new chunk being initialized).
527
   This procedure should not be called for the special chunks correspnding to
528
   Inf or NaN, whose counts should always remain at -1. */
529
530
#if INLINE_LARGE
531
  INLINE
532
#endif
533
static void xsum_add_lchunk_to_small (xsum_large_accumulator *restrict lacc,
534
                                      xsum_expint ix)
535
0
{
536
0
  xsum_expint exp, low_exp, high_exp;
537
0
  xsum_uint low_chunk, mid_chunk, high_chunk;
538
0
  xsum_lchunk chunk;
539
540
0
  const xsum_expint count = lacc->count[ix];
541
542
  /* Add to the small accumulator only if the count is not -1, which
543
     indicates a chunk that contains nothing yet. */
544
545
0
  if (count >= 0)
546
0
  {
547
    /* Propagate carries in the small accumulator if necessary. */
548
549
0
    if (lacc->sacc.adds_until_propagate == 0)
550
0
    { (void) xsum_carry_propagate(&lacc->sacc);
551
0
    }
552
553
    /* Get the chunk we will add.  Note that this chunk is the integer sum
554
       of entire 64-bit floating-point representations, with sign, exponent,
555
       and mantissa, but we want only the sum of the mantissas. */
556
557
0
    chunk = lacc->chunk[ix];
558
559
0
    if (xsum_debug)
560
0
    { c_printf(
561
0
        "\nADDING CHUNK %d TO SMALL ACCUMULATOR (COUNT %d, CHUNK %016llx)\n",
562
0
        (int) ix, (int) count, (long long) chunk);
563
0
    }
564
565
    /* If we added the maximum number of values to 'chunk', the sum of
566
       the sign and exponent parts (all the same, equal to the index) will
567
       have overflowed out the top, leaving only the sum of the mantissas.
568
       If the count of how many more terms we could have summed is greater
569
       than zero, we therefore add this count times the index (shifted to
570
       the position of the sign and exponent) to get the unwanted bits to
571
       overflow out the top. */
572
573
0
    if (count > 0)
574
0
    { chunk += (xsum_lchunk)(count*ix) << XSUM_MANTISSA_BITS;
575
0
    }
576
577
    /* Find the exponent for this chunk from the low bits of the index,
578
       and split it into low and high parts, for accessing the small
579
       accumulator.  Noting that for denormalized numbers where the
580
       exponent part is zero, the actual exponent is 1 (before subtracting
581
       the bias), not zero. */
582
583
0
    exp = ix & XSUM_EXP_MASK;
584
0
    if (exp == 0)
585
0
    { low_exp = 1;
586
0
      high_exp = 0;
587
0
    }
588
0
    else
589
0
    { low_exp = exp & XSUM_LOW_EXP_MASK;
590
0
      high_exp = exp >> XSUM_LOW_EXP_BITS;
591
0
    }
592
593
    /* Split the mantissa into three parts, for three consecutive chunks in
594
       the small accumulator.  Except for denormalized numbers, add in the sum
595
       of all the implicit 1 bits that are above the actual mantissa bits. */
596
597
0
    low_chunk = (chunk << low_exp) & XSUM_LOW_MANTISSA_MASK;
598
0
    mid_chunk = chunk >> (XSUM_LOW_MANTISSA_BITS - low_exp);
599
0
    if (exp != 0) /* normalized */
600
0
    { mid_chunk += (xsum_lchunk)((1 << XSUM_LCOUNT_BITS) - count)
601
0
         << (XSUM_MANTISSA_BITS - XSUM_LOW_MANTISSA_BITS + low_exp);
602
0
    }
603
0
    high_chunk = mid_chunk >> XSUM_LOW_MANTISSA_BITS;
604
0
    mid_chunk &= XSUM_LOW_MANTISSA_MASK;
605
606
0
    if (xsum_debug)
607
0
    { c_printf("chunk div: low "); pbinary_int64(low_chunk,64); c_printf("\n");
608
0
      c_printf("           mid "); pbinary_int64(mid_chunk,64); c_printf("\n");
609
0
      c_printf("          high "); pbinary_int64(high_chunk,64); c_printf("\n");
610
0
    }
611
612
    /* Add or subtract the three parts of the mantissa from three small
613
       accumulator chunks, according to the sign that is part of the index. */
614
615
0
    if (xsum_debug)
616
0
    { c_printf("Small chunks %d, %d, %d before add or subtract:\n",
617
0
              (int)high_exp, (int)high_exp+1, (int)high_exp+2);
618
0
      pbinary_int64 (lacc->sacc.chunk[high_exp], 64); c_printf("\n");
619
0
      pbinary_int64 (lacc->sacc.chunk[high_exp+1], 64); c_printf("\n");
620
0
      pbinary_int64 (lacc->sacc.chunk[high_exp+2], 64); c_printf("\n");
621
0
    }
622
623
0
    if (ix & (1 << XSUM_EXP_BITS))
624
0
    { lacc->sacc.chunk[high_exp] -= low_chunk;
625
0
      lacc->sacc.chunk[high_exp+1] -= mid_chunk;
626
0
      lacc->sacc.chunk[high_exp+2] -= high_chunk;
627
0
    }
628
0
    else
629
0
    { lacc->sacc.chunk[high_exp] += low_chunk;
630
0
      lacc->sacc.chunk[high_exp+1] += mid_chunk;
631
0
      lacc->sacc.chunk[high_exp+2] += high_chunk;
632
0
    }
633
634
0
    if (xsum_debug)
635
0
    { c_printf("Small chunks %d, %d, %d after add or subtract:\n",
636
0
              (int)high_exp, (int)high_exp+1, (int)high_exp+2);
637
0
      pbinary_int64 (lacc->sacc.chunk[high_exp], 64); c_printf("\n");
638
0
      pbinary_int64 (lacc->sacc.chunk[high_exp+1], 64); c_printf("\n");
639
0
      pbinary_int64 (lacc->sacc.chunk[high_exp+2], 64); c_printf("\n");
640
0
    }
641
642
    /* The above additions/subtractions reduce by one the number we can
643
       do before we need to do carry propagation again. */
644
645
0
    lacc->sacc.adds_until_propagate -= 1;
646
0
  }
647
648
  /* We now clear the chunk to zero, and set the count to the number
649
     of adds we can do before the mantissa would overflow.  We also
650
     set the bit in chunks_used to indicate that this chunk is in use
651
     (if that is enabled). */
652
653
0
  lacc->chunk[ix] = 0;
654
0
  lacc->count[ix] = 1 << XSUM_LCOUNT_BITS;
655
656
0
# if USE_USED_LARGE
657
0
    lacc->chunks_used[ix>>6] |= (xsum_used)1 << (ix & 0x3f);
658
0
    lacc->used_used |= (xsum_used)1 << (ix>>6);
659
0
# endif
660
0
}
661
662
663
/* ADD A CHUNK TO THE LARGE ACCUMULATOR OR PROCESS NAN OR INF.  This routine
664
   is called when the count for a chunk is negative after decrementing, which
665
   indicates either inf/nan, or that the chunk has not been initialized, or
666
   that the chunk needs to be transferred to the small accumulator. */
667
668
#if INLINE_LARGE
669
  INLINE
670
#endif
671
static void xsum_large_add_value_inf_nan (xsum_large_accumulator *restrict lacc,
672
                                          xsum_expint ix, xsum_lchunk uintv)
673
0
{
674
0
  if ((ix & XSUM_EXP_MASK) == XSUM_EXP_MASK)
675
0
  { xsum_small_add_inf_nan (&lacc->sacc, uintv);
676
0
  }
677
0
  else
678
0
  { xsum_add_lchunk_to_small (lacc, ix);
679
0
    lacc->count[ix] -= 1;
680
0
    lacc->chunk[ix] += uintv;
681
0
  }
682
0
}
683
684
685
/* TRANSFER ALL CHUNKS IN LARGE ACCUMULATOR TO ITS SMALL ACCUMULATOR. */
686
687
static void xsum_large_transfer_to_small (xsum_large_accumulator *restrict lacc)
688
0
{
689
0
  if (xsum_debug) c_printf("\nTRANSFERRING CHUNKS IN LARGE ACCUMULATOR\n");
690
691
0
# if USE_USED_LARGE
692
0
  {
693
0
    xsum_used *p, *e;
694
0
    xsum_used u, uu;
695
0
    int ix;
696
697
0
    p = lacc->chunks_used;
698
0
    e = p + XSUM_LCHUNKS/64;
699
700
    /* Very quickly skip some unused low-order blocks of chunks by looking
701
       at the used_used flags. */
702
703
0
    uu = lacc->used_used;
704
0
    if ((uu & 0xffffffff) == 0)
705
0
    { uu >>= 32;
706
0
      p += 32;
707
0
    }
708
0
    if ((uu & 0xffff) == 0)
709
0
    { uu >>= 16;
710
0
      p += 16;
711
0
    }
712
0
    if ((uu & 0xff) == 0)
713
0
    { p += 8;
714
0
    }
715
716
    /* Loop over remaining blocks of chunks. */
717
718
0
    do
719
0
    {
720
      /* Loop to quickly find the next non-zero block of used flags, or finish
721
         up if we've added all the used blocks to the small accumulator. */
722
723
0
      for (;;)
724
0
      { u = *p;
725
0
        if (u != 0)
726
0
        { break;
727
0
        }
728
0
        p += 1;
729
0
        if (p == e)
730
0
        { return;
731
0
        }
732
0
        u = *p;
733
0
        if (u != 0)
734
0
        { break;
735
0
        }
736
0
        p += 1;
737
0
        if (p == e)
738
0
        { return;
739
0
        }
740
0
        u = *p;
741
0
        if (u != 0)
742
0
        { break;
743
0
        }
744
0
        p += 1;
745
0
        if (p == e)
746
0
        { return;
747
0
        }
748
0
        u = *p;
749
0
        if (u != 0)
750
0
        { break;
751
0
        }
752
0
        p += 1;
753
0
        if (p == e)
754
0
        { return;
755
0
        }
756
0
      }
757
758
      /* Find and process the chunks in this block that are used.  We skip
759
         forward based on the chunks_used flags until we're within eight
760
         bits of a chunk that is in use. */
761
762
0
      ix = (p - lacc->chunks_used) << 6;
763
0
      if ((u & 0xffffffff) == 0)
764
0
      { u >>= 32;
765
0
        ix += 32;
766
0
      }
767
0
      if ((u & 0xffff) == 0)
768
0
      { u >>= 16;
769
0
        ix += 16;
770
0
      }
771
0
      if ((u & 0xff) == 0)
772
0
      { u >>= 8;
773
0
        ix += 8;
774
0
      }
775
776
0
      do
777
0
      { if (lacc->count[ix] >= 0)
778
0
        { xsum_add_lchunk_to_small (lacc, ix);
779
0
        }
780
0
        ix += 1;
781
0
        u >>= 1;
782
0
      } while (u != 0);
783
784
0
      p += 1;
785
786
0
    } while (p != e);
787
0
  }
788
# else
789
  { xsum_expint ix;
790
791
    /* When there are no used flags, we scan sequentially for chunks that
792
       need to be added to the small accumulator. */
793
794
    for (ix = 0; ix < XSUM_LCHUNKS; ix++)
795
    { if (lacc->count[ix] >= 0)
796
      { xsum_add_lchunk_to_small (lacc, ix);
797
      }
798
    }
799
  }
800
# endif
801
0
}
802
803
804
/* ------------------------ EXTERNAL ROUTINES ------------------------------- */
805
806
807
/* INITIALIZE A SMALL ACCUMULATOR TO ZERO. */
808
809
void xsum_small_init (xsum_small_accumulator *restrict sacc)
810
0
{
811
0
  sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS;
812
0
  sacc->Inf = sacc->NaN = 0;
813
0
# if USE_MEMSET_SMALL
814
0
  { memset (sacc->chunk, 0, XSUM_SCHUNKS * sizeof(xsum_schunk));
815
0
  }
816
# elif USE_SIMD && __AVX__ && XSUM_SCHUNKS==67
817
  { xsum_schunk *ch = sacc->chunk;
818
    __m256i z = _mm256_setzero_si256();
819
    _mm256_storeu_si256 ((__m256i *)(ch+0), z);
820
    _mm256_storeu_si256 ((__m256i *)(ch+4), z);
821
    _mm256_storeu_si256 ((__m256i *)(ch+8), z);
822
    _mm256_storeu_si256 ((__m256i *)(ch+12), z);
823
    _mm256_storeu_si256 ((__m256i *)(ch+16), z);
824
    _mm256_storeu_si256 ((__m256i *)(ch+20), z);
825
    _mm256_storeu_si256 ((__m256i *)(ch+24), z);
826
    _mm256_storeu_si256 ((__m256i *)(ch+28), z);
827
    _mm256_storeu_si256 ((__m256i *)(ch+32), z);
828
    _mm256_storeu_si256 ((__m256i *)(ch+36), z);
829
    _mm256_storeu_si256 ((__m256i *)(ch+40), z);
830
    _mm256_storeu_si256 ((__m256i *)(ch+44), z);
831
    _mm256_storeu_si256 ((__m256i *)(ch+48), z);
832
    _mm256_storeu_si256 ((__m256i *)(ch+52), z);
833
    _mm256_storeu_si256 ((__m256i *)(ch+56), z);
834
    _mm256_storeu_si256 ((__m256i *)(ch+60), z);
835
    _mm_storeu_si128    ((__m128i *)(ch+64), _mm256_castsi256_si128(z));
836
    _mm_storeu_si64     (ch+66, _mm256_castsi256_si128(z));
837
  }
838
# else
839
  { xsum_schunk *p;
840
    int n;
841
    p = sacc->chunk;
842
    n = XSUM_SCHUNKS;
843
    do { *p++ = 0; n -= 1; } while (n > 0);
844
  }
845
# endif
846
0
}
847
848
849
/* ADD ONE NUMBER TO A SMALL ACCUMULATOR ASSUMING NO CARRY PROPAGATION REQ'D.
850
   This function is declared INLINE regardless of the setting of INLINE_SMALL
851
   and for good performance it must be inlined by the compiler (otherwise the
852
   procedure call overhead will result in substantial inefficiency). */
853
854
static INLINE void xsum_add1_no_carry (xsum_small_accumulator *restrict sacc,
855
                                       xsum_flt value)
856
0
{
857
0
  xsum_int ivalue;
858
0
  xsum_int mantissa;
859
0
  xsum_expint exp, low_exp, high_exp;
860
0
  xsum_schunk *chunk_ptr;
861
862
0
  if (xsum_debug)
863
0
  { c_printf ("ADD1 %+.17le\n     ", (double) value);
864
0
    pbinary_double ((double) value);
865
0
    c_printf("\n");
866
0
  }
867
868
  /* Extract exponent and mantissa.  Split exponent into high and low parts. */
869
870
0
  COPY64 (ivalue, value);
871
872
0
  exp = (ivalue >> XSUM_MANTISSA_BITS) & XSUM_EXP_MASK;
873
0
  mantissa = ivalue & XSUM_MANTISSA_MASK;
874
0
  high_exp = exp >> XSUM_LOW_EXP_BITS;
875
0
  low_exp = exp & XSUM_LOW_EXP_MASK;
876
877
0
  if (xsum_debug)
878
0
  { c_printf("  high exp: ");
879
0
    pbinary_int64 (high_exp, XSUM_HIGH_EXP_BITS);
880
0
    c_printf("  low exp: ");
881
0
    pbinary_int64 (low_exp, XSUM_LOW_EXP_BITS);
882
0
    c_printf("\n");
883
0
  }
884
885
  /* Categorize number as normal, denormalized, or Inf/NaN according to
886
     the value of the exponent field. */
887
888
0
  if (exp == 0) /* zero or denormalized */
889
0
  { /* If it's a zero (positive or negative), we do nothing. */
890
0
    if (mantissa == 0)
891
0
    { return;
892
0
    }
893
    /* Denormalized mantissa has no implicit 1, but exponent is 1 not 0. */
894
0
    exp = low_exp = 1;
895
0
  }
896
0
  else if (exp == XSUM_EXP_MASK)  /* Inf or NaN */
897
0
  { /* Just update flags in accumulator structure. */
898
0
    xsum_small_add_inf_nan (sacc, ivalue);
899
0
    return;
900
0
  }
901
0
  else /* normalized */
902
0
  { /* OR in implicit 1 bit at top of mantissa */
903
0
    mantissa |= (xsum_int)1 << XSUM_MANTISSA_BITS;
904
0
  }
905
906
0
  if (xsum_debug)
907
0
  { c_printf("  mantissa: ");
908
0
    pbinary_int64 (mantissa, XSUM_MANTISSA_BITS+1);
909
0
    c_printf("\n");
910
0
  }
911
912
  /* Use high part of exponent as index of chunk, and low part of
913
     exponent to give position within chunk.  Fetch the two chunks
914
     that will be modified. */
915
916
0
  chunk_ptr = sacc->chunk + high_exp;
917
918
  /* Separate mantissa into two parts, after shifting, and add to (or
919
     subtract from) this chunk and the next higher chunk (which always
920
     exists since there are three extra ones at the top).
921
922
     Note that low_mantissa will have at most XSUM_LOW_MANTISSA_BITS bits,
923
     while high_mantissa will have at most XSUM_MANTISSA_BITS bits, since
924
     even though the high mantissa includes the extra implicit 1 bit, it will
925
     also be shifted right by at least one bit. */
926
927
0
  xsum_int split_mantissa[2];
928
0
  split_mantissa[0] = ((xsum_uint)mantissa << low_exp) & XSUM_LOW_MANTISSA_MASK;
929
0
  split_mantissa[1] = mantissa >> (XSUM_LOW_MANTISSA_BITS - low_exp);
930
931
  /* Add to, or subtract from, the two affected chunks. */
932
933
# if OPT_SMALL==1
934
  { xsum_int ivalue_sign = ivalue<0 ? -1 : 1;
935
    chunk_ptr[0] += ivalue_sign * split_mantissa[0];
936
    chunk_ptr[1] += ivalue_sign * split_mantissa[1];
937
  }
938
# elif OPT_SMALL==2
939
  { xsum_int ivalue_neg
940
              = ivalue>>(XSUM_SCHUNK_BITS-1); /* all 0s if +ve, all 1s if -ve */
941
    chunk_ptr[0] += (split_mantissa[0] ^ ivalue_neg) + (ivalue_neg & 1);
942
    chunk_ptr[1] += (split_mantissa[1] ^ ivalue_neg) + (ivalue_neg & 1);
943
  }
944
# elif OPT_SMALL==3 && USE_SIMD && __SSE2__
945
  { xsum_int ivalue_neg
946
              = ivalue>>(XSUM_SCHUNK_BITS-1); /* all 0s if +ve, all 1s if -ve */
947
    _mm_storeu_si128 ((__m128i *)chunk_ptr,
948
                      _mm_add_epi64 (_mm_loadu_si128 ((__m128i *)chunk_ptr),
949
                       _mm_add_epi64 (_mm_set1_epi64((__m64)(ivalue_neg&1)),
950
                        _mm_xor_si128 (_mm_set1_epi64((__m64)ivalue_neg),
951
                         _mm_loadu_si128 ((__m128i *)split_mantissa)))));
952
  }
953
# else
954
0
  { if (ivalue < 0)
955
0
    { chunk_ptr[0] -= split_mantissa[0];
956
0
      chunk_ptr[1] -= split_mantissa[1];
957
0
    }
958
0
    else
959
0
    { chunk_ptr[0] += split_mantissa[0];
960
0
      chunk_ptr[1] += split_mantissa[1];
961
0
    }
962
0
  }
963
0
# endif
964
965
0
  if (xsum_debug)
966
0
  { if (ivalue < 0)
967
0
    { c_printf (" -high man: ");
968
0
      pbinary_int64 (-split_mantissa[1], XSUM_MANTISSA_BITS);
969
0
      c_printf ("\n  -low man: ");
970
0
      pbinary_int64 (-split_mantissa[0], XSUM_LOW_MANTISSA_BITS);
971
0
      c_printf("\n");
972
0
    }
973
0
    else
974
0
    { c_printf ("  high man: ");
975
0
      pbinary_int64 (split_mantissa[1], XSUM_MANTISSA_BITS);
976
0
      c_printf ("\n   low man: ");
977
0
      pbinary_int64 (split_mantissa[0], XSUM_LOW_MANTISSA_BITS);
978
0
      c_printf("\n");
979
0
    }
980
0
  }
981
0
}
982
983
984
/* ADD ONE DOUBLE TO A SMALL ACCUMULATOR.  This is equivalent to, but
985
   somewhat faster than, calling xsum_small_addv with a vector of one
986
   value. */
987
988
void xsum_small_add1 (xsum_small_accumulator *restrict sacc, xsum_flt value)
989
0
{
990
0
  if (sacc->adds_until_propagate == 0)
991
0
  { (void) xsum_carry_propagate(sacc);
992
0
  }
993
994
0
  xsum_add1_no_carry (sacc, value);
995
996
0
  sacc->adds_until_propagate -= 1;
997
0
}
998
999
1000
/* ADD A VECTOR OF FLOATING-POINT NUMBERS TO A SMALL ACCUMULATOR.  Mixes
1001
   calls of xsum_carry_propagate with calls of xsum_add1_no_carry. */
1002
1003
void xsum_small_addv (xsum_small_accumulator *restrict sacc,
1004
                      const xsum_flt *restrict vec,
1005
                      xsum_length n)
1006
0
{ xsum_length m, i;
1007
1008
0
  while (n > 0)
1009
0
  { if (sacc->adds_until_propagate == 0)
1010
0
    { (void) xsum_carry_propagate(sacc);
1011
0
    }
1012
0
    m = n <= sacc->adds_until_propagate ? n : sacc->adds_until_propagate;
1013
0
    for (i = 0; i < m; i++)
1014
0
    { xsum_add1_no_carry (sacc, vec[i]);
1015
0
    }
1016
0
    sacc->adds_until_propagate -= m;
1017
0
    vec += m;
1018
0
    n -= m;
1019
0
  }
1020
0
}
1021
1022
1023
/* ADD SQUARED NORM OF VECTOR OF FLOATING-POINT NUMBERS TO SMALL ACCUMULATOR.
1024
   Mixes calls of xsum_carry_propagate with calls of xsum_add1_no_carry. */
1025
1026
void xsum_small_add_sqnorm (xsum_small_accumulator *restrict sacc,
1027
                            const xsum_flt *restrict vec,
1028
                            xsum_length n)
1029
0
{ xsum_length m, i;
1030
1031
0
  while (n > 0)
1032
0
  { if (sacc->adds_until_propagate == 0)
1033
0
    { (void) xsum_carry_propagate(sacc);
1034
0
    }
1035
0
    m = n <= sacc->adds_until_propagate ? n : sacc->adds_until_propagate;
1036
0
    for (i = 0; i < m; i++)
1037
0
    { xsum_add1_no_carry (sacc, vec[i] * vec[i]);
1038
0
    }
1039
0
    sacc->adds_until_propagate -= m;
1040
0
    vec += m;
1041
0
    n -= m;
1042
0
  }
1043
0
}
1044
1045
1046
/* ADD DOT PRODUCT OF VECTORS OF FLOATING-POINT NUMBERS TO SMALL ACCUMULATOR.
1047
   Mixes calls of xsum_carry_propagate with calls of xsum_add1_no_carry. */
1048
1049
void xsum_small_add_dot (xsum_small_accumulator *restrict sacc,
1050
                         const xsum_flt *vec1, const xsum_flt *vec2,
1051
                         xsum_length n)
1052
0
{ xsum_length m, i;
1053
1054
0
  while (n > 0)
1055
0
  { if (sacc->adds_until_propagate == 0)
1056
0
    { (void) xsum_carry_propagate(sacc);
1057
0
    }
1058
0
    m = n <= sacc->adds_until_propagate ? n : sacc->adds_until_propagate;
1059
0
    for (i = 0; i < m; i++)
1060
0
    { xsum_add1_no_carry (sacc, vec1[i] * vec2[i]);
1061
0
    }
1062
0
    sacc->adds_until_propagate -= m;
1063
0
    vec1 += m;
1064
0
    vec2 += m;
1065
0
    n -= m;
1066
0
  }
1067
0
}
1068
1069
1070
/* ADD A SMALL ACCUMULATOR TO ANOTHER SMALL ACCUMULATOR.  The first argument
1071
   is the destination, which is modified.  The second is the accumulator to
1072
   add, which may also be modified, but should still represent the same
1073
   number.  Source and destination may be the same. */
1074
1075
void xsum_small_add_accumulator (xsum_small_accumulator *dst_sacc,
1076
                                 xsum_small_accumulator *src_sacc)
1077
0
{
1078
0
  int i;
1079
1080
0
  if (xsum_debug) c_printf("\nADDING ACCUMULATOR TO A SMALL ACCUMULATOR\n");
1081
1082
0
  xsum_carry_propagate (dst_sacc);
1083
1084
0
  if (dst_sacc == src_sacc)
1085
0
  { for (i = 0; i < XSUM_SCHUNKS; i++)
1086
0
    { dst_sacc->chunk[i] += dst_sacc->chunk[i];
1087
0
    }
1088
0
  }
1089
0
  else
1090
0
  {
1091
0
    xsum_carry_propagate (src_sacc);
1092
1093
0
    if (src_sacc->Inf) xsum_small_add_inf_nan (dst_sacc, src_sacc->Inf);
1094
0
    if (src_sacc->NaN) xsum_small_add_inf_nan (dst_sacc, src_sacc->NaN);
1095
1096
0
    for (i = 0; i < XSUM_SCHUNKS; i++)
1097
0
    { dst_sacc->chunk[i] += src_sacc->chunk[i];
1098
0
    }
1099
0
  }
1100
1101
0
  dst_sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS-2;
1102
0
}
1103
1104
1105
/* NEGATE THE VALUE IN A SMALL ACCUMULATOR. */
1106
1107
void xsum_small_negate (xsum_small_accumulator *restrict sacc)
1108
0
{
1109
0
  int i;
1110
1111
0
  if (xsum_debug) c_printf("\nNEGATING A SMALL ACCUMULATOR\n");
1112
1113
0
  for (i = 0; i < XSUM_SCHUNKS; i++)
1114
0
  { sacc->chunk[i] = -sacc->chunk[i];
1115
0
  }
1116
1117
0
  if (sacc->Inf != 0)
1118
0
  { sacc->Inf ^= XSUM_SIGN_MASK;
1119
0
  }
1120
0
}
1121
1122
1123
/* RETURN THE RESULT OF ROUNDING A SMALL ACCUMULATOR.  The rounding mode
1124
   is to nearest, with ties to even.  The small accumulator may be modified
1125
   by this operation (by carry propagation being done), but the value it
1126
   represents should not change. */
1127
1128
xsum_flt xsum_small_round (xsum_small_accumulator *restrict sacc)
1129
0
{
1130
0
  xsum_int ivalue;
1131
0
  xsum_schunk lower;
1132
0
  int i, j, e, more;
1133
0
  xsum_int intv;
1134
0
  double fltv;
1135
1136
0
  if (xsum_debug) c_printf("\nROUNDING SMALL ACCUMULATOR\n");
1137
1138
  /* See if we have a NaN from one of the numbers being a NaN, in
1139
     which case we return the NaN with largest payload, or an infinite
1140
     result (+Inf, -Inf, or a NaN if both +Inf and -Inf occurred).
1141
     Note that we do NOT return NaN if we have both an infinite number
1142
     and a sum of other numbers that overflows with opposite sign,
1143
     since there is no real ambiguity regarding the sign in such a case. */
1144
1145
0
  if (sacc->NaN != 0)
1146
0
  { COPY64(fltv, sacc->NaN);
1147
0
    return fltv;
1148
0
  }
1149
1150
0
  if (sacc->Inf != 0)
1151
0
  { COPY64 (fltv, sacc->Inf);
1152
0
    return fltv;
1153
0
  }
1154
1155
  /* If none of the numbers summed were infinite or NaN, we proceed to
1156
     propagate carries, as a preliminary to finding the magnitude of
1157
     the sum.  This also ensures that the sign of the result can be
1158
     determined from the uppermost non-zero chunk.
1159
1160
     We also find the index, i, of this uppermost non-zero chunk, as
1161
     the value returned by xsum_carry_propagate, and set ivalue to
1162
     sacc->chunk[i].  Note that ivalue will not be 0 or -1, unless
1163
     i is 0 (the lowest chunk), in which case it will be handled by
1164
     the code for denormalized numbers. */
1165
1166
0
  i = xsum_carry_propagate(sacc);
1167
1168
0
  if (xsum_debug) xsum_small_display(sacc);
1169
1170
0
  ivalue = sacc->chunk[i];
1171
1172
  /* Handle a possible denormalized number, including zero. */
1173
1174
0
  if (i <= 1)
1175
0
  {
1176
    /* Check for zero value, in which case we can return immediately. */
1177
1178
0
    if (ivalue == 0)
1179
0
    { return 0.0;
1180
0
    }
1181
1182
    /* Check if it is actually a denormalized number.  It always is if only
1183
       the lowest chunk is non-zero.  If the highest non-zero chunk is the
1184
       next-to-lowest, we check the magnitude of the absolute value.
1185
       Note that the real exponent is 1 (not 0), so we need to shift right
1186
       by 1 here. */
1187
1188
0
    if (i == 0)
1189
0
    { intv = ivalue >= 0 ? ivalue : -ivalue;
1190
0
      intv >>= 1;
1191
0
      if (ivalue < 0)
1192
0
      { intv |= XSUM_SIGN_MASK;
1193
0
      }
1194
0
      if (xsum_debug)
1195
0
      { c_printf("denormalized with i==0: intv %016llx\n",
1196
0
                (long long)intv);
1197
0
      }
1198
0
      COPY64 (fltv, intv);
1199
0
      return fltv;
1200
0
    }
1201
0
    else
1202
0
    { /* Note: Left shift of -ve number is undefined, so do a multiply instead,
1203
               which is probably optimized to a shift. */
1204
0
      intv = ivalue * ((xsum_int)1 << (XSUM_LOW_MANTISSA_BITS-1))
1205
0
               + (sacc->chunk[0] >> 1);
1206
0
      if (intv < 0)
1207
0
      { if (intv > - ((xsum_int)1 << XSUM_MANTISSA_BITS))
1208
0
        { intv = (-intv) | XSUM_SIGN_MASK;
1209
0
          if (xsum_debug)
1210
0
          { c_printf("denormalized with i==1: intv %016llx\n",
1211
0
                    (long long)intv);
1212
0
          }
1213
0
          COPY64 (fltv, intv);
1214
0
          return fltv;
1215
0
        }
1216
0
      }
1217
0
      else /* non-negative */
1218
0
      { if ((xsum_uint)intv < (xsum_uint)1 << XSUM_MANTISSA_BITS)
1219
0
        { if (xsum_debug)
1220
0
          { c_printf("denormalized with i==1: intv %016llx\n",
1221
0
                    (long long)intv);
1222
0
          }
1223
0
          COPY64 (fltv, intv);
1224
0
          return fltv;
1225
0
        }
1226
0
      }
1227
      /* otherwise, it's not actually denormalized, so fall through to below */
1228
0
    }
1229
0
  }
1230
1231
  /* Find the location of the uppermost 1 bit in the absolute value of
1232
     the upper chunk by converting it (as a signed integer) to a
1233
     floating point value, and looking at the exponent.  Then set
1234
     'more' to the number of bits from the lower chunk (and maybe the
1235
     next lower) that are needed to fill out the mantissa of the
1236
     result (including the top implicit 1 bit), plus two extra bits to
1237
     help decide on rounding.  For negative numbers, it may turn out
1238
     later that we need another bit, because negating a negative value
1239
     may carry out of the top here, but not carry out of the top once
1240
     more bits are shifted into the bottom later on. */
1241
1242
0
  fltv = (xsum_flt) ivalue;  /* finds position of topmost 1 bit of |ivalue| */
1243
0
  COPY64 (intv, fltv);
1244
0
  e = (intv >> XSUM_MANTISSA_BITS) & XSUM_EXP_MASK; /* e-bias is in 0..32 */
1245
0
  more = 2 + XSUM_MANTISSA_BITS + XSUM_EXP_BIAS - e;
1246
1247
0
  if (xsum_debug)
1248
0
  { c_printf("e: %d, more: %d,             ivalue: %016llx\n",
1249
0
            e,more,(long long)ivalue);
1250
0
  }
1251
1252
  /* Change 'ivalue' to put in 'more' bits from lower chunks into the bottom.
1253
     Also set 'j' to the index of the lowest chunk from which these bits came,
1254
     and 'lower' to the remaining bits of that chunk not now in 'ivalue'.
1255
     Note that 'lower' initially has at least one bit in it, which we can
1256
     later move into 'ivalue' if it turns out that one more bit is needed. */
1257
1258
0
  ivalue *= (xsum_int)1 << more;  /* multiply, since << of negative undefined */
1259
0
  if (xsum_debug)
1260
0
  { c_printf("after ivalue <<= more,         ivalue: %016llx\n",
1261
0
            (long long)ivalue);
1262
0
  }
1263
0
  j = i-1;
1264
0
  lower = sacc->chunk[j];  /* must exist, since denormalized if i==0 */
1265
0
  if (more >= XSUM_LOW_MANTISSA_BITS)
1266
0
  { more -= XSUM_LOW_MANTISSA_BITS;
1267
0
    ivalue += lower << more;
1268
0
    if (xsum_debug)
1269
0
    { c_printf("after ivalue += lower << more, ivalue: %016llx\n",
1270
0
              (long long)ivalue);
1271
0
    }
1272
0
    j -= 1;
1273
0
    lower = j < 0 ? 0 : sacc->chunk[j];
1274
0
  }
1275
0
  ivalue += lower >> (XSUM_LOW_MANTISSA_BITS - more);
1276
0
  lower &= ((xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - more)) - 1;
1277
1278
0
  if (xsum_debug)
1279
0
  { c_printf("after final add to ivalue,     ivalue: %016llx\n",
1280
0
            (long long)ivalue);
1281
0
    c_printf("j: %d, e: %d, |ivalue|: %016llx, lower: %016llx (a)\n",
1282
0
           j, e, (long long) (ivalue<0 ? -ivalue : ivalue), (long long)lower);
1283
0
    c_printf("   mask of low 55 bits:   007fffffffffffff,  mask: %016llx\n",
1284
0
            (long long)((xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - more)) - 1);
1285
0
  }
1286
1287
  /* Decide on rounding, with separate code for positive and negative values.
1288
1289
     At this point, 'ivalue' has the signed mantissa bits, plus two extra
1290
     bits, with 'e' recording the exponent position for these within their
1291
     top chunk.  For positive 'ivalue', the bits in 'lower' and chunks
1292
     below 'j' add to the absolute value; for negative 'ivalue' they
1293
     subtract.
1294
1295
     After setting 'ivalue' to the tentative unsigned mantissa
1296
     (shifted left 2), and 'intv' to have the correct sign, this
1297
     code goes to done_rounding if it finds that just discarding lower
1298
     order bits is correct, and to round_away_from_zero if instead the
1299
     magnitude should be increased by one in the lowest mantissa bit. */
1300
1301
0
  if (ivalue >= 0)  /* number is positive, lower bits are added to magnitude */
1302
0
  {
1303
0
    intv = 0;  /* positive sign */
1304
1305
0
    if ((ivalue & 2) == 0)  /* extra bits are 0x */
1306
0
    { if (xsum_debug)
1307
0
      { c_printf("+, no adjustment, since remainder adds <1/2\n");
1308
0
      }
1309
0
      goto done_rounding;
1310
0
    }
1311
1312
0
    if ((ivalue & 1) != 0)  /* extra bits are 11 */
1313
0
    { if (xsum_debug)
1314
0
      { c_printf("+, round away from 0, since remainder adds >1/2\n");
1315
0
      }
1316
0
      goto round_away_from_zero;
1317
0
    }
1318
1319
0
    if ((ivalue & 4) != 0)  /* low bit is 1 (odd), extra bits are 10 */
1320
0
    { if (xsum_debug)
1321
0
      { c_printf("+odd, round away from 0, since remainder adds >=1/2\n");
1322
0
      }
1323
0
      goto round_away_from_zero;
1324
0
    }
1325
1326
0
    if (lower == 0)  /* see if any lower bits are non-zero */
1327
0
    { while (j > 0)
1328
0
      { j -= 1;
1329
0
        if (sacc->chunk[j] != 0)
1330
0
        { lower = 1;
1331
0
          break;
1332
0
        }
1333
0
      }
1334
0
    }
1335
1336
0
    if (lower != 0)  /* low bit 0 (even), extra bits 10, non-zero lower bits */
1337
0
    { if (xsum_debug)
1338
0
      { c_printf("+even, round away from 0, since remainder adds >1/2\n");
1339
0
      }
1340
0
      goto round_away_from_zero;
1341
0
    }
1342
0
    else  /* low bit 0 (even), extra bits 10, all lower bits 0 */
1343
0
    { if (xsum_debug)
1344
0
      { c_printf("+even, no adjustment, since reaminder adds exactly 1/2\n");
1345
0
      }
1346
0
      goto done_rounding;
1347
0
    }
1348
0
  }
1349
1350
0
  else  /* number is negative, lower bits are subtracted from magnitude */
1351
0
  {
1352
    /* Check for a negative 'ivalue' that when negated doesn't contain a full
1353
       mantissa's worth of bits, plus one to help rounding.  If so, move one
1354
       more bit into 'ivalue' from 'lower' (and remove it from 'lower').
1355
       This happens when the negation of the upper part of 'ivalue' has the
1356
       form 10000... but the negation of the full 'ivalue' is not 10000... */
1357
1358
0
    if (((-ivalue) & ((xsum_int)1 << (XSUM_MANTISSA_BITS+2))) == 0)
1359
0
    { int pos = (xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - 1 - more);
1360
0
      ivalue *= 2;  /* note that left shift undefined if ivalue is negative */
1361
0
      if (lower & pos)
1362
0
      { ivalue += 1;
1363
0
        lower &= ~pos;
1364
0
      }
1365
0
      e -= 1;
1366
0
      if (xsum_debug)
1367
0
      { c_printf("j: %d, e: %d, |ivalue|: %016llx, lower: %016llx (b)\n",
1368
0
             j, e, (long long) (ivalue<0 ? -ivalue : ivalue), (long long)lower);
1369
0
      }
1370
0
    }
1371
1372
0
    intv = XSUM_SIGN_MASK;    /* negative sign */
1373
0
    ivalue = -ivalue;         /* ivalue now contains the absolute value */
1374
1375
0
    if ((ivalue & 3) == 3)  /* extra bits are 11 */
1376
0
    { if (xsum_debug)
1377
0
      { c_printf("-, round away from 0, since remainder adds >1/2\n");
1378
0
      }
1379
0
      goto round_away_from_zero;
1380
0
    }
1381
1382
0
    if ((ivalue & 3) <= 1)  /* extra bits are 00 or 01 */
1383
0
    { if (xsum_debug)
1384
0
      { c_printf(
1385
0
         "-, no adjustment, since remainder adds <=1/4 or subtracts <1/4\n");
1386
0
      }
1387
0
      goto done_rounding;
1388
0
    }
1389
1390
0
    if ((ivalue & 4) == 0)  /* low bit is 0 (even), extra bits are 10 */
1391
0
    { if (xsum_debug)
1392
0
      { c_printf("-even, no adjustment, since remainder adds <=1/2\n");
1393
0
      }
1394
0
      goto done_rounding;
1395
0
    }
1396
1397
0
    if (lower == 0)  /* see if any lower bits are non-zero */
1398
0
    { while (j > 0)
1399
0
      { j -= 1;
1400
0
        if (sacc->chunk[j] != 0)
1401
0
        { lower = 1;
1402
0
          break;
1403
0
        }
1404
0
      }
1405
0
    }
1406
1407
0
    if (lower != 0)  /* low bit 1 (odd), extra bits 10, non-zero lower bits */
1408
0
    { if (xsum_debug)
1409
0
      { c_printf("-odd, no adjustment, since remainder adds <1/2\n");
1410
0
      }
1411
0
      goto done_rounding;
1412
0
    }
1413
0
    else  /* low bit 1 (odd), extra bits are 10, lower bits are all 0 */
1414
0
    { if (xsum_debug)
1415
0
      { c_printf("-odd, round away from 0, since remainder adds exactly 1/2\n");
1416
0
      }
1417
0
      goto round_away_from_zero;
1418
0
    }
1419
1420
0
  }
1421
1422
0
round_away_from_zero:
1423
1424
  /* Round away from zero, then check for carry having propagated out the
1425
     top, and shift if so. */
1426
1427
0
  ivalue += 4;  /* add 1 to low-order mantissa bit */
1428
0
  if (ivalue & ((xsum_int)1 << (XSUM_MANTISSA_BITS+3)))
1429
0
  { ivalue >>= 1;
1430
0
    e += 1;
1431
0
  }
1432
1433
0
done_rounding: ;
1434
1435
  /* Get rid of the bottom 2 bits that were used to decide on rounding. */
1436
1437
0
  ivalue >>= 2;
1438
1439
  /* Adjust to the true exponent, accounting for where this chunk is. */
1440
1441
0
  e += (i<<XSUM_LOW_EXP_BITS) - XSUM_EXP_BIAS - XSUM_MANTISSA_BITS;
1442
1443
  /* If exponent has overflowed, change to plus or minus Inf and return. */
1444
1445
0
  if (e >= XSUM_EXP_MASK)
1446
0
  { intv |= (xsum_int) XSUM_EXP_MASK << XSUM_MANTISSA_BITS;
1447
0
    COPY64 (fltv, intv);
1448
0
    if (xsum_debug)
1449
0
    { c_printf ("Final rounded result: %.17le (overflowed)\n  ", fltv);
1450
0
      pbinary_double(fltv);
1451
0
      c_printf("\n");
1452
0
    }
1453
0
    return fltv;
1454
0
  }
1455
1456
  /* Put exponent and mantissa into intv, which already has the sign,
1457
     then copy into fltv. */
1458
1459
0
  intv += (xsum_int)e << XSUM_MANTISSA_BITS;
1460
0
  intv += ivalue & XSUM_MANTISSA_MASK;  /* mask out the implicit 1 bit */
1461
0
  COPY64 (fltv, intv);
1462
1463
0
  if (xsum_debug)
1464
0
  { c_printf ("Final rounded result: %.17le\n  ", fltv);
1465
0
    pbinary_double(fltv);
1466
0
    c_printf("\n");
1467
0
    if ((ivalue >> XSUM_MANTISSA_BITS) != 1) c_abort();
1468
0
  }
1469
1470
0
  return fltv;
1471
0
}
1472
1473
1474
/* INITIALIZE A LARGE ACCUMULATOR TO ZERO. */
1475
1476
void xsum_large_init (xsum_large_accumulator *restrict lacc)
1477
0
{
1478
0
  xsum_large_init_chunks (lacc);
1479
0
  xsum_small_init (&lacc->sacc);
1480
0
}
1481
1482
1483
/* ADD A VECTOR OF FLOATING-POINT NUMBERS TO A LARGE ACCUMULATOR. */
1484
1485
void xsum_large_addv (xsum_large_accumulator *restrict lacc,
1486
                      const xsum_flt *restrict vec,
1487
                      xsum_length n)
1488
0
{
1489
0
  if (xsum_debug) c_printf("\nLARGE ADDV OF %ld VALUES\n",(long)n);
1490
1491
0
# if OPT_LARGE_SUM
1492
0
  {
1493
0
    xsum_lcount count;
1494
0
    xsum_expint ix;
1495
0
    xsum_uint uintv;
1496
1497
0
    while (n > 3)
1498
0
    {
1499
0
      COPY64 (uintv, *vec);
1500
0
      vec += 1;
1501
1502
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1503
1504
0
      count = lacc->count[ix] - 1;
1505
1506
0
      if (count < 0)
1507
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1508
0
      }
1509
0
      else
1510
0
      { lacc->count[ix] = count;
1511
0
        lacc->chunk[ix] += uintv;
1512
0
      }
1513
1514
0
      COPY64 (uintv, *vec);
1515
0
      vec += 1;
1516
1517
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1518
1519
0
      count = lacc->count[ix] - 1;
1520
1521
0
      if (count < 0)
1522
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1523
0
      }
1524
0
      else
1525
0
      { lacc->count[ix] = count;
1526
0
        lacc->chunk[ix] += uintv;
1527
0
      }
1528
1529
0
      COPY64 (uintv, *vec);
1530
0
      vec += 1;
1531
1532
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1533
1534
0
      count = lacc->count[ix] - 1;
1535
1536
0
      if (count < 0)
1537
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1538
0
      }
1539
0
      else
1540
0
      { lacc->count[ix] = count;
1541
0
        lacc->chunk[ix] += uintv;
1542
0
      }
1543
1544
0
      COPY64 (uintv, *vec);
1545
0
      vec += 1;
1546
1547
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1548
1549
0
      count = lacc->count[ix] - 1;
1550
1551
0
      if (count < 0)
1552
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1553
0
      }
1554
0
      else
1555
0
      { lacc->count[ix] = count;
1556
0
        lacc->chunk[ix] += uintv;
1557
0
      }
1558
1559
0
      n -= 4;
1560
0
    }
1561
1562
0
    while (n > 0)
1563
0
    {
1564
0
      COPY64 (uintv, *vec);
1565
0
      vec += 1;
1566
1567
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1568
1569
0
      count = lacc->count[ix] - 1;
1570
1571
0
      if (count < 0)
1572
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1573
0
      }
1574
0
      else
1575
0
      { lacc->count[ix] = count;
1576
0
        lacc->chunk[ix] += uintv;
1577
0
      }
1578
1579
0
      n -= 1;
1580
0
    }
1581
0
  }
1582
# else
1583
  {
1584
    /* Version not manually optimized - maybe the compiler can do better. */
1585
1586
    if (n == 0)
1587
    { return;
1588
    }
1589
1590
    xsum_lcount count;
1591
    xsum_expint ix;
1592
    xsum_uint uintv;
1593
1594
    do
1595
    {
1596
      /* Fetch the next number, and convert to integer form in uintv. */
1597
1598
      COPY64 (uintv, *vec);
1599
      vec += 1;
1600
1601
      /* Isolate the upper sign+exponent bits that index the chunk. */
1602
1603
      ix = uintv >> XSUM_MANTISSA_BITS;
1604
1605
      /* Find the count for this chunk, and subtract one. */
1606
1607
      count = lacc->count[ix] - 1;
1608
1609
      if (count < 0)
1610
      {
1611
        /* If the decremented count is negative, it's either a special
1612
           Inf/NaN chunk (in which case count will stay at -1), or one that
1613
           needs to be transferred to the small accumulator, or one that
1614
           has never been used before and needs to be initialized. */
1615
1616
        xsum_large_add_value_inf_nan (lacc, ix, uintv);
1617
      }
1618
      else
1619
      {
1620
        /* Store the decremented count of additions allowed before transfer,
1621
           and add this value to the chunk. */
1622
1623
        lacc->count[ix] = count;
1624
        lacc->chunk[ix] += uintv;
1625
      }
1626
1627
      n -= 1;
1628
1629
    } while (n > 0);
1630
  }
1631
# endif
1632
0
}
1633
1634
1635
/* ADD ONE DOUBLE TO A LARGE ACCUMULATOR.  Just calls xsum_large_addv. */
1636
1637
void xsum_large_add1 (xsum_large_accumulator *restrict lacc, xsum_flt value)
1638
0
{
1639
0
  xsum_large_addv (lacc, &value, 1);
1640
0
}
1641
1642
1643
/* ADD SQUARED NORM OF VECTOR OF FLOATING-POINT NUMBERS TO LARGE ACCUMULATOR. */
1644
1645
void xsum_large_add_sqnorm (xsum_large_accumulator *restrict lacc,
1646
                            const xsum_flt *restrict vec,
1647
                            xsum_length n)
1648
0
{
1649
0
  if (xsum_debug) c_printf("\nLARGE ADD_SQNORM OF %ld VALUES\n",(long)n);
1650
1651
0
# if OPT_LARGE_SQNORM
1652
0
  {
1653
0
    xsum_lcount count;
1654
0
    xsum_expint ix;
1655
0
    xsum_uint uintv;
1656
0
    double fltv;
1657
1658
0
    while (n > 3)
1659
0
    {
1660
0
      fltv = *vec * *vec;
1661
0
      COPY64 (uintv, fltv);
1662
0
      vec += 1;
1663
1664
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1665
1666
0
      count = lacc->count[ix] - 1;
1667
1668
0
      if (count < 0)
1669
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1670
0
      }
1671
0
      else
1672
0
      { lacc->count[ix] = count;
1673
0
        lacc->chunk[ix] += uintv;
1674
0
      }
1675
1676
0
      fltv = *vec * *vec;
1677
0
      COPY64 (uintv, fltv);
1678
0
      vec += 1;
1679
1680
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1681
1682
0
      count = lacc->count[ix] - 1;
1683
1684
0
      if (count < 0)
1685
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1686
0
      }
1687
0
      else
1688
0
      { lacc->count[ix] = count;
1689
0
        lacc->chunk[ix] += uintv;
1690
0
      }
1691
1692
0
      fltv = *vec * *vec;
1693
0
      COPY64 (uintv, fltv);
1694
0
      vec += 1;
1695
1696
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1697
1698
0
      count = lacc->count[ix] - 1;
1699
1700
0
      if (count < 0)
1701
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1702
0
      }
1703
0
      else
1704
0
      { lacc->count[ix] = count;
1705
0
        lacc->chunk[ix] += uintv;
1706
0
      }
1707
1708
0
      fltv = *vec * *vec;
1709
0
      COPY64 (uintv, fltv);
1710
0
      vec += 1;
1711
1712
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1713
1714
0
      count = lacc->count[ix] - 1;
1715
1716
0
      if (count < 0)
1717
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1718
0
      }
1719
0
      else
1720
0
      { lacc->count[ix] = count;
1721
0
        lacc->chunk[ix] += uintv;
1722
0
      }
1723
1724
0
      n -= 4;
1725
0
    }
1726
1727
0
    while (n > 0)
1728
0
    {
1729
0
      fltv = *vec * *vec;
1730
0
      COPY64 (uintv, fltv);
1731
0
      vec += 1;
1732
1733
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1734
1735
0
      count = lacc->count[ix] - 1;
1736
1737
0
      if (count < 0)
1738
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1739
0
      }
1740
0
      else
1741
0
      { lacc->count[ix] = count;
1742
0
        lacc->chunk[ix] += uintv;
1743
0
      }
1744
1745
0
      n -= 1;
1746
0
    }
1747
0
  }
1748
# else
1749
  {
1750
    /* Version not manually optimized - maybe the compiler can do better. */
1751
1752
    xsum_lcount count;
1753
    xsum_expint ix;
1754
    xsum_uint uintv;
1755
    double fltv;
1756
1757
    if (n == 0)
1758
    { return;
1759
    }
1760
1761
    do
1762
    {
1763
      /* Fetch the next number, square it, and convert to integer form in
1764
         uintv. */
1765
1766
      fltv = *vec * *vec;
1767
      COPY64 (uintv, fltv);
1768
      vec += 1;
1769
1770
      /* Isolate the upper sign+exponent bits that index the chunk. */
1771
1772
      ix = uintv >> XSUM_MANTISSA_BITS;
1773
1774
      /* Find the count for this chunk, and subtract one. */
1775
1776
      count = lacc->count[ix] - 1;
1777
1778
      if (count < 0)
1779
      {
1780
        /* If the decremented count is negative, it's either a special
1781
           Inf/NaN chunk (in which case count will stay at -1), or one that
1782
           needs to be transferred to the small accumulator, or one that
1783
           has never been used before and needs to be initialized. */
1784
1785
        xsum_large_add_value_inf_nan (lacc, ix, uintv);
1786
      }
1787
      else
1788
      {
1789
        /* Store the decremented count of additions allowed before transfer,
1790
           and add this value to the chunk. */
1791
1792
        lacc->count[ix] = count;
1793
        lacc->chunk[ix] += uintv;
1794
      }
1795
1796
      n -= 1;
1797
1798
    } while (n > 0);
1799
  }
1800
# endif
1801
0
}
1802
1803
1804
/* ADD DOT PRODUCT OF VECTORS OF FLOATING-POINT NUMBERS TO LARGE ACCUMULATOR. */
1805
1806
void xsum_large_add_dot (xsum_large_accumulator *restrict lacc,
1807
                         const xsum_flt *vec1,
1808
                         const xsum_flt *vec2,
1809
                         xsum_length n)
1810
0
{
1811
0
  if (xsum_debug) c_printf("\nLARGE ADD_DOT OF %ld VALUES\n",(long)n);
1812
1813
0
# if OPT_LARGE_DOT
1814
0
  {
1815
0
    xsum_lcount count;
1816
0
    xsum_expint ix;
1817
0
    xsum_uint uintv;
1818
0
    double fltv;
1819
1820
0
    while (n > 3)
1821
0
    {
1822
0
      fltv = *vec1 * *vec2;
1823
0
      COPY64 (uintv, fltv);
1824
0
      vec1 += 1; vec2 += 1;
1825
1826
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1827
1828
0
      count = lacc->count[ix] - 1;
1829
1830
0
      if (count < 0)
1831
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1832
0
      }
1833
0
      else
1834
0
      { lacc->count[ix] = count;
1835
0
        lacc->chunk[ix] += uintv;
1836
0
      }
1837
1838
0
      fltv = *vec1 * *vec2;
1839
0
      COPY64 (uintv, fltv);
1840
0
      vec1 += 1; vec2 += 1;
1841
1842
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1843
1844
0
      count = lacc->count[ix] - 1;
1845
1846
0
      if (count < 0)
1847
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1848
0
      }
1849
0
      else
1850
0
      { lacc->count[ix] = count;
1851
0
        lacc->chunk[ix] += uintv;
1852
0
      }
1853
1854
0
      fltv = *vec1 * *vec2;
1855
0
      COPY64 (uintv, fltv);
1856
0
      vec1 += 1; vec2 += 1;
1857
1858
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1859
1860
0
      count = lacc->count[ix] - 1;
1861
1862
0
      if (count < 0)
1863
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1864
0
      }
1865
0
      else
1866
0
      { lacc->count[ix] = count;
1867
0
        lacc->chunk[ix] += uintv;
1868
0
      }
1869
1870
0
      fltv = *vec1 * *vec2;
1871
0
      COPY64 (uintv, fltv);
1872
0
      vec1 += 1; vec2 += 1;
1873
1874
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1875
1876
0
      count = lacc->count[ix] - 1;
1877
1878
0
      if (count < 0)
1879
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1880
0
      }
1881
0
      else
1882
0
      { lacc->count[ix] = count;
1883
0
        lacc->chunk[ix] += uintv;
1884
0
      }
1885
1886
0
      n -= 4;
1887
0
    }
1888
1889
0
    while (n > 0)
1890
0
    {
1891
0
      fltv = *vec1 * *vec2;
1892
0
      COPY64 (uintv, fltv);
1893
0
      vec1 += 1; vec2 += 1;
1894
1895
0
      ix = uintv >> XSUM_MANTISSA_BITS;
1896
1897
0
      count = lacc->count[ix] - 1;
1898
1899
0
      if (count < 0)
1900
0
      { xsum_large_add_value_inf_nan (lacc, ix, uintv);
1901
0
      }
1902
0
      else
1903
0
      { lacc->count[ix] = count;
1904
0
        lacc->chunk[ix] += uintv;
1905
0
      }
1906
1907
0
      n -= 1;
1908
0
    }
1909
0
  }
1910
# else
1911
  {
1912
    /* Version not manually optimized - maybe the compiler can do better. */
1913
1914
    xsum_lcount count;
1915
    xsum_expint ix;
1916
    xsum_uint uintv;
1917
    double fltv;
1918
1919
    if (n == 0)
1920
    { return;
1921
    }
1922
1923
    do
1924
    {
1925
      /* Fetch the next numbers, multiply them, and convert the result to
1926
         integer form in uintv. */
1927
1928
      fltv = *vec1 * *vec2;
1929
      COPY64 (uintv, fltv);
1930
      vec1 += 1; vec2 += 1;
1931
1932
      /* Isolate the upper sign+exponent bits that index the chunk. */
1933
1934
      ix = uintv >> XSUM_MANTISSA_BITS;
1935
1936
      /* Find the count for this chunk, and subtract one. */
1937
1938
      count = lacc->count[ix] - 1;
1939
1940
      if (count < 0)
1941
      {
1942
        /* If the decremented count is negative, it's either a special
1943
           Inf/NaN chunk (in which case count will stay at -1), or one that
1944
           needs to be transferred to the small accumulator, or one that
1945
           has never been used before and needs to be initialized. */
1946
1947
        xsum_large_add_value_inf_nan (lacc, ix, uintv);
1948
      }
1949
      else
1950
      {
1951
        /* Store the decremented count of additions allowed before transfer,
1952
           and add this value to the chunk. */
1953
1954
        lacc->count[ix] = count;
1955
        lacc->chunk[ix] += uintv;
1956
      }
1957
1958
      n -= 1;
1959
1960
    } while (n > 0);
1961
  }
1962
# endif
1963
0
}
1964
1965
1966
/* ADD A LARGE ACCUMULATOR TO ANOTHER LARGE ACCUMULATOR.  The first argument
1967
   is the destination, which is modified.  The second is the accumulator to
1968
   add, which may also be modified, but should still represent the same
1969
   number.  Source and destination may be the same. */
1970
1971
void xsum_large_add_accumulator (xsum_large_accumulator *dst_lacc,
1972
                                 xsum_large_accumulator *src_lacc)
1973
0
{
1974
0
  if (xsum_debug) c_printf("\nADDING ACCUMULATOR TO A LARGE ACCUMULATOR\n");
1975
1976
0
  xsum_large_transfer_to_small (src_lacc);
1977
0
  xsum_small_add_accumulator (&dst_lacc->sacc, &src_lacc->sacc);
1978
0
}
1979
1980
1981
/* NEGATE THE VALUE IN A LARGE ACCUMULATOR. */
1982
1983
void xsum_large_negate (xsum_large_accumulator *restrict lacc)
1984
0
{
1985
0
  if (xsum_debug) c_printf("\nNEGATING A LARGE ACCUMULATOR\n");
1986
1987
0
  xsum_large_transfer_to_small (lacc);
1988
0
  xsum_small_negate (&lacc->sacc);
1989
0
}
1990
1991
1992
1993
/* RETURN RESULT OF ROUNDING A LARGE ACCUMULATOR.  Rounding mode is to nearest,
1994
   with ties to even.
1995
1996
   This is done by adding all the chunks in the large accumulator to the
1997
   small accumulator, and then calling its rounding procedure. */
1998
1999
xsum_flt xsum_large_round (xsum_large_accumulator *restrict lacc)
2000
0
{
2001
0
  if (xsum_debug) c_printf("\nROUNDING LARGE ACCUMULATOR\n");
2002
2003
0
  xsum_large_transfer_to_small (lacc);
2004
2005
0
  return xsum_small_round (&lacc->sacc);
2006
0
}
2007
2008
2009
/* TRANSFER NUMBER FROM A LARGE ACCUMULATOR TO A SMALL ACCUMULATOR. */
2010
2011
void xsum_large_to_small_accumulator (xsum_small_accumulator *restrict sacc,
2012
                                      xsum_large_accumulator *restrict lacc)
2013
0
{
2014
0
  if (xsum_debug) c_printf("\nTRANSFERRING FROM LARGE TO SMALL ACCUMULATOR\n");
2015
0
  xsum_large_transfer_to_small (lacc);
2016
0
  *sacc = lacc->sacc;
2017
0
}
2018
2019
2020
/* TRANSFER NUMBER FROM A SMALL ACCUMULATOR TO A LARGE ACCUMULATOR. */
2021
2022
void xsum_small_to_large_accumulator (xsum_large_accumulator *restrict lacc,
2023
                                      xsum_small_accumulator *restrict sacc)
2024
0
{
2025
0
  if (xsum_debug) c_printf("\nTRANSFERRING FROM SMALL TO LARGE ACCUMULATOR\n");
2026
0
  xsum_large_init_chunks (lacc);
2027
0
  lacc->sacc = *sacc;
2028
0
}
2029
2030
2031
/* FIND RESULT OF DIVIDING SMALL ACCUMULATOR BY UNSIGNED INTEGER. */
2032
2033
xsum_flt xsum_small_div_unsigned
2034
           (xsum_small_accumulator *restrict sacc, unsigned div)
2035
0
{
2036
0
  xsum_flt result;
2037
0
  unsigned rem;
2038
0
  double fltv;
2039
0
  int sign;
2040
0
  int i, j;
2041
2042
0
  if (xsum_debug) c_printf("\nDIVIDE SMALL ACCUMULATOR BY UNSIGNED INTEGER\n");
2043
2044
  /* Return NaN or an Inf if that's what's in the superaccumulator. */
2045
2046
0
  if (sacc->NaN != 0)
2047
0
  { COPY64(fltv, sacc->NaN);
2048
0
    return fltv;
2049
0
  }
2050
2051
0
  if (sacc->Inf != 0)
2052
0
  { COPY64 (fltv, sacc->Inf);
2053
0
    return fltv;
2054
0
  }
2055
2056
  /* Make a copy of the superaccumulator, so we can change it here without
2057
     changing *sacc. */
2058
2059
0
  xsum_small_accumulator tacc = *sacc;
2060
2061
  /* Carry propagate in the temporary copy of the superaccumulator.
2062
     Sets 'i' to the index of the topmost nonzero chunk. */
2063
2064
0
  i = xsum_carry_propagate(&tacc);
2065
2066
  /* Check for division by zero, and if so, return +Inf, -Inf, or NaN,
2067
     depending on whether the superaccumulator is positive, negative,
2068
     or zero. */
2069
2070
0
  if (div == 0)
2071
0
  { if (xsum_debug)
2072
0
    { c_printf("divide by zero, top chunk has index %d, value %lld\n",
2073
0
              i, tacc.chunk[i]);
2074
0
    }
2075
0
    return tacc.chunk[i] > 0 ? INFINITY : tacc.chunk[i] < 0 ? -INFINITY : NAN;
2076
0
  }
2077
2078
  /* Record sign of accumulator, and if it's negative, negate and
2079
     re-propagate so that it will be positive. */
2080
2081
0
  sign = +1;
2082
2083
0
  if (tacc.chunk[i] < 0)
2084
0
  { xsum_small_negate(&tacc);
2085
0
    i = xsum_carry_propagate(&tacc);
2086
0
    if (xsum_debug)
2087
0
    { c_printf("Negated accumulator to make it non-negative\n");
2088
0
      if (tacc.chunk[i] < 0) c_abort();
2089
0
    }
2090
0
    sign = -1;
2091
0
  }
2092
2093
  /* Do the division in the small accumulator, putting the remainder after
2094
     dividing the bottom chunk in 'rem'. */
2095
2096
0
  if (xsum_debug)
2097
0
  { c_printf("\nBefore division by %u: ",div);
2098
0
    xsum_small_display (&tacc);
2099
0
  }
2100
2101
0
  rem = 0;
2102
0
  for (j = i; j>=0; j--)
2103
0
  { xsum_uint num = ((xsum_uint) rem << XSUM_LOW_MANTISSA_BITS) + tacc.chunk[j];
2104
0
    xsum_uint quo = num / div;
2105
0
    rem = num - quo*div;
2106
0
    tacc.chunk[j] = quo;
2107
0
  }
2108
2109
0
  if (xsum_debug)
2110
0
  { c_printf("After division by %u: ",div);
2111
0
    xsum_small_display (&tacc);
2112
0
  }
2113
2114
  /* Find new top chunk. */
2115
2116
0
  while (i > 0 && tacc.chunk[i] == 0)
2117
0
  { i -= 1;
2118
0
  }
2119
2120
  /* Do rounding, with separate approachs for a normal number with biased
2121
     exponent greater than 1, and for a normal number with exponent of 1 
2122
     or a denormalized number (also having true biased exponent of 1). */
2123
2124
0
  if (i > 1 || tacc.chunk[1] >= (1 << (XSUM_HIGH_MANTISSA_BITS+2)))
2125
0
  {
2126
    /* Normalized number with at least two bits at bottom of chunk 0
2127
       below the mantissa.  Just need to 'or' in a 1 at the bottom if
2128
       remainder is non-zero to break a tie if bits below bottom of
2129
       mantissa are exactly 1/2. */
2130
2131
0
    if (xsum_debug) 
2132
0
    { c_printf("normalized (2+ bits below), low %016llx %016llx, remainder %u\n",
2133
0
              tacc.chunk[1],tacc.chunk[0],rem);
2134
0
    }
2135
2136
0
    if (rem > 0)
2137
0
    { tacc.chunk[0] |= 1;
2138
0
    }
2139
0
  }
2140
0
  else
2141
0
  {
2142
    /* Denormalized number or normal number with biased exponent of 1.
2143
       Lowest bit of bottom chunk is just below lowest bit of
2144
       mantissa.  Need to explicitly round here using the bottom bit
2145
       and the remainder - round up if lower > 1/2 or >= 1/2 and
2146
       odd. */
2147
2148
0
    if (xsum_debug)
2149
0
    { if (tacc.chunk[1] >= (1 << (XSUM_HIGH_MANTISSA_BITS+1)))
2150
0
      { c_printf("small normalized, low %016llx %016llx, remainder %u\n",
2151
0
                tacc.chunk[1],tacc.chunk[0],rem);
2152
0
      }
2153
0
      else
2154
0
      { c_printf("denormalized, low %016llx %016llx, remainder %u\n",
2155
0
                tacc.chunk[1],tacc.chunk[0],rem);
2156
0
      }
2157
0
    }
2158
2159
0
    if (tacc.chunk[0] & 1)  /* lower part is >= 1/2 */
2160
0
    {
2161
0
      if (tacc.chunk[0] & 2)  /* lowest bit of mantissa is 1 (odd) */
2162
0
      { tacc.chunk[0] += 2;     /* round up */
2163
0
      }
2164
0
      else                    /* lowest bit of mantissa is 0 (even) */
2165
0
      { if (rem > 0)            /* lower part is > 1/2 */
2166
0
        { tacc.chunk[0] += 2;     /* round up */
2167
0
        }
2168
0
      }
2169
2170
0
      tacc.chunk[0] &= ~1;  /* clear low bit (but should anyway be ignored) */
2171
0
    }
2172
0
  }
2173
2174
0
  if (xsum_debug)
2175
0
  { c_printf(
2176
0
     "New low chunk after adjusting to correct rounding with remainder %u:\n\n",
2177
0
      rem);
2178
0
    pbinary_int64 (tacc.chunk[0], XSUM_SCHUNK_BITS);
2179
0
  }
2180
2181
  /* Do the final rounding, with the lowest bit set as above. */
2182
2183
0
  result = xsum_small_round (&tacc);
2184
2185
0
  return sign*result;
2186
0
}
2187
2188
2189
/* FIND RESULT OF DIVIDING SMALL ACCUMULATOR BY SIGNED INTEGER. */
2190
2191
xsum_flt xsum_small_div_int
2192
           (xsum_small_accumulator *restrict sacc, int div)
2193
0
{ if (xsum_debug) c_printf("\nDIVIDE SMALL ACCUMULATOR BY SIGNED INTEGER\n");
2194
0
  if (div < 0)
2195
0
  { return -xsum_small_div_unsigned (sacc, (unsigned) -div);
2196
0
  }
2197
0
  else
2198
0
  { return xsum_small_div_unsigned (sacc, (unsigned) div);
2199
0
  }
2200
0
}
2201
2202
2203
/* FIND RESULT OF DIVIDING LARGE ACCUMULATOR BY UNSIGNED INTEGER. */
2204
2205
xsum_flt xsum_large_div_unsigned
2206
           (xsum_large_accumulator *restrict lacc, unsigned div)
2207
0
{ if (xsum_debug) c_printf("\nDIVIDE LARGE ACCUMULATOR BY UNSIGNED INTEGER\n");
2208
0
  xsum_large_transfer_to_small (lacc);
2209
0
  return xsum_small_div_unsigned (&lacc->sacc, div);
2210
0
}
2211
2212
2213
/* FIND RESULT OF DIVIDING LARGE ACCUMULATOR BY SIGNED INTEGER. */
2214
2215
xsum_flt xsum_large_div_int
2216
           (xsum_large_accumulator *restrict lacc, int div)
2217
0
{ if (xsum_debug) c_printf("\nDIVIDE LARGE ACCUMULATOR BY SIGNED INTEGER\n");
2218
0
  xsum_large_transfer_to_small (lacc);
2219
0
  return xsum_small_div_int (&lacc->sacc, div);
2220
0
}
2221
2222
2223
/* ------------------- ROUTINES FOR NON-EXACT SUMMATION --------------------- */
2224
2225
2226
/* SUM A VECTOR WITH DOUBLE FP ACCUMULATOR. */
2227
2228
xsum_flt xsum_sum_double (const xsum_flt *restrict vec,
2229
                          xsum_length n)
2230
0
{ double s;
2231
0
  xsum_length j;
2232
0
  s = 0.0;
2233
0
# if OPT_SIMPLE_SUM
2234
0
  { for (j = 3; j < n; j += 4)
2235
0
    { s += vec[j-3];
2236
0
      s += vec[j-2];
2237
0
      s += vec[j-1];
2238
0
      s += vec[j];
2239
0
    }
2240
0
    for (j = j-3; j < n; j++)
2241
0
    { s += vec[j];
2242
0
    }
2243
0
  }
2244
# else
2245
  { for (j = 0; j < n; j++)
2246
    { s += vec[j];
2247
    }
2248
  }
2249
# endif
2250
0
  return (xsum_flt) s;
2251
0
}
2252
2253
2254
/* SUM A VECTOR WITH FLOAT128 ACCUMULATOR. */
2255
2256
#ifdef FLOAT128
2257
2258
#include <quadmath.h>
2259
2260
xsum_flt xsum_sum_float128 (const xsum_flt *restrict vec,
2261
                            xsum_length n)
2262
{ __float128 s;
2263
  xsum_length j;
2264
  s = 0.0;
2265
  for (j = 0; j < n; j++)
2266
  { s += vec[j];
2267
  }
2268
  return (xsum_flt) s;
2269
}
2270
2271
#endif
2272
2273
2274
/* SUM A VECTOR WITH DOUBLE FP, NOT IN ORDER. */
2275
2276
xsum_flt xsum_sum_double_not_ordered (const xsum_flt *restrict vec,
2277
                                      xsum_length n)
2278
0
{ double s[2] = { 0, 0 };
2279
0
  xsum_length j;
2280
0
  for (j = 1; j < n; j += 2)
2281
0
  { s[0] += vec[j-1];
2282
0
    s[1] += vec[j];
2283
0
  }
2284
0
  if (j == n)
2285
0
  { s[0] += vec[j-1];
2286
0
  }
2287
0
  return (xsum_flt) (s[0]+s[1]);
2288
0
}
2289
2290
2291
/* SUM A VECTOR WITH KAHAN'S METHOD. */
2292
2293
xsum_flt xsum_sum_kahan (const xsum_flt *restrict vec,
2294
                         xsum_length n)
2295
0
{ double s, t, c, y;
2296
0
  xsum_length j;
2297
0
  s = 0.0;
2298
0
  c = 0.0;
2299
# if OPT_KAHAN_SUM
2300
  { for (j = 1; j < n; j += 2)
2301
    { y = vec[j-1] - c;
2302
      t = s;
2303
      s += y;
2304
      c = (s - t) - y;
2305
      y = vec[j] - c;
2306
      t = s;
2307
      s += y;
2308
      c = (s - t) - y;
2309
    }
2310
    for (j = j-1; j < n; j++)
2311
    { y = vec[j] - c;
2312
      t = s;
2313
      s += y;
2314
      c = (s - t) - y;
2315
    }
2316
  }
2317
# else
2318
0
  { for (j = 0; j < n; j++)
2319
0
    { y = vec[j] - c;
2320
0
      t = s;
2321
0
      s += y;
2322
0
      c = (s - t) - y;
2323
0
    }
2324
0
  }
2325
0
# endif
2326
0
  return (xsum_flt) s;
2327
0
}
2328
2329
2330
/* SQUARED NORM OF A VECTOR WITH DOUBLE FP ACCUMULATOR. */
2331
2332
xsum_flt xsum_sqnorm_double (const xsum_flt *restrict vec,
2333
                             xsum_length n)
2334
0
{ double s;
2335
0
  xsum_length j;
2336
2337
0
  s = 0.0;
2338
0
# if OPT_SIMPLE_SQNORM
2339
0
  { double a, b, c, d;
2340
0
    for (j = 3; j < n; j += 4)
2341
0
    { a = vec[j-3];
2342
0
      b = vec[j-2];
2343
0
      c = vec[j-1];
2344
0
      d = vec[j];
2345
0
      s += a*a;
2346
0
      s += b*b;
2347
0
      s += c*c;
2348
0
      s += d*d;
2349
0
    }
2350
0
    for (j = j-3; j < n; j++)
2351
0
    { a = vec[j];
2352
0
      s += a*a;
2353
0
    }
2354
0
  }
2355
# else
2356
  { double a;
2357
    for (j = 0; j < n; j++)
2358
    { a = vec[j];
2359
      s += a*a;
2360
    }
2361
  }
2362
# endif
2363
0
  return (xsum_flt) s;
2364
0
}
2365
2366
2367
/* SQUARED NORM OF A VECTOR WITH DOUBLE FP, NOT IN ORDER. */
2368
2369
xsum_flt xsum_sqnorm_double_not_ordered (const xsum_flt *restrict vec,
2370
                                         xsum_length n)
2371
0
{ double s[2] = { 0, 0 };
2372
0
  double a[2];
2373
0
  xsum_length j;
2374
0
  for (j = 1; j < n; j += 2)
2375
0
  { a[0] = vec[j-1];
2376
0
    a[1] = vec[j];
2377
0
    s[0] += a[0]*a[0];
2378
0
    s[1] += a[1]*a[1];
2379
0
  }
2380
0
  if (j == n)
2381
0
  { a[0] = vec[j-1];
2382
0
    s[0] += a[0]*a[0];
2383
0
  }
2384
0
  return (xsum_flt) (s[0]+s[1]);
2385
0
}
2386
2387
2388
/* DOT PRODUCT OF VECTORS WITH DOUBLE FP ACCUMULATOR. */
2389
2390
xsum_flt xsum_dot_double (const xsum_flt *vec1,
2391
                          const xsum_flt *vec2,
2392
                          xsum_length n)
2393
0
{ double s;
2394
0
  xsum_length j;
2395
2396
0
  s = 0.0;
2397
0
# if OPT_SIMPLE_DOT
2398
0
  { for (j = 3; j < n; j += 4)
2399
0
    { s += vec1[j-3] * vec2[j-3];
2400
0
      s += vec1[j-2] * vec2[j-2];
2401
0
      s += vec1[j-1] * vec2[j-1];
2402
0
      s += vec1[j] * vec2[j];
2403
0
    }
2404
0
    for (j = j-3; j < n; j++)
2405
0
    { s += vec1[j] * vec2[j];
2406
0
    }
2407
0
  }
2408
# else
2409
  { for (j = 0; j < n; j++)
2410
    { s += vec1[j] * vec2[j];
2411
    }
2412
  }
2413
# endif
2414
0
  return (xsum_flt) s;
2415
0
}
2416
2417
2418
/* DOT PRODUCT OF VECTORS WITH DOUBLE FP, NOT IN ORDER. */
2419
2420
xsum_flt xsum_dot_double_not_ordered (const xsum_flt *vec1,
2421
                                      const xsum_flt *vec2,
2422
                                      xsum_length n)
2423
0
{ double s[2] = { 0, 0 };
2424
0
  xsum_length j;
2425
0
  for (j = 1; j < n; j += 2)
2426
0
  { s[0] += vec1[j-1] * vec2[j-1];
2427
0
    s[1] += vec1[j] * vec2[j];
2428
0
  }
2429
0
  if (j == n)
2430
0
  { s[0] += vec1[j-1] * vec2[j-1];
2431
0
  }
2432
0
  return (xsum_flt) (s[0]+s[1]);
2433
0
}
2434
2435
2436
/* ------------------------- DEBUGGING ROUTINES ----------------------------- */
2437
2438
2439
/* DISPLAY A SMALL ACCUMULATOR. */
2440
2441
void xsum_small_display (xsum_small_accumulator *restrict sacc)
2442
0
{
2443
0
  int i, dots;
2444
0
  c_printf("Small accumulator:");
2445
0
  if (sacc->Inf)
2446
0
  { c_printf (" %cInf", sacc->Inf>0 ? '+' : '-');
2447
0
    if ((sacc->Inf & ((xsum_uint)XSUM_EXP_MASK << XSUM_MANTISSA_BITS))
2448
0
          != ((xsum_uint)XSUM_EXP_MASK << XSUM_MANTISSA_BITS))
2449
0
    { c_printf(" BUT WRONG CONTENTS: %llx", (long long) sacc->Inf);
2450
0
    }
2451
0
  }
2452
0
  if (sacc->NaN)
2453
0
  { c_printf (" NaN (%llx)", (long long) sacc->NaN);
2454
0
  }
2455
0
  c_printf("\n");
2456
0
  dots = 0;
2457
0
  for (i = XSUM_SCHUNKS-1; i >= 0; i--)
2458
0
  { if (sacc->chunk[i] == 0)
2459
0
    { if (!dots) c_printf("            ...\n");
2460
0
      dots = 1;
2461
0
    }
2462
0
    else
2463
0
    { c_printf ("%5d %5d ", i, (int)
2464
0
       ((i<<XSUM_LOW_EXP_BITS) - XSUM_EXP_BIAS - XSUM_MANTISSA_BITS));
2465
0
      pbinary_int64 ((int64_t) sacc->chunk[i] >> 32, XSUM_SCHUNK_BITS-32);
2466
0
      c_printf(" ");
2467
0
      pbinary_int64 ((int64_t) sacc->chunk[i] & 0xffffffff, 32);
2468
0
      c_printf ("\n");
2469
0
      dots = 0;
2470
0
    }
2471
0
  }
2472
0
  c_printf("\n");
2473
0
}
2474
2475
2476
/* RETURN NUMBER OF CHUNKS IN USE IN SMALL ACCUMULATOR. */
2477
2478
int xsum_small_chunks_used (xsum_small_accumulator *restrict sacc)
2479
0
{
2480
0
  int i, c;
2481
0
  c = 0;
2482
0
  for (i = 0; i < XSUM_SCHUNKS; i++)
2483
0
  { if (sacc->chunk[i] != 0)
2484
0
    { c += 1;
2485
0
    }
2486
0
  }
2487
0
  return c;
2488
0
}
2489
2490
2491
/* DISPLAY A LARGE ACCUMULATOR. */
2492
2493
void xsum_large_display (xsum_large_accumulator *restrict lacc)
2494
0
{
2495
0
  int i, dots;
2496
0
  c_printf("Large accumulator:\n");
2497
0
  dots = 0;
2498
0
  for (i = XSUM_LCHUNKS-1; i >= 0; i--)
2499
0
  { if (lacc->count[i] < 0)
2500
0
    { if (!dots) c_printf("            ...\n");
2501
0
      dots = 1;
2502
0
    }
2503
0
    else
2504
0
    { c_printf ("%c%4d %5d ", i & 0x800 ? '-' : '+', i & 0x7ff, lacc->count[i]);
2505
0
      pbinary_int64 ((int64_t) lacc->chunk[i] >> 32, XSUM_LCHUNK_BITS-32);
2506
0
      c_printf(" ");
2507
0
      pbinary_int64 ((int64_t) lacc->chunk[i] & 0xffffffff, 32);
2508
0
      c_printf ("\n");
2509
0
      dots = 0;
2510
0
    }
2511
0
  }
2512
0
  c_printf("\nWithin large accumulator:  ");
2513
0
  xsum_small_display (&lacc->sacc);
2514
2515
0
}
2516
2517
2518
/* RETURN NUMBER OF CHUNKS IN USE IN LARGE ACCUMULATOR. */
2519
2520
int xsum_large_chunks_used (xsum_large_accumulator *restrict lacc)
2521
0
{
2522
0
  int i, c;
2523
0
  c = 0;
2524
0
  for (i = 0; i < XSUM_LCHUNKS; i++)
2525
0
  { if (lacc->count[i] >= 0)
2526
0
    { c += 1;
2527
0
    }
2528
0
  }
2529
0
  return c;
2530
0
}