meshopt_decodeIndexBuffer: 379| 4.51k|{ 380| 4.51k| using namespace meshopt; 381| | 382| 4.51k| assert(index_count % 3 == 0); ------------------ | Branch (382:2): [True: 4.51k, False: 0] ------------------ 383| 4.51k| assert(index_size == 2 || index_size == 4); ------------------ | Branch (383:2): [True: 2.25k, False: 2.25k] | Branch (383:2): [True: 2.25k, False: 0] | Branch (383:2): [True: 4.51k, False: 0] ------------------ 384| | 385| | // the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table 386| 4.51k| if (buffer_size < 1 + index_count / 3 + 16) ------------------ | Branch (386:6): [True: 1.79k, False: 2.71k] ------------------ 387| 1.79k| return -2; 388| | 389| 2.71k| if ((buffer[0] & 0xf0) != kIndexHeader) ------------------ | Branch (389:6): [True: 1.96k, False: 752] ------------------ 390| 1.96k| return -1; 391| | 392| 752| int version = buffer[0] & 0x0f; 393| 752| if (version > kDecodeIndexVersion) ------------------ | Branch (393:6): [True: 80, False: 672] ------------------ 394| 80| return -1; 395| | 396| 672| EdgeFifo edgefifo; 397| 672| memset(edgefifo, -1, sizeof(edgefifo)); 398| | 399| 672| VertexFifo vertexfifo; 400| 672| memset(vertexfifo, -1, sizeof(vertexfifo)); 401| | 402| 672| size_t edgefifooffset = 0; 403| 672| size_t vertexfifooffset = 0; 404| | 405| 672| unsigned int next = 0; 406| 672| unsigned int last = 0; 407| | 408| 672| int fecmax = version >= 1 ? 13 : 15; ------------------ | Branch (408:15): [True: 164, False: 508] ------------------ 409| | 410| | // since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end 411| 672| const unsigned char* code = buffer + 1; 412| 672| const unsigned char* data = code + index_count / 3; 413| 672| const unsigned char* data_safe_end = buffer + buffer_size - 16; 414| | 415| 672| const unsigned char* codeaux_table = data_safe_end; 416| | 417| 11.8k| for (size_t i = 0; i < index_count; i += 3) ------------------ | Branch (417:21): [True: 11.4k, False: 368] ------------------ 418| 11.4k| { 419| | // make sure we have enough data to read for a triangle 420| | // each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index 421| | // after this we can be sure we can read without extra bounds checks 422| 11.4k| if (data > data_safe_end) ------------------ | Branch (422:7): [True: 304, False: 11.1k] ------------------ 423| 304| return -2; 424| | 425| 11.1k| unsigned char codetri = *code++; 426| | 427| 11.1k| if (codetri < 0xf0) ------------------ | Branch (427:7): [True: 6.75k, False: 4.40k] ------------------ 428| 6.75k| { 429| 6.75k| int fe = codetri >> 4; 430| | 431| | // fifo reads are wrapped around 16 entry buffer 432| 6.75k| unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0]; 433| 6.75k| unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1]; 434| 6.75k| unsigned int c = 0; 435| | 436| 6.75k| int fec = codetri & 15; 437| | 438| | // note: this is the most common path in the entire decoder 439| | // inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable 440| 6.75k| if (fec < fecmax) ------------------ | Branch (440:8): [True: 5.60k, False: 1.15k] ------------------ 441| 5.60k| { 442| | // fifo reads are wrapped around 16 entry buffer 443| 5.60k| unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15]; 444| 5.60k| c = (fec == 0) ? next : cf; ------------------ | Branch (444:9): [True: 2.39k, False: 3.20k] ------------------ 445| | 446| 5.60k| int fec0 = fec == 0; 447| 5.60k| next += fec0; 448| | 449| | // push vertex fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly 450| 5.60k| pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0); 451| 5.60k| } 452| 1.15k| else 453| 1.15k| { 454| | // fec - (fec ^ 3) decodes 13, 14 into -1, 1 455| | // note that we need to update the last index since free indices are delta-encoded 456| 1.15k| last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last); ------------------ | Branch (456:16): [True: 318, False: 834] ------------------ 457| | 458| | // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly 459| 1.15k| pushVertexFifo(vertexfifo, c, vertexfifooffset); 460| 1.15k| } 461| | 462| | // push edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly 463| 6.75k| pushEdgeFifo(edgefifo, c, b, edgefifooffset); 464| 6.75k| pushEdgeFifo(edgefifo, a, c, edgefifooffset); 465| | 466| | // output triangle 467| 6.75k| writeTriangle(destination, i, index_size, a, b, c); 468| 6.75k| } 469| 4.40k| else 470| 4.40k| { 471| | // fast path: read codeaux from the table 472| 4.40k| if (codetri < 0xfe) ------------------ | Branch (472:8): [True: 1.36k, False: 3.04k] ------------------ 473| 1.36k| { 474| 1.36k| unsigned char codeaux = codeaux_table[codetri & 15]; 475| | 476| | // note: table can't contain feb/fec=15 477| 1.36k| int feb = codeaux >> 4; 478| 1.36k| int fec = codeaux & 15; 479| | 480| | // fifo reads are wrapped around 16 entry buffer 481| | // also note that we increment next for all three vertices before decoding indices - this matches encoder behavior 482| 1.36k| unsigned int a = next++; 483| | 484| 1.36k| unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15]; 485| 1.36k| unsigned int b = (feb == 0) ? next : bf; ------------------ | Branch (485:22): [True: 492, False: 872] ------------------ 486| | 487| 1.36k| int feb0 = feb == 0; 488| 1.36k| next += feb0; 489| | 490| 1.36k| unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15]; 491| 1.36k| unsigned int c = (fec == 0) ? next : cf; ------------------ | Branch (491:22): [True: 440, False: 924] ------------------ 492| | 493| 1.36k| int fec0 = fec == 0; 494| 1.36k| next += fec0; 495| | 496| | // output triangle 497| 1.36k| writeTriangle(destination, i, index_size, a, b, c); 498| | 499| | // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly 500| 1.36k| pushVertexFifo(vertexfifo, a, vertexfifooffset); 501| 1.36k| pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0); 502| 1.36k| pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0); 503| | 504| 1.36k| pushEdgeFifo(edgefifo, b, a, edgefifooffset); 505| 1.36k| pushEdgeFifo(edgefifo, c, b, edgefifooffset); 506| 1.36k| pushEdgeFifo(edgefifo, a, c, edgefifooffset); 507| 1.36k| } 508| 3.04k| else 509| 3.04k| { 510| | // slow path: read a full byte for codeaux instead of using a table lookup 511| 3.04k| unsigned char codeaux = *data++; 512| | 513| 3.04k| int fea = codetri == 0xfe ? 0 : 15; ------------------ | Branch (513:15): [True: 1.11k, False: 1.93k] ------------------ 514| 3.04k| int feb = codeaux >> 4; 515| 3.04k| int fec = codeaux & 15; 516| | 517| | // reset: codeaux is 0 but encoded as not-a-table 518| 3.04k| if (codeaux == 0) ------------------ | Branch (518:9): [True: 520, False: 2.52k] ------------------ 519| 520| next = 0; 520| | 521| | // fifo reads are wrapped around 16 entry buffer 522| | // also note that we increment next for all three vertices before decoding indices - this matches encoder behavior 523| 3.04k| unsigned int a = (fea == 0) ? next++ : 0; ------------------ | Branch (523:22): [True: 1.11k, False: 1.93k] ------------------ 524| 3.04k| unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15]; ------------------ | Branch (524:22): [True: 656, False: 2.38k] ------------------ 525| 3.04k| unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15]; ------------------ | Branch (525:22): [True: 684, False: 2.36k] ------------------ 526| | 527| | // note that we need to update the last index since free indices are delta-encoded 528| 3.04k| if (fea == 15) ------------------ | Branch (528:9): [True: 1.93k, False: 1.11k] ------------------ 529| 1.93k| last = a = decodeIndex(data, last); 530| | 531| 3.04k| if (feb == 15) ------------------ | Branch (531:9): [True: 970, False: 2.07k] ------------------ 532| 970| last = b = decodeIndex(data, last); 533| | 534| 3.04k| if (fec == 15) ------------------ | Branch (534:9): [True: 936, False: 2.10k] ------------------ 535| 936| last = c = decodeIndex(data, last); 536| | 537| | // output triangle 538| 3.04k| writeTriangle(destination, i, index_size, a, b, c); 539| | 540| | // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly 541| 3.04k| pushVertexFifo(vertexfifo, a, vertexfifooffset); 542| 3.04k| pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15)); 543| 3.04k| pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15)); 544| | 545| 3.04k| pushEdgeFifo(edgefifo, b, a, edgefifooffset); 546| 3.04k| pushEdgeFifo(edgefifo, c, b, edgefifooffset); 547| 3.04k| pushEdgeFifo(edgefifo, a, c, edgefifooffset); 548| 3.04k| } 549| 4.40k| } 550| 11.1k| } 551| | 552| | // we should've read all data bytes and stopped at the boundary between data and codeaux table 553| 368| if (data != data_safe_end) ------------------ | Branch (553:6): [True: 348, False: 20] ------------------ 554| 348| return -3; 555| | 556| 20| return 0; 557| 368|} meshopt_decodeIndexSequence: 629| 4.51k|{ 630| 4.51k| using namespace meshopt; 631| | 632| | // the minimum valid encoding is header, 1 byte per index and a 4-byte tail 633| 4.51k| if (buffer_size < 1 + index_count + 4) ------------------ | Branch (633:6): [True: 2.39k, False: 2.11k] ------------------ 634| 2.39k| return -2; 635| | 636| 2.11k| if ((buffer[0] & 0xf0) != kSequenceHeader) ------------------ | Branch (636:6): [True: 1.79k, False: 318] ------------------ 637| 1.79k| return -1; 638| | 639| 318| int version = buffer[0] & 0x0f; 640| 318| if (version > kDecodeIndexVersion) ------------------ | Branch (640:6): [True: 20, False: 298] ------------------ 641| 20| return -1; 642| | 643| 298| const unsigned char* data = buffer + 1; 644| 298| const unsigned char* data_safe_end = buffer + buffer_size - 4; 645| | 646| 298| unsigned int last[2] = {}; 647| | 648| 18.3k| for (size_t i = 0; i < index_count; ++i) ------------------ | Branch (648:21): [True: 18.0k, False: 234] ------------------ 649| 18.0k| { 650| | // make sure we have enough data to read 651| | // each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end 652| | // after this we can be sure we can read without extra bounds checks 653| 18.0k| if (data >= data_safe_end) ------------------ | Branch (653:7): [True: 64, False: 18.0k] ------------------ 654| 64| return -2; 655| | 656| 18.0k| unsigned int v = decodeVByte(data); 657| | 658| | // decode the index of the last baseline 659| 18.0k| unsigned int current = v & 1; 660| 18.0k| v >>= 1; 661| | 662| | // reconstruct index as a delta 663| 18.0k| unsigned int d = (v >> 1) ^ -int(v & 1); 664| 18.0k| unsigned int index = last[current] + d; 665| | 666| | // update last for the next iteration that uses it 667| 18.0k| last[current] = index; 668| | 669| 18.0k| if (index_size == 2) ------------------ | Branch (669:7): [True: 9.01k, False: 9.01k] ------------------ 670| 9.01k| { 671| 9.01k| static_cast(destination)[i] = (unsigned short)(index); 672| 9.01k| } 673| 9.01k| else 674| 9.01k| { 675| 9.01k| static_cast(destination)[i] = index; 676| 9.01k| } 677| 18.0k| } 678| | 679| | // we should've read all data bytes and stopped at the boundary between data and tail 680| 234| if (data != data_safe_end) ------------------ | Branch (680:6): [True: 232, False: 2] ------------------ 681| 232| return -3; 682| | 683| 2| return 0; 684| 234|} indexcodec.cpp:_ZN7meshoptL14pushVertexFifoEPjjRmi: 75| 19.9k|{ 76| 19.9k| fifo[offset] = v; 77| 19.9k| offset = (offset + cond) & 15; 78| 19.9k|} indexcodec.cpp:_ZN7meshoptL12pushEdgeFifoEPA2_jjjRm: 55| 26.7k|{ 56| 26.7k| fifo[offset][0] = a; 57| 26.7k| fifo[offset][1] = b; 58| 26.7k| offset = (offset + 1) & 15; 59| 26.7k|} indexcodec.cpp:_ZN7meshoptL11decodeIndexERPKhj: 125| 4.67k|{ 126| 4.67k| unsigned int v = decodeVByte(data); 127| 4.67k| unsigned int d = (v >> 1) ^ -int(v & 1); 128| | 129| 4.67k| return last + d; 130| 4.67k|} indexcodec.cpp:_ZN7meshoptL13writeTriangleEPvmmjjj: 142| 11.1k|{ 143| 11.1k| if (index_size == 2) ------------------ | Branch (143:6): [True: 5.58k, False: 5.58k] ------------------ 144| 5.58k| { 145| 5.58k| static_cast(destination)[offset + 0] = (unsigned short)(a); 146| 5.58k| static_cast(destination)[offset + 1] = (unsigned short)(b); 147| 5.58k| static_cast(destination)[offset + 2] = (unsigned short)(c); 148| 5.58k| } 149| 5.58k| else 150| 5.58k| { 151| 5.58k| static_cast(destination)[offset + 0] = a; 152| 5.58k| static_cast(destination)[offset + 1] = b; 153| 5.58k| static_cast(destination)[offset + 2] = c; 154| 5.58k| } 155| 11.1k|} indexcodec.cpp:_ZN7meshoptL11decodeVByteERPKh: 91| 22.6k|{ 92| 22.6k| unsigned char lead = *data++; 93| | 94| | // fast path: single byte 95| 22.6k| if (lead < 128) ------------------ | Branch (95:6): [True: 13.5k, False: 9.19k] ------------------ 96| 13.5k| return lead; 97| | 98| | // slow path: up to 4 extra bytes 99| | // note that this loop always terminates, which is important for malformed data 100| 9.19k| unsigned int result = lead & 127; 101| 9.19k| unsigned int shift = 7; 102| | 103| 31.4k| for (int i = 0; i < 4; ++i) ------------------ | Branch (103:18): [True: 26.8k, False: 4.66k] ------------------ 104| 26.8k| { 105| 26.8k| unsigned char group = *data++; 106| 26.8k| result |= unsigned(group & 127) << shift; 107| 26.8k| shift += 7; 108| | 109| 26.8k| if (group < 128) ------------------ | Branch (109:7): [True: 4.53k, False: 22.2k] ------------------ 110| 4.53k| break; 111| 26.8k| } 112| | 113| 9.19k| return result; 114| 22.6k|} meshopt_encodeMeshletBound: 899| 4.51k|{ 900| 4.51k| size_t codes_size = (max_triangles + 1) / 2; 901| 4.51k| size_t extra_size = max_triangles * 3; 902| | 903| 4.51k| size_t ctrl_size = (max_vertices + 3) / 4; 904| 4.51k| size_t data_size = (max_vertices + 3) / 4 * 16; // worst case: 16 bytes per vertex group 905| | 906| 4.51k| size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0; ------------------ | Branch (906:20): [True: 2.82k, False: 1.68k] ------------------ 907| | 908| 4.51k| return codes_size + extra_size + ctrl_size + data_size + gap_size; 909| 4.51k|} meshopt_encodeMeshlet: 912| 4.51k|{ 913| 4.51k| using namespace meshopt; 914| | 915| 4.51k| assert(triangle_count <= 256 && vertex_count <= 256); ------------------ | Branch (915:2): [True: 4.51k, False: 0] | Branch (915:2): [True: 4.51k, False: 0] | Branch (915:2): [True: 4.51k, False: 0] ------------------ 916| | 917| | // 4 bits per triangle + up to three bytes of extra data 918| 4.51k| unsigned char codes[256 / 2]; 919| 4.51k| unsigned char extra[256 * 3]; 920| 4.51k| size_t codes_size = (triangle_count + 1) / 2; 921| 4.51k| size_t extra_size = encodeTriangles(codes, extra, triangles, triangle_count); 922| 4.51k| assert(extra_size <= sizeof(extra)); ------------------ | Branch (922:2): [True: 4.51k, False: 0] ------------------ 923| | 924| | // 2 bits per vertex + up to 4 bytes of actual data 925| 4.51k| unsigned char ctrl[256 / 4]; 926| 4.51k| unsigned char data[256 * 4]; 927| 4.51k| size_t ctrl_size = (vertex_count + 3) / 4; 928| 4.51k| size_t data_size = encodeVertices(ctrl, data, vertices, vertex_count); 929| 4.51k| assert(data_size <= sizeof(data)); ------------------ | Branch (929:2): [True: 4.51k, False: 0] ------------------ 930| | 931| | // we need to ensure that up to 16 bytes after extra+data are available for SIMD decoding 932| | // to minimize overhead, we place fixed-size codes+control at the end of the buffer 933| 4.51k| size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0; ------------------ | Branch (933:20): [True: 2.82k, False: 1.68k] ------------------ 934| | 935| 4.51k| size_t result = codes_size + extra_size + ctrl_size + data_size + gap_size; 936| | 937| 4.51k| if (result > buffer_size) ------------------ | Branch (937:6): [True: 0, False: 4.51k] ------------------ 938| 0| return 0; 939| | 940| | // variable-size data first 941| 4.51k| memcpy(buffer, data, data_size); 942| 4.51k| buffer += data_size; 943| 4.51k| memcpy(buffer, extra, extra_size); 944| 4.51k| buffer += extra_size; 945| | 946| | // gap (for accelerated decoding) separates variable-size and fixed-size data 947| 4.51k| memset(buffer, 0, gap_size); 948| 4.51k| buffer += gap_size; 949| | 950| | // fixed-size data last; it can be located from buffer end during decoding 951| 4.51k| memcpy(buffer, ctrl, ctrl_size); 952| 4.51k| buffer += ctrl_size; 953| 4.51k| memcpy(buffer, codes, codes_size); 954| 4.51k| buffer += codes_size; 955| | 956| |#if TRACE > 1 957| | printf("extra:"); 958| | for (size_t i = 0; i < extra_size; ++i) 959| | printf(" %d", extra[i]); 960| | printf("\n"); 961| | 962| | unsigned int minv = ~0u; 963| | for (size_t i = 0; i < vertex_count; ++i) 964| | minv = minv < vertices[i] ? minv : vertices[i]; 965| | 966| | printf("vertices: [%d+]", minv); 967| | for (size_t i = 0; i < vertex_count; ++i) 968| | printf(" %d", vertices[i] - minv); 969| | printf("\n"); 970| |#endif 971| | 972| |#if TRACE 973| | printf("stats: %d vertices, %d triangles => %d bytes (triangles: %d codes, %d extra; vertices: %d control, %d data; %d gap)\n", 974| | int(vertex_count), int(triangle_count), int(result), 975| | int(codes_size), int(extra_size), int(ctrl_size), int(data_size), int(gap_size)); 976| |#endif 977| | 978| 4.51k| return result; 979| 4.51k|} meshopt_decodeMeshlet: 982| 17.9k|{ 983| 17.9k| using namespace meshopt; 984| | 985| 17.9k| assert(triangle_count <= 256 && vertex_count <= 256); ------------------ | Branch (985:2): [True: 17.9k, False: 0] | Branch (985:2): [True: 17.9k, False: 0] | Branch (985:2): [True: 17.9k, False: 0] ------------------ 986| 17.9k| assert(vertex_size == 4 || vertex_size == 2); ------------------ | Branch (986:2): [True: 11.2k, False: 6.73k] | Branch (986:2): [True: 6.73k, False: 0] | Branch (986:2): [True: 17.9k, False: 0] ------------------ 987| 17.9k| assert(triangle_size == 4 || triangle_size == 3); ------------------ | Branch (987:2): [True: 6.73k, False: 11.2k] | Branch (987:2): [True: 11.2k, False: 0] | Branch (987:2): [True: 17.9k, False: 0] ------------------ 988| | 989| | // layout must match encoding 990| 17.9k| size_t codes_size = (triangle_count + 1) / 2; 991| 17.9k| size_t ctrl_size = (vertex_count + 3) / 4; 992| 17.9k| size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0; ------------------ | Branch (992:20): [True: 7.10k, False: 10.8k] ------------------ 993| | 994| 17.9k| if (buffer_size < codes_size + ctrl_size + gap_size) ------------------ | Branch (994:6): [True: 4.29k, False: 13.6k] ------------------ 995| 4.29k| return -2; 996| | 997| 13.6k| const unsigned char* end = buffer + buffer_size; 998| 13.6k| const unsigned char* codes = end - codes_size; 999| 13.6k| const unsigned char* ctrl = codes - ctrl_size; 1000| 13.6k| const unsigned char* data = buffer; 1001| | 1002| | // gap ensures we have at least 16 bytes available after bound; this allows SIMD decoders to over-read safely 1003| 13.6k| const unsigned char* bound = ctrl - gap_size; 1004| 13.6k| assert(bound >= buffer && bound + 16 <= buffer + buffer_size); ------------------ | Branch (1004:2): [True: 13.6k, False: 0] | Branch (1004:2): [True: 13.6k, False: 0] | Branch (1004:2): [True: 13.6k, False: 0] ------------------ 1005| | 1006| 13.6k|#if defined(SIMD_FALLBACK) 1007| 13.6k| return (gDecodeTablesInitialized ? decodeMeshletSimd<0> : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size); ------------------ | Branch (1007:10): [True: 13.6k, False: 0] ------------------ 1008| |#elif defined(SIMD_SSE) || defined(SIMD_NEON) 1009| | return decodeMeshletSimd<0>(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size); 1010| |#else 1011| | return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size); 1012| |#endif 1013| 13.6k|} meshopt_decodeMeshletRaw: 1016| 2.24k|{ 1017| 2.24k| using namespace meshopt; 1018| | 1019| 2.24k| assert(triangle_count <= 256 && vertex_count <= 256); ------------------ | Branch (1019:2): [True: 2.24k, False: 0] | Branch (1019:2): [True: 2.24k, False: 0] | Branch (1019:2): [True: 2.24k, False: 0] ------------------ 1020| | 1021| | // layout must match encoding 1022| 2.24k| size_t codes_size = (triangle_count + 1) / 2; 1023| 2.24k| size_t ctrl_size = (vertex_count + 3) / 4; 1024| 2.24k| size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0; ------------------ | Branch (1024:20): [True: 363, False: 1.87k] ------------------ 1025| | 1026| 2.24k| if (buffer_size < codes_size + ctrl_size + gap_size) ------------------ | Branch (1026:6): [True: 1.07k, False: 1.16k] ------------------ 1027| 1.07k| return -2; 1028| | 1029| 1.16k| const unsigned char* end = buffer + buffer_size; 1030| 1.16k| const unsigned char* codes = end - codes_size; 1031| 1.16k| const unsigned char* ctrl = codes - ctrl_size; 1032| 1.16k| const unsigned char* data = buffer; 1033| | 1034| | // gap ensures we have at least 16 bytes available after bound; this allows SIMD decoders to over-read safely 1035| 1.16k| const unsigned char* bound = ctrl - gap_size; 1036| 1.16k| assert(bound >= buffer && bound + 16 <= buffer + buffer_size); ------------------ | Branch (1036:2): [True: 1.16k, False: 0] | Branch (1036:2): [True: 1.16k, False: 0] | Branch (1036:2): [True: 1.16k, False: 0] ------------------ 1037| | 1038| 1.16k|#if defined(SIMD_FALLBACK) 1039| 1.16k| return (gDecodeTablesInitialized ? decodeMeshletSimd<1> : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4); ------------------ | Branch (1039:10): [True: 1.16k, False: 0] ------------------ 1040| |#elif defined(SIMD_SSE) || defined(SIMD_NEON) 1041| | return decodeMeshletSimd<1>(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4); 1042| |#else 1043| | return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4); 1044| |#endif 1045| 1.16k|} meshletcodec.cpp:_ZN7meshoptL17decodeBuildTablesEv: 398| 2|{ 399| 2|#define NEXT(var, ec) \ 400| 2| shuf[var] = (ec) ? (unsigned char)extra : 15; \ 401| 2| next[var] = (ec) ? 0 : (unsigned char)nextoff; \ 402| 2| extra += (ec), nextoff += 1 - (ec) 403| | 404| | // check for SSE4.1 support if we have a fallback path 405| 2|#if defined(SIMD_SSE) && defined(SIMD_FALLBACK) 406| 2| int cpuinfo[4] = {}; 407| |#ifdef _MSC_VER 408| | __cpuid(cpuinfo, 1); 409| |#else 410| 2| __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); 411| 2|#endif 412| | // bit 19 = SSE4.1 413| 2| if ((cpuinfo[2] & (1 << 19)) == 0) ------------------ | Branch (413:6): [True: 0, False: 2] ------------------ 414| 0| return false; 415| 2|#endif 416| | 417| | // fill triangle decoding tables for each combination of two triangle codes 418| 514| for (int code = 0; code < 256; ++code) ------------------ | Branch (418:21): [True: 512, False: 2] ------------------ 419| 512| { 420| 512| unsigned char shuf[16] = {}; 421| 512| unsigned char next[16] = {}; 422| 512| int extra = 0; 423| 512| int nextoff = 0; 424| | 425| | // state 0..5 will be refilled every iteration, so we ignore that 426| | // state 6..8 will always contain the last decoded triangle because every triangle shifts fifo equally, so we can decode it independently 427| 512| shuf[6] = 12; 428| 512| shuf[7] = 13; 429| 512| shuf[8] = 14; 430| | 431| | // state 15 will contain next (potentially incremented a few times) 432| 512| shuf[15] = 15; 433| | 434| | // state 9..11 will contain the first decoded triangle (tri0), which can refer to extra/next and the original triangle history 435| | // state 12..14 will contain the second decoded triangle (tri1); when decoding edge reuse, we need to handle edge 0/1 specially as it was just decoded earlier 436| 1.53k| for (int k = 0; k < 2; ++k) ------------------ | Branch (436:19): [True: 1.02k, False: 512] ------------------ 437| 1.02k| { 438| 1.02k| int tri = (code >> (k * 4)) & 0xf; 439| | 440| 1.02k| if (tri < 12) ------------------ | Branch (440:8): [True: 768, False: 256] ------------------ 441| 768| { 442| 768| if (k == 1 && tri / 4 == 0) ------------------ | Branch (442:9): [True: 384, False: 384] | Branch (442:19): [True: 128, False: 256] ------------------ 443| 128| { 444| | // we need to decode one of two edges from the triangle we just decoded earlier 445| | // for that we simply need to copy shuf/next values for the two decoded indices 446| 128| shuf[9 + k * 3] = shuf[9 + ((tri & 2) ? 2 : 0)]; ------------------ | Branch (446:34): [True: 64, False: 64] ------------------ 447| 128| next[9 + k * 3] = next[9 + ((tri & 2) ? 2 : 0)]; ------------------ | Branch (447:34): [True: 64, False: 64] ------------------ 448| | 449| 128| shuf[10 + k * 3] = shuf[9 + ((tri & 2) ? 1 : 2)]; ------------------ | Branch (449:35): [True: 64, False: 64] ------------------ 450| 128| next[10 + k * 3] = next[9 + ((tri & 2) ? 1 : 2)]; ------------------ | Branch (450:35): [True: 64, False: 64] ------------------ 451| 128| } 452| 640| else 453| 640| { 454| | // reuse: edge comes from the history based on edge index 455| | // note: we reuse with an offset because last triangle in the original history was consumed by tri0 456| 640| int trioff = 6 + k * 3 + (2 - tri / 4) * 3; 457| | 458| | // edge cb or ac 459| 640| shuf[9 + k * 3] = (unsigned char)(trioff + ((tri & 2) ? 2 : 0)); ------------------ | Branch (459:50): [True: 320, False: 320] ------------------ 460| 640| shuf[10 + k * 3] = (unsigned char)(trioff + ((tri & 2) ? 1 : 2)); ------------------ | Branch (460:51): [True: 320, False: 320] ------------------ 461| 640| } 462| | 463| | // third vertex is either next or comes from extra 464| 768| NEXT(11 + k * 3, tri & 1); ------------------ | | 400| 768| shuf[var] = (ec) ? (unsigned char)extra : 15; \ | | ------------------ | | | Branch (400:14): [True: 384, False: 384] | | ------------------ | | 401| 768| next[var] = (ec) ? 0 : (unsigned char)nextoff; \ | | ------------------ | | | Branch (401:14): [True: 384, False: 384] | | ------------------ | | 402| 768| extra += (ec), nextoff += 1 - (ec) ------------------ 465| 768| } 466| 256| else 467| 256| { 468| | // restart: three vertices, each comes from next or extra 469| 256| int fea = tri > 12; 470| 256| int feb = tri > 13; 471| 256| int fec = tri > 14; 472| | 473| 256| NEXT(9 + k * 3, fea); ------------------ | | 400| 256| shuf[var] = (ec) ? (unsigned char)extra : 15; \ | | ------------------ | | | Branch (400:14): [True: 192, False: 64] | | ------------------ | | 401| 256| next[var] = (ec) ? 0 : (unsigned char)nextoff; \ | | ------------------ | | | Branch (401:14): [True: 192, False: 64] | | ------------------ | | 402| 256| extra += (ec), nextoff += 1 - (ec) ------------------ 474| 256| NEXT(10 + k * 3, feb); ------------------ | | 400| 256| shuf[var] = (ec) ? (unsigned char)extra : 15; \ | | ------------------ | | | Branch (400:14): [True: 128, False: 128] | | ------------------ | | 401| 256| next[var] = (ec) ? 0 : (unsigned char)nextoff; \ | | ------------------ | | | Branch (401:14): [True: 128, False: 128] | | ------------------ | | 402| 256| extra += (ec), nextoff += 1 - (ec) ------------------ 475| 256| NEXT(11 + k * 3, fec); ------------------ | | 400| 256| shuf[var] = (ec) ? (unsigned char)extra : 15; \ | | ------------------ | | | Branch (400:14): [True: 64, False: 192] | | ------------------ | | 401| 256| next[var] = (ec) ? 0 : (unsigned char)nextoff; \ | | ------------------ | | | Branch (401:14): [True: 64, False: 192] | | ------------------ | | 402| 256| extra += (ec), nextoff += 1 - (ec) ------------------ 476| 256| } 477| 1.02k| } 478| | 479| | // next needs to advance 480| 512| next[15] = (unsigned char)nextoff; 481| | 482| | // next[0..8] = 0 trivially (never written to); next[9] must also be 0 because nextoff is 0 initially 483| | // shuf[0..5] is not used, which allows us to pack next[10..15] + shuf[6..15] into a single 16-byte entry 484| 512| assert(next[9] == 0); ------------------ | Branch (484:3): [True: 512, False: 0] ------------------ 485| 512| memcpy(&kDecodeTableMasks[code][0], &next[10], 6); 486| 512| memcpy(&kDecodeTableMasks[code][6], &shuf[6], 10); 487| 512| kDecodeTableExtra[code] = (unsigned char)extra; 488| 512| } 489| | 490| | // fill vertex decoding tables for each combination of four vertex references 491| 514| for (unsigned int i = 0; i < 256; ++i) ------------------ | Branch (491:27): [True: 512, False: 2] ------------------ 492| 512| { 493| 512| unsigned char shuf[16] = {}; 494| 512| int offset = 0; 495| | 496| 2.56k| for (int k = 0; k < 4; ++k) ------------------ | Branch (496:19): [True: 2.04k, False: 512] ------------------ 497| 2.04k| { 498| 2.04k| int code = ((i >> k) & 1) | ((i >> (k + 3)) & 2); 499| 2.04k| int length = i == 0xff ? 4 : code; // 0/1/2/3 bytes, or all 4 bytes if code==0xff ------------------ | Branch (499:17): [True: 8, False: 2.04k] ------------------ 500| | 501| 2.04k| shuf[k * 4 + 0] = (length > 0) ? (unsigned char)(offset + 0) : 0x80; ------------------ | Branch (501:22): [True: 1.53k, False: 512] ------------------ 502| 2.04k| shuf[k * 4 + 1] = (length > 1) ? (unsigned char)(offset + 1) : 0x80; ------------------ | Branch (502:22): [True: 1.02k, False: 1.02k] ------------------ 503| 2.04k| shuf[k * 4 + 2] = (length > 2) ? (unsigned char)(offset + 2) : 0x80; ------------------ | Branch (503:22): [True: 512, False: 1.53k] ------------------ 504| 2.04k| shuf[k * 4 + 3] = (length > 3) ? (unsigned char)(offset + 3) : 0x80; ------------------ | Branch (504:22): [True: 8, False: 2.04k] ------------------ 505| | 506| 2.04k| offset += length; 507| 2.04k| } 508| | 509| 512| memcpy(kDecodeTableVerts[i], shuf, sizeof(shuf)); 510| 512| kDecodeTableLength[i] = (unsigned char)offset; 511| 512| } 512| | 513| 2| return true; 514| | 515| 2|#undef NEXT 516| 2|} meshletcodec.cpp:_ZN7meshoptL15encodeTrianglesEPhS0_PKhm: 109| 4.51k|{ 110| 4.51k| EdgeFifo8 edgefifo; 111| 4.51k| memset(edgefifo, -1, sizeof(edgefifo)); 112| | 113| 4.51k| size_t edgefifooffset = 0; 114| | 115| 4.51k| unsigned int next = 0; 116| | 117| | // 4-bit triangle codes give us 16 options that we use as follows: 118| | // 3*2 edge reuse (2 edges * 3 last triangles) * 2 next/explicit = 12 options 119| | // 4 remaining options = next bits; 000, 001, 011, 111. 120| | // triangles are rotated to make next bits line up. 121| 4.51k| memset(codes, 0, (triangle_count + 1) / 2); 122| | 123| 4.51k| static const int rotations[] = {0, 1, 2, 0, 1}; 124| | 125| 4.51k| unsigned char* start = extra; 126| | 127| 188k| for (size_t i = 0; i < triangle_count; ++i) ------------------ | Branch (127:21): [True: 184k, False: 4.51k] ------------------ 128| 184k| { 129| |#if TRACE > 1 130| | unsigned int last = next; 131| |#endif 132| | 133| 184k| int fer = getEdgeFifo8(edgefifo, triangles[i * 3 + 0], triangles[i * 3 + 1], triangles[i * 3 + 2], edgefifooffset); 134| | 135| 184k| if (fer >= 0 && (fer >> 2) < 6) ------------------ | Branch (135:7): [True: 95.4k, False: 88.6k] | Branch (135:19): [True: 92.0k, False: 3.41k] ------------------ 136| 92.0k| { 137| | // note: getEdgeFifo8 implicitly rotates triangles by matching a/b to existing edge 138| 92.0k| const int* order = rotations + (fer & 3); 139| | 140| 92.0k| unsigned int a = triangles[i * 3 + order[0]], b = triangles[i * 3 + order[1]], c = triangles[i * 3 + order[2]]; 141| | 142| 92.0k| int fec = (c == next) ? (next++, 0) : 1; ------------------ | Branch (142:14): [True: 830, False: 91.1k] ------------------ 143| | 144| |#if TRACE > 1 145| | printf("%3d+ | %3d %3d %3d | edge: e%d c%d\n", last, a, b, c, fer >> 2, fec); 146| |#endif 147| | 148| 92.0k| unsigned int code = (fer >> 2) * 2 + fec; 149| | 150| 92.0k| codes[i / 2] |= (unsigned char)(code << ((i & 1) * 4)); 151| | 152| 92.0k| if (fec) ------------------ | Branch (152:8): [True: 91.1k, False: 830] ------------------ 153| 91.1k| *extra++ = (unsigned char)c; 154| | 155| 92.0k| pushEdgeFifo8(edgefifo, c, b, edgefifooffset); 156| 92.0k| pushEdgeFifo8(edgefifo, a, c, edgefifooffset); 157| 92.0k| } 158| 92.0k| else 159| 92.0k| { 160| | // rotate triangles to minimize the need for extra vertices 161| 92.0k| int rotation = rotateTriangle(triangles[i * 3 + 0], triangles[i * 3 + 1], triangles[i * 3 + 2]); 162| 92.0k| const int* order = rotations + rotation; 163| | 164| 92.0k| unsigned int a = triangles[i * 3 + order[0]], b = triangles[i * 3 + order[1]], c = triangles[i * 3 + order[2]]; 165| | 166| | // fe must be continuous: once a vertex is encoded with next, further vertices must also be encoded with next 167| 92.0k| int fea = (a == next && b == next + 1 && c == next + 2) ? (next++, 0) : 1; ------------------ | Branch (167:15): [True: 8.73k, False: 83.2k] | Branch (167:28): [True: 3.80k, False: 4.93k] | Branch (167:45): [True: 3.32k, False: 478] ------------------ 168| 92.0k| int feb = (b == next && c == next + 1) ? (next++, 0) : 1; ------------------ | Branch (168:15): [True: 8.77k, False: 83.2k] | Branch (168:28): [True: 3.58k, False: 5.18k] ------------------ 169| 92.0k| int fec = (c == next) ? (next++, 0) : 1; ------------------ | Branch (169:14): [True: 4.76k, False: 87.2k] ------------------ 170| | 171| 92.0k| assert(fea == 1 || feb == 0); ------------------ | Branch (171:4): [True: 88.7k, False: 3.32k] | Branch (171:4): [True: 3.32k, False: 0] | Branch (171:4): [True: 92.0k, False: 0] ------------------ 172| 92.0k| assert(feb == 1 || fec == 0); ------------------ | Branch (172:4): [True: 88.4k, False: 3.58k] | Branch (172:4): [True: 3.58k, False: 0] | Branch (172:4): [True: 92.0k, False: 0] ------------------ 173| | 174| |#if TRACE > 1 175| | printf("%3d+ | %3d %3d %3d | restart: %d%d%d\n", last, a, b, c, fea, feb, fec); 176| |#endif 177| | 178| 92.0k| unsigned int code = 12 + (fea + feb + fec); 179| | 180| 92.0k| codes[i / 2] |= (unsigned char)(code << ((i & 1) * 4)); 181| | 182| 92.0k| if (fea) ------------------ | Branch (182:8): [True: 88.7k, False: 3.32k] ------------------ 183| 88.7k| *extra++ = (unsigned char)a; 184| 92.0k| if (feb) ------------------ | Branch (184:8): [True: 88.4k, False: 3.58k] ------------------ 185| 88.4k| *extra++ = (unsigned char)b; 186| 92.0k| if (fec) ------------------ | Branch (186:8): [True: 87.2k, False: 4.76k] ------------------ 187| 87.2k| *extra++ = (unsigned char)c; 188| | 189| 92.0k| pushEdgeFifo8(edgefifo, c, b, edgefifooffset); 190| 92.0k| pushEdgeFifo8(edgefifo, a, c, edgefifooffset); 191| 92.0k| } 192| 184k| } 193| | 194| 4.51k| return extra - start; 195| 4.51k|} meshletcodec.cpp:_ZN7meshoptL12getEdgeFifo8EPA2_jjjjm: 82| 184k|{ 83| 957k| for (int i = 0; i < 8; ++i) ------------------ | Branch (83:18): [True: 869k, False: 88.6k] ------------------ 84| 869k| { 85| 869k| size_t index = (offset - 1 - i) & 7; 86| | 87| 869k| unsigned int e0 = fifo[index][0]; 88| 869k| unsigned int e1 = fifo[index][1]; 89| | 90| 869k| if (e0 == a && e1 == b) ------------------ | Branch (90:7): [True: 144k, False: 725k] | Branch (90:18): [True: 78.4k, False: 65.8k] ------------------ 91| 78.4k| return (i << 2) | 0; 92| 790k| if (e0 == b && e1 == c) ------------------ | Branch (92:7): [True: 55.5k, False: 735k] | Branch (92:18): [True: 9.38k, False: 46.1k] ------------------ 93| 9.38k| return (i << 2) | 1; 94| 781k| if (e0 == c && e1 == a) ------------------ | Branch (94:7): [True: 54.4k, False: 727k] | Branch (94:18): [True: 7.59k, False: 46.8k] ------------------ 95| 7.59k| return (i << 2) | 2; 96| 781k| } 97| | 98| 88.6k| return -1; 99| 184k|} meshletcodec.cpp:_ZN7meshoptL13pushEdgeFifo8EPA2_jjjRm: 102| 368k|{ 103| 368k| fifo[offset][0] = a; 104| 368k| fifo[offset][1] = b; 105| 368k| offset = (offset + 1) & 7; 106| 368k|} meshletcodec.cpp:_ZN7meshoptL14rotateTriangleEjjj: 77| 92.0k|{ 78| 92.0k| return (a > b && a > c) ? 1 : (b > c ? 2 : 0); ------------------ | Branch (78:10): [True: 38.3k, False: 53.6k] | Branch (78:19): [True: 26.6k, False: 11.6k] | Branch (78:33): [True: 25.2k, False: 40.1k] ------------------ 79| 92.0k|} meshletcodec.cpp:_ZN7meshoptL14encodeVerticesEPhS0_PKjm: 198| 4.51k|{ 199| | // grouped varint, 2 bit per value to indicate 0/1/2/3 byte deltas, with per-group 4-byte fallback 200| 4.51k| memset(ctrl, 0, (vertex_count + 3) / 4); 201| | 202| 4.51k| unsigned char* start = data; 203| | 204| 4.51k| unsigned int last = ~0u; 205| | 206| 47.2k| for (size_t i = 0; i < vertex_count; i += 4) ------------------ | Branch (206:21): [True: 42.7k, False: 4.51k] ------------------ 207| 42.7k| { 208| 42.7k| unsigned int gv[4] = {}; 209| | 210| 211k| for (int k = 0; k < 4 && i + k < vertex_count; ++k) ------------------ | Branch (210:19): [True: 169k, False: 41.5k] | Branch (210:28): [True: 168k, False: 1.19k] ------------------ 211| 168k| { 212| 168k| unsigned int d = vertices[i + k] - last - 1; 213| 168k| unsigned int v = (d << 1) ^ (int(d) >> 31); 214| | 215| 168k| gv[k] = v; 216| 168k| last = vertices[i + k]; 217| 168k| } 218| | 219| | // if any value needs 4 bytes, or if *all* values need 3 bytes, we use 4 bytes for all values 220| | // this allows us to encode most 3-byte deltas with 3 bytes which saves space overall 221| 42.7k| bool use4 = (gv[0] | gv[1] | gv[2] | gv[3]) > 0xffffff || (gv[0] > 0xffff && gv[1] > 0xffff && gv[2] > 0xffff && gv[3] > 0xffff); ------------------ | Branch (221:15): [True: 28.7k, False: 13.9k] | Branch (221:62): [True: 1.34k, False: 12.6k] | Branch (221:80): [True: 800, False: 542] | Branch (221:98): [True: 390, False: 410] | Branch (221:116): [True: 251, False: 139] ------------------ 222| | 223| 213k| for (int k = 0; k < 4; ++k) ------------------ | Branch (223:19): [True: 170k, False: 42.7k] ------------------ 224| 170k| { 225| 170k| unsigned int v = gv[k]; 226| | 227| | // 0/1/2/3 bytes per value, or all 4 values use 4 bytes 228| 170k| int code = use4 ? 3 : (v == 0 ? 0 : (v < 256 ? 1 : (v < 65536 ? 2 : 3))); ------------------ | Branch (228:15): [True: 116k, False: 54.7k] | Branch (228:27): [True: 1.58k, False: 53.2k] | Branch (228:41): [True: 47.6k, False: 5.52k] | Branch (228:56): [True: 2.44k, False: 3.07k] ------------------ 229| | 230| 170k| if (code > 0) ------------------ | Branch (230:8): [True: 169k, False: 1.58k] ------------------ 231| 169k| *data++ = (unsigned char)(v & 0xff); 232| 170k| if (code > 1) ------------------ | Branch (232:8): [True: 121k, False: 49.2k] ------------------ 233| 121k| *data++ = (unsigned char)((v >> 8) & 0xff); 234| 170k| if (code > 2) ------------------ | Branch (234:8): [True: 119k, False: 51.7k] ------------------ 235| 119k| *data++ = (unsigned char)((v >> 16) & 0xff); 236| 170k| if (use4) ------------------ | Branch (236:8): [True: 116k, False: 54.7k] ------------------ 237| 116k| *data++ = (unsigned char)((v >> 24) & 0xff); 238| | 239| | // split low and high bits into two nibbles for better packing 240| 170k| ctrl[i / 4] |= ((code & 1) << k) | ((code >> 1) << (k + 4)); 241| 170k| } 242| 42.7k| } 243| | 244| 4.51k| return data - start; 245| 4.51k|} meshletcodec.cpp:_ZN7meshoptL17decodeMeshletSimdILi0EEEiPvS1_PKhS3_S3_S3_mmmm: 865| 13.6k|{ 866| 13.6k| assert(gDecodeTablesInitialized); ------------------ | Branch (866:2): [True: 13.6k, False: 0] ------------------ 867| 13.6k| (void)gDecodeTablesInitialized; 868| | 869| 13.6k|#ifdef __clang__ 870| | // data is guaranteed to be non-null initially; if decode loops never hit bounds errors, it remains non-null 871| 13.6k| __builtin_assume(data); 872| 13.6k|#endif 873| | 874| | // decodes 4 vertices at a time with tail processing; writes up to align(vertex_size * vertex_count, 4) 875| | // raw decoding skips tail processing by rounding up vertex count; it's safe because output buffer is guaranteed to have extra space, and tail control data is 0 876| 13.6k| if (vertex_size == 4 || Raw) ------------------ | Branch (876:6): [True: 9.10k, False: 4.59k] | Branch (876:26): [Folded, False: 0] ------------------ 877| 9.10k| data = decodeVerticesSimd(static_cast(vertices), ctrl, data, bound, Raw ? (vertex_count + 3) & ~3 : vertex_count); ------------------ | Branch (877:86): [Folded, False: 9.10k] ------------------ 878| 4.59k| else 879| 4.59k| data = decodeVerticesSimd(static_cast(vertices), ctrl, data, bound, vertex_count); 880| 13.6k| if (!data) ------------------ | Branch (880:6): [True: 1.34k, False: 12.3k] ------------------ 881| 1.34k| return -2; 882| | 883| | // decodes 2/4 triangles at a time with tail processing; writes up to align(triangle_size * triangle_count, 4) 884| | // raw decoding skips tail processing by rounding up triangle count; it's safe because output buffer is guaranteed to have extra space, and tail code data is 0 885| 12.3k| if (triangle_size == 4 || Raw) ------------------ | Branch (885:6): [True: 3.91k, False: 8.42k] | Branch (885:28): [Folded, False: 0] ------------------ 886| 3.91k| data = decodeTrianglesSimd(static_cast(triangles), codes, data, bound, Raw ? (triangle_count + 1) & ~1 : triangle_count); ------------------ | Branch (886:89): [Folded, False: 3.91k] ------------------ 887| 8.42k| else 888| 8.42k| data = decodeTrianglesSimd(static_cast(triangles), codes, data, bound, triangle_count); 889| 12.3k| if (!data) ------------------ | Branch (889:6): [True: 278, False: 12.0k] ------------------ 890| 278| return -2; 891| | 892| 12.0k| return (data == bound) ? 0 : -3; ------------------ | Branch (892:9): [True: 9.06k, False: 2.99k] ------------------ 893| 12.3k|} meshletcodec.cpp:_ZN7meshoptL18decodeVerticesSimdEPjPKhS2_S2_m: 750| 10.2k|{ 751| 10.2k|#if defined(SIMD_SSE) 752| 10.2k| __m128i last = _mm_set1_epi32(-1); 753| |#elif defined(SIMD_NEON) 754| | uint32x4_t last = vdupq_n_u32(~0u); 755| |#endif 756| | 757| 10.2k| size_t groups = vertex_count / 4; 758| | 759| | // process all complete groups 760| 141k| for (size_t i = 0; i < groups; ++i) ------------------ | Branch (760:21): [True: 131k, False: 9.27k] ------------------ 761| 131k| { 762| 131k| unsigned char code = *ctrl++; 763| 131k| if (data > bound) ------------------ | Branch (763:7): [True: 997, False: 130k] ------------------ 764| 997| return NULL; 765| | 766| 130k| last = decodeVertexGroup(last, code, data); 767| | 768| 130k|#if defined(SIMD_SSE) 769| 130k| _mm_storeu_si128(reinterpret_cast<__m128i*>(&vertices[i * 4]), last); 770| |#elif defined(SIMD_NEON) 771| | vst1q_u32(&vertices[i * 4], last); 772| |#endif 773| 130k| } 774| | 775| | // process a 1-3 vertex tail; to maintain the memory safety guarantee we have to write individual elements 776| 9.27k| if (vertex_count & 3) ------------------ | Branch (776:6): [True: 2.66k, False: 6.61k] ------------------ 777| 2.66k| { 778| 2.66k| unsigned char code = *ctrl++; 779| | 780| 2.66k| if (data > bound) ------------------ | Branch (780:7): [True: 14, False: 2.64k] ------------------ 781| 14| return NULL; 782| | 783| 2.64k| last = decodeVertexGroup(last, code, data); 784| | 785| 2.64k| unsigned int* tail = &vertices[vertex_count & ~3u]; 786| | 787| 2.64k|#if defined(SIMD_SSE) 788| 2.64k| tail[0] = _mm_cvtsi128_si32(last); 789| 2.64k| if ((vertex_count & 3) > 1) ------------------ | Branch (789:7): [True: 1.19k, False: 1.45k] ------------------ 790| 1.19k| tail[1] = _mm_extract_epi32(last, 1); 791| 2.64k| if ((vertex_count & 3) > 2) ------------------ | Branch (791:7): [True: 418, False: 2.22k] ------------------ 792| 418| tail[2] = _mm_extract_epi32(last, 2); 793| |#elif defined(SIMD_NEON) 794| | vst1q_lane_u32(&tail[0], last, 0); 795| | if ((vertex_count & 3) > 1) 796| | vst1q_lane_u32(&tail[1], last, 1); 797| | if ((vertex_count & 3) > 2) 798| | vst1q_lane_u32(&tail[2], last, 2); 799| |#endif 800| 2.64k| } 801| | 802| 9.25k| return data; 803| 9.27k|} _ZN7meshopt17decodeVertexGroupEDv2_xhRPKh: 540| 236k|{ 541| 236k| __m128i word = _mm_loadu_si128(reinterpret_cast(data)); 542| 236k| __m128i shuf = _mm_loadu_si128(reinterpret_cast(kDecodeTableVerts[code])); 543| | 544| 236k| __m128i v = _mm_shuffle_epi8(word, shuf); 545| | 546| | // unzigzag+1 547| 236k| __m128i xl = _mm_sub_epi32(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi32(1))); 548| 236k| __m128i xr = _mm_srli_epi32(v, 1); 549| 236k| __m128i x = _mm_add_epi32(_mm_xor_si128(xl, xr), _mm_set1_epi32(1)); 550| | 551| | // prefix sum 552| 236k| x = _mm_add_epi32(x, _mm_slli_si128(x, 8)); 553| 236k| x = _mm_add_epi32(x, _mm_slli_si128(x, 4)); 554| 236k| x = _mm_add_epi32(x, _mm_shuffle_epi32(last, 0xff)); 555| | 556| 236k| data += kDecodeTableLength[code]; 557| | 558| 236k| return x; 559| 236k|} meshletcodec.cpp:_ZN7meshoptL18decodeVerticesSimdEPtPKhS2_S2_m: 807| 4.59k|{ 808| 4.59k|#if defined(SIMD_SSE) 809| 4.59k| __m128i repack = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0); 810| 4.59k| __m128i last = _mm_set1_epi32(-1); 811| |#elif defined(SIMD_NEON) 812| | uint32x4_t last = vdupq_n_u32(~0u); 813| |#endif 814| | 815| | // because the output buffer is guaranteed to have 32-bit aligned size available, we can simplify tail processing 816| | // if the number of vertices mod 4 is 3, we'd normally need to write 8+6 bytes, but we can instead overwrite up to 2 bytes in the main loop 817| 4.59k| size_t groups = (vertex_count + 1) / 4; 818| | 819| | // process all complete groups 820| 105k| for (size_t i = 0; i < groups; ++i) ------------------ | Branch (820:21): [True: 101k, False: 3.93k] ------------------ 821| 101k| { 822| 101k| unsigned char code = *ctrl++; 823| | 824| 101k| if (data > bound) ------------------ | Branch (824:7): [True: 660, False: 100k] ------------------ 825| 660| return NULL; 826| | 827| 100k| last = decodeVertexGroup(last, code, data); 828| | 829| 100k|#if defined(SIMD_SSE) 830| 100k| __m128i r = _mm_shuffle_epi8(last, repack); 831| 100k| _mm_storel_epi64(reinterpret_cast<__m128i*>(&vertices[i * 4]), r); 832| |#elif defined(SIMD_NEON) 833| | uint16x4_t r = vmovn_u32(last); 834| | vst1_u16(&vertices[i * 4], r); 835| |#endif 836| 100k| } 837| | 838| | // process a 1-2 vertex tail; to maintain the memory safety guarantee we have to write a 32-bit element 839| 3.93k| if (groups * 4 < vertex_count) ------------------ | Branch (839:6): [True: 2.24k, False: 1.68k] ------------------ 840| 2.24k| { 841| 2.24k| unsigned char code = *ctrl++; 842| | 843| 2.24k| if (data > bound) ------------------ | Branch (843:7): [True: 14, False: 2.22k] ------------------ 844| 14| return NULL; 845| | 846| 2.22k| last = decodeVertexGroup(last, code, data); 847| | 848| 2.22k| unsigned short* tail = &vertices[vertex_count & ~3u]; 849| | 850| 2.22k|#if defined(SIMD_SSE) 851| 2.22k| __m128i r = _mm_shufflelo_epi16(last, 8); 852| 2.22k| *reinterpret_cast(tail) = _mm_cvtsi128_si32(r); 853| |#elif defined(SIMD_NEON) 854| | uint16x4_t r = vmovn_u32(last); 855| | vst1_lane_u32(reinterpret_cast(tail), vreinterpret_u32_u16(r), 0); 856| |#endif 857| 2.22k| } 858| | 859| 3.91k| return data; 860| 3.93k|} meshletcodec.cpp:_ZN7meshoptL19decodeTrianglesSimdEPjPKhS2_S2_m: 615| 4.74k|{ 616| 4.74k|#if defined(SIMD_SSE) 617| 4.74k| __m128i repack = _mm_setr_epi8(9, 10, 11, -1, 12, 13, 14, -1, 0, 0, 0, 0, 0, 0, 0, 0); 618| 4.74k| __m128i state = _mm_setzero_si128(); 619| |#elif defined(SIMD_NEON) 620| | uint8x8_t repack = vcreate_u8(0xff0e0d0cff0b0a09ull); 621| | uint8x16_t state = vdupq_n_u8(0); 622| |#endif 623| | 624| 4.74k| size_t groups = triangle_count / 2; 625| | 626| | // process all complete groups 627| 202k| for (size_t i = 0; i < groups; ++i) ------------------ | Branch (627:21): [True: 198k, False: 4.57k] ------------------ 628| 198k| { 629| 198k| unsigned char code = *codes++; 630| | 631| 198k| if (extra > bound) ------------------ | Branch (631:7): [True: 173, False: 198k] ------------------ 632| 173| return NULL; 633| | 634| 198k| state = decodeTriangleGroup(state, code, extra); 635| | 636| | // write 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding 637| 198k|#if defined(SIMD_SSE) 638| 198k| __m128i r = _mm_shuffle_epi8(state, repack); 639| 198k| _mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 2]), r); 640| |#elif defined(SIMD_NEON) 641| | uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack)); 642| | vst1_u32(&triangles[i * 2], r); 643| |#endif 644| 198k| } 645| | 646| | // process a 1 triangle tail; to maintain the memory safety guarantee we have to write a 32-bit element 647| 4.57k| if (triangle_count & 1) ------------------ | Branch (647:6): [True: 1.76k, False: 2.81k] ------------------ 648| 1.76k| { 649| 1.76k| unsigned char code = *codes++; 650| | 651| 1.76k| if (extra > bound) ------------------ | Branch (651:7): [True: 40, False: 1.72k] ------------------ 652| 40| return NULL; 653| | 654| 1.72k| state = decodeTriangleGroup(state, code, extra); 655| | 656| 1.72k| unsigned int* tail = &triangles[triangle_count & ~1u]; 657| | 658| 1.72k|#if defined(SIMD_SSE) 659| 1.72k| __m128i r = _mm_shuffle_epi8(state, repack); 660| 1.72k| *tail = unsigned(_mm_cvtsi128_si32(r)); 661| |#elif defined(SIMD_NEON) 662| | uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack)); 663| | vst1_lane_u32(tail, r, 0); 664| |#endif 665| 1.72k| } 666| | 667| 4.53k| return extra; 668| 4.57k|} _ZN7meshopt19decodeTriangleGroupEDv2_xhRPKh: 524| 368k|{ 525| 368k| __m128i shuf = _mm_loadu_si128(reinterpret_cast(kDecodeTableMasks[code])); 526| 368k| __m128i next = _mm_slli_si128(shuf, 10); 527| | 528| | // patch first 6 bytes with current extra and roll state forward 529| 368k| __m128i ext = _mm_loadl_epi64(reinterpret_cast(extra)); 530| 368k| state = _mm_blend_epi16(state, ext, 7); 531| 368k| state = _mm_add_epi8(_mm_shuffle_epi8(state, shuf), next); 532| | 533| 368k| extra += kDecodeTableExtra[code]; 534| | 535| 368k| return state; 536| 368k|} meshletcodec.cpp:_ZN7meshoptL19decodeTrianglesSimdEPhPKhS2_S2_m: 672| 8.42k|{ 673| 8.42k|#if defined(SIMD_SSE) 674| 8.42k| __m128i state = _mm_setzero_si128(); 675| |#elif defined(SIMD_NEON) 676| | uint8x16_t state = vdupq_n_u8(0); 677| |#endif 678| | 679| | // because the output buffer is guaranteed to have 32-bit aligned size available, we can optimize writes and tail processing 680| | // instead of processing triangles 2 at a time, we process 2 *pairs* at a time (12-byte write) followed by a tail pair, if present 681| | // if the number of triangles mod 4 is 3, we'd normally need to write 12k+9 bytes, but we can instead overwrite up to 3 bytes in the main loop 682| 8.42k| size_t groups = (triangle_count + 1) / 4; 683| | 684| | // process all complete groups 685| 89.2k| for (size_t i = 0; i < groups; ++i) ------------------ | Branch (685:21): [True: 80.9k, False: 8.33k] ------------------ 686| 80.9k| { 687| 80.9k| unsigned char code0 = *codes++; 688| 80.9k| unsigned char code1 = *codes++; 689| | 690| | // each triangle pair reads <=6 bytes from extra, so two pairs need <=12 bytes and gap guarantees 16 byte of overread 691| 80.9k| if (extra > bound) ------------------ | Branch (691:7): [True: 90, False: 80.8k] ------------------ 692| 90| return NULL; 693| | 694| 80.8k| state = decodeTriangleGroup(state, code0, extra); 695| | 696| | // write first decoded triangle and first index of second decoded triangle 697| 80.8k|#if defined(SIMD_SSE) 698| 80.8k| __m128i r0 = _mm_srli_si128(state, 9); 699| 80.8k| *reinterpret_cast(&triangles[i * 12]) = _mm_cvtsi128_si32(r0); 700| |#elif defined(SIMD_NEON) 701| | uint8x16_t r0 = vextq_u8(state, vdupq_n_u8(0), 9); 702| | vst1q_lane_u32(reinterpret_cast(&triangles[i * 12]), vreinterpretq_u32_u8(r0), 0); 703| |#endif 704| | 705| 80.8k| state = decodeTriangleGroup(state, code1, extra); 706| | 707| | // write last two indices of second decoded triangle that we didn't write above plus two new ones 708| | // note that the second decoded triangle has shifted down to 6-8 bytes, hence shift by 7 709| 80.8k|#if defined(SIMD_SSE) 710| 80.8k| __m128i r1 = _mm_srli_si128(state, 7); 711| 80.8k| _mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 12 + 4]), r1); 712| |#elif defined(SIMD_NEON) 713| | uint8x16_t r1 = vextq_u8(state, vdupq_n_u8(0), 7); 714| | vst1_u8(&triangles[i * 12 + 4], vget_low_u8(r1)); 715| |#endif 716| 80.8k| } 717| | 718| | // process a 1-2 triangle tail; to maintain the memory safety guarantee we have to write 1-2 32-bit elements 719| 8.33k| if (groups * 4 < triangle_count) ------------------ | Branch (719:6): [True: 6.61k, False: 1.71k] ------------------ 720| 6.61k| { 721| 6.61k| unsigned char code = *codes++; 722| | 723| 6.61k| if (extra > bound) ------------------ | Branch (723:7): [True: 46, False: 6.57k] ------------------ 724| 46| return NULL; 725| | 726| 6.57k| state = decodeTriangleGroup(state, code, extra); 727| | 728| 6.57k| unsigned char* tail = &triangles[(triangle_count & ~3u) * 3]; 729| | 730| 6.57k|#if defined(SIMD_SSE) 731| 6.57k| __m128i r = _mm_srli_si128(state, 9); 732| | 733| 6.57k| *reinterpret_cast(tail) = _mm_cvtsi128_si32(r); 734| 6.57k| if ((triangle_count & 3) > 1) ------------------ | Branch (734:7): [True: 787, False: 5.78k] ------------------ 735| 787| *reinterpret_cast(tail + 4) = _mm_extract_epi32(r, 1); 736| |#elif defined(SIMD_NEON) 737| | uint8x16_t r = vextq_u8(state, vdupq_n_u8(0), 9); 738| | 739| | vst1q_lane_u32(reinterpret_cast(tail), vreinterpretq_u32_u8(r), 0); 740| | if ((triangle_count & 3) > 1) 741| | vst1q_lane_u32(reinterpret_cast(tail + 4), vreinterpretq_u32_u8(r), 1); 742| |#endif 743| 6.57k| } 744| | 745| 8.29k| return extra; 746| 8.33k|} meshletcodec.cpp:_ZN7meshoptL17decodeMeshletSimdILi1EEEiPvS1_PKhS3_S3_S3_mmmm: 865| 1.16k|{ 866| 1.16k| assert(gDecodeTablesInitialized); ------------------ | Branch (866:2): [True: 1.16k, False: 0] ------------------ 867| 1.16k| (void)gDecodeTablesInitialized; 868| | 869| 1.16k|#ifdef __clang__ 870| | // data is guaranteed to be non-null initially; if decode loops never hit bounds errors, it remains non-null 871| 1.16k| __builtin_assume(data); 872| 1.16k|#endif 873| | 874| | // decodes 4 vertices at a time with tail processing; writes up to align(vertex_size * vertex_count, 4) 875| | // raw decoding skips tail processing by rounding up vertex count; it's safe because output buffer is guaranteed to have extra space, and tail control data is 0 876| 1.16k| if (vertex_size == 4 || Raw) ------------------ | Branch (876:6): [True: 1.16k, False: 0] | Branch (876:26): [True: 0, Folded] ------------------ 877| 1.16k| data = decodeVerticesSimd(static_cast(vertices), ctrl, data, bound, Raw ? (vertex_count + 3) & ~3 : vertex_count); ------------------ | Branch (877:86): [True: 1.16k, Folded] ------------------ 878| 0| else 879| 0| data = decodeVerticesSimd(static_cast(vertices), ctrl, data, bound, vertex_count); 880| 1.16k| if (!data) ------------------ | Branch (880:6): [True: 337, False: 831] ------------------ 881| 337| return -2; 882| | 883| | // decodes 2/4 triangles at a time with tail processing; writes up to align(triangle_size * triangle_count, 4) 884| | // raw decoding skips tail processing by rounding up triangle count; it's safe because output buffer is guaranteed to have extra space, and tail code data is 0 885| 831| if (triangle_size == 4 || Raw) ------------------ | Branch (885:6): [True: 831, False: 0] | Branch (885:28): [True: 0, Folded] ------------------ 886| 831| data = decodeTrianglesSimd(static_cast(triangles), codes, data, bound, Raw ? (triangle_count + 1) & ~1 : triangle_count); ------------------ | Branch (886:89): [True: 831, Folded] ------------------ 887| 0| else 888| 0| data = decodeTrianglesSimd(static_cast(triangles), codes, data, bound, triangle_count); 889| 831| if (!data) ------------------ | Branch (889:6): [True: 71, False: 760] ------------------ 890| 71| return -2; 891| | 892| 760| return (data == bound) ? 0 : -3; ------------------ | Branch (892:9): [True: 12, False: 748] ------------------ 893| 831|} _Z21meshopt_decodeMeshletIjjEiPT_mPT0_mPKhm: 1406| 2.25k|{ 1407| 2.25k| char types_valid[(sizeof(V) == 2 || sizeof(V) == 4) && (sizeof(T) == 1 || sizeof(T) == 4) ? 1 : -1]; 1408| 2.25k| (void)types_valid; 1409| | 1410| 2.25k| return meshopt_decodeMeshlet(vertices, vertex_count, sizeof(V), triangles, triangle_count, sizeof(T) == 1 ? 3 : 4, buffer, buffer_size); ------------------ | Branch (1410:93): [Folded, False: 2.25k] ------------------ 1411| 2.25k|} _Z21meshopt_decodeMeshletIjhEiPT_mPT0_mPKhm: 1406| 4.51k|{ 1407| 4.51k| char types_valid[(sizeof(V) == 2 || sizeof(V) == 4) && (sizeof(T) == 1 || sizeof(T) == 4) ? 1 : -1]; 1408| 4.51k| (void)types_valid; 1409| | 1410| 4.51k| return meshopt_decodeMeshlet(vertices, vertex_count, sizeof(V), triangles, triangle_count, sizeof(T) == 1 ? 3 : 4, buffer, buffer_size); ------------------ | Branch (1410:93): [True: 4.51k, Folded] ------------------ 1411| 4.51k|} _Z21meshopt_decodeMeshletIthEiPT_mPT0_mPKhm: 1406| 2.25k|{ 1407| 2.25k| char types_valid[(sizeof(V) == 2 || sizeof(V) == 4) && (sizeof(T) == 1 || sizeof(T) == 4) ? 1 : -1]; 1408| 2.25k| (void)types_valid; 1409| | 1410| 2.25k| return meshopt_decodeMeshlet(vertices, vertex_count, sizeof(V), triangles, triangle_count, sizeof(T) == 1 ? 3 : 4, buffer, buffer_size); ------------------ | Branch (1410:93): [True: 2.25k, Folded] ------------------ 1411| 2.25k|} meshopt_encodeVertexBufferLevel: 1647| 18.0k|{ 1648| 18.0k| using namespace meshopt; 1649| | 1650| 18.0k| assert(vertex_size > 0 && vertex_size <= 256); ------------------ | Branch (1650:2): [True: 18.0k, False: 0] | Branch (1650:2): [True: 18.0k, False: 0] | Branch (1650:2): [True: 18.0k, False: 0] ------------------ 1651| 18.0k| assert(vertex_size % 4 == 0); ------------------ | Branch (1651:2): [True: 18.0k, False: 0] ------------------ 1652| 18.0k| assert(level >= 0 && level <= 9); // only a subset of this range is used right now ------------------ | Branch (1652:2): [True: 18.0k, False: 0] | Branch (1652:2): [True: 18.0k, False: 0] | Branch (1652:2): [True: 18.0k, False: 0] ------------------ 1653| 18.0k| assert(version < 0 || unsigned(version) <= kDecodeVertexVersion); ------------------ | Branch (1653:2): [True: 18.0k, False: 0] | Branch (1653:2): [True: 0, False: 0] | Branch (1653:2): [True: 18.0k, False: 0] ------------------ 1654| | 1655| 18.0k| version = version < 0 ? gEncodeVertexVersion : version; ------------------ | Branch (1655:12): [True: 18.0k, False: 0] ------------------ 1656| | 1657| |#if TRACE 1658| | memset(vertexstats, 0, sizeof(vertexstats)); 1659| |#endif 1660| | 1661| 18.0k| const unsigned char* vertex_data = static_cast(vertices); 1662| | 1663| 18.0k| unsigned char* data = buffer; 1664| 18.0k| unsigned char* data_end = buffer + buffer_size; 1665| | 1666| 18.0k| if (size_t(data_end - data) < 1) ------------------ | Branch (1666:6): [True: 0, False: 18.0k] ------------------ 1667| 0| return 0; 1668| | 1669| 18.0k| *data++ = (unsigned char)(kVertexHeader | version); 1670| | 1671| 18.0k| unsigned char first_vertex[256] = {}; 1672| 18.0k| if (vertex_count > 0) ------------------ | Branch (1672:6): [True: 14.2k, False: 3.74k] ------------------ 1673| 14.2k| memcpy(first_vertex, vertex_data, vertex_size); 1674| | 1675| 18.0k| unsigned char last_vertex[256] = {}; 1676| 18.0k| memcpy(last_vertex, first_vertex, vertex_size); 1677| | 1678| 18.0k| size_t vertex_block_size = getVertexBlockSize(vertex_size); 1679| | 1680| 18.0k| unsigned char channels[64] = {}; 1681| 18.0k| if (version != 0 && level > 1 && vertex_count > 1) ------------------ | Branch (1681:6): [True: 14.0k, False: 3.94k] | Branch (1681:22): [True: 6.16k, False: 7.92k] | Branch (1681:35): [True: 4.08k, False: 2.08k] ------------------ 1682| 21.2k| for (size_t k = 0; k < vertex_size; k += 4) ------------------ | Branch (1682:22): [True: 17.1k, False: 4.08k] ------------------ 1683| 17.1k| { 1684| 17.1k| int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0; ------------------ | Branch (1684:14): [True: 14.9k, False: 2.24k] ------------------ 1685| 17.1k| int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot); ------------------ | Branch (1685:137): [True: 14.9k, False: 2.24k] ------------------ 1686| | 1687| 17.1k| assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8)); ------------------ | Branch (1687:4): [True: 2.35k, False: 0] | Branch (1687:4): [True: 2.35k, False: 0] | Branch (1687:4): [True: 14.8k, False: 2.35k] | Branch (1687:4): [True: 17.1k, False: 0] ------------------ 1688| 17.1k| channels[k / 4] = (unsigned char)channel; 1689| 17.1k| } 1690| | 1691| 18.0k| size_t vertex_offset = 0; 1692| | 1693| 329k| while (vertex_offset < vertex_count) ------------------ | Branch (1693:9): [True: 311k, False: 18.0k] ------------------ 1694| 311k| { 1695| 311k| size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; ------------------ | Branch (1695:23): [True: 297k, False: 14.2k] ------------------ 1696| | 1697| 311k| data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level); 1698| 311k| if (!data) ------------------ | Branch (1698:7): [True: 0, False: 311k] ------------------ 1699| 0| return 0; 1700| | 1701| 311k| vertex_offset += block_size; 1702| 311k| } 1703| | 1704| 18.0k| size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4); ------------------ | Branch (1704:36): [True: 3.94k, False: 14.0k] ------------------ 1705| 18.0k| size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1; ------------------ | Branch (1705:25): [True: 3.94k, False: 14.0k] ------------------ 1706| 18.0k| size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; ------------------ | Branch (1706:25): [True: 10.0k, False: 8.03k] ------------------ 1707| | 1708| 18.0k| if (size_t(data_end - data) < tail_size_pad) ------------------ | Branch (1708:6): [True: 0, False: 18.0k] ------------------ 1709| 0| return 0; 1710| | 1711| 18.0k| if (tail_size < tail_size_pad) ------------------ | Branch (1711:6): [True: 10.0k, False: 8.03k] ------------------ 1712| 10.0k| { 1713| 10.0k| memset(data, 0, tail_size_pad - tail_size); 1714| 10.0k| data += tail_size_pad - tail_size; 1715| 10.0k| } 1716| | 1717| 18.0k| memcpy(data, first_vertex, vertex_size); 1718| 18.0k| data += vertex_size; 1719| | 1720| 18.0k| if (version != 0) ------------------ | Branch (1720:6): [True: 14.0k, False: 3.94k] ------------------ 1721| 14.0k| { 1722| 14.0k| memcpy(data, channels, vertex_size / 4); 1723| 14.0k| data += vertex_size / 4; 1724| 14.0k| } 1725| | 1726| 18.0k| assert(data >= buffer + tail_size); ------------------ | Branch (1726:2): [True: 18.0k, False: 0] ------------------ 1727| 18.0k| assert(data <= buffer + buffer_size); ------------------ | Branch (1727:2): [True: 18.0k, False: 0] ------------------ 1728| | 1729| |#if TRACE 1730| | size_t total_size = data - buffer; 1731| | 1732| | for (size_t k = 0; k < vertex_size; ++k) 1733| | { 1734| | const Stats& vsk = vertexstats[k]; 1735| | 1736| | printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8); 1737| | 1738| | size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8]; 1739| | double total_kr = total_k ? 1.0 / double(total_k) : 0; 1740| | 1741| | if (version != 0) 1742| | { 1743| | int channel = channels[k / 4]; 1744| | 1745| | if ((channel & 3) == 2 && k % 4 == 0) 1746| | printf(" | ^%d", channel >> 4); 1747| | else 1748| | printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : ".")); 1749| | } 1750| | 1751| | printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]", 1752| | double(vsk.header) * total_kr * 100, 1753| | double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100, 1754| | double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100); 1755| | 1756| | size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3]; 1757| | 1758| | if (total_ctrl) 1759| | { 1760| | printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%", 1761| | double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100, 1762| | double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100); 1763| | } 1764| | 1765| | if (level >= 3) 1766| | printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]", 1767| | double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100, 1768| | double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100, 1769| | double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100, 1770| | double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100); 1771| | 1772| | printf("\n"); 1773| | } 1774| |#endif 1775| | 1776| 18.0k| return data - buffer; 1777| 18.0k|} meshopt_encodeVertexBufferBound: 1785| 9.02k|{ 1786| 9.02k| using namespace meshopt; 1787| | 1788| 9.02k| assert(vertex_size > 0 && vertex_size <= 256); ------------------ | Branch (1788:2): [True: 9.02k, False: 0] | Branch (1788:2): [True: 9.02k, False: 0] | Branch (1788:2): [True: 9.02k, False: 0] ------------------ 1789| 9.02k| assert(vertex_size % 4 == 0); ------------------ | Branch (1789:2): [True: 9.02k, False: 0] ------------------ 1790| | 1791| 9.02k| size_t vertex_block_size = getVertexBlockSize(vertex_size); 1792| 9.02k| size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size; 1793| | 1794| 9.02k| size_t vertex_block_control_size = vertex_size / 4; 1795| 9.02k| size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4; 1796| 9.02k| size_t vertex_block_data_size = vertex_block_size; 1797| | 1798| 9.02k| size_t tail_size = vertex_size + (vertex_size / 4); 1799| 9.02k| size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1; ------------------ | Branch (1799:25): [True: 9.02k, Folded] ------------------ 1800| 9.02k| size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; ------------------ | Branch (1800:25): [True: 6.76k, False: 2.25k] ------------------ 1801| 9.02k| assert(tail_size_pad >= kByteGroupDecodeLimit); ------------------ | Branch (1801:2): [True: 9.02k, False: 0] ------------------ 1802| | 1803| 9.02k| return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad; 1804| 9.02k|} meshopt_encodeVertexVersion: 1807| 2.25k|{ 1808| 2.25k| assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion)); ------------------ | Branch (1808:2): [True: 2.25k, False: 0] ------------------ 1809| | 1810| 2.25k| meshopt::gEncodeVertexVersion = version; 1811| 2.25k|} meshopt_decodeVertexBuffer: 1831| 18.0k|{ 1832| 18.0k| using namespace meshopt; 1833| | 1834| 18.0k| assert(vertex_size > 0 && vertex_size <= 256); ------------------ | Branch (1834:2): [True: 18.0k, False: 0] | Branch (1834:2): [True: 18.0k, False: 0] | Branch (1834:2): [True: 18.0k, False: 0] ------------------ 1835| 18.0k| assert(vertex_size % 4 == 0); ------------------ | Branch (1835:2): [True: 18.0k, False: 0] ------------------ 1836| | 1837| 18.0k| const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL; 1838| | 1839| 18.0k|#if defined(SIMD_SSE) && defined(SIMD_FALLBACK) 1840| 18.0k| decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock; ------------------ | Branch (1840:11): [True: 18.0k, False: 0] ------------------ 1841| |#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM) 1842| | decode = decodeVertexBlockSimd; 1843| |#else 1844| | decode = decodeVertexBlock; 1845| |#endif 1846| | 1847| 18.0k|#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) 1848| 18.0k| assert(gDecodeBytesGroupInitialized); ------------------ | Branch (1848:2): [True: 18.0k, False: 0] ------------------ 1849| 18.0k| (void)gDecodeBytesGroupInitialized; 1850| 18.0k|#endif 1851| | 1852| 18.0k| unsigned char* vertex_data = static_cast(destination); 1853| | 1854| 18.0k| const unsigned char* data = buffer; 1855| 18.0k| const unsigned char* data_end = buffer + buffer_size; 1856| | 1857| 18.0k| if (size_t(data_end - data) < 1) ------------------ | Branch (1857:6): [True: 0, False: 18.0k] ------------------ 1858| 0| return -2; 1859| | 1860| 18.0k| unsigned char data_header = *data++; 1861| | 1862| 18.0k| if ((data_header & 0xf0) != kVertexHeader) ------------------ | Branch (1862:6): [True: 6.99k, False: 11.0k] ------------------ 1863| 6.99k| return -1; 1864| | 1865| 11.0k| int version = data_header & 0x0f; 1866| 11.0k| if (version > kDecodeVertexVersion) ------------------ | Branch (1866:6): [True: 172, False: 10.8k] ------------------ 1867| 172| return -1; 1868| | 1869| 10.8k| size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4); ------------------ | Branch (1869:36): [True: 2.54k, False: 8.33k] ------------------ 1870| 10.8k| size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1; ------------------ | Branch (1870:25): [True: 2.54k, False: 8.33k] ------------------ 1871| 10.8k| size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; ------------------ | Branch (1871:25): [True: 6.07k, False: 4.80k] ------------------ 1872| | 1873| 10.8k| if (size_t(data_end - data) < tail_size_pad) ------------------ | Branch (1873:6): [True: 219, False: 10.6k] ------------------ 1874| 219| return -2; 1875| | 1876| 10.6k| const unsigned char* tail = data_end - tail_size; 1877| | 1878| 10.6k| unsigned char last_vertex[256]; 1879| 10.6k| memcpy(last_vertex, tail, vertex_size); 1880| | 1881| 10.6k| const unsigned char* channels = version == 0 ? NULL : tail + vertex_size; ------------------ | Branch (1881:34): [True: 2.49k, False: 8.15k] ------------------ 1882| | 1883| 10.6k| size_t vertex_block_size = getVertexBlockSize(vertex_size); 1884| | 1885| 10.6k| size_t vertex_offset = 0; 1886| | 1887| 166k| while (vertex_offset < vertex_count) ------------------ | Branch (1887:9): [True: 157k, False: 9.63k] ------------------ 1888| 157k| { 1889| 157k| size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; ------------------ | Branch (1889:23): [True: 148k, False: 8.78k] ------------------ 1890| | 1891| 157k| data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version); 1892| 157k| if (!data) ------------------ | Branch (1892:7): [True: 1.01k, False: 156k] ------------------ 1893| 1.01k| return -2; 1894| | 1895| 156k| vertex_offset += block_size; 1896| 156k| } 1897| | 1898| 9.63k| if (size_t(data_end - data) != tail_size_pad) ------------------ | Branch (1898:6): [True: 616, False: 9.02k] ------------------ 1899| 616| return -3; 1900| | 1901| 9.02k| return 0; 1902| 9.63k|} vertexcodec.cpp:_ZN7meshoptL27decodeBytesGroupBuildTablesEv: 783| 2|{ 784| 514| for (int mask = 0; mask < 256; ++mask) ------------------ | Branch (784:21): [True: 512, False: 2] ------------------ 785| 512| { 786| 512| unsigned char shuffle[8]; 787| 512| unsigned char count = 0; 788| | 789| 4.60k| for (int i = 0; i < 8; ++i) ------------------ | Branch (789:19): [True: 4.09k, False: 512] ------------------ 790| 4.09k| { 791| 4.09k| int maski = (mask >> i) & 1; 792| 4.09k| shuffle[i] = maski ? count : 0x80; ------------------ | Branch (792:17): [True: 2.04k, False: 2.04k] ------------------ 793| 4.09k| count += (unsigned char)(maski); 794| 4.09k| } 795| | 796| 512| memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8); 797| 512| kDecodeBytesGroupCount[mask] = count; 798| 512| } 799| | 800| 2| return true; 801| 2|} vertexcodec.cpp:_ZN7meshoptL14getCpuFeaturesEv: 1631| 2|{ 1632| 2| int cpuinfo[4] = {}; 1633| |#ifdef _MSC_VER 1634| | __cpuid(cpuinfo, 1); 1635| |#else 1636| | __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); 1637| 2|#endif 1638| 2| return cpuinfo[2]; 1639| 2|} vertexcodec.cpp:_ZN7meshoptL18getVertexBlockSizeEm: 141| 37.7k|{ 142| | // make sure the entire block fits into the scratch buffer and is aligned to byte group size 143| | // note: the block size is implicitly part of the format, so we can't change it without breaking compatibility 144| 37.7k| size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1); 145| | 146| 37.7k| return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize; ------------------ | Branch (146:9): [True: 0, False: 37.7k] ------------------ 147| 37.7k|} vertexcodec.cpp:_ZN7meshoptL14estimateRotateEPKhmmmm: 370| 14.9k|{ 371| 14.9k| size_t sizes[8] = {}; 372| | 373| 14.9k| const unsigned char* vertex = vertex_data + k; 374| 14.9k| unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24); 375| | 376| 3.89M| for (size_t i = 0; i < vertex_count; i += group_size) ------------------ | Branch (376:21): [True: 3.88M, False: 14.9k] ------------------ 377| 3.88M| { 378| 3.88M| unsigned int bitg = 0; 379| | 380| | // calculate bit consistency mask for the group 381| 65.9M| for (size_t j = 0; j < group_size && i + j < vertex_count; ++j) ------------------ | Branch (381:22): [True: 62.0M, False: 3.87M] | Branch (381:40): [True: 62.0M, False: 12.7k] ------------------ 382| 62.0M| { 383| 62.0M| unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24); 384| 62.0M| unsigned int d = v ^ last; 385| | 386| 62.0M| bitg |= d; 387| 62.0M| last = v; 388| 62.0M| vertex += vertex_size; 389| 62.0M| } 390| | 391| |#if TRACE 392| | for (int j = 0; j < 32; ++j) 393| | vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1)); 394| |#endif 395| | 396| 34.9M| for (int j = 0; j < 8; ++j) ------------------ | Branch (396:19): [True: 31.0M, False: 3.88M] ------------------ 397| 31.0M| { 398| 31.0M| unsigned int bitr = rotate(bitg, j); 399| | 400| 31.0M| sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8)); 401| 31.0M| sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24)); 402| 31.0M| } 403| 3.88M| } 404| | 405| 14.9k| int best_rot = 0; 406| 119k| for (int rot = 1; rot < 8; ++rot) ------------------ | Branch (406:20): [True: 104k, False: 14.9k] ------------------ 407| 104k| best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot; ------------------ | Branch (407:14): [True: 6.76k, False: 97.6k] ------------------ 408| | 409| 14.9k| return best_rot; 410| 14.9k|} _ZN7meshopt6rotateEji: 150| 165M|{ 151| 165M| return (v << r) | (v >> ((32 - r) & 31)); 152| 165M|} vertexcodec.cpp:_ZN7meshoptL12estimateBitsEh: 365| 124M|{ 366| 124M| return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8; ------------------ | Branch (366:9): [True: 43.9M, False: 80.2M] | Branch (366:20): [True: 42.1M, False: 1.83M] | Branch (366:30): [True: 40.5M, False: 1.59M] ------------------ 367| 124M|} vertexcodec.cpp:_ZN7meshoptL15estimateChannelEPKhmmmmmii: 413| 17.1k|{ 414| 17.1k| unsigned char block[kVertexBlockMaxSize]; 415| 17.1k| assert(vertex_block_size <= kVertexBlockMaxSize); ------------------ | Branch (415:2): [True: 17.1k, False: 0] ------------------ 416| | 417| 17.1k| unsigned char last_vertex[256] = {}; 418| | 419| 17.1k| size_t sizes[3] = {}; 420| 17.1k| assert(max_channel <= 3); ------------------ | Branch (420:2): [True: 17.1k, False: 0] ------------------ 421| | 422| 148k| for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip) ------------------ | Branch (422:21): [True: 130k, False: 17.1k] ------------------ 423| 130k| { 424| 130k| size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i; ------------------ | Branch (424:23): [True: 118k, False: 12.7k] ------------------ 425| 130k| size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1); 426| | 427| 130k| memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size); ------------------ | Branch (427:38): [True: 17.1k, False: 113k] ------------------ 428| | 429| | // we sometimes encode elements we didn't fill when rounding to kByteGroupSize 430| 130k| if (block_size < block_size_aligned) ------------------ | Branch (430:7): [True: 11.1k, False: 119k] ------------------ 431| 11.1k| memset(block + block_size, 0, block_size_aligned - block_size); 432| | 433| 485k| for (int channel = 0; channel < max_channel; ++channel) ------------------ | Branch (433:25): [True: 354k, False: 130k] ------------------ 434| 1.77M| for (size_t j = 0; j < 4; ++j) ------------------ | Branch (434:23): [True: 1.41M, False: 354k] ------------------ 435| 1.41M| { 436| 1.41M| encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4)); 437| | 438| 22.3M| for (size_t ig = 0; ig < block_size; ig += kByteGroupSize) ------------------ | Branch (438:25): [True: 20.9M, False: 1.41M] ------------------ 439| 20.9M| { 440| | // to maximize encoding performance we only evaluate 1/2/4/8 bit groups 441| 20.9M| size_t size1 = encodeBytesGroupMeasure(block + ig, 1); 442| 20.9M| size_t size2 = encodeBytesGroupMeasure(block + ig, 2); 443| 20.9M| size_t size4 = encodeBytesGroupMeasure(block + ig, 4); 444| 20.9M| size_t size8 = encodeBytesGroupMeasure(block + ig, 8); 445| | 446| 20.9M| size_t best_size = size1 < size2 ? size1 : size2; ------------------ | Branch (446:25): [True: 19.5M, False: 1.36M] ------------------ 447| 20.9M| best_size = best_size < size4 ? best_size : size4; ------------------ | Branch (447:18): [True: 20.7M, False: 149k] ------------------ 448| 20.9M| best_size = best_size < size8 ? best_size : size8; ------------------ | Branch (448:18): [True: 12.3M, False: 8.50M] ------------------ 449| | 450| 20.9M| sizes[channel] += best_size; 451| 20.9M| } 452| 1.41M| } 453| 130k| } 454| | 455| 17.1k| int best_channel = 0; 456| 49.2k| for (int channel = 1; channel < max_channel; ++channel) ------------------ | Branch (456:24): [True: 32.0k, False: 17.1k] ------------------ 457| 32.0k| best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel; ------------------ | Branch (457:18): [True: 4.98k, False: 27.0k] ------------------ 458| | 459| 17.1k| return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel; ------------------ | Branch (459:9): [True: 2.35k, False: 14.8k] ------------------ 460| 17.1k|} vertexcodec.cpp:_ZN7meshoptL12encodeDeltasEPhPKhmmS2_mi: 350| 4.73M|{ 351| 4.73M| switch (channel & 3) 352| 4.73M| { 353| 3.52M| case 0: ------------------ | Branch (353:2): [True: 3.52M, False: 1.21M] ------------------ 354| 3.52M| return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0); 355| 646k| case 1: ------------------ | Branch (355:2): [True: 646k, False: 4.08M] ------------------ 356| 646k| return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0); 357| 565k| case 2: ------------------ | Branch (357:2): [True: 565k, False: 4.16M] ------------------ 358| 565k| return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4); 359| 0| default: ------------------ | Branch (359:2): [True: 0, False: 4.73M] ------------------ 360| | assert(!"Unsupported channel encoding"); // unreachable ------------------ | Branch (360:3): [Folded, False: 0] ------------------ 361| 4.73M| } 362| 4.73M|} vertexcodec.cpp:_ZN7meshoptL13encodeDeltas1IhLb0EEEvPhPKhmmS3_mi: 325| 3.52M|{ 326| 3.52M| size_t k0 = k & ~(sizeof(T) - 1); 327| 3.52M| int ks = (k & (sizeof(T) - 1)) * 8; 328| | 329| 3.52M| T p = last_vertex[k0]; 330| 3.52M| for (size_t j = 1; j < sizeof(T); ++j) ------------------ | Branch (330:21): [True: 0, False: 3.52M] ------------------ 331| 0| p |= T(last_vertex[k0 + j]) << (j * 8); 332| | 333| 3.52M| const unsigned char* vertex = vertex_data + k0; 334| | 335| 845M| for (size_t i = 0; i < vertex_count; ++i) ------------------ | Branch (335:21): [True: 841M, False: 3.52M] ------------------ 336| 841M| { 337| 841M| T v = vertex[0]; 338| 841M| for (size_t j = 1; j < sizeof(T); ++j) ------------------ | Branch (338:22): [True: 0, False: 841M] ------------------ 339| 0| v |= vertex[j] << (j * 8); 340| | 341| 841M| T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); ------------------ | Branch (341:9): [Folded, False: 841M] ------------------ 342| | 343| 841M| buffer[i] = (unsigned char)(d >> ks); 344| 841M| p = v; 345| 841M| vertex += vertex_size; 346| 841M| } 347| 3.52M|} _ZN7meshopt6zigzagIhEET_S1_: 156| 841M|{ 157| 841M| return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1); 158| 841M|} vertexcodec.cpp:_ZN7meshoptL13encodeDeltas1ItLb0EEEvPhPKhmmS3_mi: 325| 646k|{ 326| 646k| size_t k0 = k & ~(sizeof(T) - 1); 327| 646k| int ks = (k & (sizeof(T) - 1)) * 8; 328| | 329| 646k| T p = last_vertex[k0]; 330| 1.29M| for (size_t j = 1; j < sizeof(T); ++j) ------------------ | Branch (330:21): [True: 646k, False: 646k] ------------------ 331| 646k| p |= T(last_vertex[k0 + j]) << (j * 8); 332| | 333| 646k| const unsigned char* vertex = vertex_data + k0; 334| | 335| 153M| for (size_t i = 0; i < vertex_count; ++i) ------------------ | Branch (335:21): [True: 153M, False: 646k] ------------------ 336| 153M| { 337| 153M| T v = vertex[0]; 338| 306M| for (size_t j = 1; j < sizeof(T); ++j) ------------------ | Branch (338:22): [True: 153M, False: 153M] ------------------ 339| 153M| v |= vertex[j] << (j * 8); 340| | 341| 153M| T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); ------------------ | Branch (341:9): [Folded, False: 153M] ------------------ 342| | 343| 153M| buffer[i] = (unsigned char)(d >> ks); 344| 153M| p = v; 345| 153M| vertex += vertex_size; 346| 153M| } 347| 646k|} _ZN7meshopt6zigzagItEET_S1_: 156| 153M|{ 157| 153M| return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1); 158| 153M|} vertexcodec.cpp:_ZN7meshoptL13encodeDeltas1IjLb1EEEvPhPKhmmS3_mi: 325| 565k|{ 326| 565k| size_t k0 = k & ~(sizeof(T) - 1); 327| 565k| int ks = (k & (sizeof(T) - 1)) * 8; 328| | 329| 565k| T p = last_vertex[k0]; 330| 2.26M| for (size_t j = 1; j < sizeof(T); ++j) ------------------ | Branch (330:21): [True: 1.69M, False: 565k] ------------------ 331| 1.69M| p |= T(last_vertex[k0 + j]) << (j * 8); 332| | 333| 565k| const unsigned char* vertex = vertex_data + k0; 334| | 335| 134M| for (size_t i = 0; i < vertex_count; ++i) ------------------ | Branch (335:21): [True: 134M, False: 565k] ------------------ 336| 134M| { 337| 134M| T v = vertex[0]; 338| 537M| for (size_t j = 1; j < sizeof(T); ++j) ------------------ | Branch (338:22): [True: 403M, False: 134M] ------------------ 339| 403M| v |= vertex[j] << (j * 8); 340| | 341| 134M| T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); ------------------ | Branch (341:9): [True: 134M, Folded] ------------------ 342| | 343| 134M| buffer[i] = (unsigned char)(d >> ks); 344| 134M| p = v; 345| 134M| vertex += vertex_size; 346| 134M| } 347| 565k|} vertexcodec.cpp:_ZN7meshoptL23encodeBytesGroupMeasureEPKhi: 191| 257M|{ 192| 257M| assert(bits >= 0 && bits <= 8); ------------------ | Branch (192:2): [True: 257M, False: 0] | Branch (192:2): [True: 257M, False: 0] | Branch (192:2): [True: 257M, False: 0] ------------------ 193| | 194| 257M| if (bits == 0) ------------------ | Branch (194:6): [True: 25.0M, False: 232M] ------------------ 195| 25.0M| return encodeBytesGroupZero(buffer) ? 0 : size_t(-1); ------------------ | Branch (195:10): [True: 7.07M, False: 17.9M] ------------------ 196| | 197| 232M| if (bits == 8) ------------------ | Branch (197:6): [True: 56.3M, False: 176M] ------------------ 198| 56.3M| return kByteGroupSize; 199| | 200| 176M| size_t result = kByteGroupSize * bits / 8; 201| | 202| 176M| unsigned char sentinel = (1 << bits) - 1; 203| | 204| 2.99G| for (size_t i = 0; i < kByteGroupSize; ++i) ------------------ | Branch (204:21): [True: 2.81G, False: 176M] ------------------ 205| 2.81G| result += buffer[i] >= sentinel; 206| | 207| 176M| return result; 208| 232M|} vertexcodec.cpp:_ZN7meshoptL20encodeBytesGroupZeroEPKh: 181| 46.2M|{ 182| 46.2M| assert(kByteGroupSize == sizeof(unsigned long long) * 2); ------------------ | Branch (182:2): [True: 46.2M, Folded] ------------------ 183| | 184| 46.2M| unsigned long long v[2]; 185| 46.2M| memcpy(v, buffer, sizeof(v)); 186| | 187| 46.2M| return (v[0] | v[1]) == 0; 188| 46.2M|} vertexcodec.cpp:_ZN7meshoptL17encodeVertexBlockEPhS0_PKhmmS0_S2_ii: 510| 311k|{ 511| 311k| assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); ------------------ | Branch (511:2): [True: 311k, False: 0] | Branch (511:2): [True: 311k, False: 0] | Branch (511:2): [True: 311k, False: 0] ------------------ 512| 311k| assert(vertex_size % 4 == 0); ------------------ | Branch (512:2): [True: 311k, False: 0] ------------------ 513| | 514| 311k| unsigned char buffer[kVertexBlockMaxSize]; 515| 311k| assert(sizeof(buffer) % kByteGroupSize == 0); ------------------ | Branch (515:2): [True: 311k, Folded] ------------------ 516| | 517| 311k| size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); 518| | 519| | // we sometimes encode elements we didn't fill when rounding to kByteGroupSize 520| 311k| memset(buffer, 0, sizeof(buffer)); 521| | 522| 311k| size_t control_size = version == 0 ? 0 : vertex_size / 4; ------------------ | Branch (522:24): [True: 27.9k, False: 283k] ------------------ 523| 311k| if (size_t(data_end - data) < control_size) ------------------ | Branch (523:6): [True: 0, False: 311k] ------------------ 524| 0| return NULL; 525| | 526| 311k| unsigned char* control = data; 527| 311k| data += control_size; 528| | 529| 311k| memset(control, 0, control_size); 530| | 531| 3.62M| for (size_t k = 0; k < vertex_size; ++k) ------------------ | Branch (531:21): [True: 3.31M, False: 311k] ------------------ 532| 3.31M| { 533| 3.31M| encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]); ------------------ | Branch (533:80): [True: 311k, False: 3.00M] ------------------ 534| | 535| |#if TRACE 536| | const unsigned char* olddata = data; 537| | bytestats = &vertexstats[k]; 538| |#endif 539| | 540| 3.31M| int ctrl = 0; 541| | 542| 3.31M| if (version != 0) ------------------ | Branch (542:7): [True: 3.00M, False: 311k] ------------------ 543| 3.00M| { 544| 3.00M| ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level); 545| | 546| 3.00M| assert(unsigned(ctrl) < 4); ------------------ | Branch (546:4): [True: 3.00M, False: 0] ------------------ 547| 3.00M| control[k / 4] |= ctrl << ((k % 4) * 2); 548| | 549| |#if TRACE 550| | vertexstats[k].ctrl[ctrl]++; 551| |#endif 552| 3.00M| } 553| | 554| 3.31M| if (ctrl == 3) ------------------ | Branch (554:7): [True: 648k, False: 2.66M] ------------------ 555| 648k| { 556| | // literal encoding 557| 648k| if (size_t(data_end - data) < vertex_count) ------------------ | Branch (557:8): [True: 0, False: 648k] ------------------ 558| 0| return NULL; 559| | 560| 648k| memcpy(data, buffer, vertex_count); 561| 648k| data += vertex_count; 562| 648k| } 563| 2.66M| else if (ctrl != 2) // non-zero encoding ------------------ | Branch (563:12): [True: 1.47M, False: 1.19M] ------------------ 564| 1.47M| { 565| 1.47M| data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl); ------------------ | Branch (565:69): [True: 311k, False: 1.16M] ------------------ 566| 1.47M| if (!data) ------------------ | Branch (566:8): [True: 0, False: 1.47M] ------------------ 567| 0| return NULL; 568| 1.47M| } 569| | 570| |#if TRACE 571| | bytestats = NULL; 572| | vertexstats[k].size += data - olddata; 573| |#endif 574| 3.31M| } 575| | 576| 311k| memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size); 577| | 578| 311k| return data; 579| 311k|} vertexcodec.cpp:_ZN7meshoptL15estimateControlEPKhmmi: 472| 3.00M|{ 473| 3.00M| if (estimateControlZero(buffer, vertex_count_aligned)) ------------------ | Branch (473:6): [True: 1.19M, False: 1.81M] ------------------ 474| 1.19M| return 2; // zero encoding 475| | 476| 1.81M| if (level == 0) ------------------ | Branch (476:6): [True: 677k, False: 1.13M] ------------------ 477| 677k| return 1; // 1248 encoding in level 0 for encoding speed 478| | 479| | // round number of groups to 4 to get number of header bytes 480| 1.13M| size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4; 481| | 482| 1.13M| size_t est_bytes0 = header_size, est_bytes1 = header_size; 483| | 484| 18.2M| for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) ------------------ | Branch (484:21): [True: 17.1M, False: 1.13M] ------------------ 485| 17.1M| { 486| | // assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance 487| 17.1M| size_t size0 = encodeBytesGroupMeasure(buffer + i, 0); 488| 17.1M| size_t size1 = encodeBytesGroupMeasure(buffer + i, 1); 489| 17.1M| size_t size2 = encodeBytesGroupMeasure(buffer + i, 2); 490| 17.1M| size_t size4 = encodeBytesGroupMeasure(buffer + i, 4); 491| 17.1M| size_t size8 = encodeBytesGroupMeasure(buffer + i, 8); 492| | 493| | // both control modes have access to 1/2/4 bit encoding 494| 17.1M| size_t size12 = size1 < size2 ? size1 : size2; ------------------ | Branch (494:19): [True: 14.9M, False: 2.15M] ------------------ 495| 17.1M| size_t size124 = size12 < size4 ? size12 : size4; ------------------ | Branch (495:20): [True: 16.9M, False: 184k] ------------------ 496| | 497| | // each control mode has access to 0/8 bit encoding respectively 498| 17.1M| est_bytes0 += size124 < size0 ? size124 : size0; ------------------ | Branch (498:17): [True: 14.3M, False: 2.80M] ------------------ 499| 17.1M| est_bytes1 += size124 < size8 ? size124 : size8; ------------------ | Branch (499:17): [True: 6.42M, False: 10.7M] ------------------ 500| 17.1M| } 501| | 502| | // pick shortest control entry but prefer literal encoding 503| 1.13M| if (est_bytes0 < vertex_count || est_bytes1 < vertex_count) ------------------ | Branch (503:6): [True: 445k, False: 691k] | Branch (503:35): [True: 43.1k, False: 648k] ------------------ 504| 488k| return est_bytes0 < est_bytes1 ? 0 : 1; ------------------ | Branch (504:10): [True: 235k, False: 252k] ------------------ 505| 648k| else 506| 648k| return 3; // literal encoding 507| 1.13M|} vertexcodec.cpp:_ZN7meshoptL19estimateControlZeroEPKhm: 463| 3.00M|{ 464| 22.4M| for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) ------------------ | Branch (464:21): [True: 21.2M, False: 1.19M] ------------------ 465| 21.2M| if (!encodeBytesGroupZero(buffer + i)) ------------------ | Branch (465:7): [True: 1.81M, False: 19.3M] ------------------ 466| 1.81M| return false; 467| | 468| 1.19M| return true; 469| 3.00M|} vertexcodec.cpp:_ZN7meshoptL11encodeBytesEPhS0_PKhmPKi: 264| 1.47M|{ 265| 1.47M| assert(buffer_size % kByteGroupSize == 0); ------------------ | Branch (265:2): [True: 1.47M, False: 0] ------------------ 266| | 267| 1.47M| unsigned char* header = data; 268| | 269| | // round number of groups to 4 to get number of header bytes 270| 1.47M| size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; 271| | 272| 1.47M| if (size_t(data_end - data) < header_size) ------------------ | Branch (272:6): [True: 0, False: 1.47M] ------------------ 273| 0| return NULL; 274| | 275| 1.47M| data += header_size; 276| | 277| 1.47M| memset(header, 0, header_size); 278| | 279| 1.47M| int last_bits = -1; 280| | 281| 23.5M| for (size_t i = 0; i < buffer_size; i += kByteGroupSize) ------------------ | Branch (281:21): [True: 22.0M, False: 1.47M] ------------------ 282| 22.0M| { 283| 22.0M| if (size_t(data_end - data) < kByteGroupDecodeLimit) ------------------ | Branch (283:7): [True: 0, False: 22.0M] ------------------ 284| 0| return NULL; 285| | 286| 22.0M| int best_bitk = 3; 287| 22.0M| size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]); 288| | 289| 88.1M| for (int bitk = 0; bitk < 3; ++bitk) ------------------ | Branch (289:22): [True: 66.1M, False: 22.0M] ------------------ 290| 66.1M| { 291| 66.1M| size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]); 292| | 293| | // favor consistent bit selection across groups, but never replace literals 294| 66.1M| if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8)) ------------------ | Branch (294:8): [True: 11.1M, False: 55.0M] | Branch (294:29): [True: 1.34M, False: 53.6M] | Branch (294:50): [True: 270k, False: 1.06M] | Branch (294:77): [True: 56.4k, False: 213k] ------------------ 295| 11.1M| { 296| 11.1M| best_bitk = bitk; 297| 11.1M| best_size = size; 298| 11.1M| } 299| 66.1M| } 300| | 301| 22.0M| size_t header_offset = i / kByteGroupSize; 302| 22.0M| header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2); 303| | 304| 22.0M| int best_bits = bits[best_bitk]; 305| 22.0M| unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits); 306| | 307| 22.0M| assert(data + best_size == next); ------------------ | Branch (307:3): [True: 22.0M, False: 0] ------------------ 308| 22.0M| data = next; 309| 22.0M| last_bits = best_bits; 310| | 311| |#if TRACE 312| | bytestats->bitg[best_bits] += best_size; 313| |#endif 314| 22.0M| } 315| | 316| |#if TRACE 317| | bytestats->header += header_size; 318| |#endif 319| | 320| 1.47M| return data; 321| 1.47M|} vertexcodec.cpp:_ZN7meshoptL16encodeBytesGroupEPhPKhi: 211| 22.0M|{ 212| 22.0M| assert(bits >= 0 && bits <= 8); ------------------ | Branch (212:2): [True: 22.0M, False: 0] | Branch (212:2): [True: 22.0M, False: 0] | Branch (212:2): [True: 22.0M, False: 0] ------------------ 213| 22.0M| assert(kByteGroupSize % 8 == 0); ------------------ | Branch (213:2): [True: 22.0M, Folded] ------------------ 214| | 215| 22.0M| if (bits == 0) ------------------ | Branch (215:6): [True: 4.27M, False: 17.7M] ------------------ 216| 4.27M| return data; 217| | 218| 17.7M| if (bits == 8) ------------------ | Branch (218:6): [True: 12.3M, False: 5.40M] ------------------ 219| 12.3M| { 220| 12.3M| memcpy(data, buffer, kByteGroupSize); 221| 12.3M| return data + kByteGroupSize; 222| 12.3M| } 223| | 224| 5.40M| size_t byte_size = 8 / bits; 225| 5.40M| assert(kByteGroupSize % byte_size == 0); ------------------ | Branch (225:2): [True: 5.40M, False: 0] ------------------ 226| | 227| | // fixed portion: bits bits for each value 228| | // variable portion: full byte for each out-of-range value (using 1...1 as sentinel) 229| 5.40M| unsigned char sentinel = (1 << bits) - 1; 230| | 231| 20.7M| for (size_t i = 0; i < kByteGroupSize; i += byte_size) ------------------ | Branch (231:21): [True: 15.3M, False: 5.40M] ------------------ 232| 15.3M| { 233| 15.3M| unsigned char byte = 0; 234| | 235| 101M| for (size_t k = 0; k < byte_size; ++k) ------------------ | Branch (235:22): [True: 86.4M, False: 15.3M] ------------------ 236| 86.4M| { 237| 86.4M| unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k]; ------------------ | Branch (237:24): [True: 28.3M, False: 58.1M] ------------------ 238| | 239| 86.4M| byte <<= bits; 240| 86.4M| byte |= enc; 241| 86.4M| } 242| | 243| | // encode 1-bit groups in reverse bit order 244| | // this makes them faster to decode alongside other groups 245| 15.3M| if (bits == 1) ------------------ | Branch (245:7): [True: 6.68M, False: 8.68M] ------------------ 246| 6.68M| byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); 247| | 248| 15.3M| *data++ = byte; 249| 15.3M| } 250| | 251| 91.8M| for (size_t i = 0; i < kByteGroupSize; ++i) ------------------ | Branch (251:21): [True: 86.4M, False: 5.40M] ------------------ 252| 86.4M| { 253| 86.4M| unsigned char v = buffer[i]; 254| | 255| | // branchless append of out-of-range values 256| 86.4M| *data = v; 257| 86.4M| data += v >= sentinel; 258| 86.4M| } 259| | 260| 5.40M| return data; 261| 5.40M|} vertexcodec.cpp:_ZN7meshoptL21decodeVertexBlockSimdEPKhS1_PhmmS2_S1_i: 1555| 157k|{ 1556| 157k| assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); ------------------ | Branch (1556:2): [True: 157k, False: 0] | Branch (1556:2): [True: 157k, False: 0] | Branch (1556:2): [True: 157k, False: 0] ------------------ 1557| | 1558| 157k| unsigned char buffer[kVertexBlockMaxSize * 4]; 1559| 157k| unsigned char transposed[kVertexBlockSizeBytes]; 1560| | 1561| 157k| size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); 1562| | 1563| 157k| size_t control_size = version == 0 ? 0 : vertex_size / 4; ------------------ | Branch (1563:24): [True: 14.4k, False: 142k] ------------------ 1564| 157k| if (size_t(data_end - data) < control_size) ------------------ | Branch (1564:6): [True: 0, False: 157k] ------------------ 1565| 0| return NULL; 1566| | 1567| 157k| const unsigned char* control = data; 1568| 157k| data += control_size; 1569| | 1570| 575k| for (size_t k = 0; k < vertex_size; k += 4) ------------------ | Branch (1570:21): [True: 418k, False: 156k] ------------------ 1571| 418k| { 1572| 418k| unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4]; ------------------ | Branch (1572:29): [True: 40.4k, False: 378k] ------------------ 1573| | 1574| 2.09M| for (size_t j = 0; j < 4; ++j) ------------------ | Branch (1574:22): [True: 1.67M, False: 418k] ------------------ 1575| 1.67M| { 1576| 1.67M| int ctrl = (ctrl_byte >> (j * 2)) & 3; 1577| | 1578| 1.67M| if (ctrl == 3) ------------------ | Branch (1578:8): [True: 325k, False: 1.34M] ------------------ 1579| 325k| { 1580| | // literal encoding; safe to over-copy due to tail 1581| 325k| if (size_t(data_end - data) < vertex_count_aligned) ------------------ | Branch (1581:9): [True: 104, False: 325k] ------------------ 1582| 104| return NULL; 1583| | 1584| 325k| memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned); 1585| 325k| data += vertex_count; 1586| 325k| } 1587| 1.34M| else if (ctrl == 2) ------------------ | Branch (1587:13): [True: 598k, False: 750k] ------------------ 1588| 598k| { 1589| | // zero encoding 1590| 598k| memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned); 1591| 598k| } 1592| 750k| else 1593| 750k| { 1594| | // for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8 1595| 750k| int hshift = version == 0 ? 0 : 4 + ctrl; ------------------ | Branch (1595:18): [True: 161k, False: 589k] ------------------ 1596| | 1597| 750k| data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift); 1598| 750k| if (!data) ------------------ | Branch (1598:9): [True: 728, False: 750k] ------------------ 1599| 728| return NULL; 1600| 750k| } 1601| 1.67M| } 1602| | 1603| 418k| int channel = version == 0 ? 0 : channels[k / 4]; ------------------ | Branch (1603:17): [True: 40.1k, False: 377k] ------------------ 1604| | 1605| 418k| switch (channel & 3) 1606| 418k| { 1607| 377k| case 0: ------------------ | Branch (1607:3): [True: 377k, False: 40.8k] ------------------ 1608| 377k| decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0); 1609| 377k| break; 1610| 15.8k| case 1: ------------------ | Branch (1610:3): [True: 15.8k, False: 402k] ------------------ 1611| 15.8k| decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0); 1612| 15.8k| break; 1613| 24.8k| case 2: ------------------ | Branch (1613:3): [True: 24.8k, False: 393k] ------------------ 1614| 24.8k| decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31); 1615| 24.8k| break; 1616| 182| default: ------------------ | Branch (1616:3): [True: 182, False: 417k] ------------------ 1617| 182| return NULL; // invalid channel type 1618| 418k| } 1619| 418k| } 1620| | 1621| 156k| memcpy(vertex_data, transposed, vertex_count * vertex_size); 1622| | 1623| 156k| memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size); 1624| | 1625| 156k| return data; 1626| 157k|} vertexcodec.cpp:_ZN7meshoptL15decodeBytesSimdEPKhS1_Phmi: 1426| 750k|{ 1427| 750k| assert(buffer_size % kByteGroupSize == 0); ------------------ | Branch (1427:2): [True: 750k, False: 0] ------------------ 1428| 750k| assert(kByteGroupSize == 16); ------------------ | Branch (1428:2): [True: 750k, Folded] ------------------ 1429| | 1430| | // round number of groups to 4 to get number of header bytes 1431| 750k| size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; 1432| 750k| if (size_t(data_end - data) < header_size) ------------------ | Branch (1432:6): [True: 20, False: 750k] ------------------ 1433| 20| return NULL; 1434| | 1435| 750k| const unsigned char* header = data; 1436| 750k| data += header_size; 1437| | 1438| 750k| size_t i = 0; 1439| | 1440| | // fast-path: process 4 groups at a time, do a shared bounds check 1441| 3.48M| for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4) ------------------ | Branch (1441:9): [True: 2.74M, False: 744k] | Branch (1441:50): [True: 2.73M, False: 5.99k] ------------------ 1442| 2.73M| { 1443| 2.73M| size_t header_offset = i / kByteGroupSize; 1444| 2.73M| unsigned char header_byte = header[header_offset / 4]; 1445| | 1446| 2.73M| data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3)); 1447| 2.73M| data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3)); 1448| 2.73M| data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3)); 1449| 2.73M| data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3)); 1450| 2.73M| } 1451| | 1452| | // slow-path: process remaining groups 1453| 883k| for (; i < buffer_size; i += kByteGroupSize) ------------------ | Branch (1453:9): [True: 133k, False: 750k] ------------------ 1454| 133k| { 1455| 133k| if (size_t(data_end - data) < kByteGroupDecodeLimit) ------------------ | Branch (1455:7): [True: 708, False: 133k] ------------------ 1456| 708| return NULL; 1457| | 1458| 133k| size_t header_offset = i / kByteGroupSize; 1459| 133k| unsigned char header_byte = header[header_offset / 4]; 1460| | 1461| 133k| data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3)); 1462| 133k| } 1463| | 1464| 750k| return data; 1465| 750k|} _ZN7meshopt20decodeBytesGroupSimdEPKhPhi: 827| 11.0M|{ 828| 11.0M| switch (hbits) 829| 11.0M| { 830| 782k| case 0: ------------------ | Branch (830:2): [True: 782k, False: 10.2M] ------------------ 831| 2.17M| case 4: ------------------ | Branch (831:2): [True: 1.38M, False: 9.68M] ------------------ 832| 2.17M| { 833| 2.17M| __m128i result = _mm_setzero_si128(); 834| | 835| 2.17M| _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); 836| | 837| 2.17M| return data; 838| 782k| } 839| | 840| 129k| case 1: ------------------ | Branch (840:2): [True: 129k, False: 10.9M] ------------------ 841| 982k| case 6: ------------------ | Branch (841:2): [True: 852k, False: 10.2M] ------------------ 842| 982k| { 843| 982k|#ifdef SIMD_LATENCYOPT 844| 982k| unsigned int data32; 845| 982k| memcpy(&data32, data, 4); 846| 982k| data32 &= data32 >> 1; 847| | 848| | // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32 849| 982k| unsigned long long data64 = ((unsigned long long)data32 << 30) | data32; 850| | 851| | // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 852| 982k| int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); 853| 982k|#endif 854| | 855| 982k| __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast(data)); 856| 982k| __m128i rest = _mm_loadu_si128(reinterpret_cast(data + 4)); 857| | 858| 982k| __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2); 859| 982k| __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22); 860| 982k| __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3)); 861| | 862| 982k| __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3)); 863| 982k| int mask16 = _mm_movemask_epi8(mask); 864| 982k| unsigned char mask0 = (unsigned char)(mask16 & 255); 865| 982k| unsigned char mask1 = (unsigned char)(mask16 >> 8); 866| | 867| 982k| __m128i shuf = decodeShuffleMask(mask0, mask1); 868| 982k| __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); 869| | 870| 982k| _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); 871| | 872| 982k|#ifdef SIMD_LATENCYOPT 873| 982k| return data + 4 + datacnt; 874| |#else 875| | return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; 876| |#endif 877| 129k| } 878| | 879| 7.40k| case 2: ------------------ | Branch (879:2): [True: 7.40k, False: 11.0M] ------------------ 880| 59.4k| case 7: ------------------ | Branch (880:2): [True: 52.0k, False: 11.0M] ------------------ 881| 59.4k| { 882| 59.4k|#ifdef SIMD_LATENCYOPT 883| 59.4k| unsigned long long data64; 884| 59.4k| memcpy(&data64, data, 8); 885| 59.4k| data64 &= data64 >> 1; 886| 59.4k| data64 &= data64 >> 2; 887| | 888| | // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 889| 59.4k| int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); 890| 59.4k|#endif 891| | 892| 59.4k| __m128i sel4 = _mm_loadl_epi64(reinterpret_cast(data)); 893| 59.4k| __m128i rest = _mm_loadu_si128(reinterpret_cast(data + 8)); 894| | 895| 59.4k| __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4); 896| 59.4k| __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15)); 897| | 898| 59.4k| __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15)); 899| 59.4k| int mask16 = _mm_movemask_epi8(mask); 900| 59.4k| unsigned char mask0 = (unsigned char)(mask16 & 255); 901| 59.4k| unsigned char mask1 = (unsigned char)(mask16 >> 8); 902| | 903| 59.4k| __m128i shuf = decodeShuffleMask(mask0, mask1); 904| 59.4k| __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); 905| | 906| 59.4k| _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); 907| | 908| 59.4k|#ifdef SIMD_LATENCYOPT 909| 59.4k| return data + 8 + datacnt; 910| |#else 911| | return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; 912| |#endif 913| 7.40k| } 914| | 915| 1.21M| case 3: ------------------ | Branch (915:2): [True: 1.21M, False: 9.86M] ------------------ 916| 6.18M| case 8: ------------------ | Branch (916:2): [True: 4.97M, False: 6.10M] ------------------ 917| 6.18M| { 918| 6.18M| __m128i result = _mm_loadu_si128(reinterpret_cast(data)); 919| | 920| 6.18M| _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); 921| | 922| 6.18M| return data + 16; 923| 1.21M| } 924| | 925| 1.67M| case 5: ------------------ | Branch (925:2): [True: 1.67M, False: 9.40M] ------------------ 926| 1.67M| { 927| 1.67M| __m128i rest = _mm_loadu_si128(reinterpret_cast(data + 2)); 928| | 929| 1.67M| unsigned char mask0 = data[0]; 930| 1.67M| unsigned char mask1 = data[1]; 931| | 932| 1.67M| __m128i shuf = decodeShuffleMask(mask0, mask1); 933| 1.67M| __m128i result = _mm_shuffle_epi8(rest, shuf); 934| | 935| 1.67M| _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); 936| | 937| 1.67M| return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; 938| 1.21M| } 939| | 940| 0| default: ------------------ | Branch (940:2): [True: 0, False: 11.0M] ------------------ 941| 0| SIMD_UNREACHABLE(); // unreachable ------------------ | | 65| 0|#define SIMD_UNREACHABLE() __builtin_unreachable() ------------------ 942| 11.0M| } 943| 11.0M|} _ZN7meshopt17decodeShuffleMaskEhh: 809| 2.71M|{ 810| 2.71M| __m128i sm0 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask0])); 811| 2.71M| __m128i sm1 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask1])); 812| 2.71M| __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]); 813| | 814| 2.71M| __m128i sm1r = _mm_add_epi8(sm1, sm1off); 815| | 816| 2.71M| return _mm_unpacklo_epi64(sm0, sm1r); 817| 2.71M|} vertexcodec.cpp:_ZN7meshoptL17decodeDeltas4SimdILi0EEEvPKhPhmmS3_i: 1470| 377k|{ 1471| 377k|#if defined(SIMD_SSE) || defined(SIMD_AVX) 1472| 377k|#define TEMP __m128i 1473| 377k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) 1474| 377k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) 1475| 377k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) 1476| 377k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) 1477| 377k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size 1478| 377k|#endif 1479| | 1480| |#ifdef SIMD_NEON 1481| |#define TEMP uint8x8_t 1482| |#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex), vdup_n_u32(0), 0)) 1483| |#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) 1484| |#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) 1485| |#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) 1486| |#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size 1487| |#endif 1488| | 1489| |#ifdef SIMD_WASM 1490| |#define TEMP v128_t 1491| |#define PREP() v128_t pi = wasm_v128_load(last_vertex) 1492| |#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) 1493| |#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) 1494| |#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) 1495| |#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size 1496| |#endif 1497| | 1498| 377k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) 1499| | 1500| 377k| PREP(); ------------------ | | 1473| 377k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) ------------------ 1501| | 1502| 377k| unsigned char* savep = transposed; 1503| | 1504| 6.01M| for (size_t j = 0; j < vertex_count_aligned; j += 16) ------------------ | Branch (1504:21): [True: 5.63M, False: 377k] ------------------ 1505| 5.63M| { 1506| 5.63M| LOAD(0); ------------------ | | 1474| 5.63M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1507| 5.63M| LOAD(1); ------------------ | | 1474| 5.63M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1508| 5.63M| LOAD(2); ------------------ | | 1474| 5.63M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1509| 5.63M| LOAD(3); ------------------ | | 1474| 5.63M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1510| | 1511| 5.63M| transpose8(r0, r1, r2, r3); 1512| | 1513| 5.63M| TEMP t0, t1, t2, t3; ------------------ | | 1472| 5.63M|#define TEMP __m128i ------------------ 1514| 5.63M| TEMP npi = pi; ------------------ | | 1472| 5.63M|#define TEMP __m128i ------------------ 1515| | 1516| 5.63M| UNZR(0); ------------------ | | 1498| 5.63M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1498:24): [True: 5.63M, Folded] | | | Branch (1498:58): [Folded, False: 0] | | ------------------ ------------------ 1517| 5.63M| GRP4(0); ------------------ | | 1475| 5.63M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1518| 5.63M| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ 1519| 5.63M| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1520| | 1521| 5.63M| UNZR(1); ------------------ | | 1498| 5.63M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1498:24): [True: 5.63M, Folded] | | | Branch (1498:58): [Folded, False: 0] | | ------------------ ------------------ 1522| 5.63M| GRP4(1); ------------------ | | 1475| 5.63M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1523| 5.63M| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ 1524| 5.63M| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1525| | 1526| 5.63M| UNZR(2); ------------------ | | 1498| 5.63M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1498:24): [True: 5.63M, Folded] | | | Branch (1498:58): [Folded, False: 0] | | ------------------ ------------------ 1527| 5.63M| GRP4(2); ------------------ | | 1475| 5.63M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1528| 5.63M| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ 1529| 5.63M| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1530| | 1531| 5.63M| UNZR(3); ------------------ | | 1498| 5.63M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1498:24): [True: 5.63M, Folded] | | | Branch (1498:58): [Folded, False: 0] | | ------------------ ------------------ 1532| 5.63M| GRP4(3); ------------------ | | 1475| 5.63M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1533| 5.63M| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [True: 5.63M, Folded] | | | Branch (1476:70): [Folded, False: 0] | | ------------------ ------------------ 1534| 5.63M| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 5.63M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1535| | 1536| |#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) 1537| | // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations 1538| | pi = rebase(npi, r0, r1, r2, r3); 1539| |#else 1540| 5.63M| (void)npi; 1541| 5.63M|#endif 1542| | 1543| 5.63M|#undef UNZR 1544| 5.63M|#undef TEMP 1545| 5.63M|#undef PREP 1546| 5.63M|#undef LOAD 1547| 5.63M|#undef GRP4 1548| 5.63M|#undef FIXD 1549| 5.63M|#undef SAVE 1550| 5.63M| } 1551| 377k|} _ZN7meshopt10transpose8ERDv2_xS1_S1_S1_: 1275| 6.25M|{ 1276| 6.25M| __m128i t0 = _mm_unpacklo_epi8(x0, x1); 1277| 6.25M| __m128i t1 = _mm_unpackhi_epi8(x0, x1); 1278| 6.25M| __m128i t2 = _mm_unpacklo_epi8(x2, x3); 1279| 6.25M| __m128i t3 = _mm_unpackhi_epi8(x2, x3); 1280| | 1281| 6.25M| x0 = _mm_unpacklo_epi16(t0, t2); 1282| 6.25M| x1 = _mm_unpackhi_epi16(t0, t2); 1283| 6.25M| x2 = _mm_unpacklo_epi16(t1, t3); 1284| 6.25M| x3 = _mm_unpackhi_epi16(t1, t3); 1285| 6.25M|} _ZN7meshopt9unzigzag8EDv2_x: 1289| 22.5M|{ 1290| 22.5M| __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1))); 1291| 22.5M| __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127)); 1292| | 1293| 22.5M| return _mm_xor_si128(xl, xr); 1294| 22.5M|} vertexcodec.cpp:_ZN7meshoptL17decodeDeltas4SimdILi1EEEvPKhPhmmS3_i: 1470| 15.8k|{ 1471| 15.8k|#if defined(SIMD_SSE) || defined(SIMD_AVX) 1472| 15.8k|#define TEMP __m128i 1473| 15.8k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) 1474| 15.8k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) 1475| 15.8k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) 1476| 15.8k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) 1477| 15.8k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size 1478| 15.8k|#endif 1479| | 1480| |#ifdef SIMD_NEON 1481| |#define TEMP uint8x8_t 1482| |#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex), vdup_n_u32(0), 0)) 1483| |#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) 1484| |#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) 1485| |#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) 1486| |#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size 1487| |#endif 1488| | 1489| |#ifdef SIMD_WASM 1490| |#define TEMP v128_t 1491| |#define PREP() v128_t pi = wasm_v128_load(last_vertex) 1492| |#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) 1493| |#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) 1494| |#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) 1495| |#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size 1496| |#endif 1497| | 1498| 15.8k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) 1499| | 1500| 15.8k| PREP(); ------------------ | | 1473| 15.8k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) ------------------ 1501| | 1502| 15.8k| unsigned char* savep = transposed; 1503| | 1504| 249k| for (size_t j = 0; j < vertex_count_aligned; j += 16) ------------------ | Branch (1504:21): [True: 233k, False: 15.8k] ------------------ 1505| 233k| { 1506| 233k| LOAD(0); ------------------ | | 1474| 233k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1507| 233k| LOAD(1); ------------------ | | 1474| 233k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1508| 233k| LOAD(2); ------------------ | | 1474| 233k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1509| 233k| LOAD(3); ------------------ | | 1474| 233k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1510| | 1511| 233k| transpose8(r0, r1, r2, r3); 1512| | 1513| 233k| TEMP t0, t1, t2, t3; ------------------ | | 1472| 233k|#define TEMP __m128i ------------------ 1514| 233k| TEMP npi = pi; ------------------ | | 1472| 233k|#define TEMP __m128i ------------------ 1515| | 1516| 233k| UNZR(0); ------------------ | | 1498| 233k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1498:24): [Folded, False: 233k] | | | Branch (1498:58): [True: 233k, Folded] | | ------------------ ------------------ 1517| 233k| GRP4(0); ------------------ | | 1475| 233k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1518| 233k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ 1519| 233k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1520| | 1521| 233k| UNZR(1); ------------------ | | 1498| 233k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1498:24): [Folded, False: 233k] | | | Branch (1498:58): [True: 233k, Folded] | | ------------------ ------------------ 1522| 233k| GRP4(1); ------------------ | | 1475| 233k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1523| 233k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ 1524| 233k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1525| | 1526| 233k| UNZR(2); ------------------ | | 1498| 233k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1498:24): [Folded, False: 233k] | | | Branch (1498:58): [True: 233k, Folded] | | ------------------ ------------------ 1527| 233k| GRP4(2); ------------------ | | 1475| 233k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1528| 233k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ 1529| 233k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1530| | 1531| 233k| UNZR(3); ------------------ | | 1498| 233k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1498:24): [Folded, False: 233k] | | | Branch (1498:58): [True: 233k, Folded] | | ------------------ ------------------ 1532| 233k| GRP4(3); ------------------ | | 1475| 233k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1533| 233k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 233k] | | | Branch (1476:70): [True: 233k, Folded] | | ------------------ ------------------ 1534| 233k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 233k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1535| | 1536| |#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) 1537| | // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations 1538| | pi = rebase(npi, r0, r1, r2, r3); 1539| |#else 1540| 233k| (void)npi; 1541| 233k|#endif 1542| | 1543| 233k|#undef UNZR 1544| 233k|#undef TEMP 1545| 233k|#undef PREP 1546| 233k|#undef LOAD 1547| 233k|#undef GRP4 1548| 233k|#undef FIXD 1549| 233k|#undef SAVE 1550| 233k| } 1551| 15.8k|} _ZN7meshopt10unzigzag16EDv2_x: 1298| 934k|{ 1299| 934k| __m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1))); 1300| 934k| __m128i xr = _mm_srli_epi16(v, 1); 1301| | 1302| 934k| return _mm_xor_si128(xl, xr); 1303| 934k|} vertexcodec.cpp:_ZN7meshoptL17decodeDeltas4SimdILi2EEEvPKhPhmmS3_i: 1470| 24.8k|{ 1471| 24.8k|#if defined(SIMD_SSE) || defined(SIMD_AVX) 1472| 24.8k|#define TEMP __m128i 1473| 24.8k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) 1474| 24.8k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) 1475| 24.8k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) 1476| 24.8k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) 1477| 24.8k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size 1478| 24.8k|#endif 1479| | 1480| |#ifdef SIMD_NEON 1481| |#define TEMP uint8x8_t 1482| |#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex), vdup_n_u32(0), 0)) 1483| |#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) 1484| |#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) 1485| |#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) 1486| |#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size 1487| |#endif 1488| | 1489| |#ifdef SIMD_WASM 1490| |#define TEMP v128_t 1491| |#define PREP() v128_t pi = wasm_v128_load(last_vertex) 1492| |#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) 1493| |#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) 1494| |#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) 1495| |#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size 1496| |#endif 1497| | 1498| 24.8k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) 1499| | 1500| 24.8k| PREP(); ------------------ | | 1473| 24.8k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) ------------------ 1501| | 1502| 24.8k| unsigned char* savep = transposed; 1503| | 1504| 407k| for (size_t j = 0; j < vertex_count_aligned; j += 16) ------------------ | Branch (1504:21): [True: 382k, False: 24.8k] ------------------ 1505| 382k| { 1506| 382k| LOAD(0); ------------------ | | 1474| 382k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1507| 382k| LOAD(1); ------------------ | | 1474| 382k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1508| 382k| LOAD(2); ------------------ | | 1474| 382k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1509| 382k| LOAD(3); ------------------ | | 1474| 382k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1510| | 1511| 382k| transpose8(r0, r1, r2, r3); 1512| | 1513| 382k| TEMP t0, t1, t2, t3; ------------------ | | 1472| 382k|#define TEMP __m128i ------------------ 1514| 382k| TEMP npi = pi; ------------------ | | 1472| 382k|#define TEMP __m128i ------------------ 1515| | 1516| 382k| UNZR(0); ------------------ | | 1498| 382k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1498:24): [Folded, False: 382k] | | | Branch (1498:58): [Folded, False: 382k] | | ------------------ ------------------ 1517| 382k| GRP4(0); ------------------ | | 1475| 382k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1518| 382k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ 1519| 382k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1520| | 1521| 382k| UNZR(1); ------------------ | | 1498| 382k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1498:24): [Folded, False: 382k] | | | Branch (1498:58): [Folded, False: 382k] | | ------------------ ------------------ 1522| 382k| GRP4(1); ------------------ | | 1475| 382k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1523| 382k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ 1524| 382k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1525| | 1526| 382k| UNZR(2); ------------------ | | 1498| 382k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1498:24): [Folded, False: 382k] | | | Branch (1498:58): [Folded, False: 382k] | | ------------------ ------------------ 1527| 382k| GRP4(2); ------------------ | | 1475| 382k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1528| 382k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ 1529| 382k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1530| | 1531| 382k| UNZR(3); ------------------ | | 1498| 382k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1498:24): [Folded, False: 382k] | | | Branch (1498:58): [Folded, False: 382k] | | ------------------ ------------------ 1532| 382k| GRP4(3); ------------------ | | 1475| 382k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1533| 382k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1476| 382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1476:29): [Folded, False: 382k] | | | Branch (1476:70): [Folded, False: 382k] | | ------------------ ------------------ 1534| 382k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1477| 382k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1535| | 1536| |#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) 1537| | // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations 1538| | pi = rebase(npi, r0, r1, r2, r3); 1539| |#else 1540| 382k| (void)npi; 1541| 382k|#endif 1542| | 1543| 382k|#undef UNZR 1544| 382k|#undef TEMP 1545| 382k|#undef PREP 1546| 382k|#undef LOAD 1547| 382k|#undef GRP4 1548| 382k|#undef FIXD 1549| 382k|#undef SAVE 1550| 382k| } 1551| 24.8k|} _ZN7meshopt8rotate32EDv2_xi: 1307| 1.53M|{ 1308| 1.53M| return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r)); 1309| 1.53M|} _Z11fuzzDecoderPKhmmPFiPvmmS0_mE: 8| 18.0k|{ 9| 18.0k| size_t count = 66; // must be divisible by 3 for decodeIndexBuffer; should be >=64 to cover large vertex blocks 10| | 11| 18.0k| void* destination = malloc(count * stride); 12| 18.0k| assert(destination); ------------------ | Branch (12:2): [True: 18.0k, False: 0] ------------------ 13| | 14| 18.0k| int rc = decode(destination, count, stride, reinterpret_cast(data), size); 15| 18.0k| (void)rc; 16| | 17| 18.0k| free(destination); 18| 18.0k|} _Z13fuzzRoundtripPKhmmi: 21| 9.02k|{ 22| 9.02k| size_t count = size / stride; 23| | 24| 9.02k| size_t bound = meshopt_encodeVertexBufferBound(count, stride); 25| 9.02k| void* encoded = malloc(bound); 26| 9.02k| void* decoded = malloc(count * stride); 27| 9.02k| assert(encoded && decoded); ------------------ | Branch (27:2): [True: 9.02k, False: 0] | Branch (27:2): [True: 9.02k, False: 0] | Branch (27:2): [True: 9.02k, False: 0] ------------------ 28| | 29| 9.02k| size_t res = meshopt_encodeVertexBufferLevel(static_cast(encoded), bound, data, count, stride, level, -1); 30| 9.02k| assert(res > 0 && res <= bound); ------------------ | Branch (30:2): [True: 9.02k, False: 0] | Branch (30:2): [True: 9.02k, False: 0] | Branch (30:2): [True: 9.02k, False: 0] ------------------ 31| | 32| | // encode again at the boundary to check for memory safety 33| | // this should produce the same output because encoder is deterministic 34| 9.02k| size_t rese = meshopt_encodeVertexBufferLevel(static_cast(encoded) + bound - res, res, data, count, stride, level, -1); 35| 9.02k| assert(rese == res); ------------------ | Branch (35:2): [True: 9.02k, False: 0] ------------------ 36| | 37| 9.02k| int rc = meshopt_decodeVertexBuffer(decoded, count, stride, static_cast(encoded) + bound - res, res); 38| 9.02k| assert(rc == 0); ------------------ | Branch (38:2): [True: 9.02k, False: 0] ------------------ 39| | 40| 9.02k| assert(memcmp(data, decoded, count * stride) == 0); ------------------ | Branch (40:2): [True: 9.02k, False: 0] ------------------ 41| | 42| 9.02k| free(decoded); 43| 9.02k| free(encoded); 44| 9.02k|} _Z5alignmm: 47| 13.4k|{ 48| 13.4k| return (value + alignment - 1) & ~(alignment - 1); 49| 13.4k|} _Z17fuzzDecodeMeshletmmPKhm: 52| 2.24k|{ 53| | // raw decoding: allowed to write align(count, 4) elements 54| 2.24k| unsigned int rt[256]; 55| 2.24k| unsigned int rv[256]; 56| 2.24k| meshopt_decodeMeshletRaw(rv + 256 - align(vertex_count, 4), vertex_count, rt + 256 - align(triangle_count, 4), triangle_count, data, size); 57| | 58| | // regular decoding: allowed to write align(count * size, 4) bytes 59| | // with variations for 3-byte triangles and 2-byte vertex references 60| 2.24k| unsigned short rsv[256]; 61| 2.24k| unsigned char rbt[256 * 3]; 62| | 63| 2.24k| meshopt_decodeMeshlet(rv + 256 - vertex_count, vertex_count, 4, rt + 256 - triangle_count, triangle_count, 4, data, size); 64| 2.24k| meshopt_decodeMeshlet(rsv + 256 - align(vertex_count, 2), vertex_count, 2, rt + 256 - triangle_count, triangle_count, 4, data, size); 65| 2.24k| meshopt_decodeMeshlet(rv + 256 - vertex_count, vertex_count, 4, rbt + 256 * 3 - align(triangle_count * 3, 4), triangle_count, 3, data, size); 66| 2.24k| meshopt_decodeMeshlet(rsv + 256 - align(vertex_count, 2), vertex_count, 2, rbt + 256 * 3 - align(triangle_count * 3, 4), triangle_count, 3, data, size); 67| 2.24k|} _Z20fuzzRoundtripMeshletPKhm: 70| 2.25k|{ 71| 2.25k| size_t triangle_count = size / 3; 72| 2.25k| if (triangle_count > 256) ------------------ | Branch (72:6): [True: 518, False: 1.73k] ------------------ 73| 518| triangle_count = 256; 74| | 75| 2.25k| unsigned char buf[4096]; 76| 2.25k| size_t enc = meshopt_encodeMeshlet(buf, sizeof(buf), NULL, 0, reinterpret_cast(data), triangle_count); 77| 2.25k| assert(enc > 0); ------------------ | Branch (77:2): [True: 2.25k, False: 0] ------------------ 78| 2.25k| assert(enc <= meshopt_encodeMeshletBound(0, triangle_count)); ------------------ | Branch (78:2): [True: 2.25k, False: 0] ------------------ 79| | 80| 2.25k| unsigned int rt4[256]; 81| 2.25k| int rc4 = meshopt_decodeMeshlet(static_cast(NULL), 0, rt4, triangle_count, buf, enc); 82| 2.25k| assert(rc4 == 0); ------------------ | Branch (82:2): [True: 2.25k, False: 0] ------------------ 83| | 84| 184k| for (size_t i = 0; i < triangle_count; ++i) ------------------ | Branch (84:21): [True: 181k, False: 2.25k] ------------------ 85| 181k| { 86| 181k| unsigned char a = data[i * 3 + 0], b = data[i * 3 + 1], c = data[i * 3 + 2]; 87| | 88| 181k| unsigned int abc = (a << 0) | (b << 8) | (c << 16); 89| 181k| unsigned int bca = (b << 0) | (c << 8) | (a << 16); 90| 181k| unsigned int cba = (c << 0) | (a << 8) | (b << 16); 91| | 92| 181k| unsigned int tri = rt4[i]; 93| | 94| 181k| assert(tri == abc || tri == bca || tri == cba); ------------------ | Branch (94:3): [True: 114k, False: 67.3k] | Branch (94:3): [True: 35.2k, False: 32.0k] | Branch (94:3): [True: 32.0k, False: 0] | Branch (94:3): [True: 181k, False: 0] ------------------ 95| 181k| } 96| | 97| 2.25k| unsigned char rt3[256 * 3]; 98| 2.25k| int rc3 = meshopt_decodeMeshlet(static_cast(NULL), 0, rt3, triangle_count, buf, enc); 99| 2.25k| assert(rc3 == 0); ------------------ | Branch (99:2): [True: 2.25k, False: 0] ------------------ 100| | 101| 184k| for (size_t i = 0; i < triangle_count; ++i) ------------------ | Branch (101:21): [True: 181k, False: 2.25k] ------------------ 102| 181k| { 103| 181k| unsigned char a = data[i * 3 + 0], b = data[i * 3 + 1], c = data[i * 3 + 2]; 104| | 105| 181k| unsigned int abc = (a << 0) | (b << 8) | (c << 16); 106| 181k| unsigned int bca = (b << 0) | (c << 8) | (a << 16); 107| 181k| unsigned int cba = (c << 0) | (a << 8) | (b << 16); 108| | 109| 181k| unsigned int tri = rt3[i * 3 + 0] | (rt3[i * 3 + 1] << 8) | (rt3[i * 3 + 2] << 16); 110| | 111| | assert(tri == abc || tri == bca || tri == cba); ------------------ | Branch (111:3): [True: 114k, False: 67.3k] | Branch (111:3): [True: 35.2k, False: 32.0k] | Branch (111:3): [True: 32.0k, False: 0] | Branch (111:3): [True: 181k, False: 0] ------------------ 112| 181k| } 113| 2.25k|} _Z21fuzzRoundtripMeshletVPKhm: 116| 2.25k|{ 117| 2.25k| size_t vertex_count = size / 4; 118| 2.25k| if (vertex_count > 256) ------------------ | Branch (118:6): [True: 468, False: 1.78k] ------------------ 119| 468| vertex_count = 256; 120| | 121| 2.25k| unsigned char tri[4] = {0, 1, 2}; 122| | 123| 2.25k| unsigned char buf[4096]; 124| 2.25k| size_t enc = meshopt_encodeMeshlet(buf, sizeof(buf), reinterpret_cast(data), vertex_count, tri, 1); 125| 2.25k| assert(enc > 0); ------------------ | Branch (125:2): [True: 2.25k, False: 0] ------------------ 126| 2.25k| assert(enc <= meshopt_encodeMeshletBound(vertex_count, 1)); ------------------ | Branch (126:2): [True: 2.25k, False: 0] ------------------ 127| | 128| 2.25k| unsigned int rv4[256]; 129| 2.25k| int rc4 = meshopt_decodeMeshlet(rv4, vertex_count, tri, 1, buf, enc); 130| 2.25k| assert(rc4 == 0); ------------------ | Branch (130:2): [True: 2.25k, False: 0] ------------------ 131| | 132| 170k| for (size_t i = 0; i < vertex_count; ++i) ------------------ | Branch (132:21): [True: 168k, False: 2.25k] ------------------ 133| 168k| assert(rv4[i] == reinterpret_cast(data)[i]); ------------------ | Branch (133:3): [True: 168k, False: 0] ------------------ 134| | 135| 2.25k| unsigned short rv2[256]; 136| 2.25k| int rc2 = meshopt_decodeMeshlet(rv2, vertex_count, tri, 1, buf, enc); 137| 2.25k| assert(rc2 == 0); ------------------ | Branch (137:2): [True: 2.25k, False: 0] ------------------ 138| | 139| 170k| for (size_t i = 0; i < vertex_count; ++i) ------------------ | Branch (139:21): [True: 168k, False: 2.25k] ------------------ 140| | assert(rv2[i] == uint16_t(reinterpret_cast(data)[i])); ------------------ | Branch (140:3): [True: 168k, False: 0] ------------------ 141| 2.25k|} LLVMFuzzerTestOneInput: 144| 2.25k|{ 145| | // decodeIndexBuffer supports 2 and 4-byte indices 146| 2.25k| fuzzDecoder(data, size, 2, meshopt_decodeIndexBuffer); 147| 2.25k| fuzzDecoder(data, size, 4, meshopt_decodeIndexBuffer); 148| | 149| | // decodeIndexSequence supports 2 and 4-byte indices 150| 2.25k| fuzzDecoder(data, size, 2, meshopt_decodeIndexSequence); 151| 2.25k| fuzzDecoder(data, size, 4, meshopt_decodeIndexSequence); 152| | 153| | // decodeVertexBuffer supports any strides divisible by 4 in 4-256 interval 154| | // it's a waste of time to check all of them, so we'll just check a few with different alignment mod 16 155| 2.25k| fuzzDecoder(data, size, 4, meshopt_decodeVertexBuffer); 156| 2.25k| fuzzDecoder(data, size, 16, meshopt_decodeVertexBuffer); 157| 2.25k| fuzzDecoder(data, size, 24, meshopt_decodeVertexBuffer); 158| 2.25k| fuzzDecoder(data, size, 32, meshopt_decodeVertexBuffer); 159| | 160| | // encodeVertexBuffer/decodeVertexBuffer should roundtrip for any stride, check a few with different alignment mod 16 161| | // this also checks memory safety properties of the encoder 162| | // to conserve time, we only check one version/level combination, biased towards version 1 163| 2.25k| uint8_t data0 = size > 0 ? data[0] : 0; ------------------ | Branch (163:18): [True: 2.25k, False: 0] ------------------ 164| 2.25k| int level = data0 % 5; 165| | 166| 2.25k| meshopt_encodeVertexVersion(level < 4 ? 1 : 0); ------------------ | Branch (166:30): [True: 1.76k, False: 493] ------------------ 167| | 168| 2.25k| fuzzRoundtrip(data, size, 4, level); 169| 2.25k| fuzzRoundtrip(data, size, 16, level); 170| 2.25k| fuzzRoundtrip(data, size, 24, level); 171| 2.25k| fuzzRoundtrip(data, size, 32, level); 172| | 173| | // validate that decodeMeshlet works on untrusted data and is memory safe within documented limits 174| 2.25k| if (size > 2) ------------------ | Branch (174:6): [True: 2.24k, False: 14] ------------------ 175| 2.24k| fuzzDecodeMeshlet(data[0] + 1, data[1] + 1, reinterpret_cast(data + 2), size - 2); 176| | 177| | // validate that index data roundtrips in meshlet encoding modulo rotation 178| 2.25k| fuzzRoundtripMeshlet(data, size); 179| | 180| | // validate that vertex data roundtrips in meshlet encoding 181| 2.25k| fuzzRoundtripMeshletV(data, size); 182| | 183| 2.25k| return 0; 184| 2.25k|}