meshopt_decodeIndexBuffer: 379| 4.14k|{ 380| 4.14k| using namespace meshopt; 381| | 382| 4.14k| assert(index_count % 3 == 0); ------------------ | Branch (382:2): [True: 4.14k, False: 0] ------------------ 383| 4.14k| assert(index_size == 2 || index_size == 4); ------------------ | Branch (383:2): [True: 2.07k, False: 2.07k] | Branch (383:2): [True: 2.07k, False: 0] | Branch (383:2): [True: 4.14k, False: 0] ------------------ 384| | 385| | // the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table 386| 4.14k| if (buffer_size < 1 + index_count / 3 + 16) ------------------ | Branch (386:6): [True: 1.75k, False: 2.39k] ------------------ 387| 1.75k| return -2; 388| | 389| 2.39k| if ((buffer[0] & 0xf0) != kIndexHeader) ------------------ | Branch (389:6): [True: 1.71k, False: 684] ------------------ 390| 1.71k| return -1; 391| | 392| 684| int version = buffer[0] & 0x0f; 393| 684| if (version > kDecodeIndexVersion) ------------------ | Branch (393:6): [True: 64, False: 620] ------------------ 394| 64| return -1; 395| | 396| 620| EdgeFifo edgefifo; 397| 620| memset(edgefifo, -1, sizeof(edgefifo)); 398| | 399| 620| VertexFifo vertexfifo; 400| 620| memset(vertexfifo, -1, sizeof(vertexfifo)); 401| | 402| 620| size_t edgefifooffset = 0; 403| 620| size_t vertexfifooffset = 0; 404| | 405| 620| unsigned int next = 0; 406| 620| unsigned int last = 0; 407| | 408| 620| int fecmax = version >= 1 ? 13 : 15; ------------------ | Branch (408:15): [True: 126, False: 494] ------------------ 409| | 410| | // since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end 411| 620| const unsigned char* code = buffer + 1; 412| 620| const unsigned char* data = code + index_count / 3; 413| 620| const unsigned char* data_safe_end = buffer + buffer_size - 16; 414| | 415| 620| const unsigned char* codeaux_table = data_safe_end; 416| | 417| 10.7k| for (size_t i = 0; i < index_count; i += 3) ------------------ | Branch (417:21): [True: 10.4k, False: 338] ------------------ 418| 10.4k| { 419| | // make sure we have enough data to read for a triangle 420| | // each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index 421| | // after this we can be sure we can read without extra bounds checks 422| 10.4k| if (data > data_safe_end) ------------------ | Branch (422:7): [True: 282, False: 10.1k] ------------------ 423| 282| return -2; 424| | 425| 10.1k| unsigned char codetri = *code++; 426| | 427| 10.1k| if (codetri < 0xf0) ------------------ | Branch (427:7): [True: 5.78k, False: 4.33k] ------------------ 428| 5.78k| { 429| 5.78k| int fe = codetri >> 4; 430| | 431| | // fifo reads are wrapped around 16 entry buffer 432| 5.78k| unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0]; 433| 5.78k| unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1]; 434| 5.78k| unsigned int c = 0; 435| | 436| 5.78k| int fec = codetri & 15; 437| | 438| | // note: this is the most common path in the entire decoder 439| | // inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable 440| 5.78k| if (fec < fecmax) ------------------ | Branch (440:8): [True: 4.72k, False: 1.06k] ------------------ 441| 4.72k| { 442| | // fifo reads are wrapped around 16 entry buffer 443| 4.72k| unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15]; 444| 4.72k| c = (fec == 0) ? next : cf; ------------------ | Branch (444:9): [True: 1.82k, False: 2.90k] ------------------ 445| | 446| 4.72k| int fec0 = fec == 0; 447| 4.72k| next += fec0; 448| | 449| | // push vertex fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly 450| 4.72k| pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0); 451| 4.72k| } 452| 1.06k| else 453| 1.06k| { 454| | // fec - (fec ^ 3) decodes 13, 14 into -1, 1 455| | // note that we need to update the last index since free indices are delta-encoded 456| 1.06k| last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last); ------------------ | Branch (456:16): [True: 176, False: 886] ------------------ 457| | 458| | // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly 459| 1.06k| pushVertexFifo(vertexfifo, c, vertexfifooffset); 460| 1.06k| } 461| | 462| | // push edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly 463| 5.78k| pushEdgeFifo(edgefifo, c, b, edgefifooffset); 464| 5.78k| pushEdgeFifo(edgefifo, a, c, edgefifooffset); 465| | 466| | // output triangle 467| 5.78k| writeTriangle(destination, i, index_size, a, b, c); 468| 5.78k| } 469| 4.33k| else 470| 4.33k| { 471| | // fast path: read codeaux from the table 472| 4.33k| if (codetri < 0xfe) ------------------ | Branch (472:8): [True: 1.30k, False: 3.02k] ------------------ 473| 1.30k| { 474| 1.30k| unsigned char codeaux = codeaux_table[codetri & 15]; 475| | 476| | // note: table can't contain feb/fec=15 477| 1.30k| int feb = codeaux >> 4; 478| 1.30k| int fec = codeaux & 15; 479| | 480| | // fifo reads are wrapped around 16 entry buffer 481| | // also note that we increment next for all three vertices before decoding indices - this matches encoder behavior 482| 1.30k| unsigned int a = next++; 483| | 484| 1.30k| unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15]; 485| 1.30k| unsigned int b = (feb == 0) ? next : bf; ------------------ | Branch (485:22): [True: 464, False: 838] ------------------ 486| | 487| 1.30k| int feb0 = feb == 0; 488| 1.30k| next += feb0; 489| | 490| 1.30k| unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15]; 491| 1.30k| unsigned int c = (fec == 0) ? next : cf; ------------------ | Branch (491:22): [True: 460, False: 842] ------------------ 492| | 493| 1.30k| int fec0 = fec == 0; 494| 1.30k| next += fec0; 495| | 496| | // output triangle 497| 1.30k| writeTriangle(destination, i, index_size, a, b, c); 498| | 499| | // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly 500| 1.30k| pushVertexFifo(vertexfifo, a, vertexfifooffset); 501| 1.30k| pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0); 502| 1.30k| pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0); 503| | 504| 1.30k| pushEdgeFifo(edgefifo, b, a, edgefifooffset); 505| 1.30k| pushEdgeFifo(edgefifo, c, b, edgefifooffset); 506| 1.30k| pushEdgeFifo(edgefifo, a, c, edgefifooffset); 507| 1.30k| } 508| 3.02k| else 509| 3.02k| { 510| | // slow path: read a full byte for codeaux instead of using a table lookup 511| 3.02k| unsigned char codeaux = *data++; 512| | 513| 3.02k| int fea = codetri == 0xfe ? 0 : 15; ------------------ | Branch (513:15): [True: 1.00k, False: 2.02k] ------------------ 514| 3.02k| int feb = codeaux >> 4; 515| 3.02k| int fec = codeaux & 15; 516| | 517| | // reset: codeaux is 0 but encoded as not-a-table 518| 3.02k| if (codeaux == 0) ------------------ | Branch (518:9): [True: 532, False: 2.49k] ------------------ 519| 532| next = 0; 520| | 521| | // fifo reads are wrapped around 16 entry buffer 522| | // also note that we increment next for all three vertices before decoding indices - this matches encoder behavior 523| 3.02k| unsigned int a = (fea == 0) ? next++ : 0; ------------------ | Branch (523:22): [True: 1.00k, False: 2.02k] ------------------ 524| 3.02k| unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15]; ------------------ | Branch (524:22): [True: 748, False: 2.28k] ------------------ 525| 3.02k| unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15]; ------------------ | Branch (525:22): [True: 688, False: 2.34k] ------------------ 526| | 527| | // note that we need to update the last index since free indices are delta-encoded 528| 3.02k| if (fea == 15) ------------------ | Branch (528:9): [True: 2.02k, False: 1.00k] ------------------ 529| 2.02k| last = a = decodeIndex(data, last); 530| | 531| 3.02k| if (feb == 15) ------------------ | Branch (531:9): [True: 1.00k, False: 2.02k] ------------------ 532| 1.00k| last = b = decodeIndex(data, last); 533| | 534| 3.02k| if (fec == 15) ------------------ | Branch (534:9): [True: 1.04k, False: 1.98k] ------------------ 535| 1.04k| last = c = decodeIndex(data, last); 536| | 537| | // output triangle 538| 3.02k| writeTriangle(destination, i, index_size, a, b, c); 539| | 540| | // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly 541| 3.02k| pushVertexFifo(vertexfifo, a, vertexfifooffset); 542| 3.02k| pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15)); 543| 3.02k| pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15)); 544| | 545| 3.02k| pushEdgeFifo(edgefifo, b, a, edgefifooffset); 546| 3.02k| pushEdgeFifo(edgefifo, c, b, edgefifooffset); 547| 3.02k| pushEdgeFifo(edgefifo, a, c, edgefifooffset); 548| 3.02k| } 549| 4.33k| } 550| 10.1k| } 551| | 552| | // we should've read all data bytes and stopped at the boundary between data and codeaux table 553| 338| if (data != data_safe_end) ------------------ | Branch (553:6): [True: 320, False: 18] ------------------ 554| 320| return -3; 555| | 556| 18| return 0; 557| 338|} meshopt_decodeIndexSequence: 629| 4.14k|{ 630| 4.14k| using namespace meshopt; 631| | 632| | // the minimum valid encoding is header, 1 byte per index and a 4-byte tail 633| 4.14k| if (buffer_size < 1 + index_count + 4) ------------------ | Branch (633:6): [True: 2.30k, False: 1.84k] ------------------ 634| 2.30k| return -2; 635| | 636| 1.84k| if ((buffer[0] & 0xf0) != kSequenceHeader) ------------------ | Branch (636:6): [True: 1.44k, False: 406] ------------------ 637| 1.44k| return -1; 638| | 639| 406| int version = buffer[0] & 0x0f; 640| 406| if (version > kDecodeIndexVersion) ------------------ | Branch (640:6): [True: 28, False: 378] ------------------ 641| 28| return -1; 642| | 643| 378| const unsigned char* data = buffer + 1; 644| 378| const unsigned char* data_safe_end = buffer + buffer_size - 4; 645| | 646| 378| unsigned int last[2] = {}; 647| | 648| 23.9k| for (size_t i = 0; i < index_count; ++i) ------------------ | Branch (648:21): [True: 23.6k, False: 312] ------------------ 649| 23.6k| { 650| | // make sure we have enough data to read 651| | // each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end 652| | // after this we can be sure we can read without extra bounds checks 653| 23.6k| if (data >= data_safe_end) ------------------ | Branch (653:7): [True: 66, False: 23.5k] ------------------ 654| 66| return -2; 655| | 656| 23.5k| unsigned int v = decodeVByte(data); 657| | 658| | // decode the index of the last baseline 659| 23.5k| unsigned int current = v & 1; 660| 23.5k| v >>= 1; 661| | 662| | // reconstruct index as a delta 663| 23.5k| unsigned int d = (v >> 1) ^ -int(v & 1); 664| 23.5k| unsigned int index = last[current] + d; 665| | 666| | // update last for the next iteration that uses it 667| 23.5k| last[current] = index; 668| | 669| 23.5k| if (index_size == 2) ------------------ | Branch (669:7): [True: 11.7k, False: 11.7k] ------------------ 670| 11.7k| { 671| 11.7k| static_cast(destination)[i] = (unsigned short)(index); 672| 11.7k| } 673| 11.7k| else 674| 11.7k| { 675| 11.7k| static_cast(destination)[i] = index; 676| 11.7k| } 677| 23.5k| } 678| | 679| | // we should've read all data bytes and stopped at the boundary between data and tail 680| 312| if (data != data_safe_end) ------------------ | Branch (680:6): [True: 310, False: 2] ------------------ 681| 310| return -3; 682| | 683| 2| return 0; 684| 312|} indexcodec.cpp:_ZN7meshoptL14pushVertexFifoEPjjRmi: 75| 18.7k|{ 76| 18.7k| fifo[offset] = v; 77| 18.7k| offset = (offset + cond) & 15; 78| 18.7k|} indexcodec.cpp:_ZN7meshoptL12pushEdgeFifoEPA2_jjjRm: 55| 24.5k|{ 56| 24.5k| fifo[offset][0] = a; 57| 24.5k| fifo[offset][1] = b; 58| 24.5k| offset = (offset + 1) & 15; 59| 24.5k|} indexcodec.cpp:_ZN7meshoptL11decodeIndexERPKhj: 125| 4.95k|{ 126| 4.95k| unsigned int v = decodeVByte(data); 127| 4.95k| unsigned int d = (v >> 1) ^ -int(v & 1); 128| | 129| 4.95k| return last + d; 130| 4.95k|} indexcodec.cpp:_ZN7meshoptL13writeTriangleEPvmmjjj: 142| 10.1k|{ 143| 10.1k| if (index_size == 2) ------------------ | Branch (143:6): [True: 5.05k, False: 5.05k] ------------------ 144| 5.05k| { 145| 5.05k| static_cast(destination)[offset + 0] = (unsigned short)(a); 146| 5.05k| static_cast(destination)[offset + 1] = (unsigned short)(b); 147| 5.05k| static_cast(destination)[offset + 2] = (unsigned short)(c); 148| 5.05k| } 149| 5.05k| else 150| 5.05k| { 151| 5.05k| static_cast(destination)[offset + 0] = a; 152| 5.05k| static_cast(destination)[offset + 1] = b; 153| 5.05k| static_cast(destination)[offset + 2] = c; 154| 5.05k| } 155| 10.1k|} indexcodec.cpp:_ZN7meshoptL11decodeVByteERPKh: 91| 28.5k|{ 92| 28.5k| unsigned char lead = *data++; 93| | 94| | // fast path: single byte 95| 28.5k| if (lead < 128) ------------------ | Branch (95:6): [True: 17.6k, False: 10.9k] ------------------ 96| 17.6k| return lead; 97| | 98| | // slow path: up to 4 extra bytes 99| | // note that this loop always terminates, which is important for malformed data 100| 10.9k| unsigned int result = lead & 127; 101| 10.9k| unsigned int shift = 7; 102| | 103| 34.6k| for (int i = 0; i < 4; ++i) ------------------ | Branch (103:18): [True: 29.9k, False: 4.63k] ------------------ 104| 29.9k| { 105| 29.9k| unsigned char group = *data++; 106| 29.9k| result |= unsigned(group & 127) << shift; 107| 29.9k| shift += 7; 108| | 109| 29.9k| if (group < 128) ------------------ | Branch (109:7): [True: 6.28k, False: 23.7k] ------------------ 110| 6.28k| break; 111| 29.9k| } 112| | 113| 10.9k| return result; 114| 28.5k|} meshopt_encodeMeshletBound: 899| 4.14k|{ 900| 4.14k| size_t codes_size = (max_triangles + 1) / 2; 901| 4.14k| size_t extra_size = max_triangles * 3; 902| | 903| 4.14k| size_t ctrl_size = (max_vertices + 3) / 4; 904| 4.14k| size_t data_size = (max_vertices + 3) / 4 * 16; // worst case: 16 bytes per vertex group 905| | 906| 4.14k| size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0; ------------------ | Branch (906:20): [True: 2.61k, False: 1.53k] ------------------ 907| | 908| 4.14k| return codes_size + extra_size + ctrl_size + data_size + gap_size; 909| 4.14k|} meshopt_encodeMeshlet: 912| 4.14k|{ 913| 4.14k| using namespace meshopt; 914| | 915| 4.14k| assert(triangle_count <= 256 && vertex_count <= 256); ------------------ | Branch (915:2): [True: 4.14k, False: 0] | Branch (915:2): [True: 4.14k, False: 0] | Branch (915:2): [True: 4.14k, False: 0] ------------------ 916| | 917| | // 4 bits per triangle + up to three bytes of extra data 918| 4.14k| unsigned char codes[256 / 2]; 919| 4.14k| unsigned char extra[256 * 3]; 920| 4.14k| size_t codes_size = (triangle_count + 1) / 2; 921| 4.14k| size_t extra_size = encodeTriangles(codes, extra, triangles, triangle_count); 922| 4.14k| assert(extra_size <= sizeof(extra)); ------------------ | Branch (922:2): [True: 4.14k, False: 0] ------------------ 923| | 924| | // 2 bits per vertex + up to 4 bytes of actual data 925| 4.14k| unsigned char ctrl[256 / 4]; 926| 4.14k| unsigned char data[256 * 4]; 927| 4.14k| size_t ctrl_size = (vertex_count + 3) / 4; 928| 4.14k| size_t data_size = encodeVertices(ctrl, data, vertices, vertex_count); 929| 4.14k| assert(data_size <= sizeof(data)); ------------------ | Branch (929:2): [True: 4.14k, False: 0] ------------------ 930| | 931| | // we need to ensure that up to 16 bytes after extra+data are available for SIMD decoding 932| | // to minimize overhead, we place fixed-size codes+control at the end of the buffer 933| 4.14k| size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0; ------------------ | Branch (933:20): [True: 2.61k, False: 1.53k] ------------------ 934| | 935| 4.14k| size_t result = codes_size + extra_size + ctrl_size + data_size + gap_size; 936| | 937| 4.14k| if (result > buffer_size) ------------------ | Branch (937:6): [True: 0, False: 4.14k] ------------------ 938| 0| return 0; 939| | 940| | // variable-size data first 941| 4.14k| memcpy(buffer, data, data_size); 942| 4.14k| buffer += data_size; 943| 4.14k| memcpy(buffer, extra, extra_size); 944| 4.14k| buffer += extra_size; 945| | 946| | // gap (for accelerated decoding) separates variable-size and fixed-size data 947| 4.14k| memset(buffer, 0, gap_size); 948| 4.14k| buffer += gap_size; 949| | 950| | // fixed-size data last; it can be located from buffer end during decoding 951| 4.14k| memcpy(buffer, ctrl, ctrl_size); 952| 4.14k| buffer += ctrl_size; 953| 4.14k| memcpy(buffer, codes, codes_size); 954| 4.14k| buffer += codes_size; 955| | 956| |#if TRACE > 1 957| | printf("extra:"); 958| | for (size_t i = 0; i < extra_size; ++i) 959| | printf(" %d", extra[i]); 960| | printf("\n"); 961| | 962| | unsigned int minv = ~0u; 963| | for (size_t i = 0; i < vertex_count; ++i) 964| | minv = minv < vertices[i] ? minv : vertices[i]; 965| | 966| | printf("vertices: [%d+]", minv); 967| | for (size_t i = 0; i < vertex_count; ++i) 968| | printf(" %d", vertices[i] - minv); 969| | printf("\n"); 970| |#endif 971| | 972| |#if TRACE 973| | printf("stats: %d vertices, %d triangles => %d bytes (triangles: %d codes, %d extra; vertices: %d control, %d data; %d gap)\n", 974| | int(vertex_count), int(triangle_count), int(result), 975| | int(codes_size), int(extra_size), int(ctrl_size), int(data_size), int(gap_size)); 976| |#endif 977| | 978| 4.14k| return result; 979| 4.14k|} meshopt_decodeMeshlet: 982| 16.5k|{ 983| 16.5k| using namespace meshopt; 984| | 985| 16.5k| assert(triangle_count <= 256 && vertex_count <= 256); ------------------ | Branch (985:2): [True: 16.5k, False: 0] | Branch (985:2): [True: 16.5k, False: 0] | Branch (985:2): [True: 16.5k, False: 0] ------------------ 986| 16.5k| assert(vertex_size == 4 || vertex_size == 2); ------------------ | Branch (986:2): [True: 10.3k, False: 6.19k] | Branch (986:2): [True: 6.19k, False: 0] | Branch (986:2): [True: 16.5k, False: 0] ------------------ 987| 16.5k| assert(triangle_size == 4 || triangle_size == 3); ------------------ | Branch (987:2): [True: 6.19k, False: 10.3k] | Branch (987:2): [True: 10.3k, False: 0] | Branch (987:2): [True: 16.5k, False: 0] ------------------ 988| | 989| | // layout must match encoding 990| 16.5k| size_t codes_size = (triangle_count + 1) / 2; 991| 16.5k| size_t ctrl_size = (vertex_count + 3) / 4; 992| 16.5k| size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0; ------------------ | Branch (992:20): [True: 6.67k, False: 9.86k] ------------------ 993| | 994| 16.5k| if (buffer_size < codes_size + ctrl_size + gap_size) ------------------ | Branch (994:6): [True: 4.16k, False: 12.3k] ------------------ 995| 4.16k| return -2; 996| | 997| 12.3k| const unsigned char* end = buffer + buffer_size; 998| 12.3k| const unsigned char* codes = end - codes_size; 999| 12.3k| const unsigned char* ctrl = codes - ctrl_size; 1000| 12.3k| const unsigned char* data = buffer; 1001| | 1002| | // gap ensures we have at least 16 bytes available after bound; this allows SIMD decoders to over-read safely 1003| 12.3k| const unsigned char* bound = ctrl - gap_size; 1004| 12.3k| assert(bound >= buffer && bound + 16 <= buffer + buffer_size); ------------------ | Branch (1004:2): [True: 12.3k, False: 0] | Branch (1004:2): [True: 12.3k, False: 0] | Branch (1004:2): [True: 12.3k, False: 0] ------------------ 1005| | 1006| 12.3k|#if defined(SIMD_FALLBACK) 1007| 12.3k| return (gDecodeTablesInitialized ? decodeMeshletSimd<0> : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size); ------------------ | Branch (1007:10): [True: 12.3k, False: 0] ------------------ 1008| |#elif defined(SIMD_SSE) || defined(SIMD_NEON) 1009| | return decodeMeshletSimd<0>(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size); 1010| |#else 1011| | return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size); 1012| |#endif 1013| 12.3k|} meshopt_decodeMeshletRaw: 1016| 2.06k|{ 1017| 2.06k| using namespace meshopt; 1018| | 1019| 2.06k| assert(triangle_count <= 256 && vertex_count <= 256); ------------------ | Branch (1019:2): [True: 2.06k, False: 0] | Branch (1019:2): [True: 2.06k, False: 0] | Branch (1019:2): [True: 2.06k, False: 0] ------------------ 1020| | 1021| | // layout must match encoding 1022| 2.06k| size_t codes_size = (triangle_count + 1) / 2; 1023| 2.06k| size_t ctrl_size = (vertex_count + 3) / 4; 1024| 2.06k| size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0; ------------------ | Branch (1024:20): [True: 362, False: 1.69k] ------------------ 1025| | 1026| 2.06k| if (buffer_size < codes_size + ctrl_size + gap_size) ------------------ | Branch (1026:6): [True: 1.04k, False: 1.02k] ------------------ 1027| 1.04k| return -2; 1028| | 1029| 1.02k| const unsigned char* end = buffer + buffer_size; 1030| 1.02k| const unsigned char* codes = end - codes_size; 1031| 1.02k| const unsigned char* ctrl = codes - ctrl_size; 1032| 1.02k| const unsigned char* data = buffer; 1033| | 1034| | // gap ensures we have at least 16 bytes available after bound; this allows SIMD decoders to over-read safely 1035| 1.02k| const unsigned char* bound = ctrl - gap_size; 1036| 1.02k| assert(bound >= buffer && bound + 16 <= buffer + buffer_size); ------------------ | Branch (1036:2): [True: 1.02k, False: 0] | Branch (1036:2): [True: 1.02k, False: 0] | Branch (1036:2): [True: 1.02k, False: 0] ------------------ 1037| | 1038| 1.02k|#if defined(SIMD_FALLBACK) 1039| 1.02k| return (gDecodeTablesInitialized ? decodeMeshletSimd<1> : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4); ------------------ | Branch (1039:10): [True: 1.02k, False: 0] ------------------ 1040| |#elif defined(SIMD_SSE) || defined(SIMD_NEON) 1041| | return decodeMeshletSimd<1>(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4); 1042| |#else 1043| | return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4); 1044| |#endif 1045| 1.02k|} meshletcodec.cpp:_ZN7meshoptL17decodeBuildTablesEv: 398| 2|{ 399| 2|#define NEXT(var, ec) \ 400| 2| shuf[var] = (ec) ? (unsigned char)extra : 15; \ 401| 2| next[var] = (ec) ? 0 : (unsigned char)nextoff; \ 402| 2| extra += (ec), nextoff += 1 - (ec) 403| | 404| | // check for SSE4.1 support if we have a fallback path 405| 2|#if defined(SIMD_SSE) && defined(SIMD_FALLBACK) 406| 2| int cpuinfo[4] = {}; 407| |#ifdef _MSC_VER 408| | __cpuid(cpuinfo, 1); 409| |#else 410| 2| __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); 411| 2|#endif 412| | // bit 19 = SSE4.1 413| 2| if ((cpuinfo[2] & (1 << 19)) == 0) ------------------ | Branch (413:6): [True: 0, False: 2] ------------------ 414| 0| return false; 415| 2|#endif 416| | 417| | // fill triangle decoding tables for each combination of two triangle codes 418| 514| for (int code = 0; code < 256; ++code) ------------------ | Branch (418:21): [True: 512, False: 2] ------------------ 419| 512| { 420| 512| unsigned char shuf[16] = {}; 421| 512| unsigned char next[16] = {}; 422| 512| int extra = 0; 423| 512| int nextoff = 0; 424| | 425| | // state 0..5 will be refilled every iteration, so we ignore that 426| | // state 6..8 will always contain the last decoded triangle because every triangle shifts fifo equally, so we can decode it independently 427| 512| shuf[6] = 12; 428| 512| shuf[7] = 13; 429| 512| shuf[8] = 14; 430| | 431| | // state 15 will contain next (potentially incremented a few times) 432| 512| shuf[15] = 15; 433| | 434| | // state 9..11 will contain the first decoded triangle (tri0), which can refer to extra/next and the original triangle history 435| | // state 12..14 will contain the second decoded triangle (tri1); when decoding edge reuse, we need to handle edge 0/1 specially as it was just decoded earlier 436| 1.53k| for (int k = 0; k < 2; ++k) ------------------ | Branch (436:19): [True: 1.02k, False: 512] ------------------ 437| 1.02k| { 438| 1.02k| int tri = (code >> (k * 4)) & 0xf; 439| | 440| 1.02k| if (tri < 12) ------------------ | Branch (440:8): [True: 768, False: 256] ------------------ 441| 768| { 442| 768| if (k == 1 && tri / 4 == 0) ------------------ | Branch (442:9): [True: 384, False: 384] | Branch (442:19): [True: 128, False: 256] ------------------ 443| 128| { 444| | // we need to decode one of two edges from the triangle we just decoded earlier 445| | // for that we simply need to copy shuf/next values for the two decoded indices 446| 128| shuf[9 + k * 3] = shuf[9 + ((tri & 2) ? 2 : 0)]; ------------------ | Branch (446:34): [True: 64, False: 64] ------------------ 447| 128| next[9 + k * 3] = next[9 + ((tri & 2) ? 2 : 0)]; ------------------ | Branch (447:34): [True: 64, False: 64] ------------------ 448| | 449| 128| shuf[10 + k * 3] = shuf[9 + ((tri & 2) ? 1 : 2)]; ------------------ | Branch (449:35): [True: 64, False: 64] ------------------ 450| 128| next[10 + k * 3] = next[9 + ((tri & 2) ? 1 : 2)]; ------------------ | Branch (450:35): [True: 64, False: 64] ------------------ 451| 128| } 452| 640| else 453| 640| { 454| | // reuse: edge comes from the history based on edge index 455| | // note: we reuse with an offset because last triangle in the original history was consumed by tri0 456| 640| int trioff = 6 + k * 3 + (2 - tri / 4) * 3; 457| | 458| | // edge cb or ac 459| 640| shuf[9 + k * 3] = (unsigned char)(trioff + ((tri & 2) ? 2 : 0)); ------------------ | Branch (459:50): [True: 320, False: 320] ------------------ 460| 640| shuf[10 + k * 3] = (unsigned char)(trioff + ((tri & 2) ? 1 : 2)); ------------------ | Branch (460:51): [True: 320, False: 320] ------------------ 461| 640| } 462| | 463| | // third vertex is either next or comes from extra 464| 768| NEXT(11 + k * 3, tri & 1); ------------------ | | 400| 768| shuf[var] = (ec) ? (unsigned char)extra : 15; \ | | ------------------ | | | Branch (400:14): [True: 384, False: 384] | | ------------------ | | 401| 768| next[var] = (ec) ? 0 : (unsigned char)nextoff; \ | | ------------------ | | | Branch (401:14): [True: 384, False: 384] | | ------------------ | | 402| 768| extra += (ec), nextoff += 1 - (ec) ------------------ 465| 768| } 466| 256| else 467| 256| { 468| | // restart: three vertices, each comes from next or extra 469| 256| int fea = tri > 12; 470| 256| int feb = tri > 13; 471| 256| int fec = tri > 14; 472| | 473| 256| NEXT(9 + k * 3, fea); ------------------ | | 400| 256| shuf[var] = (ec) ? (unsigned char)extra : 15; \ | | ------------------ | | | Branch (400:14): [True: 192, False: 64] | | ------------------ | | 401| 256| next[var] = (ec) ? 0 : (unsigned char)nextoff; \ | | ------------------ | | | Branch (401:14): [True: 192, False: 64] | | ------------------ | | 402| 256| extra += (ec), nextoff += 1 - (ec) ------------------ 474| 256| NEXT(10 + k * 3, feb); ------------------ | | 400| 256| shuf[var] = (ec) ? (unsigned char)extra : 15; \ | | ------------------ | | | Branch (400:14): [True: 128, False: 128] | | ------------------ | | 401| 256| next[var] = (ec) ? 0 : (unsigned char)nextoff; \ | | ------------------ | | | Branch (401:14): [True: 128, False: 128] | | ------------------ | | 402| 256| extra += (ec), nextoff += 1 - (ec) ------------------ 475| 256| NEXT(11 + k * 3, fec); ------------------ | | 400| 256| shuf[var] = (ec) ? (unsigned char)extra : 15; \ | | ------------------ | | | Branch (400:14): [True: 64, False: 192] | | ------------------ | | 401| 256| next[var] = (ec) ? 0 : (unsigned char)nextoff; \ | | ------------------ | | | Branch (401:14): [True: 64, False: 192] | | ------------------ | | 402| 256| extra += (ec), nextoff += 1 - (ec) ------------------ 476| 256| } 477| 1.02k| } 478| | 479| | // next needs to advance 480| 512| next[15] = (unsigned char)nextoff; 481| | 482| | // next[0..8] = 0 trivially (never written to); next[9] must also be 0 because nextoff is 0 initially 483| | // shuf[0..5] is not used, which allows us to pack next[10..15] + shuf[6..15] into a single 16-byte entry 484| 512| assert(next[9] == 0); ------------------ | Branch (484:3): [True: 512, False: 0] ------------------ 485| 512| memcpy(&kDecodeTableMasks[code][0], &next[10], 6); 486| 512| memcpy(&kDecodeTableMasks[code][6], &shuf[6], 10); 487| 512| kDecodeTableExtra[code] = (unsigned char)extra; 488| 512| } 489| | 490| | // fill vertex decoding tables for each combination of four vertex references 491| 514| for (unsigned int i = 0; i < 256; ++i) ------------------ | Branch (491:27): [True: 512, False: 2] ------------------ 492| 512| { 493| 512| unsigned char shuf[16] = {}; 494| 512| int offset = 0; 495| | 496| 2.56k| for (int k = 0; k < 4; ++k) ------------------ | Branch (496:19): [True: 2.04k, False: 512] ------------------ 497| 2.04k| { 498| 2.04k| int code = ((i >> k) & 1) | ((i >> (k + 3)) & 2); 499| 2.04k| int length = i == 0xff ? 4 : code; // 0/1/2/3 bytes, or all 4 bytes if code==0xff ------------------ | Branch (499:17): [True: 8, False: 2.04k] ------------------ 500| | 501| 2.04k| shuf[k * 4 + 0] = (length > 0) ? (unsigned char)(offset + 0) : 0x80; ------------------ | Branch (501:22): [True: 1.53k, False: 512] ------------------ 502| 2.04k| shuf[k * 4 + 1] = (length > 1) ? (unsigned char)(offset + 1) : 0x80; ------------------ | Branch (502:22): [True: 1.02k, False: 1.02k] ------------------ 503| 2.04k| shuf[k * 4 + 2] = (length > 2) ? (unsigned char)(offset + 2) : 0x80; ------------------ | Branch (503:22): [True: 512, False: 1.53k] ------------------ 504| 2.04k| shuf[k * 4 + 3] = (length > 3) ? (unsigned char)(offset + 3) : 0x80; ------------------ | Branch (504:22): [True: 8, False: 2.04k] ------------------ 505| | 506| 2.04k| offset += length; 507| 2.04k| } 508| | 509| 512| memcpy(kDecodeTableVerts[i], shuf, sizeof(shuf)); 510| 512| kDecodeTableLength[i] = (unsigned char)offset; 511| 512| } 512| | 513| 2| return true; 514| | 515| 2|#undef NEXT 516| 2|} meshletcodec.cpp:_ZN7meshoptL15encodeTrianglesEPhS0_PKhm: 109| 4.14k|{ 110| 4.14k| EdgeFifo8 edgefifo; 111| 4.14k| memset(edgefifo, -1, sizeof(edgefifo)); 112| | 113| 4.14k| size_t edgefifooffset = 0; 114| | 115| 4.14k| unsigned int next = 0; 116| | 117| | // 4-bit triangle codes give us 16 options that we use as follows: 118| | // 3*2 edge reuse (2 edges * 3 last triangles) * 2 next/explicit = 12 options 119| | // 4 remaining options = next bits; 000, 001, 011, 111. 120| | // triangles are rotated to make next bits line up. 121| 4.14k| memset(codes, 0, (triangle_count + 1) / 2); 122| | 123| 4.14k| static const int rotations[] = {0, 1, 2, 0, 1}; 124| | 125| 4.14k| unsigned char* start = extra; 126| | 127| 179k| for (size_t i = 0; i < triangle_count; ++i) ------------------ | Branch (127:21): [True: 175k, False: 4.14k] ------------------ 128| 175k| { 129| |#if TRACE > 1 130| | unsigned int last = next; 131| |#endif 132| | 133| 175k| int fer = getEdgeFifo8(edgefifo, triangles[i * 3 + 0], triangles[i * 3 + 1], triangles[i * 3 + 2], edgefifooffset); 134| | 135| 175k| if (fer >= 0 && (fer >> 2) < 6) ------------------ | Branch (135:7): [True: 87.5k, False: 87.8k] | Branch (135:19): [True: 84.0k, False: 3.50k] ------------------ 136| 84.0k| { 137| | // note: getEdgeFifo8 implicitly rotates triangles by matching a/b to existing edge 138| 84.0k| const int* order = rotations + (fer & 3); 139| | 140| 84.0k| unsigned int a = triangles[i * 3 + order[0]], b = triangles[i * 3 + order[1]], c = triangles[i * 3 + order[2]]; 141| | 142| 84.0k| int fec = (c == next) ? (next++, 0) : 1; ------------------ | Branch (142:14): [True: 712, False: 83.3k] ------------------ 143| | 144| |#if TRACE > 1 145| | printf("%3d+ | %3d %3d %3d | edge: e%d c%d\n", last, a, b, c, fer >> 2, fec); 146| |#endif 147| | 148| 84.0k| unsigned int code = (fer >> 2) * 2 + fec; 149| | 150| 84.0k| codes[i / 2] |= (unsigned char)(code << ((i & 1) * 4)); 151| | 152| 84.0k| if (fec) ------------------ | Branch (152:8): [True: 83.3k, False: 712] ------------------ 153| 83.3k| *extra++ = (unsigned char)c; 154| | 155| 84.0k| pushEdgeFifo8(edgefifo, c, b, edgefifooffset); 156| 84.0k| pushEdgeFifo8(edgefifo, a, c, edgefifooffset); 157| 84.0k| } 158| 91.3k| else 159| 91.3k| { 160| | // rotate triangles to minimize the need for extra vertices 161| 91.3k| int rotation = rotateTriangle(triangles[i * 3 + 0], triangles[i * 3 + 1], triangles[i * 3 + 2]); 162| 91.3k| const int* order = rotations + rotation; 163| | 164| 91.3k| unsigned int a = triangles[i * 3 + order[0]], b = triangles[i * 3 + order[1]], c = triangles[i * 3 + order[2]]; 165| | 166| | // fe must be continuous: once a vertex is encoded with next, further vertices must also be encoded with next 167| 91.3k| int fea = (a == next && b == next + 1 && c == next + 2) ? (next++, 0) : 1; ------------------ | Branch (167:15): [True: 8.03k, False: 83.3k] | Branch (167:28): [True: 3.42k, False: 4.60k] | Branch (167:45): [True: 2.96k, False: 466] ------------------ 168| 91.3k| int feb = (b == next && c == next + 1) ? (next++, 0) : 1; ------------------ | Branch (168:15): [True: 8.26k, False: 83.0k] | Branch (168:28): [True: 3.20k, False: 5.05k] ------------------ 169| 91.3k| int fec = (c == next) ? (next++, 0) : 1; ------------------ | Branch (169:14): [True: 4.19k, False: 87.1k] ------------------ 170| | 171| 91.3k| assert(fea == 1 || feb == 0); ------------------ | Branch (171:4): [True: 88.3k, False: 2.96k] | Branch (171:4): [True: 2.96k, False: 0] | Branch (171:4): [True: 91.3k, False: 0] ------------------ 172| 91.3k| assert(feb == 1 || fec == 0); ------------------ | Branch (172:4): [True: 88.1k, False: 3.20k] | Branch (172:4): [True: 3.20k, False: 0] | Branch (172:4): [True: 91.3k, False: 0] ------------------ 173| | 174| |#if TRACE > 1 175| | printf("%3d+ | %3d %3d %3d | restart: %d%d%d\n", last, a, b, c, fea, feb, fec); 176| |#endif 177| | 178| 91.3k| unsigned int code = 12 + (fea + feb + fec); 179| | 180| 91.3k| codes[i / 2] |= (unsigned char)(code << ((i & 1) * 4)); 181| | 182| 91.3k| if (fea) ------------------ | Branch (182:8): [True: 88.3k, False: 2.96k] ------------------ 183| 88.3k| *extra++ = (unsigned char)a; 184| 91.3k| if (feb) ------------------ | Branch (184:8): [True: 88.1k, False: 3.20k] ------------------ 185| 88.1k| *extra++ = (unsigned char)b; 186| 91.3k| if (fec) ------------------ | Branch (186:8): [True: 87.1k, False: 4.19k] ------------------ 187| 87.1k| *extra++ = (unsigned char)c; 188| | 189| 91.3k| pushEdgeFifo8(edgefifo, c, b, edgefifooffset); 190| 91.3k| pushEdgeFifo8(edgefifo, a, c, edgefifooffset); 191| 91.3k| } 192| 175k| } 193| | 194| 4.14k| return extra - start; 195| 4.14k|} meshletcodec.cpp:_ZN7meshoptL12getEdgeFifo8EPA2_jjjjm: 82| 175k|{ 83| 941k| for (int i = 0; i < 8; ++i) ------------------ | Branch (83:18): [True: 853k, False: 87.8k] ------------------ 84| 853k| { 85| 853k| size_t index = (offset - 1 - i) & 7; 86| | 87| 853k| unsigned int e0 = fifo[index][0]; 88| 853k| unsigned int e1 = fifo[index][1]; 89| | 90| 853k| if (e0 == a && e1 == b) ------------------ | Branch (90:7): [True: 135k, False: 718k] | Branch (90:18): [True: 71.1k, False: 64.1k] ------------------ 91| 71.1k| return (i << 2) | 0; 92| 782k| if (e0 == b && e1 == c) ------------------ | Branch (92:7): [True: 55.9k, False: 726k] | Branch (92:18): [True: 8.81k, False: 47.1k] ------------------ 93| 8.81k| return (i << 2) | 1; 94| 773k| if (e0 == c && e1 == a) ------------------ | Branch (94:7): [True: 53.2k, False: 720k] | Branch (94:18): [True: 7.57k, False: 45.6k] ------------------ 95| 7.57k| return (i << 2) | 2; 96| 773k| } 97| | 98| 87.8k| return -1; 99| 175k|} meshletcodec.cpp:_ZN7meshoptL13pushEdgeFifo8EPA2_jjjRm: 102| 350k|{ 103| 350k| fifo[offset][0] = a; 104| 350k| fifo[offset][1] = b; 105| 350k| offset = (offset + 1) & 7; 106| 350k|} meshletcodec.cpp:_ZN7meshoptL14rotateTriangleEjjj: 77| 91.3k|{ 78| 91.3k| return (a > b && a > c) ? 1 : (b > c ? 2 : 0); ------------------ | Branch (78:10): [True: 38.8k, False: 52.4k] | Branch (78:19): [True: 27.2k, False: 11.6k] | Branch (78:33): [True: 24.9k, False: 39.1k] ------------------ 79| 91.3k|} meshletcodec.cpp:_ZN7meshoptL14encodeVerticesEPhS0_PKjm: 198| 4.14k|{ 199| | // grouped varint, 2 bit per value to indicate 0/1/2/3 byte deltas, with per-group 4-byte fallback 200| 4.14k| memset(ctrl, 0, (vertex_count + 3) / 4); 201| | 202| 4.14k| unsigned char* start = data; 203| | 204| 4.14k| unsigned int last = ~0u; 205| | 206| 45.1k| for (size_t i = 0; i < vertex_count; i += 4) ------------------ | Branch (206:21): [True: 40.9k, False: 4.14k] ------------------ 207| 40.9k| { 208| 40.9k| unsigned int gv[4] = {}; 209| | 210| 202k| for (int k = 0; k < 4 && i + k < vertex_count; ++k) ------------------ | Branch (210:19): [True: 162k, False: 39.9k] | Branch (210:28): [True: 161k, False: 1.06k] ------------------ 211| 161k| { 212| 161k| unsigned int d = vertices[i + k] - last - 1; 213| 161k| unsigned int v = (d << 1) ^ (int(d) >> 31); 214| | 215| 161k| gv[k] = v; 216| 161k| last = vertices[i + k]; 217| 161k| } 218| | 219| | // if any value needs 4 bytes, or if *all* values need 3 bytes, we use 4 bytes for all values 220| | // this allows us to encode most 3-byte deltas with 3 bytes which saves space overall 221| 40.9k| bool use4 = (gv[0] | gv[1] | gv[2] | gv[3]) > 0xffffff || (gv[0] > 0xffff && gv[1] > 0xffff && gv[2] > 0xffff && gv[3] > 0xffff); ------------------ | Branch (221:15): [True: 28.1k, False: 12.8k] | Branch (221:62): [True: 1.23k, False: 11.6k] | Branch (221:80): [True: 755, False: 477] | Branch (221:98): [True: 358, False: 397] | Branch (221:116): [True: 261, False: 97] ------------------ 222| | 223| 204k| for (int k = 0; k < 4; ++k) ------------------ | Branch (223:19): [True: 163k, False: 40.9k] ------------------ 224| 163k| { 225| 163k| unsigned int v = gv[k]; 226| | 227| | // 0/1/2/3 bytes per value, or all 4 values use 4 bytes 228| 163k| int code = use4 ? 3 : (v == 0 ? 0 : (v < 256 ? 1 : (v < 65536 ? 2 : 3))); ------------------ | Branch (228:15): [True: 113k, False: 50.2k] | Branch (228:27): [True: 1.46k, False: 48.8k] | Branch (228:41): [True: 43.6k, False: 5.18k] | Branch (228:56): [True: 2.46k, False: 2.72k] ------------------ 229| | 230| 163k| if (code > 0) ------------------ | Branch (230:8): [True: 162k, False: 1.46k] ------------------ 231| 162k| *data++ = (unsigned char)(v & 0xff); 232| 163k| if (code > 1) ------------------ | Branch (232:8): [True: 118k, False: 45.0k] ------------------ 233| 118k| *data++ = (unsigned char)((v >> 8) & 0xff); 234| 163k| if (code > 2) ------------------ | Branch (234:8): [True: 116k, False: 47.5k] ------------------ 235| 116k| *data++ = (unsigned char)((v >> 16) & 0xff); 236| 163k| if (use4) ------------------ | Branch (236:8): [True: 113k, False: 50.2k] ------------------ 237| 113k| *data++ = (unsigned char)((v >> 24) & 0xff); 238| | 239| | // split low and high bits into two nibbles for better packing 240| 163k| ctrl[i / 4] |= ((code & 1) << k) | ((code >> 1) << (k + 4)); 241| 163k| } 242| 40.9k| } 243| | 244| 4.14k| return data - start; 245| 4.14k|} meshletcodec.cpp:_ZN7meshoptL17decodeMeshletSimdILi0EEEiPvS1_PKhS3_S3_S3_mmmm: 865| 12.3k|{ 866| 12.3k| assert(gDecodeTablesInitialized); ------------------ | Branch (866:2): [True: 12.3k, False: 0] ------------------ 867| 12.3k| (void)gDecodeTablesInitialized; 868| | 869| 12.3k|#ifdef __clang__ 870| | // data is guaranteed to be non-null initially; if decode loops never hit bounds errors, it remains non-null 871| 12.3k| __builtin_assume(data); 872| 12.3k|#endif 873| | 874| | // decodes 4 vertices at a time with tail processing; writes up to align(vertex_size * vertex_count, 4) 875| | // raw decoding skips tail processing by rounding up vertex count; it's safe because output buffer is guaranteed to have extra space, and tail control data is 0 876| 12.3k| if (vertex_size == 4 || Raw) ------------------ | Branch (876:6): [True: 8.26k, False: 4.11k] | Branch (876:26): [Folded, False: 0] ------------------ 877| 8.26k| data = decodeVerticesSimd(static_cast(vertices), ctrl, data, bound, Raw ? (vertex_count + 3) & ~3 : vertex_count); ------------------ | Branch (877:86): [Folded, False: 8.26k] ------------------ 878| 4.11k| else 879| 4.11k| data = decodeVerticesSimd(static_cast(vertices), ctrl, data, bound, vertex_count); 880| 12.3k| if (!data) ------------------ | Branch (880:6): [True: 792, False: 11.5k] ------------------ 881| 792| return -2; 882| | 883| | // decodes 2/4 triangles at a time with tail processing; writes up to align(triangle_size * triangle_count, 4) 884| | // raw decoding skips tail processing by rounding up triangle count; it's safe because output buffer is guaranteed to have extra space, and tail code data is 0 885| 11.5k| if (triangle_size == 4 || Raw) ------------------ | Branch (885:6): [True: 3.71k, False: 7.86k] | Branch (885:28): [Folded, False: 0] ------------------ 886| 3.71k| data = decodeTrianglesSimd(static_cast(triangles), codes, data, bound, Raw ? (triangle_count + 1) & ~1 : triangle_count); ------------------ | Branch (886:89): [Folded, False: 3.71k] ------------------ 887| 7.86k| else 888| 7.86k| data = decodeTrianglesSimd(static_cast(triangles), codes, data, bound, triangle_count); 889| 11.5k| if (!data) ------------------ | Branch (889:6): [True: 298, False: 11.2k] ------------------ 890| 298| return -2; 891| | 892| 11.2k| return (data == bound) ? 0 : -3; ------------------ | Branch (892:9): [True: 8.36k, False: 2.91k] ------------------ 893| 11.5k|} meshletcodec.cpp:_ZN7meshoptL18decodeVerticesSimdEPjPKhS2_S2_m: 750| 9.28k|{ 751| 9.28k|#if defined(SIMD_SSE) 752| 9.28k| __m128i last = _mm_set1_epi32(-1); 753| |#elif defined(SIMD_NEON) 754| | uint32x4_t last = vdupq_n_u32(~0u); 755| |#endif 756| | 757| 9.28k| size_t groups = vertex_count / 4; 758| | 759| | // process all complete groups 760| 131k| for (size_t i = 0; i < groups; ++i) ------------------ | Branch (760:21): [True: 122k, False: 8.71k] ------------------ 761| 122k| { 762| 122k| unsigned char code = *ctrl++; 763| 122k| if (data > bound) ------------------ | Branch (763:7): [True: 572, False: 121k] ------------------ 764| 572| return NULL; 765| | 766| 121k| last = decodeVertexGroup(last, code, data); 767| | 768| 121k|#if defined(SIMD_SSE) 769| 121k| _mm_storeu_si128(reinterpret_cast<__m128i*>(&vertices[i * 4]), last); 770| |#elif defined(SIMD_NEON) 771| | vst1q_u32(&vertices[i * 4], last); 772| |#endif 773| 121k| } 774| | 775| | // process a 1-3 vertex tail; to maintain the memory safety guarantee we have to write individual elements 776| 8.71k| if (vertex_count & 3) ------------------ | Branch (776:6): [True: 2.52k, False: 6.18k] ------------------ 777| 2.52k| { 778| 2.52k| unsigned char code = *ctrl++; 779| | 780| 2.52k| if (data > bound) ------------------ | Branch (780:7): [True: 22, False: 2.50k] ------------------ 781| 22| return NULL; 782| | 783| 2.50k| last = decodeVertexGroup(last, code, data); 784| | 785| 2.50k| unsigned int* tail = &vertices[vertex_count & ~3u]; 786| | 787| 2.50k|#if defined(SIMD_SSE) 788| 2.50k| tail[0] = _mm_cvtsi128_si32(last); 789| 2.50k| if ((vertex_count & 3) > 1) ------------------ | Branch (789:7): [True: 1.10k, False: 1.40k] ------------------ 790| 1.10k| tail[1] = _mm_extract_epi32(last, 1); 791| 2.50k| if ((vertex_count & 3) > 2) ------------------ | Branch (791:7): [True: 377, False: 2.12k] ------------------ 792| 377| tail[2] = _mm_extract_epi32(last, 2); 793| |#elif defined(SIMD_NEON) 794| | vst1q_lane_u32(&tail[0], last, 0); 795| | if ((vertex_count & 3) > 1) 796| | vst1q_lane_u32(&tail[1], last, 1); 797| | if ((vertex_count & 3) > 2) 798| | vst1q_lane_u32(&tail[2], last, 2); 799| |#endif 800| 2.50k| } 801| | 802| 8.68k| return data; 803| 8.71k|} _ZN7meshopt17decodeVertexGroupEDv2_xhRPKh: 540| 220k|{ 541| 220k| __m128i word = _mm_loadu_si128(reinterpret_cast(data)); 542| 220k| __m128i shuf = _mm_loadu_si128(reinterpret_cast(kDecodeTableVerts[code])); 543| | 544| 220k| __m128i v = _mm_shuffle_epi8(word, shuf); 545| | 546| | // unzigzag+1 547| 220k| __m128i xl = _mm_sub_epi32(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi32(1))); 548| 220k| __m128i xr = _mm_srli_epi32(v, 1); 549| 220k| __m128i x = _mm_add_epi32(_mm_xor_si128(xl, xr), _mm_set1_epi32(1)); 550| | 551| | // prefix sum 552| 220k| x = _mm_add_epi32(x, _mm_slli_si128(x, 8)); 553| 220k| x = _mm_add_epi32(x, _mm_slli_si128(x, 4)); 554| 220k| x = _mm_add_epi32(x, _mm_shuffle_epi32(last, 0xff)); 555| | 556| 220k| data += kDecodeTableLength[code]; 557| | 558| 220k| return x; 559| 220k|} meshletcodec.cpp:_ZN7meshoptL18decodeVerticesSimdEPtPKhS2_S2_m: 807| 4.11k|{ 808| 4.11k|#if defined(SIMD_SSE) 809| 4.11k| __m128i repack = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0); 810| 4.11k| __m128i last = _mm_set1_epi32(-1); 811| |#elif defined(SIMD_NEON) 812| | uint32x4_t last = vdupq_n_u32(~0u); 813| |#endif 814| | 815| | // because the output buffer is guaranteed to have 32-bit aligned size available, we can simplify tail processing 816| | // if the number of vertices mod 4 is 3, we'd normally need to write 8+6 bytes, but we can instead overwrite up to 2 bytes in the main loop 817| 4.11k| size_t groups = (vertex_count + 1) / 4; 818| | 819| | // process all complete groups 820| 98.5k| for (size_t i = 0; i < groups; ++i) ------------------ | Branch (820:21): [True: 94.7k, False: 3.74k] ------------------ 821| 94.7k| { 822| 94.7k| unsigned char code = *ctrl++; 823| | 824| 94.7k| if (data > bound) ------------------ | Branch (824:7): [True: 374, False: 94.3k] ------------------ 825| 374| return NULL; 826| | 827| 94.3k| last = decodeVertexGroup(last, code, data); 828| | 829| 94.3k|#if defined(SIMD_SSE) 830| 94.3k| __m128i r = _mm_shuffle_epi8(last, repack); 831| 94.3k| _mm_storel_epi64(reinterpret_cast<__m128i*>(&vertices[i * 4]), r); 832| |#elif defined(SIMD_NEON) 833| | uint16x4_t r = vmovn_u32(last); 834| | vst1_u16(&vertices[i * 4], r); 835| |#endif 836| 94.3k| } 837| | 838| | // process a 1-2 vertex tail; to maintain the memory safety guarantee we have to write a 32-bit element 839| 3.74k| if (groups * 4 < vertex_count) ------------------ | Branch (839:6): [True: 2.15k, False: 1.58k] ------------------ 840| 2.15k| { 841| 2.15k| unsigned char code = *ctrl++; 842| | 843| 2.15k| if (data > bound) ------------------ | Branch (843:7): [True: 22, False: 2.12k] ------------------ 844| 22| return NULL; 845| | 846| 2.12k| last = decodeVertexGroup(last, code, data); 847| | 848| 2.12k| unsigned short* tail = &vertices[vertex_count & ~3u]; 849| | 850| 2.12k|#if defined(SIMD_SSE) 851| 2.12k| __m128i r = _mm_shufflelo_epi16(last, 8); 852| 2.12k| *reinterpret_cast(tail) = _mm_cvtsi128_si32(r); 853| |#elif defined(SIMD_NEON) 854| | uint16x4_t r = vmovn_u32(last); 855| | vst1_lane_u32(reinterpret_cast(tail), vreinterpret_u32_u16(r), 0); 856| |#endif 857| 2.12k| } 858| | 859| 3.71k| return data; 860| 3.74k|} meshletcodec.cpp:_ZN7meshoptL19decodeTrianglesSimdEPjPKhS2_S2_m: 615| 4.54k|{ 616| 4.54k|#if defined(SIMD_SSE) 617| 4.54k| __m128i repack = _mm_setr_epi8(9, 10, 11, -1, 12, 13, 14, -1, 0, 0, 0, 0, 0, 0, 0, 0); 618| 4.54k| __m128i state = _mm_setzero_si128(); 619| |#elif defined(SIMD_NEON) 620| | uint8x8_t repack = vcreate_u8(0xff0e0d0cff0b0a09ull); 621| | uint8x16_t state = vdupq_n_u8(0); 622| |#endif 623| | 624| 4.54k| size_t groups = triangle_count / 2; 625| | 626| | // process all complete groups 627| 195k| for (size_t i = 0; i < groups; ++i) ------------------ | Branch (627:21): [True: 191k, False: 4.34k] ------------------ 628| 191k| { 629| 191k| unsigned char code = *codes++; 630| | 631| 191k| if (extra > bound) ------------------ | Branch (631:7): [True: 199, False: 190k] ------------------ 632| 199| return NULL; 633| | 634| 190k| state = decodeTriangleGroup(state, code, extra); 635| | 636| | // write 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding 637| 190k|#if defined(SIMD_SSE) 638| 190k| __m128i r = _mm_shuffle_epi8(state, repack); 639| 190k| _mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 2]), r); 640| |#elif defined(SIMD_NEON) 641| | uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack)); 642| | vst1_u32(&triangles[i * 2], r); 643| |#endif 644| 190k| } 645| | 646| | // process a 1 triangle tail; to maintain the memory safety guarantee we have to write a 32-bit element 647| 4.34k| if (triangle_count & 1) ------------------ | Branch (647:6): [True: 1.59k, False: 2.74k] ------------------ 648| 1.59k| { 649| 1.59k| unsigned char code = *codes++; 650| | 651| 1.59k| if (extra > bound) ------------------ | Branch (651:7): [True: 32, False: 1.56k] ------------------ 652| 32| return NULL; 653| | 654| 1.56k| state = decodeTriangleGroup(state, code, extra); 655| | 656| 1.56k| unsigned int* tail = &triangles[triangle_count & ~1u]; 657| | 658| 1.56k|#if defined(SIMD_SSE) 659| 1.56k| __m128i r = _mm_shuffle_epi8(state, repack); 660| 1.56k| *tail = unsigned(_mm_cvtsi128_si32(r)); 661| |#elif defined(SIMD_NEON) 662| | uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack)); 663| | vst1_lane_u32(tail, r, 0); 664| |#endif 665| 1.56k| } 666| | 667| 4.30k| return extra; 668| 4.34k|} _ZN7meshopt19decodeTriangleGroupEDv2_xhRPKh: 524| 353k|{ 525| 353k| __m128i shuf = _mm_loadu_si128(reinterpret_cast(kDecodeTableMasks[code])); 526| 353k| __m128i next = _mm_slli_si128(shuf, 10); 527| | 528| | // patch first 6 bytes with current extra and roll state forward 529| 353k| __m128i ext = _mm_loadl_epi64(reinterpret_cast(extra)); 530| 353k| state = _mm_blend_epi16(state, ext, 7); 531| 353k| state = _mm_add_epi8(_mm_shuffle_epi8(state, shuf), next); 532| | 533| 353k| extra += kDecodeTableExtra[code]; 534| | 535| 353k| return state; 536| 353k|} meshletcodec.cpp:_ZN7meshoptL19decodeTrianglesSimdEPhPKhS2_S2_m: 672| 7.86k|{ 673| 7.86k|#if defined(SIMD_SSE) 674| 7.86k| __m128i state = _mm_setzero_si128(); 675| |#elif defined(SIMD_NEON) 676| | uint8x16_t state = vdupq_n_u8(0); 677| |#endif 678| | 679| | // because the output buffer is guaranteed to have 32-bit aligned size available, we can optimize writes and tail processing 680| | // instead of processing triangles 2 at a time, we process 2 *pairs* at a time (12-byte write) followed by a tail pair, if present 681| | // if the number of triangles mod 4 is 3, we'd normally need to write 12k+9 bytes, but we can instead overwrite up to 3 bytes in the main loop 682| 7.86k| size_t groups = (triangle_count + 1) / 4; 683| | 684| | // process all complete groups 685| 85.5k| for (size_t i = 0; i < groups; ++i) ------------------ | Branch (685:21): [True: 77.7k, False: 7.76k] ------------------ 686| 77.7k| { 687| 77.7k| unsigned char code0 = *codes++; 688| 77.7k| unsigned char code1 = *codes++; 689| | 690| | // each triangle pair reads <=6 bytes from extra, so two pairs need <=12 bytes and gap guarantees 16 byte of overread 691| 77.7k| if (extra > bound) ------------------ | Branch (691:7): [True: 106, False: 77.6k] ------------------ 692| 106| return NULL; 693| | 694| 77.6k| state = decodeTriangleGroup(state, code0, extra); 695| | 696| | // write first decoded triangle and first index of second decoded triangle 697| 77.6k|#if defined(SIMD_SSE) 698| 77.6k| __m128i r0 = _mm_srli_si128(state, 9); 699| 77.6k| *reinterpret_cast(&triangles[i * 12]) = _mm_cvtsi128_si32(r0); 700| |#elif defined(SIMD_NEON) 701| | uint8x16_t r0 = vextq_u8(state, vdupq_n_u8(0), 9); 702| | vst1q_lane_u32(reinterpret_cast(&triangles[i * 12]), vreinterpretq_u32_u8(r0), 0); 703| |#endif 704| | 705| 77.6k| state = decodeTriangleGroup(state, code1, extra); 706| | 707| | // write last two indices of second decoded triangle that we didn't write above plus two new ones 708| | // note that the second decoded triangle has shifted down to 6-8 bytes, hence shift by 7 709| 77.6k|#if defined(SIMD_SSE) 710| 77.6k| __m128i r1 = _mm_srli_si128(state, 7); 711| 77.6k| _mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 12 + 4]), r1); 712| |#elif defined(SIMD_NEON) 713| | uint8x16_t r1 = vextq_u8(state, vdupq_n_u8(0), 7); 714| | vst1_u8(&triangles[i * 12 + 4], vget_low_u8(r1)); 715| |#endif 716| 77.6k| } 717| | 718| | // process a 1-2 triangle tail; to maintain the memory safety guarantee we have to write 1-2 32-bit elements 719| 7.76k| if (groups * 4 < triangle_count) ------------------ | Branch (719:6): [True: 6.13k, False: 1.63k] ------------------ 720| 6.13k| { 721| 6.13k| unsigned char code = *codes++; 722| | 723| 6.13k| if (extra > bound) ------------------ | Branch (723:7): [True: 38, False: 6.09k] ------------------ 724| 38| return NULL; 725| | 726| 6.09k| state = decodeTriangleGroup(state, code, extra); 727| | 728| 6.09k| unsigned char* tail = &triangles[(triangle_count & ~3u) * 3]; 729| | 730| 6.09k|#if defined(SIMD_SSE) 731| 6.09k| __m128i r = _mm_srli_si128(state, 9); 732| | 733| 6.09k| *reinterpret_cast(tail) = _mm_cvtsi128_si32(r); 734| 6.09k| if ((triangle_count & 3) > 1) ------------------ | Branch (734:7): [True: 776, False: 5.31k] ------------------ 735| 776| *reinterpret_cast(tail + 4) = _mm_extract_epi32(r, 1); 736| |#elif defined(SIMD_NEON) 737| | uint8x16_t r = vextq_u8(state, vdupq_n_u8(0), 9); 738| | 739| | vst1q_lane_u32(reinterpret_cast(tail), vreinterpretq_u32_u8(r), 0); 740| | if ((triangle_count & 3) > 1) 741| | vst1q_lane_u32(reinterpret_cast(tail + 4), vreinterpretq_u32_u8(r), 1); 742| |#endif 743| 6.09k| } 744| | 745| 7.72k| return extra; 746| 7.76k|} meshletcodec.cpp:_ZN7meshoptL17decodeMeshletSimdILi1EEEiPvS1_PKhS3_S3_S3_mmmm: 865| 1.02k|{ 866| 1.02k| assert(gDecodeTablesInitialized); ------------------ | Branch (866:2): [True: 1.02k, False: 0] ------------------ 867| 1.02k| (void)gDecodeTablesInitialized; 868| | 869| 1.02k|#ifdef __clang__ 870| | // data is guaranteed to be non-null initially; if decode loops never hit bounds errors, it remains non-null 871| 1.02k| __builtin_assume(data); 872| 1.02k|#endif 873| | 874| | // decodes 4 vertices at a time with tail processing; writes up to align(vertex_size * vertex_count, 4) 875| | // raw decoding skips tail processing by rounding up vertex count; it's safe because output buffer is guaranteed to have extra space, and tail control data is 0 876| 1.02k| if (vertex_size == 4 || Raw) ------------------ | Branch (876:6): [True: 1.02k, False: 0] | Branch (876:26): [True: 0, Folded] ------------------ 877| 1.02k| data = decodeVerticesSimd(static_cast(vertices), ctrl, data, bound, Raw ? (vertex_count + 3) & ~3 : vertex_count); ------------------ | Branch (877:86): [True: 1.02k, Folded] ------------------ 878| 0| else 879| 0| data = decodeVerticesSimd(static_cast(vertices), ctrl, data, bound, vertex_count); 880| 1.02k| if (!data) ------------------ | Branch (880:6): [True: 198, False: 822] ------------------ 881| 198| return -2; 882| | 883| | // decodes 2/4 triangles at a time with tail processing; writes up to align(triangle_size * triangle_count, 4) 884| | // raw decoding skips tail processing by rounding up triangle count; it's safe because output buffer is guaranteed to have extra space, and tail code data is 0 885| 822| if (triangle_size == 4 || Raw) ------------------ | Branch (885:6): [True: 822, False: 0] | Branch (885:28): [True: 0, Folded] ------------------ 886| 822| data = decodeTrianglesSimd(static_cast(triangles), codes, data, bound, Raw ? (triangle_count + 1) & ~1 : triangle_count); ------------------ | Branch (886:89): [True: 822, Folded] ------------------ 887| 0| else 888| 0| data = decodeTrianglesSimd(static_cast(triangles), codes, data, bound, triangle_count); 889| 822| if (!data) ------------------ | Branch (889:6): [True: 77, False: 745] ------------------ 890| 77| return -2; 891| | 892| 745| return (data == bound) ? 0 : -3; ------------------ | Branch (892:9): [True: 18, False: 727] ------------------ 893| 822|} _Z21meshopt_decodeMeshletIjjEiPT_mPT0_mPKhm: 1456| 2.07k|{ 1457| 2.07k| char types_valid[(sizeof(V) == 2 || sizeof(V) == 4) && (sizeof(T) == 1 || sizeof(T) == 4) ? 1 : -1]; 1458| 2.07k| (void)types_valid; 1459| | 1460| 2.07k| return meshopt_decodeMeshlet(vertices, vertex_count, sizeof(V), triangles, triangle_count, sizeof(T) == 1 ? 3 : 4, buffer, buffer_size); ------------------ | Branch (1460:93): [Folded, False: 2.07k] ------------------ 1461| 2.07k|} _Z21meshopt_decodeMeshletIjhEiPT_mPT0_mPKhm: 1456| 4.14k|{ 1457| 4.14k| char types_valid[(sizeof(V) == 2 || sizeof(V) == 4) && (sizeof(T) == 1 || sizeof(T) == 4) ? 1 : -1]; 1458| 4.14k| (void)types_valid; 1459| | 1460| 4.14k| return meshopt_decodeMeshlet(vertices, vertex_count, sizeof(V), triangles, triangle_count, sizeof(T) == 1 ? 3 : 4, buffer, buffer_size); ------------------ | Branch (1460:93): [True: 4.14k, Folded] ------------------ 1461| 4.14k|} _Z21meshopt_decodeMeshletIthEiPT_mPT0_mPKhm: 1456| 2.07k|{ 1457| 2.07k| char types_valid[(sizeof(V) == 2 || sizeof(V) == 4) && (sizeof(T) == 1 || sizeof(T) == 4) ? 1 : -1]; 1458| 2.07k| (void)types_valid; 1459| | 1460| 2.07k| return meshopt_decodeMeshlet(vertices, vertex_count, sizeof(V), triangles, triangle_count, sizeof(T) == 1 ? 3 : 4, buffer, buffer_size); ------------------ | Branch (1460:93): [True: 2.07k, Folded] ------------------ 1461| 2.07k|} meshopt_encodeVertexBufferLevel: 1616| 16.5k|{ 1617| 16.5k| using namespace meshopt; 1618| | 1619| 16.5k| assert(vertex_size > 0 && vertex_size <= 256); ------------------ | Branch (1619:2): [True: 16.5k, False: 0] | Branch (1619:2): [True: 16.5k, False: 0] | Branch (1619:2): [True: 16.5k, False: 0] ------------------ 1620| 16.5k| assert(vertex_size % 4 == 0); ------------------ | Branch (1620:2): [True: 16.5k, False: 0] ------------------ 1621| 16.5k| assert(level >= 0 && level <= 9); // only a subset of this range is used right now ------------------ | Branch (1621:2): [True: 16.5k, False: 0] | Branch (1621:2): [True: 16.5k, False: 0] | Branch (1621:2): [True: 16.5k, False: 0] ------------------ 1622| 16.5k| assert(version < 0 || unsigned(version) <= kDecodeVertexVersion); ------------------ | Branch (1622:2): [True: 16.5k, False: 0] | Branch (1622:2): [True: 0, False: 0] | Branch (1622:2): [True: 16.5k, False: 0] ------------------ 1623| | 1624| 16.5k| version = version < 0 ? gEncodeVertexVersion : version; ------------------ | Branch (1624:12): [True: 16.5k, False: 0] ------------------ 1625| | 1626| |#if TRACE 1627| | memset(vertexstats, 0, sizeof(vertexstats)); 1628| |#endif 1629| | 1630| 16.5k| const unsigned char* vertex_data = static_cast(vertices); 1631| | 1632| 16.5k| unsigned char* data = buffer; 1633| 16.5k| unsigned char* data_end = buffer + buffer_size; 1634| | 1635| 16.5k| if (size_t(data_end - data) < 1) ------------------ | Branch (1635:6): [True: 0, False: 16.5k] ------------------ 1636| 0| return 0; 1637| | 1638| 16.5k| *data++ = (unsigned char)(kVertexHeader | version); 1639| | 1640| 16.5k| unsigned char first_vertex[256] = {}; 1641| 16.5k| if (vertex_count > 0) ------------------ | Branch (1641:6): [True: 12.8k, False: 3.70k] ------------------ 1642| 12.8k| memcpy(first_vertex, vertex_data, vertex_size); 1643| | 1644| 16.5k| unsigned char last_vertex[256] = {}; 1645| 16.5k| memcpy(last_vertex, first_vertex, vertex_size); 1646| | 1647| 16.5k| size_t vertex_block_size = getVertexBlockSize(vertex_size); 1648| | 1649| 16.5k| unsigned char channels[64] = {}; 1650| 16.5k| if (version != 0 && level > 1 && vertex_count > 1) ------------------ | Branch (1650:6): [True: 12.7k, False: 3.88k] | Branch (1650:22): [True: 6.17k, False: 6.52k] | Branch (1650:35): [True: 4.22k, False: 1.95k] ------------------ 1651| 22.0k| for (size_t k = 0; k < vertex_size; k += 4) ------------------ | Branch (1651:22): [True: 17.8k, False: 4.22k] ------------------ 1652| 17.8k| { 1653| 17.8k| int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0; ------------------ | Branch (1653:14): [True: 15.5k, False: 2.31k] ------------------ 1654| 17.8k| int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot); ------------------ | Branch (1654:137): [True: 15.5k, False: 2.31k] ------------------ 1655| | 1656| 17.8k| assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8)); ------------------ | Branch (1656:4): [True: 2.24k, False: 0] | Branch (1656:4): [True: 2.24k, False: 0] | Branch (1656:4): [True: 15.6k, False: 2.24k] | Branch (1656:4): [True: 17.8k, False: 0] ------------------ 1657| 17.8k| channels[k / 4] = (unsigned char)channel; 1658| 17.8k| } 1659| | 1660| 16.5k| size_t vertex_offset = 0; 1661| | 1662| 334k| while (vertex_offset < vertex_count) ------------------ | Branch (1662:9): [True: 318k, False: 16.5k] ------------------ 1663| 318k| { 1664| 318k| size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; ------------------ | Branch (1664:23): [True: 305k, False: 12.8k] ------------------ 1665| | 1666| 318k| data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level); 1667| 318k| if (!data) ------------------ | Branch (1667:7): [True: 0, False: 318k] ------------------ 1668| 0| return 0; 1669| | 1670| 318k| vertex_offset += block_size; 1671| 318k| } 1672| | 1673| 16.5k| size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4); ------------------ | Branch (1673:36): [True: 3.88k, False: 12.7k] ------------------ 1674| 16.5k| size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1; ------------------ | Branch (1674:25): [True: 3.88k, False: 12.7k] ------------------ 1675| 16.5k| size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; ------------------ | Branch (1675:25): [True: 9.26k, False: 7.32k] ------------------ 1676| | 1677| 16.5k| if (size_t(data_end - data) < tail_size_pad) ------------------ | Branch (1677:6): [True: 0, False: 16.5k] ------------------ 1678| 0| return 0; 1679| | 1680| 16.5k| if (tail_size < tail_size_pad) ------------------ | Branch (1680:6): [True: 9.26k, False: 7.32k] ------------------ 1681| 9.26k| { 1682| 9.26k| memset(data, 0, tail_size_pad - tail_size); 1683| 9.26k| data += tail_size_pad - tail_size; 1684| 9.26k| } 1685| | 1686| 16.5k| memcpy(data, first_vertex, vertex_size); 1687| 16.5k| data += vertex_size; 1688| | 1689| 16.5k| if (version != 0) ------------------ | Branch (1689:6): [True: 12.7k, False: 3.88k] ------------------ 1690| 12.7k| { 1691| 12.7k| memcpy(data, channels, vertex_size / 4); 1692| 12.7k| data += vertex_size / 4; 1693| 12.7k| } 1694| | 1695| 16.5k| assert(data >= buffer + tail_size); ------------------ | Branch (1695:2): [True: 16.5k, False: 0] ------------------ 1696| 16.5k| assert(data <= buffer + buffer_size); ------------------ | Branch (1696:2): [True: 16.5k, False: 0] ------------------ 1697| | 1698| |#if TRACE 1699| | size_t total_size = data - buffer; 1700| | 1701| | for (size_t k = 0; k < vertex_size; ++k) 1702| | { 1703| | const Stats& vsk = vertexstats[k]; 1704| | 1705| | printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8); 1706| | 1707| | size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8]; 1708| | double total_kr = total_k ? 1.0 / double(total_k) : 0; 1709| | 1710| | if (version != 0) 1711| | { 1712| | int channel = channels[k / 4]; 1713| | 1714| | if ((channel & 3) == 2 && k % 4 == 0) 1715| | printf(" | ^%d", channel >> 4); 1716| | else 1717| | printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : ".")); 1718| | } 1719| | 1720| | printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]", 1721| | double(vsk.header) * total_kr * 100, 1722| | double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100, 1723| | double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100); 1724| | 1725| | size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3]; 1726| | 1727| | if (total_ctrl) 1728| | { 1729| | printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%", 1730| | double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100, 1731| | double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100); 1732| | } 1733| | 1734| | if (level >= 3) 1735| | printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]", 1736| | double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100, 1737| | double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100, 1738| | double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100, 1739| | double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100); 1740| | 1741| | printf("\n"); 1742| | } 1743| |#endif 1744| | 1745| 16.5k| return data - buffer; 1746| 16.5k|} meshopt_encodeVertexBufferBound: 1754| 8.29k|{ 1755| 8.29k| using namespace meshopt; 1756| | 1757| 8.29k| assert(vertex_size > 0 && vertex_size <= 256); ------------------ | Branch (1757:2): [True: 8.29k, False: 0] | Branch (1757:2): [True: 8.29k, False: 0] | Branch (1757:2): [True: 8.29k, False: 0] ------------------ 1758| 8.29k| assert(vertex_size % 4 == 0); ------------------ | Branch (1758:2): [True: 8.29k, False: 0] ------------------ 1759| | 1760| 8.29k| size_t vertex_block_size = getVertexBlockSize(vertex_size); 1761| 8.29k| size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size; 1762| | 1763| 8.29k| size_t vertex_block_control_size = vertex_size / 4; 1764| 8.29k| size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4; 1765| 8.29k| size_t vertex_block_data_size = vertex_block_size; 1766| | 1767| 8.29k| size_t tail_size = vertex_size + (vertex_size / 4); 1768| 8.29k| size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1; ------------------ | Branch (1768:25): [True: 8.29k, Folded] ------------------ 1769| 8.29k| size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; ------------------ | Branch (1769:25): [True: 6.22k, False: 2.07k] ------------------ 1770| 8.29k| assert(tail_size_pad >= kByteGroupDecodeLimit); ------------------ | Branch (1770:2): [True: 8.29k, False: 0] ------------------ 1771| | 1772| 8.29k| return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad; 1773| 8.29k|} meshopt_encodeVertexVersion: 1776| 2.07k|{ 1777| 2.07k| assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion)); ------------------ | Branch (1777:2): [True: 2.07k, False: 0] ------------------ 1778| | 1779| 2.07k| meshopt::gEncodeVertexVersion = version; 1780| 2.07k|} meshopt_decodeVertexBuffer: 1800| 16.5k|{ 1801| 16.5k| using namespace meshopt; 1802| | 1803| 16.5k| assert(vertex_size > 0 && vertex_size <= 256); ------------------ | Branch (1803:2): [True: 16.5k, False: 0] | Branch (1803:2): [True: 16.5k, False: 0] | Branch (1803:2): [True: 16.5k, False: 0] ------------------ 1804| 16.5k| assert(vertex_size % 4 == 0); ------------------ | Branch (1804:2): [True: 16.5k, False: 0] ------------------ 1805| | 1806| 16.5k| const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL; 1807| | 1808| 16.5k|#if defined(SIMD_SSE) && defined(SIMD_FALLBACK) 1809| 16.5k| const unsigned int cpumask = (1 << 9) | (1 << 23); // SSSE3+POPCNT 1810| 16.5k| decode = (cpuid & cpumask) == cpumask ? decodeVertexBlockSimd : decodeVertexBlock; ------------------ | Branch (1810:11): [True: 16.5k, False: 0] ------------------ 1811| |#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM) 1812| | decode = decodeVertexBlockSimd; 1813| |#else 1814| | decode = decodeVertexBlock; 1815| |#endif 1816| | 1817| 16.5k|#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) 1818| 16.5k| assert(gDecodeBytesGroupInitialized); ------------------ | Branch (1818:2): [True: 16.5k, False: 0] ------------------ 1819| 16.5k| (void)gDecodeBytesGroupInitialized; 1820| 16.5k|#endif 1821| | 1822| 16.5k| unsigned char* vertex_data = static_cast(destination); 1823| | 1824| 16.5k| const unsigned char* data = buffer; 1825| 16.5k| const unsigned char* data_end = buffer + buffer_size; 1826| | 1827| 16.5k| if (size_t(data_end - data) < 1) ------------------ | Branch (1827:6): [True: 0, False: 16.5k] ------------------ 1828| 0| return -2; 1829| | 1830| 16.5k| unsigned char data_header = *data++; 1831| | 1832| 16.5k| if ((data_header & 0xf0) != kVertexHeader) ------------------ | Branch (1832:6): [True: 6.88k, False: 9.70k] ------------------ 1833| 6.88k| return -1; 1834| | 1835| 9.70k| int version = data_header & 0x0f; 1836| 9.70k| if (version > kDecodeVertexVersion) ------------------ | Branch (1836:6): [True: 188, False: 9.52k] ------------------ 1837| 188| return -1; 1838| | 1839| 9.52k| size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4); ------------------ | Branch (1839:36): [True: 2.22k, False: 7.29k] ------------------ 1840| 9.52k| size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1; ------------------ | Branch (1840:25): [True: 2.22k, False: 7.29k] ------------------ 1841| 9.52k| size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; ------------------ | Branch (1841:25): [True: 5.31k, False: 4.20k] ------------------ 1842| | 1843| 9.52k| if (size_t(data_end - data) < tail_size_pad) ------------------ | Branch (1843:6): [True: 214, False: 9.30k] ------------------ 1844| 214| return -2; 1845| | 1846| 9.30k| const unsigned char* tail = data_end - tail_size; 1847| | 1848| 9.30k| unsigned char last_vertex[256]; 1849| 9.30k| memcpy(last_vertex, tail, vertex_size); 1850| | 1851| 9.30k| const unsigned char* channels = version == 0 ? NULL : tail + vertex_size; ------------------ | Branch (1851:34): [True: 2.18k, False: 7.12k] ------------------ 1852| | 1853| 9.30k| size_t vertex_block_size = getVertexBlockSize(vertex_size); 1854| | 1855| 9.30k| size_t vertex_offset = 0; 1856| | 1857| 168k| while (vertex_offset < vertex_count) ------------------ | Branch (1857:9): [True: 160k, False: 8.75k] ------------------ 1858| 160k| { 1859| 160k| size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; ------------------ | Branch (1859:23): [True: 152k, False: 7.45k] ------------------ 1860| | 1861| 160k| data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version); 1862| 160k| if (!data) ------------------ | Branch (1862:7): [True: 548, False: 159k] ------------------ 1863| 548| return -2; 1864| | 1865| 159k| vertex_offset += block_size; 1866| 159k| } 1867| | 1868| 8.75k| if (size_t(data_end - data) != tail_size_pad) ------------------ | Branch (1868:6): [True: 459, False: 8.29k] ------------------ 1869| 459| return -3; 1870| | 1871| 8.29k| return 0; 1872| 8.75k|} vertexcodec.cpp:_ZN7meshoptL27decodeBytesGroupBuildTablesEv: 792| 2|{ 793| 514| for (int mask = 0; mask < 256; ++mask) ------------------ | Branch (793:21): [True: 512, False: 2] ------------------ 794| 512| { 795| 512| unsigned char shuffle[8]; 796| 512| unsigned char count = 0; 797| | 798| 4.60k| for (int i = 0; i < 8; ++i) ------------------ | Branch (798:19): [True: 4.09k, False: 512] ------------------ 799| 4.09k| { 800| 4.09k| int maski = (mask >> i) & 1; 801| 4.09k| shuffle[i] = maski ? count : 0x80; ------------------ | Branch (801:17): [True: 2.04k, False: 2.04k] ------------------ 802| 4.09k| count += (unsigned char)(maski); 803| 4.09k| } 804| | 805| 512| memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8); 806| 512| kDecodeBytesGroupCount[mask] = count; 807| 512| } 808| | 809| 2| return true; 810| 2|} vertexcodec.cpp:_ZN7meshoptL14getCpuFeaturesEv: 1600| 2|{ 1601| 2| int cpuinfo[4] = {}; 1602| |#ifdef _MSC_VER 1603| | __cpuid(cpuinfo, 1); 1604| |#else 1605| | __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); 1606| 2|#endif 1607| 2| return cpuinfo[2]; 1608| 2|} vertexcodec.cpp:_ZN7meshoptL18getVertexBlockSizeEm: 141| 34.1k|{ 142| | // make sure the entire block fits into the scratch buffer and is aligned to byte group size 143| | // note: the block size is implicitly part of the format, so we can't change it without breaking compatibility 144| 34.1k| size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1); 145| | 146| 34.1k| return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize; ------------------ | Branch (146:9): [True: 0, False: 34.1k] ------------------ 147| 34.1k|} vertexcodec.cpp:_ZN7meshoptL14estimateRotateEPKhmmmm: 370| 15.5k|{ 371| 15.5k| size_t sizes[8] = {}; 372| | 373| 15.5k| const unsigned char* vertex = vertex_data + k; 374| 15.5k| unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24); 375| | 376| 4.10M| for (size_t i = 0; i < vertex_count; i += group_size) ------------------ | Branch (376:21): [True: 4.09M, False: 15.5k] ------------------ 377| 4.09M| { 378| 4.09M| unsigned int bitg = 0; 379| | 380| | // calculate bit consistency mask for the group 381| 69.4M| for (size_t j = 0; j < group_size && i + j < vertex_count; ++j) ------------------ | Branch (381:22): [True: 65.3M, False: 4.07M] | Branch (381:40): [True: 65.3M, False: 13.4k] ------------------ 382| 65.3M| { 383| 65.3M| unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24); 384| 65.3M| unsigned int d = v ^ last; 385| | 386| 65.3M| bitg |= d; 387| 65.3M| last = v; 388| 65.3M| vertex += vertex_size; 389| 65.3M| } 390| | 391| |#if TRACE 392| | for (int j = 0; j < 32; ++j) 393| | vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1)); 394| |#endif 395| | 396| 36.8M| for (int j = 0; j < 8; ++j) ------------------ | Branch (396:19): [True: 32.7M, False: 4.09M] ------------------ 397| 32.7M| { 398| 32.7M| unsigned int bitr = rotate(bitg, j); 399| | 400| 32.7M| sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8)); 401| 32.7M| sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24)); 402| 32.7M| } 403| 4.09M| } 404| | 405| 15.5k| int best_rot = 0; 406| 124k| for (int rot = 1; rot < 8; ++rot) ------------------ | Branch (406:20): [True: 108k, False: 15.5k] ------------------ 407| 108k| best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot; ------------------ | Branch (407:14): [True: 6.76k, False: 102k] ------------------ 408| | 409| 15.5k| return best_rot; 410| 15.5k|} _ZN7meshopt6rotateEji: 150| 162M|{ 151| 162M| return (v << r) | (v >> ((32 - r) & 31)); 152| 162M|} vertexcodec.cpp:_ZN7meshoptL12estimateBitsEh: 365| 130M|{ 366| 130M| return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8; ------------------ | Branch (366:9): [True: 48.8M, False: 82.1M] | Branch (366:20): [True: 46.9M, False: 1.87M] | Branch (366:30): [True: 45.2M, False: 1.67M] ------------------ 367| 130M|} vertexcodec.cpp:_ZN7meshoptL15estimateChannelEPKhmmmmmii: 413| 17.8k|{ 414| 17.8k| unsigned char block[kVertexBlockMaxSize]; 415| 17.8k| assert(vertex_block_size <= kVertexBlockMaxSize); ------------------ | Branch (415:2): [True: 17.8k, False: 0] ------------------ 416| | 417| 17.8k| unsigned char last_vertex[256] = {}; 418| | 419| 17.8k| size_t sizes[3] = {}; 420| 17.8k| assert(max_channel <= 3); ------------------ | Branch (420:2): [True: 17.8k, False: 0] ------------------ 421| | 422| 151k| for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip) ------------------ | Branch (422:21): [True: 133k, False: 17.8k] ------------------ 423| 133k| { 424| 133k| size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i; ------------------ | Branch (424:23): [True: 120k, False: 13.4k] ------------------ 425| 133k| size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1); 426| | 427| 133k| memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size); ------------------ | Branch (427:38): [True: 17.8k, False: 115k] ------------------ 428| | 429| | // we sometimes encode elements we didn't fill when rounding to kByteGroupSize 430| 133k| if (block_size < block_size_aligned) ------------------ | Branch (430:7): [True: 11.9k, False: 121k] ------------------ 431| 11.9k| memset(block + block_size, 0, block_size_aligned - block_size); 432| | 433| 498k| for (int channel = 0; channel < max_channel; ++channel) ------------------ | Branch (433:25): [True: 365k, False: 133k] ------------------ 434| 1.82M| for (size_t j = 0; j < 4; ++j) ------------------ | Branch (434:23): [True: 1.46M, False: 365k] ------------------ 435| 1.46M| { 436| 1.46M| encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4)); 437| | 438| 22.9M| for (size_t ig = 0; ig < block_size; ig += kByteGroupSize) ------------------ | Branch (438:25): [True: 21.4M, False: 1.46M] ------------------ 439| 21.4M| { 440| | // to maximize encoding performance we only evaluate 1/2/4/8 bit groups 441| 21.4M| size_t size1 = encodeBytesGroupMeasure(block + ig, 1); 442| 21.4M| size_t size2 = encodeBytesGroupMeasure(block + ig, 2); 443| 21.4M| size_t size4 = encodeBytesGroupMeasure(block + ig, 4); 444| 21.4M| size_t size8 = encodeBytesGroupMeasure(block + ig, 8); 445| | 446| 21.4M| size_t best_size = size1 < size2 ? size1 : size2; ------------------ | Branch (446:25): [True: 20.3M, False: 1.11M] ------------------ 447| 21.4M| best_size = best_size < size4 ? best_size : size4; ------------------ | Branch (447:18): [True: 21.4M, False: 44.0k] ------------------ 448| 21.4M| best_size = best_size < size8 ? best_size : size8; ------------------ | Branch (448:18): [True: 11.9M, False: 9.52M] ------------------ 449| | 450| 21.4M| sizes[channel] += best_size; 451| 21.4M| } 452| 1.46M| } 453| 133k| } 454| | 455| 17.8k| int best_channel = 0; 456| 51.2k| for (int channel = 1; channel < max_channel; ++channel) ------------------ | Branch (456:24): [True: 33.4k, False: 17.8k] ------------------ 457| 33.4k| best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel; ------------------ | Branch (457:18): [True: 4.94k, False: 28.4k] ------------------ 458| | 459| 17.8k| return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel; ------------------ | Branch (459:9): [True: 2.24k, False: 15.6k] ------------------ 460| 17.8k|} vertexcodec.cpp:_ZN7meshoptL12encodeDeltasEPhPKhmmS2_mi: 350| 4.83M|{ 351| 4.83M| switch (channel & 3) 352| 4.83M| { 353| 3.62M| case 0: ------------------ | Branch (353:2): [True: 3.62M, False: 1.21M] ------------------ 354| 3.62M| return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0); 355| 662k| case 1: ------------------ | Branch (355:2): [True: 662k, False: 4.17M] ------------------ 356| 662k| return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0); 357| 549k| case 2: ------------------ | Branch (357:2): [True: 549k, False: 4.28M] ------------------ 358| 549k| return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4); 359| 0| default: ------------------ | Branch (359:2): [True: 0, False: 4.83M] ------------------ 360| | assert(!"Unsupported channel encoding"); // unreachable ------------------ | Branch (360:3): [Folded, False: 0] ------------------ 361| 4.83M| } 362| 4.83M|} vertexcodec.cpp:_ZN7meshoptL13encodeDeltas1IhLb0EEEvPhPKhmmS3_mi: 325| 3.62M|{ 326| 3.62M| size_t k0 = k & ~(sizeof(T) - 1); 327| 3.62M| int ks = (k & (sizeof(T) - 1)) * 8; 328| | 329| 3.62M| T p = last_vertex[k0]; 330| 3.62M| for (size_t j = 1; j < sizeof(T); ++j) ------------------ | Branch (330:21): [True: 0, False: 3.62M] ------------------ 331| 0| p |= T(last_vertex[k0 + j]) << (j * 8); 332| | 333| 3.62M| const unsigned char* vertex = vertex_data + k0; 334| | 335| 877M| for (size_t i = 0; i < vertex_count; ++i) ------------------ | Branch (335:21): [True: 874M, False: 3.62M] ------------------ 336| 874M| { 337| 874M| T v = vertex[0]; 338| 874M| for (size_t j = 1; j < sizeof(T); ++j) ------------------ | Branch (338:22): [True: 0, False: 874M] ------------------ 339| 0| v |= vertex[j] << (j * 8); 340| | 341| 874M| T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); ------------------ | Branch (341:9): [Folded, False: 874M] ------------------ 342| | 343| 874M| buffer[i] = (unsigned char)(d >> ks); 344| 874M| p = v; 345| 874M| vertex += vertex_size; 346| 874M| } 347| 3.62M|} _ZN7meshopt6zigzagIhEET_S1_: 156| 874M|{ 157| 874M| return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1); 158| 874M|} vertexcodec.cpp:_ZN7meshoptL13encodeDeltas1ItLb0EEEvPhPKhmmS3_mi: 325| 662k|{ 326| 662k| size_t k0 = k & ~(sizeof(T) - 1); 327| 662k| int ks = (k & (sizeof(T) - 1)) * 8; 328| | 329| 662k| T p = last_vertex[k0]; 330| 1.32M| for (size_t j = 1; j < sizeof(T); ++j) ------------------ | Branch (330:21): [True: 662k, False: 662k] ------------------ 331| 662k| p |= T(last_vertex[k0 + j]) << (j * 8); 332| | 333| 662k| const unsigned char* vertex = vertex_data + k0; 334| | 335| 157M| for (size_t i = 0; i < vertex_count; ++i) ------------------ | Branch (335:21): [True: 156M, False: 662k] ------------------ 336| 156M| { 337| 156M| T v = vertex[0]; 338| 313M| for (size_t j = 1; j < sizeof(T); ++j) ------------------ | Branch (338:22): [True: 156M, False: 156M] ------------------ 339| 156M| v |= vertex[j] << (j * 8); 340| | 341| 156M| T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); ------------------ | Branch (341:9): [Folded, False: 156M] ------------------ 342| | 343| 156M| buffer[i] = (unsigned char)(d >> ks); 344| 156M| p = v; 345| 156M| vertex += vertex_size; 346| 156M| } 347| 662k|} _ZN7meshopt6zigzagItEET_S1_: 156| 156M|{ 157| 156M| return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1); 158| 156M|} vertexcodec.cpp:_ZN7meshoptL13encodeDeltas1IjLb1EEEvPhPKhmmS3_mi: 325| 549k|{ 326| 549k| size_t k0 = k & ~(sizeof(T) - 1); 327| 549k| int ks = (k & (sizeof(T) - 1)) * 8; 328| | 329| 549k| T p = last_vertex[k0]; 330| 2.19M| for (size_t j = 1; j < sizeof(T); ++j) ------------------ | Branch (330:21): [True: 1.64M, False: 549k] ------------------ 331| 1.64M| p |= T(last_vertex[k0 + j]) << (j * 8); 332| | 333| 549k| const unsigned char* vertex = vertex_data + k0; 334| | 335| 130M| for (size_t i = 0; i < vertex_count; ++i) ------------------ | Branch (335:21): [True: 129M, False: 549k] ------------------ 336| 129M| { 337| 129M| T v = vertex[0]; 338| 518M| for (size_t j = 1; j < sizeof(T); ++j) ------------------ | Branch (338:22): [True: 388M, False: 129M] ------------------ 339| 388M| v |= vertex[j] << (j * 8); 340| | 341| 129M| T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); ------------------ | Branch (341:9): [True: 129M, Folded] ------------------ 342| | 343| 129M| buffer[i] = (unsigned char)(d >> ks); 344| 129M| p = v; 345| 129M| vertex += vertex_size; 346| 129M| } 347| 549k|} vertexcodec.cpp:_ZN7meshoptL23encodeBytesGroupMeasureEPKhi: 191| 277M|{ 192| 277M| assert(bits >= 0 && bits <= 8); ------------------ | Branch (192:2): [True: 277M, False: 0] | Branch (192:2): [True: 277M, False: 0] | Branch (192:2): [True: 277M, False: 0] ------------------ 193| | 194| 277M| if (bits == 0) ------------------ | Branch (194:6): [True: 30.4M, False: 247M] ------------------ 195| 30.4M| return encodeBytesGroupZero(buffer) ? 0 : size_t(-1); ------------------ | Branch (195:10): [True: 11.2M, False: 19.1M] ------------------ 196| | 197| 247M| if (bits == 8) ------------------ | Branch (197:6): [True: 60.2M, False: 186M] ------------------ 198| 60.2M| return kByteGroupSize; 199| | 200| 186M| size_t result = kByteGroupSize * bits / 8; 201| | 202| 186M| unsigned char sentinel = (1 << bits) - 1; 203| | 204| 3.17G| for (size_t i = 0; i < kByteGroupSize; ++i) ------------------ | Branch (204:21): [True: 2.98G, False: 186M] ------------------ 205| 2.98G| result += buffer[i] >= sentinel; 206| | 207| 186M| return result; 208| 247M|} vertexcodec.cpp:_ZN7meshoptL20encodeBytesGroupZeroEPKh: 181| 48.8M|{ 182| 48.8M| assert(kByteGroupSize == sizeof(unsigned long long) * 2); ------------------ | Branch (182:2): [True: 48.8M, Folded] ------------------ 183| | 184| 48.8M| unsigned long long v[2]; 185| 48.8M| memcpy(v, buffer, sizeof(v)); 186| | 187| 48.8M| return (v[0] | v[1]) == 0; 188| 48.8M|} vertexcodec.cpp:_ZN7meshoptL17encodeVertexBlockEPhS0_PKhmmS0_S2_ii: 510| 318k|{ 511| 318k| assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); ------------------ | Branch (511:2): [True: 318k, False: 0] | Branch (511:2): [True: 318k, False: 0] | Branch (511:2): [True: 318k, False: 0] ------------------ 512| 318k| assert(vertex_size % 4 == 0); ------------------ | Branch (512:2): [True: 318k, False: 0] ------------------ 513| | 514| 318k| unsigned char buffer[kVertexBlockMaxSize]; 515| 318k| assert(sizeof(buffer) % kByteGroupSize == 0); ------------------ | Branch (515:2): [True: 318k, Folded] ------------------ 516| | 517| 318k| size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); 518| | 519| | // we sometimes encode elements we didn't fill when rounding to kByteGroupSize 520| 318k| memset(buffer, 0, sizeof(buffer)); 521| | 522| 318k| size_t control_size = version == 0 ? 0 : vertex_size / 4; ------------------ | Branch (522:24): [True: 45.4k, False: 272k] ------------------ 523| 318k| if (size_t(data_end - data) < control_size) ------------------ | Branch (523:6): [True: 0, False: 318k] ------------------ 524| 0| return NULL; 525| | 526| 318k| unsigned char* control = data; 527| 318k| data += control_size; 528| | 529| 318k| memset(control, 0, control_size); 530| | 531| 3.69M| for (size_t k = 0; k < vertex_size; ++k) ------------------ | Branch (531:21): [True: 3.37M, False: 318k] ------------------ 532| 3.37M| { 533| 3.37M| encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]); ------------------ | Branch (533:80): [True: 493k, False: 2.88M] ------------------ 534| | 535| |#if TRACE 536| | const unsigned char* olddata = data; 537| | bytestats = &vertexstats[k]; 538| |#endif 539| | 540| 3.37M| int ctrl = 0; 541| | 542| 3.37M| if (version != 0) ------------------ | Branch (542:7): [True: 2.88M, False: 493k] ------------------ 543| 2.88M| { 544| 2.88M| ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level); 545| | 546| 2.88M| assert(unsigned(ctrl) < 4); ------------------ | Branch (546:4): [True: 2.88M, False: 0] ------------------ 547| 2.88M| control[k / 4] |= ctrl << ((k % 4) * 2); 548| | 549| |#if TRACE 550| | vertexstats[k].ctrl[ctrl]++; 551| |#endif 552| 2.88M| } 553| | 554| 3.37M| if (ctrl == 3) ------------------ | Branch (554:7): [True: 774k, False: 2.60M] ------------------ 555| 774k| { 556| | // literal encoding 557| 774k| if (size_t(data_end - data) < vertex_count) ------------------ | Branch (557:8): [True: 0, False: 774k] ------------------ 558| 0| return NULL; 559| | 560| 774k| memcpy(data, buffer, vertex_count); 561| 774k| data += vertex_count; 562| 774k| } 563| 2.60M| else if (ctrl != 2) // non-zero encoding ------------------ | Branch (563:12): [True: 1.60M, False: 996k] ------------------ 564| 1.60M| { 565| 1.60M| data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl); ------------------ | Branch (565:69): [True: 493k, False: 1.11M] ------------------ 566| 1.60M| if (!data) ------------------ | Branch (566:8): [True: 0, False: 1.60M] ------------------ 567| 0| return NULL; 568| 1.60M| } 569| | 570| |#if TRACE 571| | bytestats = NULL; 572| | vertexstats[k].size += data - olddata; 573| |#endif 574| 3.37M| } 575| | 576| 318k| memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size); 577| | 578| 318k| return data; 579| 318k|} vertexcodec.cpp:_ZN7meshoptL15estimateControlEPKhmmi: 472| 2.88M|{ 473| 2.88M| if (estimateControlZero(buffer, vertex_count_aligned)) ------------------ | Branch (473:6): [True: 996k, False: 1.88M] ------------------ 474| 996k| return 2; // zero encoding 475| | 476| 1.88M| if (level == 0) ------------------ | Branch (476:6): [True: 649k, False: 1.23M] ------------------ 477| 649k| return 1; // 1248 encoding in level 0 for encoding speed 478| | 479| | // round number of groups to 4 to get number of header bytes 480| 1.23M| size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4; 481| | 482| 1.23M| size_t est_bytes0 = header_size, est_bytes1 = header_size; 483| | 484| 20.1M| for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) ------------------ | Branch (484:21): [True: 18.9M, False: 1.23M] ------------------ 485| 18.9M| { 486| | // assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance 487| 18.9M| size_t size0 = encodeBytesGroupMeasure(buffer + i, 0); 488| 18.9M| size_t size1 = encodeBytesGroupMeasure(buffer + i, 1); 489| 18.9M| size_t size2 = encodeBytesGroupMeasure(buffer + i, 2); 490| 18.9M| size_t size4 = encodeBytesGroupMeasure(buffer + i, 4); 491| 18.9M| size_t size8 = encodeBytesGroupMeasure(buffer + i, 8); 492| | 493| | // both control modes have access to 1/2/4 bit encoding 494| 18.9M| size_t size12 = size1 < size2 ? size1 : size2; ------------------ | Branch (494:19): [True: 16.9M, False: 1.96M] ------------------ 495| 18.9M| size_t size124 = size12 < size4 ? size12 : size4; ------------------ | Branch (495:20): [True: 18.8M, False: 70.4k] ------------------ 496| | 497| | // each control mode has access to 0/8 bit encoding respectively 498| 18.9M| est_bytes0 += size124 < size0 ? size124 : size0; ------------------ | Branch (498:17): [True: 15.5M, False: 3.35M] ------------------ 499| 18.9M| est_bytes1 += size124 < size8 ? size124 : size8; ------------------ | Branch (499:17): [True: 6.54M, False: 12.3M] ------------------ 500| 18.9M| } 501| | 502| | // pick shortest control entry but prefer literal encoding 503| 1.23M| if (est_bytes0 < vertex_count || est_bytes1 < vertex_count) ------------------ | Branch (503:6): [True: 441k, False: 797k] | Branch (503:35): [True: 22.8k, False: 774k] ------------------ 504| 464k| return est_bytes0 < est_bytes1 ? 0 : 1; ------------------ | Branch (504:10): [True: 277k, False: 186k] ------------------ 505| 774k| else 506| 774k| return 3; // literal encoding 507| 1.23M|} vertexcodec.cpp:_ZN7meshoptL19estimateControlZeroEPKhm: 463| 2.88M|{ 464| 19.4M| for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) ------------------ | Branch (464:21): [True: 18.4M, False: 996k] ------------------ 465| 18.4M| if (!encodeBytesGroupZero(buffer + i)) ------------------ | Branch (465:7): [True: 1.88M, False: 16.5M] ------------------ 466| 1.88M| return false; 467| | 468| 996k| return true; 469| 2.88M|} vertexcodec.cpp:_ZN7meshoptL11encodeBytesEPhS0_PKhmPKi: 264| 1.60M|{ 265| 1.60M| assert(buffer_size % kByteGroupSize == 0); ------------------ | Branch (265:2): [True: 1.60M, False: 0] ------------------ 266| | 267| 1.60M| unsigned char* header = data; 268| | 269| | // round number of groups to 4 to get number of header bytes 270| 1.60M| size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; 271| | 272| 1.60M| if (size_t(data_end - data) < header_size) ------------------ | Branch (272:6): [True: 0, False: 1.60M] ------------------ 273| 0| return NULL; 274| | 275| 1.60M| data += header_size; 276| | 277| 1.60M| memset(header, 0, header_size); 278| | 279| 1.60M| int last_bits = -1; 280| | 281| 25.9M| for (size_t i = 0; i < buffer_size; i += kByteGroupSize) ------------------ | Branch (281:21): [True: 24.3M, False: 1.60M] ------------------ 282| 24.3M| { 283| 24.3M| if (size_t(data_end - data) < kByteGroupDecodeLimit) ------------------ | Branch (283:7): [True: 0, False: 24.3M] ------------------ 284| 0| return NULL; 285| | 286| 24.3M| int best_bitk = 3; 287| 24.3M| size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]); 288| | 289| 97.2M| for (int bitk = 0; bitk < 3; ++bitk) ------------------ | Branch (289:22): [True: 72.9M, False: 24.3M] ------------------ 290| 72.9M| { 291| 72.9M| size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]); 292| | 293| | // favor consistent bit selection across groups, but never replace literals 294| 72.9M| if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8)) ------------------ | Branch (294:8): [True: 15.1M, False: 57.7M] | Branch (294:29): [True: 1.21M, False: 56.5M] | Branch (294:50): [True: 297k, False: 916k] | Branch (294:77): [True: 48.5k, False: 249k] ------------------ 295| 15.2M| { 296| 15.2M| best_bitk = bitk; 297| 15.2M| best_size = size; 298| 15.2M| } 299| 72.9M| } 300| | 301| 24.3M| size_t header_offset = i / kByteGroupSize; 302| 24.3M| header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2); 303| | 304| 24.3M| int best_bits = bits[best_bitk]; 305| 24.3M| unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits); 306| | 307| 24.3M| assert(data + best_size == next); ------------------ | Branch (307:3): [True: 24.3M, False: 0] ------------------ 308| 24.3M| data = next; 309| 24.3M| last_bits = best_bits; 310| | 311| |#if TRACE 312| | bytestats->bitg[best_bits] += best_size; 313| |#endif 314| 24.3M| } 315| | 316| |#if TRACE 317| | bytestats->header += header_size; 318| |#endif 319| | 320| 1.60M| return data; 321| 1.60M|} vertexcodec.cpp:_ZN7meshoptL16encodeBytesGroupEPhPKhi: 211| 24.3M|{ 212| 24.3M| assert(bits >= 0 && bits <= 8); ------------------ | Branch (212:2): [True: 24.3M, False: 0] | Branch (212:2): [True: 24.3M, False: 0] | Branch (212:2): [True: 24.3M, False: 0] ------------------ 213| 24.3M| assert(kByteGroupSize % 8 == 0); ------------------ | Branch (213:2): [True: 24.3M, Folded] ------------------ 214| | 215| 24.3M| if (bits == 0) ------------------ | Branch (215:6): [True: 7.92M, False: 16.3M] ------------------ 216| 7.92M| return data; 217| | 218| 16.3M| if (bits == 8) ------------------ | Branch (218:6): [True: 10.5M, False: 5.83M] ------------------ 219| 10.5M| { 220| 10.5M| memcpy(data, buffer, kByteGroupSize); 221| 10.5M| return data + kByteGroupSize; 222| 10.5M| } 223| | 224| 5.83M| size_t byte_size = 8 / bits; 225| 5.83M| assert(kByteGroupSize % byte_size == 0); ------------------ | Branch (225:2): [True: 5.83M, False: 0] ------------------ 226| | 227| | // fixed portion: bits bits for each value 228| | // variable portion: full byte for each out-of-range value (using 1...1 as sentinel) 229| 5.83M| unsigned char sentinel = (1 << bits) - 1; 230| | 231| 22.3M| for (size_t i = 0; i < kByteGroupSize; i += byte_size) ------------------ | Branch (231:21): [True: 16.4M, False: 5.83M] ------------------ 232| 16.4M| { 233| 16.4M| unsigned char byte = 0; 234| | 235| 109M| for (size_t k = 0; k < byte_size; ++k) ------------------ | Branch (235:22): [True: 93.3M, False: 16.4M] ------------------ 236| 93.3M| { 237| 93.3M| unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k]; ------------------ | Branch (237:24): [True: 32.6M, False: 60.7M] ------------------ 238| | 239| 93.3M| byte <<= bits; 240| 93.3M| byte |= enc; 241| 93.3M| } 242| | 243| | // encode 1-bit groups in reverse bit order 244| | // this makes them faster to decode alongside other groups 245| 16.4M| if (bits == 1) ------------------ | Branch (245:7): [True: 7.18M, False: 9.29M] ------------------ 246| 7.18M| byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); 247| | 248| 16.4M| *data++ = byte; 249| 16.4M| } 250| | 251| 99.1M| for (size_t i = 0; i < kByteGroupSize; ++i) ------------------ | Branch (251:21): [True: 93.3M, False: 5.83M] ------------------ 252| 93.3M| { 253| 93.3M| unsigned char v = buffer[i]; 254| | 255| | // branchless append of out-of-range values 256| 93.3M| *data = v; 257| 93.3M| data += v >= sentinel; 258| 93.3M| } 259| | 260| 5.83M| return data; 261| 5.83M|} vertexcodec.cpp:_ZN7meshoptL21decodeVertexBlockSimdEPKhS1_PhmmS2_S1_i: 1515| 160k|{ 1516| 160k| assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); ------------------ | Branch (1516:2): [True: 160k, False: 0] | Branch (1516:2): [True: 160k, False: 0] | Branch (1516:2): [True: 160k, False: 0] ------------------ 1517| | 1518| 160k| unsigned char buffer[kVertexBlockMaxSize * 4]; 1519| 160k| unsigned char transposed[kVertexBlockSizeBytes]; 1520| | 1521| 160k| size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); 1522| | 1523| | // we can decode directly into the output buffer if vertex count is aligned to 16 (delta decode works 16 vertices at a time) 1524| | // this uses strided writes and also reads the last vertex once, which is bad for performance for write-combined memory so we only enable this if configured 1525| |#ifdef MESHOPTIMIZER_VERTEXCODEC_ZEROCOPY 1526| | unsigned char* target = vertex_count == vertex_count_aligned ? vertex_data : transposed; 1527| |#else 1528| 160k| unsigned char* target = transposed; 1529| 160k|#endif 1530| | 1531| 160k| size_t control_size = version == 0 ? 0 : vertex_size / 4; ------------------ | Branch (1531:24): [True: 22.9k, False: 137k] ------------------ 1532| 160k| if (size_t(data_end - data) < control_size) ------------------ | Branch (1532:6): [True: 0, False: 160k] ------------------ 1533| 0| return NULL; 1534| | 1535| 160k| const unsigned char* control = data; 1536| 160k| data += control_size; 1537| | 1538| 584k| for (size_t k = 0; k < vertex_size; k += 4) ------------------ | Branch (1538:21): [True: 425k, False: 159k] ------------------ 1539| 425k| { 1540| 425k| unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4]; ------------------ | Branch (1540:29): [True: 62.3k, False: 362k] ------------------ 1541| | 1542| 2.12M| for (size_t j = 0; j < 4; ++j) ------------------ | Branch (1542:22): [True: 1.70M, False: 424k] ------------------ 1543| 1.70M| { 1544| 1.70M| int ctrl = (ctrl_byte >> (j * 2)) & 3; 1545| | 1546| 1.70M| if (ctrl == 3) ------------------ | Branch (1546:8): [True: 388k, False: 1.31M] ------------------ 1547| 388k| { 1548| | // literal encoding; safe to over-copy due to tail 1549| 388k| if (size_t(data_end - data) < vertex_count_aligned) ------------------ | Branch (1549:9): [True: 72, False: 388k] ------------------ 1550| 72| return NULL; 1551| | 1552| 388k| memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned); 1553| 388k| data += vertex_count; 1554| 388k| } 1555| 1.31M| else if (ctrl == 2) ------------------ | Branch (1555:13): [True: 501k, False: 810k] ------------------ 1556| 501k| { 1557| | // zero encoding 1558| 501k| memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned); 1559| 501k| } 1560| 810k| else 1561| 810k| { 1562| | // for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8 1563| 810k| int hshift = version == 0 ? 0 : 4 + ctrl; ------------------ | Branch (1563:18): [True: 249k, False: 561k] ------------------ 1564| | 1565| 810k| data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift); 1566| 810k| if (!data) ------------------ | Branch (1566:9): [True: 307, False: 810k] ------------------ 1567| 307| return NULL; 1568| 810k| } 1569| 1.70M| } 1570| | 1571| 424k| int channel = version == 0 ? 0 : channels[k / 4]; ------------------ | Branch (1571:17): [True: 62.2k, False: 362k] ------------------ 1572| | 1573| 424k| switch (channel & 3) 1574| 424k| { 1575| 388k| case 0: ------------------ | Branch (1575:3): [True: 388k, False: 36.6k] ------------------ 1576| 388k| decodeDeltas4Simd<0>(buffer, target + k, vertex_count_aligned, vertex_size, last_vertex + k, 0); 1577| 388k| break; 1578| 16.2k| case 1: ------------------ | Branch (1578:3): [True: 16.2k, False: 408k] ------------------ 1579| 16.2k| decodeDeltas4Simd<1>(buffer, target + k, vertex_count_aligned, vertex_size, last_vertex + k, 0); 1580| 16.2k| break; 1581| 20.2k| case 2: ------------------ | Branch (1581:3): [True: 20.2k, False: 404k] ------------------ 1582| 20.2k| decodeDeltas4Simd<2>(buffer, target + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31); 1583| 20.2k| break; 1584| 169| default: ------------------ | Branch (1584:3): [True: 169, False: 424k] ------------------ 1585| 169| return NULL; // invalid channel type 1586| 424k| } 1587| 424k| } 1588| | 1589| 159k| if (target == transposed) ------------------ | Branch (1589:6): [True: 159k, False: 0] ------------------ 1590| 159k| memcpy(vertex_data, transposed, vertex_count * vertex_size); 1591| | 1592| 159k| memcpy(last_vertex, &target[vertex_size * (vertex_count - 1)], vertex_size); 1593| | 1594| 159k| return data; 1595| 160k|} vertexcodec.cpp:_ZN7meshoptL15decodeBytesSimdEPKhS1_Phmi: 1370| 810k|{ 1371| 810k| assert(buffer_size % kByteGroupSize == 0); ------------------ | Branch (1371:2): [True: 810k, False: 0] ------------------ 1372| 810k| assert(kByteGroupSize == 16); ------------------ | Branch (1372:2): [True: 810k, Folded] ------------------ 1373| | 1374| | // round number of groups to 4 to get number of header bytes 1375| 810k| size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; 1376| 810k| if (size_t(data_end - data) < header_size) ------------------ | Branch (1376:6): [True: 13, False: 810k] ------------------ 1377| 13| return NULL; 1378| | 1379| 810k| const unsigned char* header = data; 1380| 810k| data += header_size; 1381| | 1382| 810k| size_t i = 0; 1383| | 1384| | // fast-path: process 4 groups at a time, do a shared bounds check 1385| 3.82M| for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4) ------------------ | Branch (1385:9): [True: 3.02M, False: 805k] | Branch (1385:50): [True: 3.01M, False: 4.59k] ------------------ 1386| 3.01M| { 1387| 3.01M| size_t header_offset = i / kByteGroupSize; 1388| 3.01M| unsigned char header_byte = header[header_offset / 4]; 1389| | 1390| 3.01M|#if defined(SIMD_SSE) || defined(SIMD_AVX) 1391| | // very-fast-path: for consecutive 4 groups that are all 0-bit (v0/0, v1/0/0000) or 8-bit (v0/3333, v1/1/3333), 1392| | // the branchless decoders are slower than branching over the decoding of 4 groups and issuing a few load/store ops 1393| 3.01M| if (hshift != 5 && header_byte == 0) ------------------ | Branch (1393:7): [True: 1.42M, False: 1.58M] | Branch (1393:22): [True: 835k, False: 590k] ------------------ 1394| 835k| { 1395| 835k| memset(buffer + i, 0, kByteGroupSize * 4); 1396| 835k| continue; 1397| 835k| } 1398| 2.18M| else if (hshift != 4 && header_byte == 255) ------------------ | Branch (1398:12): [True: 1.91M, False: 263k] | Branch (1398:27): [True: 1.20M, False: 709k] ------------------ 1399| 1.20M| { 1400| 1.20M| memcpy(buffer + i, data, kByteGroupSize * 4); 1401| 1.20M| data += kByteGroupSize * 4; 1402| 1.20M| continue; 1403| 1.20M| } 1404| 973k|#endif 1405| | 1406| 973k| data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3)); 1407| 973k| data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3)); 1408| 973k| data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3)); 1409| 973k| data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3)); 1410| 973k| } 1411| | 1412| | // slow-path: process remaining groups 1413| 927k| for (; i < buffer_size; i += kByteGroupSize) ------------------ | Branch (1413:9): [True: 117k, False: 810k] ------------------ 1414| 117k| { 1415| 117k| if (size_t(data_end - data) < kByteGroupDecodeLimit) ------------------ | Branch (1415:7): [True: 294, False: 117k] ------------------ 1416| 294| return NULL; 1417| | 1418| 117k| size_t header_offset = i / kByteGroupSize; 1419| 117k| unsigned char header_byte = header[header_offset / 4]; 1420| | 1421| 117k| data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3)); 1422| 117k| } 1423| | 1424| 810k| return data; 1425| 810k|} vertexcodec.cpp:_ZN7meshoptL20decodeBytesGroupSimdEPKhPhi: 831| 4.01M|{ 832| | // 0 for 1-bit, 1 for 2-bit, 2 for 4-bit, 3 for 8-bit, and 4 for 0-bit as it makes some of the uses easier 833| 4.01M| static const int hbtn[9] = {4, 1, 2, 3, 4, 0, 1, 2, 3}; 834| | 835| 4.01M| int n = hbtn[hbits]; 836| | 837| 4.01M|#ifdef SIMD_LATENCYOPT 838| 4.01M| unsigned long long data64; 839| 4.01M| memcpy(&data64, data, 8); 840| 4.01M| data64 &= data64 >> n; 841| 4.01M| data64 &= data64 >> (n >> 1); 842| | 843| | // mask out one bit per group that is set if all group bits were 1 844| 4.01M| static const unsigned long long lanes[9] = {0, 0x55555555, 0x1111111111111111ull, 0, 0, 0xffff, 0x55555555, 0x1111111111111111ull, 0}; 845| 4.01M| int datacnt = int(_mm_popcnt_u64(data64 & lanes[hbits])); 846| 4.01M|#endif 847| | 848| | // for 8-bit groups, instead of loading the bytes through 'data', we load them through 'skip' as they are easier to preserve 849| | // for 0-bit groups, the load results get discarded because mask is always 0; in both cases the shift wraps to zero 850| 4.01M| const unsigned char* skip = data + ((2 << n) & 15); 851| | 852| 4.01M| __m128i selb = _mm_loadl_epi64(reinterpret_cast(data)); 853| 4.01M| __m128i rest = _mm_loadu_si128(reinterpret_cast(skip)); 854| | 855| | // unpack 1, 2 or 4-bit values: shuffle replicates each source byte into both halves of a 16-bit lane 856| | // mulhi extracts even and odd fields into the low byte; the results are interleaved back with shift/or 857| 4.01M| __m128i selw = _mm_shuffle_epi8(selb, _mm_loadu_si128(reinterpret_cast(kDecodeBytesGroupConfig[hbits][1]))); 858| 4.01M| __m128i sel0 = _mm_mulhi_epu16(selw, _mm_loadu_si128(reinterpret_cast(kDecodeBytesGroupConfig[hbits][2]))); 859| 4.01M| __m128i sel1 = _mm_mulhi_epu16(selw, _mm_loadu_si128(reinterpret_cast(kDecodeBytesGroupConfig[hbits][3]))); 860| 4.01M| __m128i seli = _mm_or_si128(sel0, _mm_slli_epi16(sel1, 8)); 861| | 862| | // the interleaved fields are masked by the bit count (special handling: for 0/8-bit values, mul produces 0) 863| 4.01M| __m128i sent = _mm_loadu_si128(reinterpret_cast(kDecodeBytesGroupConfig[hbits][0])); 864| 4.01M| __m128i sel = _mm_and_si128(seli, sent); 865| | 866| | // compare sel to sentinel; returns 0 for 0-bit (mul produces 0, sent is 1), 1 for 8-bit (mul produces 0, sent is 0) 867| 4.01M| __m128i mask = _mm_cmpeq_epi8(sel, sent); 868| 4.01M| int mask16 = _mm_movemask_epi8(mask); 869| 4.01M| unsigned char mask0 = (unsigned char)(mask16 & 255); 870| 4.01M| unsigned char mask1 = (unsigned char)(mask16 >> 8); 871| | 872| | // decode shuffle mask from two halves; second half needs to be shifted by popcount(mask0) 873| 4.01M| __m128i sm0 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask0])); 874| 4.01M| __m128i sm1 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask1])); 875| | 876| | // each lane of mask is 0x00 or 0xff; sad yields 255*popcount(mask0) in low word => low byte is -popcount(mask0) 877| 4.01M| __m128i npops = _mm_sad_epu8(mask, _mm_setzero_si128()); 878| 4.01M| __m128i sm1r = _mm_sub_epi8(sm1, _mm_shuffle_epi8(npops, _mm_setzero_si128())); 879| 4.01M| __m128i shuf = _mm_unpacklo_epi64(sm0, sm1r); 880| | 881| | // expand rest via shuffle mask and combine with sel; shuffle mask zeroes out bytes that are replaced by sel 882| 4.01M| __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); 883| | 884| 4.01M| _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); 885| | 886| 4.01M|#ifdef SIMD_LATENCYOPT 887| | // datacnt is 0 for 8-bit groups so we can't use skip to advance; 0-bit groups wrap the shift to zero 888| 4.01M| return data + ((2 << n) & 31) + datacnt; 889| |#else 890| | return skip + _mm_popcnt_u32(mask16); 891| |#endif 892| 4.01M|} vertexcodec.cpp:_ZN7meshoptL17decodeDeltas4SimdILi0EEEvPKhPhmmS3_i: 1430| 388k|{ 1431| 388k|#if defined(SIMD_SSE) || defined(SIMD_AVX) 1432| 388k|#define TEMP __m128i 1433| 388k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) 1434| 388k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) 1435| 388k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) 1436| 388k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) 1437| 388k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size 1438| 388k|#endif 1439| | 1440| |#ifdef SIMD_NEON 1441| |#define TEMP uint8x8_t 1442| |#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex), vdup_n_u32(0), 0)) 1443| |#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) 1444| |#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) 1445| |#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) 1446| |#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size 1447| |#endif 1448| | 1449| |#ifdef SIMD_WASM 1450| |#define TEMP v128_t 1451| |#define PREP() v128_t pi = wasm_v128_load(last_vertex) 1452| |#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) 1453| |#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) 1454| |#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) 1455| |#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size 1456| |#endif 1457| | 1458| 388k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) 1459| | 1460| 388k| PREP(); ------------------ | | 1433| 388k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) ------------------ 1461| | 1462| 388k| unsigned char* savep = transposed; 1463| | 1464| 6.25M| for (size_t j = 0; j < vertex_count_aligned; j += 16) ------------------ | Branch (1464:21): [True: 5.87M, False: 388k] ------------------ 1465| 5.87M| { 1466| 5.87M| LOAD(0); ------------------ | | 1434| 5.87M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1467| 5.87M| LOAD(1); ------------------ | | 1434| 5.87M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1468| 5.87M| LOAD(2); ------------------ | | 1434| 5.87M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1469| 5.87M| LOAD(3); ------------------ | | 1434| 5.87M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1470| | 1471| 5.87M| transpose8(r0, r1, r2, r3); 1472| | 1473| 5.87M| TEMP t0, t1, t2, t3; ------------------ | | 1432| 5.87M|#define TEMP __m128i ------------------ 1474| 5.87M| TEMP npi = pi; ------------------ | | 1432| 5.87M|#define TEMP __m128i ------------------ 1475| | 1476| 5.87M| UNZR(0); ------------------ | | 1458| 5.87M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1458:24): [True: 5.87M, Folded] | | | Branch (1458:58): [Folded, False: 0] | | ------------------ ------------------ 1477| 5.87M| GRP4(0); ------------------ | | 1435| 5.87M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1478| 5.87M| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ 1479| 5.87M| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1480| | 1481| 5.87M| UNZR(1); ------------------ | | 1458| 5.87M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1458:24): [True: 5.87M, Folded] | | | Branch (1458:58): [Folded, False: 0] | | ------------------ ------------------ 1482| 5.87M| GRP4(1); ------------------ | | 1435| 5.87M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1483| 5.87M| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ 1484| 5.87M| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1485| | 1486| 5.87M| UNZR(2); ------------------ | | 1458| 5.87M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1458:24): [True: 5.87M, Folded] | | | Branch (1458:58): [Folded, False: 0] | | ------------------ ------------------ 1487| 5.87M| GRP4(2); ------------------ | | 1435| 5.87M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1488| 5.87M| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ 1489| 5.87M| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1490| | 1491| 5.87M| UNZR(3); ------------------ | | 1458| 5.87M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1458:24): [True: 5.87M, Folded] | | | Branch (1458:58): [Folded, False: 0] | | ------------------ ------------------ 1492| 5.87M| GRP4(3); ------------------ | | 1435| 5.87M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1493| 5.87M| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [True: 5.87M, Folded] | | | Branch (1436:70): [Folded, False: 0] | | ------------------ ------------------ 1494| 5.87M| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 5.87M|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1495| | 1496| |#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) 1497| | // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations 1498| | pi = rebase(npi, r0, r1, r2, r3); 1499| |#else 1500| 5.87M| (void)npi; 1501| 5.87M|#endif 1502| | 1503| 5.87M|#undef UNZR 1504| 5.87M|#undef TEMP 1505| 5.87M|#undef PREP 1506| 5.87M|#undef LOAD 1507| 5.87M|#undef GRP4 1508| 5.87M|#undef FIXD 1509| 5.87M|#undef SAVE 1510| 5.87M| } 1511| 388k|} _ZN7meshopt10transpose8ERDv2_xS1_S1_S1_: 1219| 6.42M|{ 1220| 6.42M| __m128i t0 = _mm_unpacklo_epi8(x0, x1); 1221| 6.42M| __m128i t1 = _mm_unpackhi_epi8(x0, x1); 1222| 6.42M| __m128i t2 = _mm_unpacklo_epi8(x2, x3); 1223| 6.42M| __m128i t3 = _mm_unpackhi_epi8(x2, x3); 1224| | 1225| 6.42M| x0 = _mm_unpacklo_epi16(t0, t2); 1226| 6.42M| x1 = _mm_unpackhi_epi16(t0, t2); 1227| 6.42M| x2 = _mm_unpacklo_epi16(t1, t3); 1228| 6.42M| x3 = _mm_unpackhi_epi16(t1, t3); 1229| 6.42M|} _ZN7meshopt9unzigzag8EDv2_x: 1233| 23.4M|{ 1234| 23.4M| __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1))); 1235| 23.4M| __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127)); 1236| | 1237| 23.4M| return _mm_xor_si128(xl, xr); 1238| 23.4M|} vertexcodec.cpp:_ZN7meshoptL17decodeDeltas4SimdILi1EEEvPKhPhmmS3_i: 1430| 16.2k|{ 1431| 16.2k|#if defined(SIMD_SSE) || defined(SIMD_AVX) 1432| 16.2k|#define TEMP __m128i 1433| 16.2k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) 1434| 16.2k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) 1435| 16.2k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) 1436| 16.2k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) 1437| 16.2k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size 1438| 16.2k|#endif 1439| | 1440| |#ifdef SIMD_NEON 1441| |#define TEMP uint8x8_t 1442| |#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex), vdup_n_u32(0), 0)) 1443| |#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) 1444| |#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) 1445| |#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) 1446| |#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size 1447| |#endif 1448| | 1449| |#ifdef SIMD_WASM 1450| |#define TEMP v128_t 1451| |#define PREP() v128_t pi = wasm_v128_load(last_vertex) 1452| |#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) 1453| |#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) 1454| |#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) 1455| |#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size 1456| |#endif 1457| | 1458| 16.2k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) 1459| | 1460| 16.2k| PREP(); ------------------ | | 1433| 16.2k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) ------------------ 1461| | 1462| 16.2k| unsigned char* savep = transposed; 1463| | 1464| 258k| for (size_t j = 0; j < vertex_count_aligned; j += 16) ------------------ | Branch (1464:21): [True: 241k, False: 16.2k] ------------------ 1465| 241k| { 1466| 241k| LOAD(0); ------------------ | | 1434| 241k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1467| 241k| LOAD(1); ------------------ | | 1434| 241k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1468| 241k| LOAD(2); ------------------ | | 1434| 241k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1469| 241k| LOAD(3); ------------------ | | 1434| 241k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1470| | 1471| 241k| transpose8(r0, r1, r2, r3); 1472| | 1473| 241k| TEMP t0, t1, t2, t3; ------------------ | | 1432| 241k|#define TEMP __m128i ------------------ 1474| 241k| TEMP npi = pi; ------------------ | | 1432| 241k|#define TEMP __m128i ------------------ 1475| | 1476| 241k| UNZR(0); ------------------ | | 1458| 241k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1458:24): [Folded, False: 241k] | | | Branch (1458:58): [True: 241k, Folded] | | ------------------ ------------------ 1477| 241k| GRP4(0); ------------------ | | 1435| 241k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1478| 241k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ 1479| 241k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1480| | 1481| 241k| UNZR(1); ------------------ | | 1458| 241k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1458:24): [Folded, False: 241k] | | | Branch (1458:58): [True: 241k, Folded] | | ------------------ ------------------ 1482| 241k| GRP4(1); ------------------ | | 1435| 241k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1483| 241k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ 1484| 241k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1485| | 1486| 241k| UNZR(2); ------------------ | | 1458| 241k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1458:24): [Folded, False: 241k] | | | Branch (1458:58): [True: 241k, Folded] | | ------------------ ------------------ 1487| 241k| GRP4(2); ------------------ | | 1435| 241k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1488| 241k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ 1489| 241k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1490| | 1491| 241k| UNZR(3); ------------------ | | 1458| 241k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1458:24): [Folded, False: 241k] | | | Branch (1458:58): [True: 241k, Folded] | | ------------------ ------------------ 1492| 241k| GRP4(3); ------------------ | | 1435| 241k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1493| 241k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 241k] | | | Branch (1436:70): [True: 241k, Folded] | | ------------------ ------------------ 1494| 241k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 241k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1495| | 1496| |#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) 1497| | // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations 1498| | pi = rebase(npi, r0, r1, r2, r3); 1499| |#else 1500| 241k| (void)npi; 1501| 241k|#endif 1502| | 1503| 241k|#undef UNZR 1504| 241k|#undef TEMP 1505| 241k|#undef PREP 1506| 241k|#undef LOAD 1507| 241k|#undef GRP4 1508| 241k|#undef FIXD 1509| 241k|#undef SAVE 1510| 241k| } 1511| 16.2k|} _ZN7meshopt10unzigzag16EDv2_x: 1242| 967k|{ 1243| 967k| __m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1))); 1244| 967k| __m128i xr = _mm_srli_epi16(v, 1); 1245| | 1246| 967k| return _mm_xor_si128(xl, xr); 1247| 967k|} vertexcodec.cpp:_ZN7meshoptL17decodeDeltas4SimdILi2EEEvPKhPhmmS3_i: 1430| 20.2k|{ 1431| 20.2k|#if defined(SIMD_SSE) || defined(SIMD_AVX) 1432| 20.2k|#define TEMP __m128i 1433| 20.2k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) 1434| 20.2k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) 1435| 20.2k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) 1436| 20.2k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) 1437| 20.2k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size 1438| 20.2k|#endif 1439| | 1440| |#ifdef SIMD_NEON 1441| |#define TEMP uint8x8_t 1442| |#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex), vdup_n_u32(0), 0)) 1443| |#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) 1444| |#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) 1445| |#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) 1446| |#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size 1447| |#endif 1448| | 1449| |#ifdef SIMD_WASM 1450| |#define TEMP v128_t 1451| |#define PREP() v128_t pi = wasm_v128_load(last_vertex) 1452| |#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) 1453| |#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) 1454| |#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) 1455| |#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size 1456| |#endif 1457| | 1458| 20.2k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) 1459| | 1460| 20.2k| PREP(); ------------------ | | 1433| 20.2k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) ------------------ 1461| | 1462| 20.2k| unsigned char* savep = transposed; 1463| | 1464| 330k| for (size_t j = 0; j < vertex_count_aligned; j += 16) ------------------ | Branch (1464:21): [True: 309k, False: 20.2k] ------------------ 1465| 309k| { 1466| 309k| LOAD(0); ------------------ | | 1434| 309k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1467| 309k| LOAD(1); ------------------ | | 1434| 309k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1468| 309k| LOAD(2); ------------------ | | 1434| 309k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1469| 309k| LOAD(3); ------------------ | | 1434| 309k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) ------------------ 1470| | 1471| 309k| transpose8(r0, r1, r2, r3); 1472| | 1473| 309k| TEMP t0, t1, t2, t3; ------------------ | | 1432| 309k|#define TEMP __m128i ------------------ 1474| 309k| TEMP npi = pi; ------------------ | | 1432| 309k|#define TEMP __m128i ------------------ 1475| | 1476| 309k| UNZR(0); ------------------ | | 1458| 309k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1458:24): [Folded, False: 309k] | | | Branch (1458:58): [Folded, False: 309k] | | ------------------ ------------------ 1477| 309k| GRP4(0); ------------------ | | 1435| 309k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1478| 309k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ 1479| 309k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1480| | 1481| 309k| UNZR(1); ------------------ | | 1458| 309k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1458:24): [Folded, False: 309k] | | | Branch (1458:58): [Folded, False: 309k] | | ------------------ ------------------ 1482| 309k| GRP4(1); ------------------ | | 1435| 309k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1483| 309k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ 1484| 309k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1485| | 1486| 309k| UNZR(2); ------------------ | | 1458| 309k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1458:24): [Folded, False: 309k] | | | Branch (1458:58): [Folded, False: 309k] | | ------------------ ------------------ 1487| 309k| GRP4(2); ------------------ | | 1435| 309k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1488| 309k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ 1489| 309k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1490| | 1491| 309k| UNZR(3); ------------------ | | 1458| 309k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | | ------------------ | | | Branch (1458:24): [Folded, False: 309k] | | | Branch (1458:58): [Folded, False: 309k] | | ------------------ ------------------ 1492| 309k| GRP4(3); ------------------ | | 1435| 309k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) ------------------ 1493| 309k| FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ FIXD(0), FIXD(1), FIXD(2), FIXD(3); ------------------ | | 1436| 309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | | ------------------ | | | Branch (1436:29): [Folded, False: 309k] | | | Branch (1436:70): [Folded, False: 309k] | | ------------------ ------------------ 1494| 309k| SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ SAVE(0), SAVE(1), SAVE(2), SAVE(3); ------------------ | | 1437| 309k|#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size ------------------ 1495| | 1496| |#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) 1497| | // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations 1498| | pi = rebase(npi, r0, r1, r2, r3); 1499| |#else 1500| 309k| (void)npi; 1501| 309k|#endif 1502| | 1503| 309k|#undef UNZR 1504| 309k|#undef TEMP 1505| 309k|#undef PREP 1506| 309k|#undef LOAD 1507| 309k|#undef GRP4 1508| 309k|#undef FIXD 1509| 309k|#undef SAVE 1510| 309k| } 1511| 20.2k|} _ZN7meshopt8rotate32EDv2_xi: 1251| 1.23M|{ 1252| 1.23M| return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r)); 1253| 1.23M|} _Z11fuzzDecoderPKhmmPFiPvmmS0_mE: 8| 16.5k|{ 9| 16.5k| size_t count = 66; // must be divisible by 3 for decodeIndexBuffer; should be >=64 to cover large vertex blocks 10| | 11| 16.5k| void* destination = malloc(count * stride); 12| 16.5k| assert(destination); ------------------ | Branch (12:2): [True: 16.5k, False: 0] ------------------ 13| | 14| 16.5k| int rc = decode(destination, count, stride, reinterpret_cast(data), size); 15| 16.5k| (void)rc; 16| | 17| 16.5k| free(destination); 18| 16.5k|} _Z13fuzzRoundtripPKhmmi: 21| 8.29k|{ 22| 8.29k| size_t count = size / stride; 23| | 24| 8.29k| size_t bound = meshopt_encodeVertexBufferBound(count, stride); 25| 8.29k| void* encoded = malloc(bound); 26| 8.29k| void* decoded = malloc(count * stride); 27| 8.29k| assert(encoded && decoded); ------------------ | Branch (27:2): [True: 8.29k, False: 0] | Branch (27:2): [True: 8.29k, False: 0] | Branch (27:2): [True: 8.29k, False: 0] ------------------ 28| | 29| 8.29k| size_t res = meshopt_encodeVertexBufferLevel(static_cast(encoded), bound, data, count, stride, level, -1); 30| 8.29k| assert(res > 0 && res <= bound); ------------------ | Branch (30:2): [True: 8.29k, False: 0] | Branch (30:2): [True: 8.29k, False: 0] | Branch (30:2): [True: 8.29k, False: 0] ------------------ 31| | 32| | // encode again at the boundary to check for memory safety 33| | // this should produce the same output because encoder is deterministic 34| 8.29k| size_t rese = meshopt_encodeVertexBufferLevel(static_cast(encoded) + bound - res, res, data, count, stride, level, -1); 35| 8.29k| assert(rese == res); ------------------ | Branch (35:2): [True: 8.29k, False: 0] ------------------ 36| | 37| 8.29k| int rc = meshopt_decodeVertexBuffer(decoded, count, stride, static_cast(encoded) + bound - res, res); 38| 8.29k| assert(rc == 0); ------------------ | Branch (38:2): [True: 8.29k, False: 0] ------------------ 39| | 40| 8.29k| assert(memcmp(data, decoded, count * stride) == 0); ------------------ | Branch (40:2): [True: 8.29k, False: 0] ------------------ 41| | 42| 8.29k| free(decoded); 43| 8.29k| free(encoded); 44| 8.29k|} _Z5alignmm: 47| 12.3k|{ 48| 12.3k| return (value + alignment - 1) & ~(alignment - 1); 49| 12.3k|} _Z17fuzzDecodeMeshletmmPKhm: 52| 2.06k|{ 53| | // raw decoding: allowed to write align(count, 4) elements 54| 2.06k| unsigned int rt[256]; 55| 2.06k| unsigned int rv[256]; 56| 2.06k| meshopt_decodeMeshletRaw(rv + 256 - align(vertex_count, 4), vertex_count, rt + 256 - align(triangle_count, 4), triangle_count, data, size); 57| | 58| | // regular decoding: allowed to write align(count * size, 4) bytes 59| | // with variations for 3-byte triangles and 2-byte vertex references 60| 2.06k| unsigned short rsv[256]; 61| 2.06k| unsigned char rbt[256 * 3]; 62| | 63| 2.06k| meshopt_decodeMeshlet(rv + 256 - vertex_count, vertex_count, 4, rt + 256 - triangle_count, triangle_count, 4, data, size); 64| 2.06k| meshopt_decodeMeshlet(rsv + 256 - align(vertex_count, 2), vertex_count, 2, rt + 256 - triangle_count, triangle_count, 4, data, size); 65| 2.06k| meshopt_decodeMeshlet(rv + 256 - vertex_count, vertex_count, 4, rbt + 256 * 3 - align(triangle_count * 3, 4), triangle_count, 3, data, size); 66| 2.06k| meshopt_decodeMeshlet(rsv + 256 - align(vertex_count, 2), vertex_count, 2, rbt + 256 * 3 - align(triangle_count * 3, 4), triangle_count, 3, data, size); 67| 2.06k|} _Z20fuzzRoundtripMeshletPKhm: 70| 2.07k|{ 71| 2.07k| size_t triangle_count = size / 3; 72| 2.07k| if (triangle_count > 256) ------------------ | Branch (72:6): [True: 512, False: 1.56k] ------------------ 73| 512| triangle_count = 256; 74| | 75| 2.07k| unsigned char buf[4096]; 76| 2.07k| size_t enc = meshopt_encodeMeshlet(buf, sizeof(buf), NULL, 0, reinterpret_cast(data), triangle_count); 77| 2.07k| assert(enc > 0); ------------------ | Branch (77:2): [True: 2.07k, False: 0] ------------------ 78| 2.07k| assert(enc <= meshopt_encodeMeshletBound(0, triangle_count)); ------------------ | Branch (78:2): [True: 2.07k, False: 0] ------------------ 79| | 80| 2.07k| unsigned int rt4[256]; 81| 2.07k| int rc4 = meshopt_decodeMeshlet(static_cast(NULL), 0, rt4, triangle_count, buf, enc); 82| 2.07k| assert(rc4 == 0); ------------------ | Branch (82:2): [True: 2.07k, False: 0] ------------------ 83| | 84| 175k| for (size_t i = 0; i < triangle_count; ++i) ------------------ | Branch (84:21): [True: 173k, False: 2.07k] ------------------ 85| 173k| { 86| 173k| unsigned char a = data[i * 3 + 0], b = data[i * 3 + 1], c = data[i * 3 + 2]; 87| | 88| 173k| unsigned int abc = (a << 0) | (b << 8) | (c << 16); 89| 173k| unsigned int bca = (b << 0) | (c << 8) | (a << 16); 90| 173k| unsigned int cba = (c << 0) | (a << 8) | (b << 16); 91| | 92| 173k| unsigned int tri = rt4[i]; 93| | 94| 173k| assert(tri == abc || tri == bca || tri == cba); ------------------ | Branch (94:3): [True: 106k, False: 66.9k] | Branch (94:3): [True: 35.1k, False: 31.8k] | Branch (94:3): [True: 31.8k, False: 0] | Branch (94:3): [True: 173k, False: 0] ------------------ 95| 173k| } 96| | 97| 2.07k| unsigned char rt3[256 * 3]; 98| 2.07k| int rc3 = meshopt_decodeMeshlet(static_cast(NULL), 0, rt3, triangle_count, buf, enc); 99| 2.07k| assert(rc3 == 0); ------------------ | Branch (99:2): [True: 2.07k, False: 0] ------------------ 100| | 101| 175k| for (size_t i = 0; i < triangle_count; ++i) ------------------ | Branch (101:21): [True: 173k, False: 2.07k] ------------------ 102| 173k| { 103| 173k| unsigned char a = data[i * 3 + 0], b = data[i * 3 + 1], c = data[i * 3 + 2]; 104| | 105| 173k| unsigned int abc = (a << 0) | (b << 8) | (c << 16); 106| 173k| unsigned int bca = (b << 0) | (c << 8) | (a << 16); 107| 173k| unsigned int cba = (c << 0) | (a << 8) | (b << 16); 108| | 109| 173k| unsigned int tri = rt3[i * 3 + 0] | (rt3[i * 3 + 1] << 8) | (rt3[i * 3 + 2] << 16); 110| | 111| | assert(tri == abc || tri == bca || tri == cba); ------------------ | Branch (111:3): [True: 106k, False: 66.9k] | Branch (111:3): [True: 35.1k, False: 31.8k] | Branch (111:3): [True: 31.8k, False: 0] | Branch (111:3): [True: 173k, False: 0] ------------------ 112| 173k| } 113| 2.07k|} _Z21fuzzRoundtripMeshletVPKhm: 116| 2.07k|{ 117| 2.07k| size_t vertex_count = size / 4; 118| 2.07k| if (vertex_count > 256) ------------------ | Branch (118:6): [True: 460, False: 1.61k] ------------------ 119| 460| vertex_count = 256; 120| | 121| 2.07k| unsigned char tri[4] = {0, 1, 2}; 122| | 123| 2.07k| unsigned char buf[4096]; 124| 2.07k| size_t enc = meshopt_encodeMeshlet(buf, sizeof(buf), reinterpret_cast(data), vertex_count, tri, 1); 125| 2.07k| assert(enc > 0); ------------------ | Branch (125:2): [True: 2.07k, False: 0] ------------------ 126| 2.07k| assert(enc <= meshopt_encodeMeshletBound(vertex_count, 1)); ------------------ | Branch (126:2): [True: 2.07k, False: 0] ------------------ 127| | 128| 2.07k| unsigned int rv4[256]; 129| 2.07k| int rc4 = meshopt_decodeMeshlet(rv4, vertex_count, tri, 1, buf, enc); 130| 2.07k| assert(rc4 == 0); ------------------ | Branch (130:2): [True: 2.07k, False: 0] ------------------ 131| | 132| 163k| for (size_t i = 0; i < vertex_count; ++i) ------------------ | Branch (132:21): [True: 161k, False: 2.07k] ------------------ 133| 161k| assert(rv4[i] == reinterpret_cast(data)[i]); ------------------ | Branch (133:3): [True: 161k, False: 0] ------------------ 134| | 135| 2.07k| unsigned short rv2[256]; 136| 2.07k| int rc2 = meshopt_decodeMeshlet(rv2, vertex_count, tri, 1, buf, enc); 137| 2.07k| assert(rc2 == 0); ------------------ | Branch (137:2): [True: 2.07k, False: 0] ------------------ 138| | 139| 163k| for (size_t i = 0; i < vertex_count; ++i) ------------------ | Branch (139:21): [True: 161k, False: 2.07k] ------------------ 140| | assert(rv2[i] == uint16_t(reinterpret_cast(data)[i])); ------------------ | Branch (140:3): [True: 161k, False: 0] ------------------ 141| 2.07k|} LLVMFuzzerTestOneInput: 144| 2.07k|{ 145| | // decodeIndexBuffer supports 2 and 4-byte indices 146| 2.07k| fuzzDecoder(data, size, 2, meshopt_decodeIndexBuffer); 147| 2.07k| fuzzDecoder(data, size, 4, meshopt_decodeIndexBuffer); 148| | 149| | // decodeIndexSequence supports 2 and 4-byte indices 150| 2.07k| fuzzDecoder(data, size, 2, meshopt_decodeIndexSequence); 151| 2.07k| fuzzDecoder(data, size, 4, meshopt_decodeIndexSequence); 152| | 153| | // decodeVertexBuffer supports any strides divisible by 4 in 4-256 interval 154| | // it's a waste of time to check all of them, so we'll just check a few with different alignment mod 16 155| 2.07k| fuzzDecoder(data, size, 4, meshopt_decodeVertexBuffer); 156| 2.07k| fuzzDecoder(data, size, 16, meshopt_decodeVertexBuffer); 157| 2.07k| fuzzDecoder(data, size, 24, meshopt_decodeVertexBuffer); 158| 2.07k| fuzzDecoder(data, size, 32, meshopt_decodeVertexBuffer); 159| | 160| | // encodeVertexBuffer/decodeVertexBuffer should roundtrip for any stride, check a few with different alignment mod 16 161| | // this also checks memory safety properties of the encoder 162| | // to conserve time, we only check one version/level combination, biased towards version 1 163| 2.07k| uint8_t data0 = size > 0 ? data[0] : 0; ------------------ | Branch (163:18): [True: 2.07k, False: 0] ------------------ 164| 2.07k| int level = data0 % 5; 165| | 166| 2.07k| meshopt_encodeVertexVersion(level < 4 ? 1 : 0); ------------------ | Branch (166:30): [True: 1.58k, False: 486] ------------------ 167| | 168| 2.07k| fuzzRoundtrip(data, size, 4, level); 169| 2.07k| fuzzRoundtrip(data, size, 16, level); 170| 2.07k| fuzzRoundtrip(data, size, 24, level); 171| 2.07k| fuzzRoundtrip(data, size, 32, level); 172| | 173| | // validate that decodeMeshlet works on untrusted data and is memory safe within documented limits 174| 2.07k| if (size > 2) ------------------ | Branch (174:6): [True: 2.06k, False: 13] ------------------ 175| 2.06k| fuzzDecodeMeshlet(data[0] + 1, data[1] + 1, reinterpret_cast(data + 2), size - 2); 176| | 177| | // validate that index data roundtrips in meshlet encoding modulo rotation 178| 2.07k| fuzzRoundtripMeshlet(data, size); 179| | 180| | // validate that vertex data roundtrips in meshlet encoding 181| 2.07k| fuzzRoundtripMeshletV(data, size); 182| | 183| 2.07k| return 0; 184| 2.07k|}