meshopt_decodeIndexBuffer:
  379|  4.14k|{
  380|  4.14k|	using namespace meshopt;
  381|       |
  382|  4.14k|	assert(index_count % 3 == 0);
  ------------------
  |  Branch (382:2): [True: 4.14k, False: 0]
  ------------------
  383|  4.14k|	assert(index_size == 2 || index_size == 4);
  ------------------
  |  Branch (383:2): [True: 2.07k, False: 2.07k]
  |  Branch (383:2): [True: 2.07k, False: 0]
  |  Branch (383:2): [True: 4.14k, False: 0]
  ------------------
  384|       |
  385|       |	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
  386|  4.14k|	if (buffer_size < 1 + index_count / 3 + 16)
  ------------------
  |  Branch (386:6): [True: 1.75k, False: 2.39k]
  ------------------
  387|  1.75k|		return -2;
  388|       |
  389|  2.39k|	if ((buffer[0] & 0xf0) != kIndexHeader)
  ------------------
  |  Branch (389:6): [True: 1.71k, False: 684]
  ------------------
  390|  1.71k|		return -1;
  391|       |
  392|    684|	int version = buffer[0] & 0x0f;
  393|    684|	if (version > kDecodeIndexVersion)
  ------------------
  |  Branch (393:6): [True: 64, False: 620]
  ------------------
  394|     64|		return -1;
  395|       |
  396|    620|	EdgeFifo edgefifo;
  397|    620|	memset(edgefifo, -1, sizeof(edgefifo));
  398|       |
  399|    620|	VertexFifo vertexfifo;
  400|    620|	memset(vertexfifo, -1, sizeof(vertexfifo));
  401|       |
  402|    620|	size_t edgefifooffset = 0;
  403|    620|	size_t vertexfifooffset = 0;
  404|       |
  405|    620|	unsigned int next = 0;
  406|    620|	unsigned int last = 0;
  407|       |
  408|    620|	int fecmax = version >= 1 ? 13 : 15;
  ------------------
  |  Branch (408:15): [True: 126, False: 494]
  ------------------
  409|       |
  410|       |	// since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end
  411|    620|	const unsigned char* code = buffer + 1;
  412|    620|	const unsigned char* data = code + index_count / 3;
  413|    620|	const unsigned char* data_safe_end = buffer + buffer_size - 16;
  414|       |
  415|    620|	const unsigned char* codeaux_table = data_safe_end;
  416|       |
  417|  10.7k|	for (size_t i = 0; i < index_count; i += 3)
  ------------------
  |  Branch (417:21): [True: 10.4k, False: 338]
  ------------------
  418|  10.4k|	{
  419|       |		// make sure we have enough data to read for a triangle
  420|       |		// each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index
  421|       |		// after this we can be sure we can read without extra bounds checks
  422|  10.4k|		if (data > data_safe_end)
  ------------------
  |  Branch (422:7): [True: 282, False: 10.1k]
  ------------------
  423|    282|			return -2;
  424|       |
  425|  10.1k|		unsigned char codetri = *code++;
  426|       |
  427|  10.1k|		if (codetri < 0xf0)
  ------------------
  |  Branch (427:7): [True: 5.78k, False: 4.33k]
  ------------------
  428|  5.78k|		{
  429|  5.78k|			int fe = codetri >> 4;
  430|       |
  431|       |			// fifo reads are wrapped around 16 entry buffer
  432|  5.78k|			unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
  433|  5.78k|			unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
  434|  5.78k|			unsigned int c = 0;
  435|       |
  436|  5.78k|			int fec = codetri & 15;
  437|       |
  438|       |			// note: this is the most common path in the entire decoder
  439|       |			// inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable
  440|  5.78k|			if (fec < fecmax)
  ------------------
  |  Branch (440:8): [True: 4.72k, False: 1.06k]
  ------------------
  441|  4.72k|			{
  442|       |				// fifo reads are wrapped around 16 entry buffer
  443|  4.72k|				unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
  444|  4.72k|				c = (fec == 0) ? next : cf;
  ------------------
  |  Branch (444:9): [True: 1.82k, False: 2.90k]
  ------------------
  445|       |
  446|  4.72k|				int fec0 = fec == 0;
  447|  4.72k|				next += fec0;
  448|       |
  449|       |				// push vertex fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
  450|  4.72k|				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
  451|  4.72k|			}
  452|  1.06k|			else
  453|  1.06k|			{
  454|       |				// fec - (fec ^ 3) decodes 13, 14 into -1, 1
  455|       |				// note that we need to update the last index since free indices are delta-encoded
  456|  1.06k|				last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
  ------------------
  |  Branch (456:16): [True: 176, False: 886]
  ------------------
  457|       |
  458|       |				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
  459|  1.06k|				pushVertexFifo(vertexfifo, c, vertexfifooffset);
  460|  1.06k|			}
  461|       |
  462|       |			// push edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
  463|  5.78k|			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
  464|  5.78k|			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
  465|       |
  466|       |			// output triangle
  467|  5.78k|			writeTriangle(destination, i, index_size, a, b, c);
  468|  5.78k|		}
  469|  4.33k|		else
  470|  4.33k|		{
  471|       |			// fast path: read codeaux from the table
  472|  4.33k|			if (codetri < 0xfe)
  ------------------
  |  Branch (472:8): [True: 1.30k, False: 3.02k]
  ------------------
  473|  1.30k|			{
  474|  1.30k|				unsigned char codeaux = codeaux_table[codetri & 15];
  475|       |
  476|       |				// note: table can't contain feb/fec=15
  477|  1.30k|				int feb = codeaux >> 4;
  478|  1.30k|				int fec = codeaux & 15;
  479|       |
  480|       |				// fifo reads are wrapped around 16 entry buffer
  481|       |				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
  482|  1.30k|				unsigned int a = next++;
  483|       |
  484|  1.30k|				unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15];
  485|  1.30k|				unsigned int b = (feb == 0) ? next : bf;
  ------------------
  |  Branch (485:22): [True: 464, False: 838]
  ------------------
  486|       |
  487|  1.30k|				int feb0 = feb == 0;
  488|  1.30k|				next += feb0;
  489|       |
  490|  1.30k|				unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15];
  491|  1.30k|				unsigned int c = (fec == 0) ? next : cf;
  ------------------
  |  Branch (491:22): [True: 460, False: 842]
  ------------------
  492|       |
  493|  1.30k|				int fec0 = fec == 0;
  494|  1.30k|				next += fec0;
  495|       |
  496|       |				// output triangle
  497|  1.30k|				writeTriangle(destination, i, index_size, a, b, c);
  498|       |
  499|       |				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
  500|  1.30k|				pushVertexFifo(vertexfifo, a, vertexfifooffset);
  501|  1.30k|				pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0);
  502|  1.30k|				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
  503|       |
  504|  1.30k|				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
  505|  1.30k|				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
  506|  1.30k|				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
  507|  1.30k|			}
  508|  3.02k|			else
  509|  3.02k|			{
  510|       |				// slow path: read a full byte for codeaux instead of using a table lookup
  511|  3.02k|				unsigned char codeaux = *data++;
  512|       |
  513|  3.02k|				int fea = codetri == 0xfe ? 0 : 15;
  ------------------
  |  Branch (513:15): [True: 1.00k, False: 2.02k]
  ------------------
  514|  3.02k|				int feb = codeaux >> 4;
  515|  3.02k|				int fec = codeaux & 15;
  516|       |
  517|       |				// reset: codeaux is 0 but encoded as not-a-table
  518|  3.02k|				if (codeaux == 0)
  ------------------
  |  Branch (518:9): [True: 532, False: 2.49k]
  ------------------
  519|    532|					next = 0;
  520|       |
  521|       |				// fifo reads are wrapped around 16 entry buffer
  522|       |				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
  523|  3.02k|				unsigned int a = (fea == 0) ? next++ : 0;
  ------------------
  |  Branch (523:22): [True: 1.00k, False: 2.02k]
  ------------------
  524|  3.02k|				unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15];
  ------------------
  |  Branch (524:22): [True: 748, False: 2.28k]
  ------------------
  525|  3.02k|				unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15];
  ------------------
  |  Branch (525:22): [True: 688, False: 2.34k]
  ------------------
  526|       |
  527|       |				// note that we need to update the last index since free indices are delta-encoded
  528|  3.02k|				if (fea == 15)
  ------------------
  |  Branch (528:9): [True: 2.02k, False: 1.00k]
  ------------------
  529|  2.02k|					last = a = decodeIndex(data, last);
  530|       |
  531|  3.02k|				if (feb == 15)
  ------------------
  |  Branch (531:9): [True: 1.00k, False: 2.02k]
  ------------------
  532|  1.00k|					last = b = decodeIndex(data, last);
  533|       |
  534|  3.02k|				if (fec == 15)
  ------------------
  |  Branch (534:9): [True: 1.04k, False: 1.98k]
  ------------------
  535|  1.04k|					last = c = decodeIndex(data, last);
  536|       |
  537|       |				// output triangle
  538|  3.02k|				writeTriangle(destination, i, index_size, a, b, c);
  539|       |
  540|       |				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
  541|  3.02k|				pushVertexFifo(vertexfifo, a, vertexfifooffset);
  542|  3.02k|				pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15));
  543|  3.02k|				pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15));
  544|       |
  545|  3.02k|				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
  546|  3.02k|				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
  547|  3.02k|				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
  548|  3.02k|			}
  549|  4.33k|		}
  550|  10.1k|	}
  551|       |
  552|       |	// we should've read all data bytes and stopped at the boundary between data and codeaux table
  553|    338|	if (data != data_safe_end)
  ------------------
  |  Branch (553:6): [True: 320, False: 18]
  ------------------
  554|    320|		return -3;
  555|       |
  556|     18|	return 0;
  557|    338|}
meshopt_decodeIndexSequence:
  629|  4.14k|{
  630|  4.14k|	using namespace meshopt;
  631|       |
  632|       |	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
  633|  4.14k|	if (buffer_size < 1 + index_count + 4)
  ------------------
  |  Branch (633:6): [True: 2.30k, False: 1.84k]
  ------------------
  634|  2.30k|		return -2;
  635|       |
  636|  1.84k|	if ((buffer[0] & 0xf0) != kSequenceHeader)
  ------------------
  |  Branch (636:6): [True: 1.44k, False: 406]
  ------------------
  637|  1.44k|		return -1;
  638|       |
  639|    406|	int version = buffer[0] & 0x0f;
  640|    406|	if (version > kDecodeIndexVersion)
  ------------------
  |  Branch (640:6): [True: 28, False: 378]
  ------------------
  641|     28|		return -1;
  642|       |
  643|    378|	const unsigned char* data = buffer + 1;
  644|    378|	const unsigned char* data_safe_end = buffer + buffer_size - 4;
  645|       |
  646|    378|	unsigned int last[2] = {};
  647|       |
  648|  23.9k|	for (size_t i = 0; i < index_count; ++i)
  ------------------
  |  Branch (648:21): [True: 23.6k, False: 312]
  ------------------
  649|  23.6k|	{
  650|       |		// make sure we have enough data to read
  651|       |		// each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end
  652|       |		// after this we can be sure we can read without extra bounds checks
  653|  23.6k|		if (data >= data_safe_end)
  ------------------
  |  Branch (653:7): [True: 66, False: 23.5k]
  ------------------
  654|     66|			return -2;
  655|       |
  656|  23.5k|		unsigned int v = decodeVByte(data);
  657|       |
  658|       |		// decode the index of the last baseline
  659|  23.5k|		unsigned int current = v & 1;
  660|  23.5k|		v >>= 1;
  661|       |
  662|       |		// reconstruct index as a delta
  663|  23.5k|		unsigned int d = (v >> 1) ^ -int(v & 1);
  664|  23.5k|		unsigned int index = last[current] + d;
  665|       |
  666|       |		// update last for the next iteration that uses it
  667|  23.5k|		last[current] = index;
  668|       |
  669|  23.5k|		if (index_size == 2)
  ------------------
  |  Branch (669:7): [True: 11.7k, False: 11.7k]
  ------------------
  670|  11.7k|		{
  671|  11.7k|			static_cast<unsigned short*>(destination)[i] = (unsigned short)(index);
  672|  11.7k|		}
  673|  11.7k|		else
  674|  11.7k|		{
  675|  11.7k|			static_cast<unsigned int*>(destination)[i] = index;
  676|  11.7k|		}
  677|  23.5k|	}
  678|       |
  679|       |	// we should've read all data bytes and stopped at the boundary between data and tail
  680|    312|	if (data != data_safe_end)
  ------------------
  |  Branch (680:6): [True: 310, False: 2]
  ------------------
  681|    310|		return -3;
  682|       |
  683|      2|	return 0;
  684|    312|}
indexcodec.cpp:_ZN7meshoptL14pushVertexFifoEPjjRmi:
   75|  18.7k|{
   76|  18.7k|	fifo[offset] = v;
   77|  18.7k|	offset = (offset + cond) & 15;
   78|  18.7k|}
indexcodec.cpp:_ZN7meshoptL12pushEdgeFifoEPA2_jjjRm:
   55|  24.5k|{
   56|  24.5k|	fifo[offset][0] = a;
   57|  24.5k|	fifo[offset][1] = b;
   58|  24.5k|	offset = (offset + 1) & 15;
   59|  24.5k|}
indexcodec.cpp:_ZN7meshoptL11decodeIndexERPKhj:
  125|  4.95k|{
  126|  4.95k|	unsigned int v = decodeVByte(data);
  127|  4.95k|	unsigned int d = (v >> 1) ^ -int(v & 1);
  128|       |
  129|  4.95k|	return last + d;
  130|  4.95k|}
indexcodec.cpp:_ZN7meshoptL13writeTriangleEPvmmjjj:
  142|  10.1k|{
  143|  10.1k|	if (index_size == 2)
  ------------------
  |  Branch (143:6): [True: 5.05k, False: 5.05k]
  ------------------
  144|  5.05k|	{
  145|  5.05k|		static_cast<unsigned short*>(destination)[offset + 0] = (unsigned short)(a);
  146|  5.05k|		static_cast<unsigned short*>(destination)[offset + 1] = (unsigned short)(b);
  147|  5.05k|		static_cast<unsigned short*>(destination)[offset + 2] = (unsigned short)(c);
  148|  5.05k|	}
  149|  5.05k|	else
  150|  5.05k|	{
  151|  5.05k|		static_cast<unsigned int*>(destination)[offset + 0] = a;
  152|  5.05k|		static_cast<unsigned int*>(destination)[offset + 1] = b;
  153|  5.05k|		static_cast<unsigned int*>(destination)[offset + 2] = c;
  154|  5.05k|	}
  155|  10.1k|}
indexcodec.cpp:_ZN7meshoptL11decodeVByteERPKh:
   91|  28.5k|{
   92|  28.5k|	unsigned char lead = *data++;
   93|       |
   94|       |	// fast path: single byte
   95|  28.5k|	if (lead < 128)
  ------------------
  |  Branch (95:6): [True: 17.6k, False: 10.9k]
  ------------------
   96|  17.6k|		return lead;
   97|       |
   98|       |	// slow path: up to 4 extra bytes
   99|       |	// note that this loop always terminates, which is important for malformed data
  100|  10.9k|	unsigned int result = lead & 127;
  101|  10.9k|	unsigned int shift = 7;
  102|       |
  103|  34.6k|	for (int i = 0; i < 4; ++i)
  ------------------
  |  Branch (103:18): [True: 29.9k, False: 4.63k]
  ------------------
  104|  29.9k|	{
  105|  29.9k|		unsigned char group = *data++;
  106|  29.9k|		result |= unsigned(group & 127) << shift;
  107|  29.9k|		shift += 7;
  108|       |
  109|  29.9k|		if (group < 128)
  ------------------
  |  Branch (109:7): [True: 6.28k, False: 23.7k]
  ------------------
  110|  6.28k|			break;
  111|  29.9k|	}
  112|       |
  113|  10.9k|	return result;
  114|  28.5k|}

meshopt_encodeMeshletBound:
  899|  4.14k|{
  900|  4.14k|	size_t codes_size = (max_triangles + 1) / 2;
  901|  4.14k|	size_t extra_size = max_triangles * 3;
  902|       |
  903|  4.14k|	size_t ctrl_size = (max_vertices + 3) / 4;
  904|  4.14k|	size_t data_size = (max_vertices + 3) / 4 * 16; // worst case: 16 bytes per vertex group
  905|       |
  906|  4.14k|	size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
  ------------------
  |  Branch (906:20): [True: 2.61k, False: 1.53k]
  ------------------
  907|       |
  908|  4.14k|	return codes_size + extra_size + ctrl_size + data_size + gap_size;
  909|  4.14k|}
meshopt_encodeMeshlet:
  912|  4.14k|{
  913|  4.14k|	using namespace meshopt;
  914|       |
  915|  4.14k|	assert(triangle_count <= 256 && vertex_count <= 256);
  ------------------
  |  Branch (915:2): [True: 4.14k, False: 0]
  |  Branch (915:2): [True: 4.14k, False: 0]
  |  Branch (915:2): [True: 4.14k, False: 0]
  ------------------
  916|       |
  917|       |	// 4 bits per triangle + up to three bytes of extra data
  918|  4.14k|	unsigned char codes[256 / 2];
  919|  4.14k|	unsigned char extra[256 * 3];
  920|  4.14k|	size_t codes_size = (triangle_count + 1) / 2;
  921|  4.14k|	size_t extra_size = encodeTriangles(codes, extra, triangles, triangle_count);
  922|  4.14k|	assert(extra_size <= sizeof(extra));
  ------------------
  |  Branch (922:2): [True: 4.14k, False: 0]
  ------------------
  923|       |
  924|       |	// 2 bits per vertex + up to 4 bytes of actual data
  925|  4.14k|	unsigned char ctrl[256 / 4];
  926|  4.14k|	unsigned char data[256 * 4];
  927|  4.14k|	size_t ctrl_size = (vertex_count + 3) / 4;
  928|  4.14k|	size_t data_size = encodeVertices(ctrl, data, vertices, vertex_count);
  929|  4.14k|	assert(data_size <= sizeof(data));
  ------------------
  |  Branch (929:2): [True: 4.14k, False: 0]
  ------------------
  930|       |
  931|       |	// we need to ensure that up to 16 bytes after extra+data are available for SIMD decoding
  932|       |	// to minimize overhead, we place fixed-size codes+control at the end of the buffer
  933|  4.14k|	size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
  ------------------
  |  Branch (933:20): [True: 2.61k, False: 1.53k]
  ------------------
  934|       |
  935|  4.14k|	size_t result = codes_size + extra_size + ctrl_size + data_size + gap_size;
  936|       |
  937|  4.14k|	if (result > buffer_size)
  ------------------
  |  Branch (937:6): [True: 0, False: 4.14k]
  ------------------
  938|      0|		return 0;
  939|       |
  940|       |	// variable-size data first
  941|  4.14k|	memcpy(buffer, data, data_size);
  942|  4.14k|	buffer += data_size;
  943|  4.14k|	memcpy(buffer, extra, extra_size);
  944|  4.14k|	buffer += extra_size;
  945|       |
  946|       |	// gap (for accelerated decoding) separates variable-size and fixed-size data
  947|  4.14k|	memset(buffer, 0, gap_size);
  948|  4.14k|	buffer += gap_size;
  949|       |
  950|       |	// fixed-size data last; it can be located from buffer end during decoding
  951|  4.14k|	memcpy(buffer, ctrl, ctrl_size);
  952|  4.14k|	buffer += ctrl_size;
  953|  4.14k|	memcpy(buffer, codes, codes_size);
  954|  4.14k|	buffer += codes_size;
  955|       |
  956|       |#if TRACE > 1
  957|       |	printf("extra:");
  958|       |	for (size_t i = 0; i < extra_size; ++i)
  959|       |		printf(" %d", extra[i]);
  960|       |	printf("\n");
  961|       |
  962|       |	unsigned int minv = ~0u;
  963|       |	for (size_t i = 0; i < vertex_count; ++i)
  964|       |		minv = minv < vertices[i] ? minv : vertices[i];
  965|       |
  966|       |	printf("vertices: [%d+]", minv);
  967|       |	for (size_t i = 0; i < vertex_count; ++i)
  968|       |		printf(" %d", vertices[i] - minv);
  969|       |	printf("\n");
  970|       |#endif
  971|       |
  972|       |#if TRACE
  973|       |	printf("stats: %d vertices, %d triangles => %d bytes (triangles: %d codes, %d extra; vertices: %d control, %d data; %d gap)\n",
  974|       |	    int(vertex_count), int(triangle_count), int(result),
  975|       |	    int(codes_size), int(extra_size), int(ctrl_size), int(data_size), int(gap_size));
  976|       |#endif
  977|       |
  978|  4.14k|	return result;
  979|  4.14k|}
meshopt_decodeMeshlet:
  982|  16.5k|{
  983|  16.5k|	using namespace meshopt;
  984|       |
  985|  16.5k|	assert(triangle_count <= 256 && vertex_count <= 256);
  ------------------
  |  Branch (985:2): [True: 16.5k, False: 0]
  |  Branch (985:2): [True: 16.5k, False: 0]
  |  Branch (985:2): [True: 16.5k, False: 0]
  ------------------
  986|  16.5k|	assert(vertex_size == 4 || vertex_size == 2);
  ------------------
  |  Branch (986:2): [True: 10.3k, False: 6.19k]
  |  Branch (986:2): [True: 6.19k, False: 0]
  |  Branch (986:2): [True: 16.5k, False: 0]
  ------------------
  987|  16.5k|	assert(triangle_size == 4 || triangle_size == 3);
  ------------------
  |  Branch (987:2): [True: 6.19k, False: 10.3k]
  |  Branch (987:2): [True: 10.3k, False: 0]
  |  Branch (987:2): [True: 16.5k, False: 0]
  ------------------
  988|       |
  989|       |	// layout must match encoding
  990|  16.5k|	size_t codes_size = (triangle_count + 1) / 2;
  991|  16.5k|	size_t ctrl_size = (vertex_count + 3) / 4;
  992|  16.5k|	size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
  ------------------
  |  Branch (992:20): [True: 6.67k, False: 9.86k]
  ------------------
  993|       |
  994|  16.5k|	if (buffer_size < codes_size + ctrl_size + gap_size)
  ------------------
  |  Branch (994:6): [True: 4.16k, False: 12.3k]
  ------------------
  995|  4.16k|		return -2;
  996|       |
  997|  12.3k|	const unsigned char* end = buffer + buffer_size;
  998|  12.3k|	const unsigned char* codes = end - codes_size;
  999|  12.3k|	const unsigned char* ctrl = codes - ctrl_size;
 1000|  12.3k|	const unsigned char* data = buffer;
 1001|       |
 1002|       |	// gap ensures we have at least 16 bytes available after bound; this allows SIMD decoders to over-read safely
 1003|  12.3k|	const unsigned char* bound = ctrl - gap_size;
 1004|  12.3k|	assert(bound >= buffer && bound + 16 <= buffer + buffer_size);
  ------------------
  |  Branch (1004:2): [True: 12.3k, False: 0]
  |  Branch (1004:2): [True: 12.3k, False: 0]
  |  Branch (1004:2): [True: 12.3k, False: 0]
  ------------------
 1005|       |
 1006|  12.3k|#if defined(SIMD_FALLBACK)
 1007|  12.3k|	return (gDecodeTablesInitialized ? decodeMeshletSimd<0> : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
  ------------------
  |  Branch (1007:10): [True: 12.3k, False: 0]
  ------------------
 1008|       |#elif defined(SIMD_SSE) || defined(SIMD_NEON)
 1009|       |	return decodeMeshletSimd<0>(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
 1010|       |#else
 1011|       |	return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
 1012|       |#endif
 1013|  12.3k|}
meshopt_decodeMeshletRaw:
 1016|  2.06k|{
 1017|  2.06k|	using namespace meshopt;
 1018|       |
 1019|  2.06k|	assert(triangle_count <= 256 && vertex_count <= 256);
  ------------------
  |  Branch (1019:2): [True: 2.06k, False: 0]
  |  Branch (1019:2): [True: 2.06k, False: 0]
  |  Branch (1019:2): [True: 2.06k, False: 0]
  ------------------
 1020|       |
 1021|       |	// layout must match encoding
 1022|  2.06k|	size_t codes_size = (triangle_count + 1) / 2;
 1023|  2.06k|	size_t ctrl_size = (vertex_count + 3) / 4;
 1024|  2.06k|	size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
  ------------------
  |  Branch (1024:20): [True: 362, False: 1.69k]
  ------------------
 1025|       |
 1026|  2.06k|	if (buffer_size < codes_size + ctrl_size + gap_size)
  ------------------
  |  Branch (1026:6): [True: 1.04k, False: 1.02k]
  ------------------
 1027|  1.04k|		return -2;
 1028|       |
 1029|  1.02k|	const unsigned char* end = buffer + buffer_size;
 1030|  1.02k|	const unsigned char* codes = end - codes_size;
 1031|  1.02k|	const unsigned char* ctrl = codes - ctrl_size;
 1032|  1.02k|	const unsigned char* data = buffer;
 1033|       |
 1034|       |	// gap ensures we have at least 16 bytes available after bound; this allows SIMD decoders to over-read safely
 1035|  1.02k|	const unsigned char* bound = ctrl - gap_size;
 1036|  1.02k|	assert(bound >= buffer && bound + 16 <= buffer + buffer_size);
  ------------------
  |  Branch (1036:2): [True: 1.02k, False: 0]
  |  Branch (1036:2): [True: 1.02k, False: 0]
  |  Branch (1036:2): [True: 1.02k, False: 0]
  ------------------
 1037|       |
 1038|  1.02k|#if defined(SIMD_FALLBACK)
 1039|  1.02k|	return (gDecodeTablesInitialized ? decodeMeshletSimd<1> : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);
  ------------------
  |  Branch (1039:10): [True: 1.02k, False: 0]
  ------------------
 1040|       |#elif defined(SIMD_SSE) || defined(SIMD_NEON)
 1041|       |	return decodeMeshletSimd<1>(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);
 1042|       |#else
 1043|       |	return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);
 1044|       |#endif
 1045|  1.02k|}
meshletcodec.cpp:_ZN7meshoptL17decodeBuildTablesEv:
  398|      2|{
  399|      2|#define NEXT(var, ec) \
  400|      2|	shuf[var] = (ec) ? (unsigned char)extra : 15; \
  401|      2|	next[var] = (ec) ? 0 : (unsigned char)nextoff; \
  402|      2|	extra += (ec), nextoff += 1 - (ec)
  403|       |
  404|       |	// check for SSE4.1 support if we have a fallback path
  405|      2|#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  406|      2|	int cpuinfo[4] = {};
  407|       |#ifdef _MSC_VER
  408|       |	__cpuid(cpuinfo, 1);
  409|       |#else
  410|      2|	__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
  411|      2|#endif
  412|       |	// bit 19 = SSE4.1
  413|      2|	if ((cpuinfo[2] & (1 << 19)) == 0)
  ------------------
  |  Branch (413:6): [True: 0, False: 2]
  ------------------
  414|      0|		return false;
  415|      2|#endif
  416|       |
  417|       |	// fill triangle decoding tables for each combination of two triangle codes
  418|    514|	for (int code = 0; code < 256; ++code)
  ------------------
  |  Branch (418:21): [True: 512, False: 2]
  ------------------
  419|    512|	{
  420|    512|		unsigned char shuf[16] = {};
  421|    512|		unsigned char next[16] = {};
  422|    512|		int extra = 0;
  423|    512|		int nextoff = 0;
  424|       |
  425|       |		// state 0..5 will be refilled every iteration, so we ignore that
  426|       |		// state 6..8 will always contain the last decoded triangle because every triangle shifts fifo equally, so we can decode it independently
  427|    512|		shuf[6] = 12;
  428|    512|		shuf[7] = 13;
  429|    512|		shuf[8] = 14;
  430|       |
  431|       |		// state 15 will contain next (potentially incremented a few times)
  432|    512|		shuf[15] = 15;
  433|       |
  434|       |		// state 9..11 will contain the first decoded triangle (tri0), which can refer to extra/next and the original triangle history
  435|       |		// state 12..14 will contain the second decoded triangle (tri1); when decoding edge reuse, we need to handle edge 0/1 specially as it was just decoded earlier
  436|  1.53k|		for (int k = 0; k < 2; ++k)
  ------------------
  |  Branch (436:19): [True: 1.02k, False: 512]
  ------------------
  437|  1.02k|		{
  438|  1.02k|			int tri = (code >> (k * 4)) & 0xf;
  439|       |
  440|  1.02k|			if (tri < 12)
  ------------------
  |  Branch (440:8): [True: 768, False: 256]
  ------------------
  441|    768|			{
  442|    768|				if (k == 1 && tri / 4 == 0)
  ------------------
  |  Branch (442:9): [True: 384, False: 384]
  |  Branch (442:19): [True: 128, False: 256]
  ------------------
  443|    128|				{
  444|       |					// we need to decode one of two edges from the triangle we just decoded earlier
  445|       |					// for that we simply need to copy shuf/next values for the two decoded indices
  446|    128|					shuf[9 + k * 3] = shuf[9 + ((tri & 2) ? 2 : 0)];
  ------------------
  |  Branch (446:34): [True: 64, False: 64]
  ------------------
  447|    128|					next[9 + k * 3] = next[9 + ((tri & 2) ? 2 : 0)];
  ------------------
  |  Branch (447:34): [True: 64, False: 64]
  ------------------
  448|       |
  449|    128|					shuf[10 + k * 3] = shuf[9 + ((tri & 2) ? 1 : 2)];
  ------------------
  |  Branch (449:35): [True: 64, False: 64]
  ------------------
  450|    128|					next[10 + k * 3] = next[9 + ((tri & 2) ? 1 : 2)];
  ------------------
  |  Branch (450:35): [True: 64, False: 64]
  ------------------
  451|    128|				}
  452|    640|				else
  453|    640|				{
  454|       |					// reuse: edge comes from the history based on edge index
  455|       |					// note: we reuse with an offset because last triangle in the original history was consumed by tri0
  456|    640|					int trioff = 6 + k * 3 + (2 - tri / 4) * 3;
  457|       |
  458|       |					// edge cb or ac
  459|    640|					shuf[9 + k * 3] = (unsigned char)(trioff + ((tri & 2) ? 2 : 0));
  ------------------
  |  Branch (459:50): [True: 320, False: 320]
  ------------------
  460|    640|					shuf[10 + k * 3] = (unsigned char)(trioff + ((tri & 2) ? 1 : 2));
  ------------------
  |  Branch (460:51): [True: 320, False: 320]
  ------------------
  461|    640|				}
  462|       |
  463|       |				// third vertex is either next or comes from extra
  464|    768|				NEXT(11 + k * 3, tri & 1);
  ------------------
  |  |  400|    768|	shuf[var] = (ec) ? (unsigned char)extra : 15; \
  |  |  ------------------
  |  |  |  Branch (400:14): [True: 384, False: 384]
  |  |  ------------------
  |  |  401|    768|	next[var] = (ec) ? 0 : (unsigned char)nextoff; \
  |  |  ------------------
  |  |  |  Branch (401:14): [True: 384, False: 384]
  |  |  ------------------
  |  |  402|    768|	extra += (ec), nextoff += 1 - (ec)
  ------------------
  465|    768|			}
  466|    256|			else
  467|    256|			{
  468|       |				// restart: three vertices, each comes from next or extra
  469|    256|				int fea = tri > 12;
  470|    256|				int feb = tri > 13;
  471|    256|				int fec = tri > 14;
  472|       |
  473|    256|				NEXT(9 + k * 3, fea);
  ------------------
  |  |  400|    256|	shuf[var] = (ec) ? (unsigned char)extra : 15; \
  |  |  ------------------
  |  |  |  Branch (400:14): [True: 192, False: 64]
  |  |  ------------------
  |  |  401|    256|	next[var] = (ec) ? 0 : (unsigned char)nextoff; \
  |  |  ------------------
  |  |  |  Branch (401:14): [True: 192, False: 64]
  |  |  ------------------
  |  |  402|    256|	extra += (ec), nextoff += 1 - (ec)
  ------------------
  474|    256|				NEXT(10 + k * 3, feb);
  ------------------
  |  |  400|    256|	shuf[var] = (ec) ? (unsigned char)extra : 15; \
  |  |  ------------------
  |  |  |  Branch (400:14): [True: 128, False: 128]
  |  |  ------------------
  |  |  401|    256|	next[var] = (ec) ? 0 : (unsigned char)nextoff; \
  |  |  ------------------
  |  |  |  Branch (401:14): [True: 128, False: 128]
  |  |  ------------------
  |  |  402|    256|	extra += (ec), nextoff += 1 - (ec)
  ------------------
  475|    256|				NEXT(11 + k * 3, fec);
  ------------------
  |  |  400|    256|	shuf[var] = (ec) ? (unsigned char)extra : 15; \
  |  |  ------------------
  |  |  |  Branch (400:14): [True: 64, False: 192]
  |  |  ------------------
  |  |  401|    256|	next[var] = (ec) ? 0 : (unsigned char)nextoff; \
  |  |  ------------------
  |  |  |  Branch (401:14): [True: 64, False: 192]
  |  |  ------------------
  |  |  402|    256|	extra += (ec), nextoff += 1 - (ec)
  ------------------
  476|    256|			}
  477|  1.02k|		}
  478|       |
  479|       |		// next needs to advance
  480|    512|		next[15] = (unsigned char)nextoff;
  481|       |
  482|       |		// next[0..8] = 0 trivially (never written to); next[9] must also be 0 because nextoff is 0 initially
  483|       |		// shuf[0..5] is not used, which allows us to pack next[10..15] + shuf[6..15] into a single 16-byte entry
  484|    512|		assert(next[9] == 0);
  ------------------
  |  Branch (484:3): [True: 512, False: 0]
  ------------------
  485|    512|		memcpy(&kDecodeTableMasks[code][0], &next[10], 6);
  486|    512|		memcpy(&kDecodeTableMasks[code][6], &shuf[6], 10);
  487|    512|		kDecodeTableExtra[code] = (unsigned char)extra;
  488|    512|	}
  489|       |
  490|       |	// fill vertex decoding tables for each combination of four vertex references
  491|    514|	for (unsigned int i = 0; i < 256; ++i)
  ------------------
  |  Branch (491:27): [True: 512, False: 2]
  ------------------
  492|    512|	{
  493|    512|		unsigned char shuf[16] = {};
  494|    512|		int offset = 0;
  495|       |
  496|  2.56k|		for (int k = 0; k < 4; ++k)
  ------------------
  |  Branch (496:19): [True: 2.04k, False: 512]
  ------------------
  497|  2.04k|		{
  498|  2.04k|			int code = ((i >> k) & 1) | ((i >> (k + 3)) & 2);
  499|  2.04k|			int length = i == 0xff ? 4 : code; // 0/1/2/3 bytes, or all 4 bytes if code==0xff
  ------------------
  |  Branch (499:17): [True: 8, False: 2.04k]
  ------------------
  500|       |
  501|  2.04k|			shuf[k * 4 + 0] = (length > 0) ? (unsigned char)(offset + 0) : 0x80;
  ------------------
  |  Branch (501:22): [True: 1.53k, False: 512]
  ------------------
  502|  2.04k|			shuf[k * 4 + 1] = (length > 1) ? (unsigned char)(offset + 1) : 0x80;
  ------------------
  |  Branch (502:22): [True: 1.02k, False: 1.02k]
  ------------------
  503|  2.04k|			shuf[k * 4 + 2] = (length > 2) ? (unsigned char)(offset + 2) : 0x80;
  ------------------
  |  Branch (503:22): [True: 512, False: 1.53k]
  ------------------
  504|  2.04k|			shuf[k * 4 + 3] = (length > 3) ? (unsigned char)(offset + 3) : 0x80;
  ------------------
  |  Branch (504:22): [True: 8, False: 2.04k]
  ------------------
  505|       |
  506|  2.04k|			offset += length;
  507|  2.04k|		}
  508|       |
  509|    512|		memcpy(kDecodeTableVerts[i], shuf, sizeof(shuf));
  510|    512|		kDecodeTableLength[i] = (unsigned char)offset;
  511|    512|	}
  512|       |
  513|      2|	return true;
  514|       |
  515|      2|#undef NEXT
  516|      2|}
meshletcodec.cpp:_ZN7meshoptL15encodeTrianglesEPhS0_PKhm:
  109|  4.14k|{
  110|  4.14k|	EdgeFifo8 edgefifo;
  111|  4.14k|	memset(edgefifo, -1, sizeof(edgefifo));
  112|       |
  113|  4.14k|	size_t edgefifooffset = 0;
  114|       |
  115|  4.14k|	unsigned int next = 0;
  116|       |
  117|       |	// 4-bit triangle codes give us 16 options that we use as follows:
  118|       |	// 3*2 edge reuse (2 edges * 3 last triangles) * 2 next/explicit = 12 options
  119|       |	// 4 remaining options = next bits; 000, 001, 011, 111.
  120|       |	// triangles are rotated to make next bits line up.
  121|  4.14k|	memset(codes, 0, (triangle_count + 1) / 2);
  122|       |
  123|  4.14k|	static const int rotations[] = {0, 1, 2, 0, 1};
  124|       |
  125|  4.14k|	unsigned char* start = extra;
  126|       |
  127|   179k|	for (size_t i = 0; i < triangle_count; ++i)
  ------------------
  |  Branch (127:21): [True: 175k, False: 4.14k]
  ------------------
  128|   175k|	{
  129|       |#if TRACE > 1
  130|       |		unsigned int last = next;
  131|       |#endif
  132|       |
  133|   175k|		int fer = getEdgeFifo8(edgefifo, triangles[i * 3 + 0], triangles[i * 3 + 1], triangles[i * 3 + 2], edgefifooffset);
  134|       |
  135|   175k|		if (fer >= 0 && (fer >> 2) < 6)
  ------------------
  |  Branch (135:7): [True: 87.5k, False: 87.8k]
  |  Branch (135:19): [True: 84.0k, False: 3.50k]
  ------------------
  136|  84.0k|		{
  137|       |			// note: getEdgeFifo8 implicitly rotates triangles by matching a/b to existing edge
  138|  84.0k|			const int* order = rotations + (fer & 3);
  139|       |
  140|  84.0k|			unsigned int a = triangles[i * 3 + order[0]], b = triangles[i * 3 + order[1]], c = triangles[i * 3 + order[2]];
  141|       |
  142|  84.0k|			int fec = (c == next) ? (next++, 0) : 1;
  ------------------
  |  Branch (142:14): [True: 712, False: 83.3k]
  ------------------
  143|       |
  144|       |#if TRACE > 1
  145|       |			printf("%3d+ | %3d %3d %3d | edge: e%d c%d\n", last, a, b, c, fer >> 2, fec);
  146|       |#endif
  147|       |
  148|  84.0k|			unsigned int code = (fer >> 2) * 2 + fec;
  149|       |
  150|  84.0k|			codes[i / 2] |= (unsigned char)(code << ((i & 1) * 4));
  151|       |
  152|  84.0k|			if (fec)
  ------------------
  |  Branch (152:8): [True: 83.3k, False: 712]
  ------------------
  153|  83.3k|				*extra++ = (unsigned char)c;
  154|       |
  155|  84.0k|			pushEdgeFifo8(edgefifo, c, b, edgefifooffset);
  156|  84.0k|			pushEdgeFifo8(edgefifo, a, c, edgefifooffset);
  157|  84.0k|		}
  158|  91.3k|		else
  159|  91.3k|		{
  160|       |			// rotate triangles to minimize the need for extra vertices
  161|  91.3k|			int rotation = rotateTriangle(triangles[i * 3 + 0], triangles[i * 3 + 1], triangles[i * 3 + 2]);
  162|  91.3k|			const int* order = rotations + rotation;
  163|       |
  164|  91.3k|			unsigned int a = triangles[i * 3 + order[0]], b = triangles[i * 3 + order[1]], c = triangles[i * 3 + order[2]];
  165|       |
  166|       |			// fe must be continuous: once a vertex is encoded with next, further vertices must also be encoded with next
  167|  91.3k|			int fea = (a == next && b == next + 1 && c == next + 2) ? (next++, 0) : 1;
  ------------------
  |  Branch (167:15): [True: 8.03k, False: 83.3k]
  |  Branch (167:28): [True: 3.42k, False: 4.60k]
  |  Branch (167:45): [True: 2.96k, False: 466]
  ------------------
  168|  91.3k|			int feb = (b == next && c == next + 1) ? (next++, 0) : 1;
  ------------------
  |  Branch (168:15): [True: 8.26k, False: 83.0k]
  |  Branch (168:28): [True: 3.20k, False: 5.05k]
  ------------------
  169|  91.3k|			int fec = (c == next) ? (next++, 0) : 1;
  ------------------
  |  Branch (169:14): [True: 4.19k, False: 87.1k]
  ------------------
  170|       |
  171|  91.3k|			assert(fea == 1 || feb == 0);
  ------------------
  |  Branch (171:4): [True: 88.3k, False: 2.96k]
  |  Branch (171:4): [True: 2.96k, False: 0]
  |  Branch (171:4): [True: 91.3k, False: 0]
  ------------------
  172|  91.3k|			assert(feb == 1 || fec == 0);
  ------------------
  |  Branch (172:4): [True: 88.1k, False: 3.20k]
  |  Branch (172:4): [True: 3.20k, False: 0]
  |  Branch (172:4): [True: 91.3k, False: 0]
  ------------------
  173|       |
  174|       |#if TRACE > 1
  175|       |			printf("%3d+ | %3d %3d %3d | restart: %d%d%d\n", last, a, b, c, fea, feb, fec);
  176|       |#endif
  177|       |
  178|  91.3k|			unsigned int code = 12 + (fea + feb + fec);
  179|       |
  180|  91.3k|			codes[i / 2] |= (unsigned char)(code << ((i & 1) * 4));
  181|       |
  182|  91.3k|			if (fea)
  ------------------
  |  Branch (182:8): [True: 88.3k, False: 2.96k]
  ------------------
  183|  88.3k|				*extra++ = (unsigned char)a;
  184|  91.3k|			if (feb)
  ------------------
  |  Branch (184:8): [True: 88.1k, False: 3.20k]
  ------------------
  185|  88.1k|				*extra++ = (unsigned char)b;
  186|  91.3k|			if (fec)
  ------------------
  |  Branch (186:8): [True: 87.1k, False: 4.19k]
  ------------------
  187|  87.1k|				*extra++ = (unsigned char)c;
  188|       |
  189|  91.3k|			pushEdgeFifo8(edgefifo, c, b, edgefifooffset);
  190|  91.3k|			pushEdgeFifo8(edgefifo, a, c, edgefifooffset);
  191|  91.3k|		}
  192|   175k|	}
  193|       |
  194|  4.14k|	return extra - start;
  195|  4.14k|}
meshletcodec.cpp:_ZN7meshoptL12getEdgeFifo8EPA2_jjjjm:
   82|   175k|{
   83|   941k|	for (int i = 0; i < 8; ++i)
  ------------------
  |  Branch (83:18): [True: 853k, False: 87.8k]
  ------------------
   84|   853k|	{
   85|   853k|		size_t index = (offset - 1 - i) & 7;
   86|       |
   87|   853k|		unsigned int e0 = fifo[index][0];
   88|   853k|		unsigned int e1 = fifo[index][1];
   89|       |
   90|   853k|		if (e0 == a && e1 == b)
  ------------------
  |  Branch (90:7): [True: 135k, False: 718k]
  |  Branch (90:18): [True: 71.1k, False: 64.1k]
  ------------------
   91|  71.1k|			return (i << 2) | 0;
   92|   782k|		if (e0 == b && e1 == c)
  ------------------
  |  Branch (92:7): [True: 55.9k, False: 726k]
  |  Branch (92:18): [True: 8.81k, False: 47.1k]
  ------------------
   93|  8.81k|			return (i << 2) | 1;
   94|   773k|		if (e0 == c && e1 == a)
  ------------------
  |  Branch (94:7): [True: 53.2k, False: 720k]
  |  Branch (94:18): [True: 7.57k, False: 45.6k]
  ------------------
   95|  7.57k|			return (i << 2) | 2;
   96|   773k|	}
   97|       |
   98|  87.8k|	return -1;
   99|   175k|}
meshletcodec.cpp:_ZN7meshoptL13pushEdgeFifo8EPA2_jjjRm:
  102|   350k|{
  103|   350k|	fifo[offset][0] = a;
  104|   350k|	fifo[offset][1] = b;
  105|   350k|	offset = (offset + 1) & 7;
  106|   350k|}
meshletcodec.cpp:_ZN7meshoptL14rotateTriangleEjjj:
   77|  91.3k|{
   78|  91.3k|	return (a > b && a > c) ? 1 : (b > c ? 2 : 0);
  ------------------
  |  Branch (78:10): [True: 38.8k, False: 52.4k]
  |  Branch (78:19): [True: 27.2k, False: 11.6k]
  |  Branch (78:33): [True: 24.9k, False: 39.1k]
  ------------------
   79|  91.3k|}
meshletcodec.cpp:_ZN7meshoptL14encodeVerticesEPhS0_PKjm:
  198|  4.14k|{
  199|       |	// grouped varint, 2 bit per value to indicate 0/1/2/3 byte deltas, with per-group 4-byte fallback
  200|  4.14k|	memset(ctrl, 0, (vertex_count + 3) / 4);
  201|       |
  202|  4.14k|	unsigned char* start = data;
  203|       |
  204|  4.14k|	unsigned int last = ~0u;
  205|       |
  206|  45.1k|	for (size_t i = 0; i < vertex_count; i += 4)
  ------------------
  |  Branch (206:21): [True: 40.9k, False: 4.14k]
  ------------------
  207|  40.9k|	{
  208|  40.9k|		unsigned int gv[4] = {};
  209|       |
  210|   202k|		for (int k = 0; k < 4 && i + k < vertex_count; ++k)
  ------------------
  |  Branch (210:19): [True: 162k, False: 39.9k]
  |  Branch (210:28): [True: 161k, False: 1.06k]
  ------------------
  211|   161k|		{
  212|   161k|			unsigned int d = vertices[i + k] - last - 1;
  213|   161k|			unsigned int v = (d << 1) ^ (int(d) >> 31);
  214|       |
  215|   161k|			gv[k] = v;
  216|   161k|			last = vertices[i + k];
  217|   161k|		}
  218|       |
  219|       |		// if any value needs 4 bytes, or if *all* values need 3 bytes, we use 4 bytes for all values
  220|       |		// this allows us to encode most 3-byte deltas with 3 bytes which saves space overall
  221|  40.9k|		bool use4 = (gv[0] | gv[1] | gv[2] | gv[3]) > 0xffffff || (gv[0] > 0xffff && gv[1] > 0xffff && gv[2] > 0xffff && gv[3] > 0xffff);
  ------------------
  |  Branch (221:15): [True: 28.1k, False: 12.8k]
  |  Branch (221:62): [True: 1.23k, False: 11.6k]
  |  Branch (221:80): [True: 755, False: 477]
  |  Branch (221:98): [True: 358, False: 397]
  |  Branch (221:116): [True: 261, False: 97]
  ------------------
  222|       |
  223|   204k|		for (int k = 0; k < 4; ++k)
  ------------------
  |  Branch (223:19): [True: 163k, False: 40.9k]
  ------------------
  224|   163k|		{
  225|   163k|			unsigned int v = gv[k];
  226|       |
  227|       |			// 0/1/2/3 bytes per value, or all 4 values use 4 bytes
  228|   163k|			int code = use4 ? 3 : (v == 0 ? 0 : (v < 256 ? 1 : (v < 65536 ? 2 : 3)));
  ------------------
  |  Branch (228:15): [True: 113k, False: 50.2k]
  |  Branch (228:27): [True: 1.46k, False: 48.8k]
  |  Branch (228:41): [True: 43.6k, False: 5.18k]
  |  Branch (228:56): [True: 2.46k, False: 2.72k]
  ------------------
  229|       |
  230|   163k|			if (code > 0)
  ------------------
  |  Branch (230:8): [True: 162k, False: 1.46k]
  ------------------
  231|   162k|				*data++ = (unsigned char)(v & 0xff);
  232|   163k|			if (code > 1)
  ------------------
  |  Branch (232:8): [True: 118k, False: 45.0k]
  ------------------
  233|   118k|				*data++ = (unsigned char)((v >> 8) & 0xff);
  234|   163k|			if (code > 2)
  ------------------
  |  Branch (234:8): [True: 116k, False: 47.5k]
  ------------------
  235|   116k|				*data++ = (unsigned char)((v >> 16) & 0xff);
  236|   163k|			if (use4)
  ------------------
  |  Branch (236:8): [True: 113k, False: 50.2k]
  ------------------
  237|   113k|				*data++ = (unsigned char)((v >> 24) & 0xff);
  238|       |
  239|       |			// split low and high bits into two nibbles for better packing
  240|   163k|			ctrl[i / 4] |= ((code & 1) << k) | ((code >> 1) << (k + 4));
  241|   163k|		}
  242|  40.9k|	}
  243|       |
  244|  4.14k|	return data - start;
  245|  4.14k|}
meshletcodec.cpp:_ZN7meshoptL17decodeMeshletSimdILi0EEEiPvS1_PKhS3_S3_S3_mmmm:
  865|  12.3k|{
  866|  12.3k|	assert(gDecodeTablesInitialized);
  ------------------
  |  Branch (866:2): [True: 12.3k, False: 0]
  ------------------
  867|  12.3k|	(void)gDecodeTablesInitialized;
  868|       |
  869|  12.3k|#ifdef __clang__
  870|       |	// data is guaranteed to be non-null initially; if decode loops never hit bounds errors, it remains non-null
  871|  12.3k|	__builtin_assume(data);
  872|  12.3k|#endif
  873|       |
  874|       |	// decodes 4 vertices at a time with tail processing; writes up to align(vertex_size * vertex_count, 4)
  875|       |	// raw decoding skips tail processing by rounding up vertex count; it's safe because output buffer is guaranteed to have extra space, and tail control data is 0
  876|  12.3k|	if (vertex_size == 4 || Raw)
  ------------------
  |  Branch (876:6): [True: 8.26k, False: 4.11k]
  |  Branch (876:26): [Folded, False: 0]
  ------------------
  877|  8.26k|		data = decodeVerticesSimd(static_cast<unsigned int*>(vertices), ctrl, data, bound, Raw ? (vertex_count + 3) & ~3 : vertex_count);
  ------------------
  |  Branch (877:86): [Folded, False: 8.26k]
  ------------------
  878|  4.11k|	else
  879|  4.11k|		data = decodeVerticesSimd(static_cast<unsigned short*>(vertices), ctrl, data, bound, vertex_count);
  880|  12.3k|	if (!data)
  ------------------
  |  Branch (880:6): [True: 792, False: 11.5k]
  ------------------
  881|    792|		return -2;
  882|       |
  883|       |	// decodes 2/4 triangles at a time with tail processing; writes up to align(triangle_size * triangle_count, 4)
  884|       |	// raw decoding skips tail processing by rounding up triangle count; it's safe because output buffer is guaranteed to have extra space, and tail code data is 0
  885|  11.5k|	if (triangle_size == 4 || Raw)
  ------------------
  |  Branch (885:6): [True: 3.71k, False: 7.86k]
  |  Branch (885:28): [Folded, False: 0]
  ------------------
  886|  3.71k|		data = decodeTrianglesSimd(static_cast<unsigned int*>(triangles), codes, data, bound, Raw ? (triangle_count + 1) & ~1 : triangle_count);
  ------------------
  |  Branch (886:89): [Folded, False: 3.71k]
  ------------------
  887|  7.86k|	else
  888|  7.86k|		data = decodeTrianglesSimd(static_cast<unsigned char*>(triangles), codes, data, bound, triangle_count);
  889|  11.5k|	if (!data)
  ------------------
  |  Branch (889:6): [True: 298, False: 11.2k]
  ------------------
  890|    298|		return -2;
  891|       |
  892|  11.2k|	return (data == bound) ? 0 : -3;
  ------------------
  |  Branch (892:9): [True: 8.36k, False: 2.91k]
  ------------------
  893|  11.5k|}
meshletcodec.cpp:_ZN7meshoptL18decodeVerticesSimdEPjPKhS2_S2_m:
  750|  9.28k|{
  751|  9.28k|#if defined(SIMD_SSE)
  752|  9.28k|	__m128i last = _mm_set1_epi32(-1);
  753|       |#elif defined(SIMD_NEON)
  754|       |	uint32x4_t last = vdupq_n_u32(~0u);
  755|       |#endif
  756|       |
  757|  9.28k|	size_t groups = vertex_count / 4;
  758|       |
  759|       |	// process all complete groups
  760|   131k|	for (size_t i = 0; i < groups; ++i)
  ------------------
  |  Branch (760:21): [True: 122k, False: 8.71k]
  ------------------
  761|   122k|	{
  762|   122k|		unsigned char code = *ctrl++;
  763|   122k|		if (data > bound)
  ------------------
  |  Branch (763:7): [True: 572, False: 121k]
  ------------------
  764|    572|			return NULL;
  765|       |
  766|   121k|		last = decodeVertexGroup(last, code, data);
  767|       |
  768|   121k|#if defined(SIMD_SSE)
  769|   121k|		_mm_storeu_si128(reinterpret_cast<__m128i*>(&vertices[i * 4]), last);
  770|       |#elif defined(SIMD_NEON)
  771|       |		vst1q_u32(&vertices[i * 4], last);
  772|       |#endif
  773|   121k|	}
  774|       |
  775|       |	// process a 1-3 vertex tail; to maintain the memory safety guarantee we have to write individual elements
  776|  8.71k|	if (vertex_count & 3)
  ------------------
  |  Branch (776:6): [True: 2.52k, False: 6.18k]
  ------------------
  777|  2.52k|	{
  778|  2.52k|		unsigned char code = *ctrl++;
  779|       |
  780|  2.52k|		if (data > bound)
  ------------------
  |  Branch (780:7): [True: 22, False: 2.50k]
  ------------------
  781|     22|			return NULL;
  782|       |
  783|  2.50k|		last = decodeVertexGroup(last, code, data);
  784|       |
  785|  2.50k|		unsigned int* tail = &vertices[vertex_count & ~3u];
  786|       |
  787|  2.50k|#if defined(SIMD_SSE)
  788|  2.50k|		tail[0] = _mm_cvtsi128_si32(last);
  789|  2.50k|		if ((vertex_count & 3) > 1)
  ------------------
  |  Branch (789:7): [True: 1.10k, False: 1.40k]
  ------------------
  790|  1.10k|			tail[1] = _mm_extract_epi32(last, 1);
  791|  2.50k|		if ((vertex_count & 3) > 2)
  ------------------
  |  Branch (791:7): [True: 377, False: 2.12k]
  ------------------
  792|    377|			tail[2] = _mm_extract_epi32(last, 2);
  793|       |#elif defined(SIMD_NEON)
  794|       |		vst1q_lane_u32(&tail[0], last, 0);
  795|       |		if ((vertex_count & 3) > 1)
  796|       |			vst1q_lane_u32(&tail[1], last, 1);
  797|       |		if ((vertex_count & 3) > 2)
  798|       |			vst1q_lane_u32(&tail[2], last, 2);
  799|       |#endif
  800|  2.50k|	}
  801|       |
  802|  8.68k|	return data;
  803|  8.71k|}
_ZN7meshopt17decodeVertexGroupEDv2_xhRPKh:
  540|   220k|{
  541|   220k|	__m128i word = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  542|   220k|	__m128i shuf = _mm_loadu_si128(reinterpret_cast<const __m128i*>(kDecodeTableVerts[code]));
  543|       |
  544|   220k|	__m128i v = _mm_shuffle_epi8(word, shuf);
  545|       |
  546|       |	// unzigzag+1
  547|   220k|	__m128i xl = _mm_sub_epi32(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi32(1)));
  548|   220k|	__m128i xr = _mm_srli_epi32(v, 1);
  549|   220k|	__m128i x = _mm_add_epi32(_mm_xor_si128(xl, xr), _mm_set1_epi32(1));
  550|       |
  551|       |	// prefix sum
  552|   220k|	x = _mm_add_epi32(x, _mm_slli_si128(x, 8));
  553|   220k|	x = _mm_add_epi32(x, _mm_slli_si128(x, 4));
  554|   220k|	x = _mm_add_epi32(x, _mm_shuffle_epi32(last, 0xff));
  555|       |
  556|   220k|	data += kDecodeTableLength[code];
  557|       |
  558|   220k|	return x;
  559|   220k|}
meshletcodec.cpp:_ZN7meshoptL18decodeVerticesSimdEPtPKhS2_S2_m:
  807|  4.11k|{
  808|  4.11k|#if defined(SIMD_SSE)
  809|  4.11k|	__m128i repack = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);
  810|  4.11k|	__m128i last = _mm_set1_epi32(-1);
  811|       |#elif defined(SIMD_NEON)
  812|       |	uint32x4_t last = vdupq_n_u32(~0u);
  813|       |#endif
  814|       |
  815|       |	// because the output buffer is guaranteed to have 32-bit aligned size available, we can simplify tail processing
  816|       |	// if the number of vertices mod 4 is 3, we'd normally need to write 8+6 bytes, but we can instead overwrite up to 2 bytes in the main loop
  817|  4.11k|	size_t groups = (vertex_count + 1) / 4;
  818|       |
  819|       |	// process all complete groups
  820|  98.5k|	for (size_t i = 0; i < groups; ++i)
  ------------------
  |  Branch (820:21): [True: 94.7k, False: 3.74k]
  ------------------
  821|  94.7k|	{
  822|  94.7k|		unsigned char code = *ctrl++;
  823|       |
  824|  94.7k|		if (data > bound)
  ------------------
  |  Branch (824:7): [True: 374, False: 94.3k]
  ------------------
  825|    374|			return NULL;
  826|       |
  827|  94.3k|		last = decodeVertexGroup(last, code, data);
  828|       |
  829|  94.3k|#if defined(SIMD_SSE)
  830|  94.3k|		__m128i r = _mm_shuffle_epi8(last, repack);
  831|  94.3k|		_mm_storel_epi64(reinterpret_cast<__m128i*>(&vertices[i * 4]), r);
  832|       |#elif defined(SIMD_NEON)
  833|       |		uint16x4_t r = vmovn_u32(last);
  834|       |		vst1_u16(&vertices[i * 4], r);
  835|       |#endif
  836|  94.3k|	}
  837|       |
  838|       |	// process a 1-2 vertex tail; to maintain the memory safety guarantee we have to write a 32-bit element
  839|  3.74k|	if (groups * 4 < vertex_count)
  ------------------
  |  Branch (839:6): [True: 2.15k, False: 1.58k]
  ------------------
  840|  2.15k|	{
  841|  2.15k|		unsigned char code = *ctrl++;
  842|       |
  843|  2.15k|		if (data > bound)
  ------------------
  |  Branch (843:7): [True: 22, False: 2.12k]
  ------------------
  844|     22|			return NULL;
  845|       |
  846|  2.12k|		last = decodeVertexGroup(last, code, data);
  847|       |
  848|  2.12k|		unsigned short* tail = &vertices[vertex_count & ~3u];
  849|       |
  850|  2.12k|#if defined(SIMD_SSE)
  851|  2.12k|		__m128i r = _mm_shufflelo_epi16(last, 8);
  852|  2.12k|		*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
  853|       |#elif defined(SIMD_NEON)
  854|       |		uint16x4_t r = vmovn_u32(last);
  855|       |		vst1_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpret_u32_u16(r), 0);
  856|       |#endif
  857|  2.12k|	}
  858|       |
  859|  3.71k|	return data;
  860|  3.74k|}
meshletcodec.cpp:_ZN7meshoptL19decodeTrianglesSimdEPjPKhS2_S2_m:
  615|  4.54k|{
  616|  4.54k|#if defined(SIMD_SSE)
  617|  4.54k|	__m128i repack = _mm_setr_epi8(9, 10, 11, -1, 12, 13, 14, -1, 0, 0, 0, 0, 0, 0, 0, 0);
  618|  4.54k|	__m128i state = _mm_setzero_si128();
  619|       |#elif defined(SIMD_NEON)
  620|       |	uint8x8_t repack = vcreate_u8(0xff0e0d0cff0b0a09ull);
  621|       |	uint8x16_t state = vdupq_n_u8(0);
  622|       |#endif
  623|       |
  624|  4.54k|	size_t groups = triangle_count / 2;
  625|       |
  626|       |	// process all complete groups
  627|   195k|	for (size_t i = 0; i < groups; ++i)
  ------------------
  |  Branch (627:21): [True: 191k, False: 4.34k]
  ------------------
  628|   191k|	{
  629|   191k|		unsigned char code = *codes++;
  630|       |
  631|   191k|		if (extra > bound)
  ------------------
  |  Branch (631:7): [True: 199, False: 190k]
  ------------------
  632|    199|			return NULL;
  633|       |
  634|   190k|		state = decodeTriangleGroup(state, code, extra);
  635|       |
  636|       |		// write 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding
  637|   190k|#if defined(SIMD_SSE)
  638|   190k|		__m128i r = _mm_shuffle_epi8(state, repack);
  639|   190k|		_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 2]), r);
  640|       |#elif defined(SIMD_NEON)
  641|       |		uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
  642|       |		vst1_u32(&triangles[i * 2], r);
  643|       |#endif
  644|   190k|	}
  645|       |
  646|       |	// process a 1 triangle tail; to maintain the memory safety guarantee we have to write a 32-bit element
  647|  4.34k|	if (triangle_count & 1)
  ------------------
  |  Branch (647:6): [True: 1.59k, False: 2.74k]
  ------------------
  648|  1.59k|	{
  649|  1.59k|		unsigned char code = *codes++;
  650|       |
  651|  1.59k|		if (extra > bound)
  ------------------
  |  Branch (651:7): [True: 32, False: 1.56k]
  ------------------
  652|     32|			return NULL;
  653|       |
  654|  1.56k|		state = decodeTriangleGroup(state, code, extra);
  655|       |
  656|  1.56k|		unsigned int* tail = &triangles[triangle_count & ~1u];
  657|       |
  658|  1.56k|#if defined(SIMD_SSE)
  659|  1.56k|		__m128i r = _mm_shuffle_epi8(state, repack);
  660|  1.56k|		*tail = unsigned(_mm_cvtsi128_si32(r));
  661|       |#elif defined(SIMD_NEON)
  662|       |		uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
  663|       |		vst1_lane_u32(tail, r, 0);
  664|       |#endif
  665|  1.56k|	}
  666|       |
  667|  4.30k|	return extra;
  668|  4.34k|}
_ZN7meshopt19decodeTriangleGroupEDv2_xhRPKh:
  524|   353k|{
  525|   353k|	__m128i shuf = _mm_loadu_si128(reinterpret_cast<const __m128i*>(kDecodeTableMasks[code]));
  526|   353k|	__m128i next = _mm_slli_si128(shuf, 10);
  527|       |
  528|       |	// patch first 6 bytes with current extra and roll state forward
  529|   353k|	__m128i ext = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(extra));
  530|   353k|	state = _mm_blend_epi16(state, ext, 7);
  531|   353k|	state = _mm_add_epi8(_mm_shuffle_epi8(state, shuf), next);
  532|       |
  533|   353k|	extra += kDecodeTableExtra[code];
  534|       |
  535|   353k|	return state;
  536|   353k|}
meshletcodec.cpp:_ZN7meshoptL19decodeTrianglesSimdEPhPKhS2_S2_m:
  672|  7.86k|{
  673|  7.86k|#if defined(SIMD_SSE)
  674|  7.86k|	__m128i state = _mm_setzero_si128();
  675|       |#elif defined(SIMD_NEON)
  676|       |	uint8x16_t state = vdupq_n_u8(0);
  677|       |#endif
  678|       |
  679|       |	// because the output buffer is guaranteed to have 32-bit aligned size available, we can optimize writes and tail processing
  680|       |	// instead of processing triangles 2 at a time, we process 2 *pairs* at a time (12-byte write) followed by a tail pair, if present
  681|       |	// if the number of triangles mod 4 is 3, we'd normally need to write 12k+9 bytes, but we can instead overwrite up to 3 bytes in the main loop
  682|  7.86k|	size_t groups = (triangle_count + 1) / 4;
  683|       |
  684|       |	// process all complete groups
  685|  85.5k|	for (size_t i = 0; i < groups; ++i)
  ------------------
  |  Branch (685:21): [True: 77.7k, False: 7.76k]
  ------------------
  686|  77.7k|	{
  687|  77.7k|		unsigned char code0 = *codes++;
  688|  77.7k|		unsigned char code1 = *codes++;
  689|       |
  690|       |		// each triangle pair reads <=6 bytes from extra, so two pairs need <=12 bytes and gap guarantees 16 byte of overread
  691|  77.7k|		if (extra > bound)
  ------------------
  |  Branch (691:7): [True: 106, False: 77.6k]
  ------------------
  692|    106|			return NULL;
  693|       |
  694|  77.6k|		state = decodeTriangleGroup(state, code0, extra);
  695|       |
  696|       |		// write first decoded triangle and first index of second decoded triangle
  697|  77.6k|#if defined(SIMD_SSE)
  698|  77.6k|		__m128i r0 = _mm_srli_si128(state, 9);
  699|  77.6k|		*reinterpret_cast<unaligned_int*>(&triangles[i * 12]) = _mm_cvtsi128_si32(r0);
  700|       |#elif defined(SIMD_NEON)
  701|       |		uint8x16_t r0 = vextq_u8(state, vdupq_n_u8(0), 9);
  702|       |		vst1q_lane_u32(reinterpret_cast<unsigned int*>(&triangles[i * 12]), vreinterpretq_u32_u8(r0), 0);
  703|       |#endif
  704|       |
  705|  77.6k|		state = decodeTriangleGroup(state, code1, extra);
  706|       |
  707|       |		// write last two indices of second decoded triangle that we didn't write above plus two new ones
  708|       |		// note that the second decoded triangle has shifted down to 6-8 bytes, hence shift by 7
  709|  77.6k|#if defined(SIMD_SSE)
  710|  77.6k|		__m128i r1 = _mm_srli_si128(state, 7);
  711|  77.6k|		_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 12 + 4]), r1);
  712|       |#elif defined(SIMD_NEON)
  713|       |		uint8x16_t r1 = vextq_u8(state, vdupq_n_u8(0), 7);
  714|       |		vst1_u8(&triangles[i * 12 + 4], vget_low_u8(r1));
  715|       |#endif
  716|  77.6k|	}
  717|       |
  718|       |	// process a 1-2 triangle tail; to maintain the memory safety guarantee we have to write 1-2 32-bit elements
  719|  7.76k|	if (groups * 4 < triangle_count)
  ------------------
  |  Branch (719:6): [True: 6.13k, False: 1.63k]
  ------------------
  720|  6.13k|	{
  721|  6.13k|		unsigned char code = *codes++;
  722|       |
  723|  6.13k|		if (extra > bound)
  ------------------
  |  Branch (723:7): [True: 38, False: 6.09k]
  ------------------
  724|     38|			return NULL;
  725|       |
  726|  6.09k|		state = decodeTriangleGroup(state, code, extra);
  727|       |
  728|  6.09k|		unsigned char* tail = &triangles[(triangle_count & ~3u) * 3];
  729|       |
  730|  6.09k|#if defined(SIMD_SSE)
  731|  6.09k|		__m128i r = _mm_srli_si128(state, 9);
  732|       |
  733|  6.09k|		*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
  734|  6.09k|		if ((triangle_count & 3) > 1)
  ------------------
  |  Branch (734:7): [True: 776, False: 5.31k]
  ------------------
  735|    776|			*reinterpret_cast<unaligned_int*>(tail + 4) = _mm_extract_epi32(r, 1);
  736|       |#elif defined(SIMD_NEON)
  737|       |		uint8x16_t r = vextq_u8(state, vdupq_n_u8(0), 9);
  738|       |
  739|       |		vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpretq_u32_u8(r), 0);
  740|       |		if ((triangle_count & 3) > 1)
  741|       |			vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail + 4), vreinterpretq_u32_u8(r), 1);
  742|       |#endif
  743|  6.09k|	}
  744|       |
  745|  7.72k|	return extra;
  746|  7.76k|}
meshletcodec.cpp:_ZN7meshoptL17decodeMeshletSimdILi1EEEiPvS1_PKhS3_S3_S3_mmmm:
  865|  1.02k|{
  866|  1.02k|	assert(gDecodeTablesInitialized);
  ------------------
  |  Branch (866:2): [True: 1.02k, False: 0]
  ------------------
  867|  1.02k|	(void)gDecodeTablesInitialized;
  868|       |
  869|  1.02k|#ifdef __clang__
  870|       |	// data is guaranteed to be non-null initially; if decode loops never hit bounds errors, it remains non-null
  871|  1.02k|	__builtin_assume(data);
  872|  1.02k|#endif
  873|       |
  874|       |	// decodes 4 vertices at a time with tail processing; writes up to align(vertex_size * vertex_count, 4)
  875|       |	// raw decoding skips tail processing by rounding up vertex count; it's safe because output buffer is guaranteed to have extra space, and tail control data is 0
  876|  1.02k|	if (vertex_size == 4 || Raw)
  ------------------
  |  Branch (876:6): [True: 1.02k, False: 0]
  |  Branch (876:26): [True: 0, Folded]
  ------------------
  877|  1.02k|		data = decodeVerticesSimd(static_cast<unsigned int*>(vertices), ctrl, data, bound, Raw ? (vertex_count + 3) & ~3 : vertex_count);
  ------------------
  |  Branch (877:86): [True: 1.02k, Folded]
  ------------------
  878|      0|	else
  879|      0|		data = decodeVerticesSimd(static_cast<unsigned short*>(vertices), ctrl, data, bound, vertex_count);
  880|  1.02k|	if (!data)
  ------------------
  |  Branch (880:6): [True: 198, False: 822]
  ------------------
  881|    198|		return -2;
  882|       |
  883|       |	// decodes 2/4 triangles at a time with tail processing; writes up to align(triangle_size * triangle_count, 4)
  884|       |	// raw decoding skips tail processing by rounding up triangle count; it's safe because output buffer is guaranteed to have extra space, and tail code data is 0
  885|    822|	if (triangle_size == 4 || Raw)
  ------------------
  |  Branch (885:6): [True: 822, False: 0]
  |  Branch (885:28): [True: 0, Folded]
  ------------------
  886|    822|		data = decodeTrianglesSimd(static_cast<unsigned int*>(triangles), codes, data, bound, Raw ? (triangle_count + 1) & ~1 : triangle_count);
  ------------------
  |  Branch (886:89): [True: 822, Folded]
  ------------------
  887|      0|	else
  888|      0|		data = decodeTrianglesSimd(static_cast<unsigned char*>(triangles), codes, data, bound, triangle_count);
  889|    822|	if (!data)
  ------------------
  |  Branch (889:6): [True: 77, False: 745]
  ------------------
  890|     77|		return -2;
  891|       |
  892|    745|	return (data == bound) ? 0 : -3;
  ------------------
  |  Branch (892:9): [True: 18, False: 727]
  ------------------
  893|    822|}

_Z21meshopt_decodeMeshletIjjEiPT_mPT0_mPKhm:
 1456|  2.07k|{
 1457|  2.07k|	char types_valid[(sizeof(V) == 2 || sizeof(V) == 4) && (sizeof(T) == 1 || sizeof(T) == 4) ? 1 : -1];
 1458|  2.07k|	(void)types_valid;
 1459|       |
 1460|  2.07k|	return meshopt_decodeMeshlet(vertices, vertex_count, sizeof(V), triangles, triangle_count, sizeof(T) == 1 ? 3 : 4, buffer, buffer_size);
  ------------------
  |  Branch (1460:93): [Folded, False: 2.07k]
  ------------------
 1461|  2.07k|}
_Z21meshopt_decodeMeshletIjhEiPT_mPT0_mPKhm:
 1456|  4.14k|{
 1457|  4.14k|	char types_valid[(sizeof(V) == 2 || sizeof(V) == 4) && (sizeof(T) == 1 || sizeof(T) == 4) ? 1 : -1];
 1458|  4.14k|	(void)types_valid;
 1459|       |
 1460|  4.14k|	return meshopt_decodeMeshlet(vertices, vertex_count, sizeof(V), triangles, triangle_count, sizeof(T) == 1 ? 3 : 4, buffer, buffer_size);
  ------------------
  |  Branch (1460:93): [True: 4.14k, Folded]
  ------------------
 1461|  4.14k|}
_Z21meshopt_decodeMeshletIthEiPT_mPT0_mPKhm:
 1456|  2.07k|{
 1457|  2.07k|	char types_valid[(sizeof(V) == 2 || sizeof(V) == 4) && (sizeof(T) == 1 || sizeof(T) == 4) ? 1 : -1];
 1458|  2.07k|	(void)types_valid;
 1459|       |
 1460|  2.07k|	return meshopt_decodeMeshlet(vertices, vertex_count, sizeof(V), triangles, triangle_count, sizeof(T) == 1 ? 3 : 4, buffer, buffer_size);
  ------------------
  |  Branch (1460:93): [True: 2.07k, Folded]
  ------------------
 1461|  2.07k|}

meshopt_encodeVertexBufferLevel:
 1616|  16.5k|{
 1617|  16.5k|	using namespace meshopt;
 1618|       |
 1619|  16.5k|	assert(vertex_size > 0 && vertex_size <= 256);
  ------------------
  |  Branch (1619:2): [True: 16.5k, False: 0]
  |  Branch (1619:2): [True: 16.5k, False: 0]
  |  Branch (1619:2): [True: 16.5k, False: 0]
  ------------------
 1620|  16.5k|	assert(vertex_size % 4 == 0);
  ------------------
  |  Branch (1620:2): [True: 16.5k, False: 0]
  ------------------
 1621|  16.5k|	assert(level >= 0 && level <= 9); // only a subset of this range is used right now
  ------------------
  |  Branch (1621:2): [True: 16.5k, False: 0]
  |  Branch (1621:2): [True: 16.5k, False: 0]
  |  Branch (1621:2): [True: 16.5k, False: 0]
  ------------------
 1622|  16.5k|	assert(version < 0 || unsigned(version) <= kDecodeVertexVersion);
  ------------------
  |  Branch (1622:2): [True: 16.5k, False: 0]
  |  Branch (1622:2): [True: 0, False: 0]
  |  Branch (1622:2): [True: 16.5k, False: 0]
  ------------------
 1623|       |
 1624|  16.5k|	version = version < 0 ? gEncodeVertexVersion : version;
  ------------------
  |  Branch (1624:12): [True: 16.5k, False: 0]
  ------------------
 1625|       |
 1626|       |#if TRACE
 1627|       |	memset(vertexstats, 0, sizeof(vertexstats));
 1628|       |#endif
 1629|       |
 1630|  16.5k|	const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
 1631|       |
 1632|  16.5k|	unsigned char* data = buffer;
 1633|  16.5k|	unsigned char* data_end = buffer + buffer_size;
 1634|       |
 1635|  16.5k|	if (size_t(data_end - data) < 1)
  ------------------
  |  Branch (1635:6): [True: 0, False: 16.5k]
  ------------------
 1636|      0|		return 0;
 1637|       |
 1638|  16.5k|	*data++ = (unsigned char)(kVertexHeader | version);
 1639|       |
 1640|  16.5k|	unsigned char first_vertex[256] = {};
 1641|  16.5k|	if (vertex_count > 0)
  ------------------
  |  Branch (1641:6): [True: 12.8k, False: 3.70k]
  ------------------
 1642|  12.8k|		memcpy(first_vertex, vertex_data, vertex_size);
 1643|       |
 1644|  16.5k|	unsigned char last_vertex[256] = {};
 1645|  16.5k|	memcpy(last_vertex, first_vertex, vertex_size);
 1646|       |
 1647|  16.5k|	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 1648|       |
 1649|  16.5k|	unsigned char channels[64] = {};
 1650|  16.5k|	if (version != 0 && level > 1 && vertex_count > 1)
  ------------------
  |  Branch (1650:6): [True: 12.7k, False: 3.88k]
  |  Branch (1650:22): [True: 6.17k, False: 6.52k]
  |  Branch (1650:35): [True: 4.22k, False: 1.95k]
  ------------------
 1651|  22.0k|		for (size_t k = 0; k < vertex_size; k += 4)
  ------------------
  |  Branch (1651:22): [True: 17.8k, False: 4.22k]
  ------------------
 1652|  17.8k|		{
 1653|  17.8k|			int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0;
  ------------------
  |  Branch (1653:14): [True: 15.5k, False: 2.31k]
  ------------------
 1654|  17.8k|			int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot);
  ------------------
  |  Branch (1654:137): [True: 15.5k, False: 2.31k]
  ------------------
 1655|       |
 1656|  17.8k|			assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8));
  ------------------
  |  Branch (1656:4): [True: 2.24k, False: 0]
  |  Branch (1656:4): [True: 2.24k, False: 0]
  |  Branch (1656:4): [True: 15.6k, False: 2.24k]
  |  Branch (1656:4): [True: 17.8k, False: 0]
  ------------------
 1657|  17.8k|			channels[k / 4] = (unsigned char)channel;
 1658|  17.8k|		}
 1659|       |
 1660|  16.5k|	size_t vertex_offset = 0;
 1661|       |
 1662|   334k|	while (vertex_offset < vertex_count)
  ------------------
  |  Branch (1662:9): [True: 318k, False: 16.5k]
  ------------------
 1663|   318k|	{
 1664|   318k|		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  ------------------
  |  Branch (1664:23): [True: 305k, False: 12.8k]
  ------------------
 1665|       |
 1666|   318k|		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level);
 1667|   318k|		if (!data)
  ------------------
  |  Branch (1667:7): [True: 0, False: 318k]
  ------------------
 1668|      0|			return 0;
 1669|       |
 1670|   318k|		vertex_offset += block_size;
 1671|   318k|	}
 1672|       |
 1673|  16.5k|	size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
  ------------------
  |  Branch (1673:36): [True: 3.88k, False: 12.7k]
  ------------------
 1674|  16.5k|	size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
  ------------------
  |  Branch (1674:25): [True: 3.88k, False: 12.7k]
  ------------------
 1675|  16.5k|	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
  ------------------
  |  Branch (1675:25): [True: 9.26k, False: 7.32k]
  ------------------
 1676|       |
 1677|  16.5k|	if (size_t(data_end - data) < tail_size_pad)
  ------------------
  |  Branch (1677:6): [True: 0, False: 16.5k]
  ------------------
 1678|      0|		return 0;
 1679|       |
 1680|  16.5k|	if (tail_size < tail_size_pad)
  ------------------
  |  Branch (1680:6): [True: 9.26k, False: 7.32k]
  ------------------
 1681|  9.26k|	{
 1682|  9.26k|		memset(data, 0, tail_size_pad - tail_size);
 1683|  9.26k|		data += tail_size_pad - tail_size;
 1684|  9.26k|	}
 1685|       |
 1686|  16.5k|	memcpy(data, first_vertex, vertex_size);
 1687|  16.5k|	data += vertex_size;
 1688|       |
 1689|  16.5k|	if (version != 0)
  ------------------
  |  Branch (1689:6): [True: 12.7k, False: 3.88k]
  ------------------
 1690|  12.7k|	{
 1691|  12.7k|		memcpy(data, channels, vertex_size / 4);
 1692|  12.7k|		data += vertex_size / 4;
 1693|  12.7k|	}
 1694|       |
 1695|  16.5k|	assert(data >= buffer + tail_size);
  ------------------
  |  Branch (1695:2): [True: 16.5k, False: 0]
  ------------------
 1696|  16.5k|	assert(data <= buffer + buffer_size);
  ------------------
  |  Branch (1696:2): [True: 16.5k, False: 0]
  ------------------
 1697|       |
 1698|       |#if TRACE
 1699|       |	size_t total_size = data - buffer;
 1700|       |
 1701|       |	for (size_t k = 0; k < vertex_size; ++k)
 1702|       |	{
 1703|       |		const Stats& vsk = vertexstats[k];
 1704|       |
 1705|       |		printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
 1706|       |
 1707|       |		size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8];
 1708|       |		double total_kr = total_k ? 1.0 / double(total_k) : 0;
 1709|       |
 1710|       |		if (version != 0)
 1711|       |		{
 1712|       |			int channel = channels[k / 4];
 1713|       |
 1714|       |			if ((channel & 3) == 2 && k % 4 == 0)
 1715|       |				printf(" | ^%d", channel >> 4);
 1716|       |			else
 1717|       |				printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : "."));
 1718|       |		}
 1719|       |
 1720|       |		printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]",
 1721|       |		    double(vsk.header) * total_kr * 100,
 1722|       |		    double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100,
 1723|       |		    double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100);
 1724|       |
 1725|       |		size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3];
 1726|       |
 1727|       |		if (total_ctrl)
 1728|       |		{
 1729|       |			printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%",
 1730|       |			    double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100,
 1731|       |			    double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100);
 1732|       |		}
 1733|       |
 1734|       |		if (level >= 3)
 1735|       |			printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
 1736|       |			    double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
 1737|       |			    double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
 1738|       |			    double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
 1739|       |			    double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
 1740|       |
 1741|       |		printf("\n");
 1742|       |	}
 1743|       |#endif
 1744|       |
 1745|  16.5k|	return data - buffer;
 1746|  16.5k|}
meshopt_encodeVertexBufferBound:
 1754|  8.29k|{
 1755|  8.29k|	using namespace meshopt;
 1756|       |
 1757|  8.29k|	assert(vertex_size > 0 && vertex_size <= 256);
  ------------------
  |  Branch (1757:2): [True: 8.29k, False: 0]
  |  Branch (1757:2): [True: 8.29k, False: 0]
  |  Branch (1757:2): [True: 8.29k, False: 0]
  ------------------
 1758|  8.29k|	assert(vertex_size % 4 == 0);
  ------------------
  |  Branch (1758:2): [True: 8.29k, False: 0]
  ------------------
 1759|       |
 1760|  8.29k|	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 1761|  8.29k|	size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
 1762|       |
 1763|  8.29k|	size_t vertex_block_control_size = vertex_size / 4;
 1764|  8.29k|	size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
 1765|  8.29k|	size_t vertex_block_data_size = vertex_block_size;
 1766|       |
 1767|  8.29k|	size_t tail_size = vertex_size + (vertex_size / 4);
 1768|  8.29k|	size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1;
  ------------------
  |  Branch (1768:25): [True: 8.29k, Folded]
  ------------------
 1769|  8.29k|	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
  ------------------
  |  Branch (1769:25): [True: 6.22k, False: 2.07k]
  ------------------
 1770|  8.29k|	assert(tail_size_pad >= kByteGroupDecodeLimit);
  ------------------
  |  Branch (1770:2): [True: 8.29k, False: 0]
  ------------------
 1771|       |
 1772|  8.29k|	return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad;
 1773|  8.29k|}
meshopt_encodeVertexVersion:
 1776|  2.07k|{
 1777|  2.07k|	assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion));
  ------------------
  |  Branch (1777:2): [True: 2.07k, False: 0]
  ------------------
 1778|       |
 1779|  2.07k|	meshopt::gEncodeVertexVersion = version;
 1780|  2.07k|}
meshopt_decodeVertexBuffer:
 1800|  16.5k|{
 1801|  16.5k|	using namespace meshopt;
 1802|       |
 1803|  16.5k|	assert(vertex_size > 0 && vertex_size <= 256);
  ------------------
  |  Branch (1803:2): [True: 16.5k, False: 0]
  |  Branch (1803:2): [True: 16.5k, False: 0]
  |  Branch (1803:2): [True: 16.5k, False: 0]
  ------------------
 1804|  16.5k|	assert(vertex_size % 4 == 0);
  ------------------
  |  Branch (1804:2): [True: 16.5k, False: 0]
  ------------------
 1805|       |
 1806|  16.5k|	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL;
 1807|       |
 1808|  16.5k|#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
 1809|  16.5k|	const unsigned int cpumask = (1 << 9) | (1 << 23); // SSSE3+POPCNT
 1810|  16.5k|	decode = (cpuid & cpumask) == cpumask ? decodeVertexBlockSimd : decodeVertexBlock;
  ------------------
  |  Branch (1810:11): [True: 16.5k, False: 0]
  ------------------
 1811|       |#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
 1812|       |	decode = decodeVertexBlockSimd;
 1813|       |#else
 1814|       |	decode = decodeVertexBlock;
 1815|       |#endif
 1816|       |
 1817|  16.5k|#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
 1818|  16.5k|	assert(gDecodeBytesGroupInitialized);
  ------------------
  |  Branch (1818:2): [True: 16.5k, False: 0]
  ------------------
 1819|  16.5k|	(void)gDecodeBytesGroupInitialized;
 1820|  16.5k|#endif
 1821|       |
 1822|  16.5k|	unsigned char* vertex_data = static_cast<unsigned char*>(destination);
 1823|       |
 1824|  16.5k|	const unsigned char* data = buffer;
 1825|  16.5k|	const unsigned char* data_end = buffer + buffer_size;
 1826|       |
 1827|  16.5k|	if (size_t(data_end - data) < 1)
  ------------------
  |  Branch (1827:6): [True: 0, False: 16.5k]
  ------------------
 1828|      0|		return -2;
 1829|       |
 1830|  16.5k|	unsigned char data_header = *data++;
 1831|       |
 1832|  16.5k|	if ((data_header & 0xf0) != kVertexHeader)
  ------------------
  |  Branch (1832:6): [True: 6.88k, False: 9.70k]
  ------------------
 1833|  6.88k|		return -1;
 1834|       |
 1835|  9.70k|	int version = data_header & 0x0f;
 1836|  9.70k|	if (version > kDecodeVertexVersion)
  ------------------
  |  Branch (1836:6): [True: 188, False: 9.52k]
  ------------------
 1837|    188|		return -1;
 1838|       |
 1839|  9.52k|	size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
  ------------------
  |  Branch (1839:36): [True: 2.22k, False: 7.29k]
  ------------------
 1840|  9.52k|	size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
  ------------------
  |  Branch (1840:25): [True: 2.22k, False: 7.29k]
  ------------------
 1841|  9.52k|	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
  ------------------
  |  Branch (1841:25): [True: 5.31k, False: 4.20k]
  ------------------
 1842|       |
 1843|  9.52k|	if (size_t(data_end - data) < tail_size_pad)
  ------------------
  |  Branch (1843:6): [True: 214, False: 9.30k]
  ------------------
 1844|    214|		return -2;
 1845|       |
 1846|  9.30k|	const unsigned char* tail = data_end - tail_size;
 1847|       |
 1848|  9.30k|	unsigned char last_vertex[256];
 1849|  9.30k|	memcpy(last_vertex, tail, vertex_size);
 1850|       |
 1851|  9.30k|	const unsigned char* channels = version == 0 ? NULL : tail + vertex_size;
  ------------------
  |  Branch (1851:34): [True: 2.18k, False: 7.12k]
  ------------------
 1852|       |
 1853|  9.30k|	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 1854|       |
 1855|  9.30k|	size_t vertex_offset = 0;
 1856|       |
 1857|   168k|	while (vertex_offset < vertex_count)
  ------------------
  |  Branch (1857:9): [True: 160k, False: 8.75k]
  ------------------
 1858|   160k|	{
 1859|   160k|		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  ------------------
  |  Branch (1859:23): [True: 152k, False: 7.45k]
  ------------------
 1860|       |
 1861|   160k|		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version);
 1862|   160k|		if (!data)
  ------------------
  |  Branch (1862:7): [True: 548, False: 159k]
  ------------------
 1863|    548|			return -2;
 1864|       |
 1865|   159k|		vertex_offset += block_size;
 1866|   159k|	}
 1867|       |
 1868|  8.75k|	if (size_t(data_end - data) != tail_size_pad)
  ------------------
  |  Branch (1868:6): [True: 459, False: 8.29k]
  ------------------
 1869|    459|		return -3;
 1870|       |
 1871|  8.29k|	return 0;
 1872|  8.75k|}
vertexcodec.cpp:_ZN7meshoptL27decodeBytesGroupBuildTablesEv:
  792|      2|{
  793|    514|	for (int mask = 0; mask < 256; ++mask)
  ------------------
  |  Branch (793:21): [True: 512, False: 2]
  ------------------
  794|    512|	{
  795|    512|		unsigned char shuffle[8];
  796|    512|		unsigned char count = 0;
  797|       |
  798|  4.60k|		for (int i = 0; i < 8; ++i)
  ------------------
  |  Branch (798:19): [True: 4.09k, False: 512]
  ------------------
  799|  4.09k|		{
  800|  4.09k|			int maski = (mask >> i) & 1;
  801|  4.09k|			shuffle[i] = maski ? count : 0x80;
  ------------------
  |  Branch (801:17): [True: 2.04k, False: 2.04k]
  ------------------
  802|  4.09k|			count += (unsigned char)(maski);
  803|  4.09k|		}
  804|       |
  805|    512|		memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
  806|    512|		kDecodeBytesGroupCount[mask] = count;
  807|    512|	}
  808|       |
  809|      2|	return true;
  810|      2|}
vertexcodec.cpp:_ZN7meshoptL14getCpuFeaturesEv:
 1600|      2|{
 1601|      2|	int cpuinfo[4] = {};
 1602|       |#ifdef _MSC_VER
 1603|       |	__cpuid(cpuinfo, 1);
 1604|       |#else
 1605|       |	__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
 1606|      2|#endif
 1607|      2|	return cpuinfo[2];
 1608|      2|}
vertexcodec.cpp:_ZN7meshoptL18getVertexBlockSizeEm:
  141|  34.1k|{
  142|       |	// make sure the entire block fits into the scratch buffer and is aligned to byte group size
  143|       |	// note: the block size is implicitly part of the format, so we can't change it without breaking compatibility
  144|  34.1k|	size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1);
  145|       |
  146|  34.1k|	return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
  ------------------
  |  Branch (146:9): [True: 0, False: 34.1k]
  ------------------
  147|  34.1k|}
vertexcodec.cpp:_ZN7meshoptL14estimateRotateEPKhmmmm:
  370|  15.5k|{
  371|  15.5k|	size_t sizes[8] = {};
  372|       |
  373|  15.5k|	const unsigned char* vertex = vertex_data + k;
  374|  15.5k|	unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
  375|       |
  376|  4.10M|	for (size_t i = 0; i < vertex_count; i += group_size)
  ------------------
  |  Branch (376:21): [True: 4.09M, False: 15.5k]
  ------------------
  377|  4.09M|	{
  378|  4.09M|		unsigned int bitg = 0;
  379|       |
  380|       |		// calculate bit consistency mask for the group
  381|  69.4M|		for (size_t j = 0; j < group_size && i + j < vertex_count; ++j)
  ------------------
  |  Branch (381:22): [True: 65.3M, False: 4.07M]
  |  Branch (381:40): [True: 65.3M, False: 13.4k]
  ------------------
  382|  65.3M|		{
  383|  65.3M|			unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
  384|  65.3M|			unsigned int d = v ^ last;
  385|       |
  386|  65.3M|			bitg |= d;
  387|  65.3M|			last = v;
  388|  65.3M|			vertex += vertex_size;
  389|  65.3M|		}
  390|       |
  391|       |#if TRACE
  392|       |		for (int j = 0; j < 32; ++j)
  393|       |			vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1));
  394|       |#endif
  395|       |
  396|  36.8M|		for (int j = 0; j < 8; ++j)
  ------------------
  |  Branch (396:19): [True: 32.7M, False: 4.09M]
  ------------------
  397|  32.7M|		{
  398|  32.7M|			unsigned int bitr = rotate(bitg, j);
  399|       |
  400|  32.7M|			sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8));
  401|  32.7M|			sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24));
  402|  32.7M|		}
  403|  4.09M|	}
  404|       |
  405|  15.5k|	int best_rot = 0;
  406|   124k|	for (int rot = 1; rot < 8; ++rot)
  ------------------
  |  Branch (406:20): [True: 108k, False: 15.5k]
  ------------------
  407|   108k|		best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot;
  ------------------
  |  Branch (407:14): [True: 6.76k, False: 102k]
  ------------------
  408|       |
  409|  15.5k|	return best_rot;
  410|  15.5k|}
_ZN7meshopt6rotateEji:
  150|   162M|{
  151|   162M|	return (v << r) | (v >> ((32 - r) & 31));
  152|   162M|}
vertexcodec.cpp:_ZN7meshoptL12estimateBitsEh:
  365|   130M|{
  366|   130M|	return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8;
  ------------------
  |  Branch (366:9): [True: 48.8M, False: 82.1M]
  |  Branch (366:20): [True: 46.9M, False: 1.87M]
  |  Branch (366:30): [True: 45.2M, False: 1.67M]
  ------------------
  367|   130M|}
vertexcodec.cpp:_ZN7meshoptL15estimateChannelEPKhmmmmmii:
  413|  17.8k|{
  414|  17.8k|	unsigned char block[kVertexBlockMaxSize];
  415|  17.8k|	assert(vertex_block_size <= kVertexBlockMaxSize);
  ------------------
  |  Branch (415:2): [True: 17.8k, False: 0]
  ------------------
  416|       |
  417|  17.8k|	unsigned char last_vertex[256] = {};
  418|       |
  419|  17.8k|	size_t sizes[3] = {};
  420|  17.8k|	assert(max_channel <= 3);
  ------------------
  |  Branch (420:2): [True: 17.8k, False: 0]
  ------------------
  421|       |
  422|   151k|	for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip)
  ------------------
  |  Branch (422:21): [True: 133k, False: 17.8k]
  ------------------
  423|   133k|	{
  424|   133k|		size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i;
  ------------------
  |  Branch (424:23): [True: 120k, False: 13.4k]
  ------------------
  425|   133k|		size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  426|       |
  427|   133k|		memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size);
  ------------------
  |  Branch (427:38): [True: 17.8k, False: 115k]
  ------------------
  428|       |
  429|       |		// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
  430|   133k|		if (block_size < block_size_aligned)
  ------------------
  |  Branch (430:7): [True: 11.9k, False: 121k]
  ------------------
  431|  11.9k|			memset(block + block_size, 0, block_size_aligned - block_size);
  432|       |
  433|   498k|		for (int channel = 0; channel < max_channel; ++channel)
  ------------------
  |  Branch (433:25): [True: 365k, False: 133k]
  ------------------
  434|  1.82M|			for (size_t j = 0; j < 4; ++j)
  ------------------
  |  Branch (434:23): [True: 1.46M, False: 365k]
  ------------------
  435|  1.46M|			{
  436|  1.46M|				encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4));
  437|       |
  438|  22.9M|				for (size_t ig = 0; ig < block_size; ig += kByteGroupSize)
  ------------------
  |  Branch (438:25): [True: 21.4M, False: 1.46M]
  ------------------
  439|  21.4M|				{
  440|       |					// to maximize encoding performance we only evaluate 1/2/4/8 bit groups
  441|  21.4M|					size_t size1 = encodeBytesGroupMeasure(block + ig, 1);
  442|  21.4M|					size_t size2 = encodeBytesGroupMeasure(block + ig, 2);
  443|  21.4M|					size_t size4 = encodeBytesGroupMeasure(block + ig, 4);
  444|  21.4M|					size_t size8 = encodeBytesGroupMeasure(block + ig, 8);
  445|       |
  446|  21.4M|					size_t best_size = size1 < size2 ? size1 : size2;
  ------------------
  |  Branch (446:25): [True: 20.3M, False: 1.11M]
  ------------------
  447|  21.4M|					best_size = best_size < size4 ? best_size : size4;
  ------------------
  |  Branch (447:18): [True: 21.4M, False: 44.0k]
  ------------------
  448|  21.4M|					best_size = best_size < size8 ? best_size : size8;
  ------------------
  |  Branch (448:18): [True: 11.9M, False: 9.52M]
  ------------------
  449|       |
  450|  21.4M|					sizes[channel] += best_size;
  451|  21.4M|				}
  452|  1.46M|			}
  453|   133k|	}
  454|       |
  455|  17.8k|	int best_channel = 0;
  456|  51.2k|	for (int channel = 1; channel < max_channel; ++channel)
  ------------------
  |  Branch (456:24): [True: 33.4k, False: 17.8k]
  ------------------
  457|  33.4k|		best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel;
  ------------------
  |  Branch (457:18): [True: 4.94k, False: 28.4k]
  ------------------
  458|       |
  459|  17.8k|	return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel;
  ------------------
  |  Branch (459:9): [True: 2.24k, False: 15.6k]
  ------------------
  460|  17.8k|}
vertexcodec.cpp:_ZN7meshoptL12encodeDeltasEPhPKhmmS2_mi:
  350|  4.83M|{
  351|  4.83M|	switch (channel & 3)
  352|  4.83M|	{
  353|  3.62M|	case 0:
  ------------------
  |  Branch (353:2): [True: 3.62M, False: 1.21M]
  ------------------
  354|  3.62M|		return encodeDeltas1<unsigned char, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
  355|   662k|	case 1:
  ------------------
  |  Branch (355:2): [True: 662k, False: 4.17M]
  ------------------
  356|   662k|		return encodeDeltas1<unsigned short, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
  357|   549k|	case 2:
  ------------------
  |  Branch (357:2): [True: 549k, False: 4.28M]
  ------------------
  358|   549k|		return encodeDeltas1<unsigned int, true>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4);
  359|      0|	default:
  ------------------
  |  Branch (359:2): [True: 0, False: 4.83M]
  ------------------
  360|       |		assert(!"Unsupported channel encoding"); // unreachable
  ------------------
  |  Branch (360:3): [Folded, False: 0]
  ------------------
  361|  4.83M|	}
  362|  4.83M|}
vertexcodec.cpp:_ZN7meshoptL13encodeDeltas1IhLb0EEEvPhPKhmmS3_mi:
  325|  3.62M|{
  326|  3.62M|	size_t k0 = k & ~(sizeof(T) - 1);
  327|  3.62M|	int ks = (k & (sizeof(T) - 1)) * 8;
  328|       |
  329|  3.62M|	T p = last_vertex[k0];
  330|  3.62M|	for (size_t j = 1; j < sizeof(T); ++j)
  ------------------
  |  Branch (330:21): [True: 0, False: 3.62M]
  ------------------
  331|      0|		p |= T(last_vertex[k0 + j]) << (j * 8);
  332|       |
  333|  3.62M|	const unsigned char* vertex = vertex_data + k0;
  334|       |
  335|   877M|	for (size_t i = 0; i < vertex_count; ++i)
  ------------------
  |  Branch (335:21): [True: 874M, False: 3.62M]
  ------------------
  336|   874M|	{
  337|   874M|		T v = vertex[0];
  338|   874M|		for (size_t j = 1; j < sizeof(T); ++j)
  ------------------
  |  Branch (338:22): [True: 0, False: 874M]
  ------------------
  339|      0|			v |= vertex[j] << (j * 8);
  340|       |
  341|   874M|		T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
  ------------------
  |  Branch (341:9): [Folded, False: 874M]
  ------------------
  342|       |
  343|   874M|		buffer[i] = (unsigned char)(d >> ks);
  344|   874M|		p = v;
  345|   874M|		vertex += vertex_size;
  346|   874M|	}
  347|  3.62M|}
_ZN7meshopt6zigzagIhEET_S1_:
  156|   874M|{
  157|   874M|	return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
  158|   874M|}
vertexcodec.cpp:_ZN7meshoptL13encodeDeltas1ItLb0EEEvPhPKhmmS3_mi:
  325|   662k|{
  326|   662k|	size_t k0 = k & ~(sizeof(T) - 1);
  327|   662k|	int ks = (k & (sizeof(T) - 1)) * 8;
  328|       |
  329|   662k|	T p = last_vertex[k0];
  330|  1.32M|	for (size_t j = 1; j < sizeof(T); ++j)
  ------------------
  |  Branch (330:21): [True: 662k, False: 662k]
  ------------------
  331|   662k|		p |= T(last_vertex[k0 + j]) << (j * 8);
  332|       |
  333|   662k|	const unsigned char* vertex = vertex_data + k0;
  334|       |
  335|   157M|	for (size_t i = 0; i < vertex_count; ++i)
  ------------------
  |  Branch (335:21): [True: 156M, False: 662k]
  ------------------
  336|   156M|	{
  337|   156M|		T v = vertex[0];
  338|   313M|		for (size_t j = 1; j < sizeof(T); ++j)
  ------------------
  |  Branch (338:22): [True: 156M, False: 156M]
  ------------------
  339|   156M|			v |= vertex[j] << (j * 8);
  340|       |
  341|   156M|		T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
  ------------------
  |  Branch (341:9): [Folded, False: 156M]
  ------------------
  342|       |
  343|   156M|		buffer[i] = (unsigned char)(d >> ks);
  344|   156M|		p = v;
  345|   156M|		vertex += vertex_size;
  346|   156M|	}
  347|   662k|}
_ZN7meshopt6zigzagItEET_S1_:
  156|   156M|{
  157|   156M|	return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
  158|   156M|}
vertexcodec.cpp:_ZN7meshoptL13encodeDeltas1IjLb1EEEvPhPKhmmS3_mi:
  325|   549k|{
  326|   549k|	size_t k0 = k & ~(sizeof(T) - 1);
  327|   549k|	int ks = (k & (sizeof(T) - 1)) * 8;
  328|       |
  329|   549k|	T p = last_vertex[k0];
  330|  2.19M|	for (size_t j = 1; j < sizeof(T); ++j)
  ------------------
  |  Branch (330:21): [True: 1.64M, False: 549k]
  ------------------
  331|  1.64M|		p |= T(last_vertex[k0 + j]) << (j * 8);
  332|       |
  333|   549k|	const unsigned char* vertex = vertex_data + k0;
  334|       |
  335|   130M|	for (size_t i = 0; i < vertex_count; ++i)
  ------------------
  |  Branch (335:21): [True: 129M, False: 549k]
  ------------------
  336|   129M|	{
  337|   129M|		T v = vertex[0];
  338|   518M|		for (size_t j = 1; j < sizeof(T); ++j)
  ------------------
  |  Branch (338:22): [True: 388M, False: 129M]
  ------------------
  339|   388M|			v |= vertex[j] << (j * 8);
  340|       |
  341|   129M|		T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
  ------------------
  |  Branch (341:9): [True: 129M, Folded]
  ------------------
  342|       |
  343|   129M|		buffer[i] = (unsigned char)(d >> ks);
  344|   129M|		p = v;
  345|   129M|		vertex += vertex_size;
  346|   129M|	}
  347|   549k|}
vertexcodec.cpp:_ZN7meshoptL23encodeBytesGroupMeasureEPKhi:
  191|   277M|{
  192|   277M|	assert(bits >= 0 && bits <= 8);
  ------------------
  |  Branch (192:2): [True: 277M, False: 0]
  |  Branch (192:2): [True: 277M, False: 0]
  |  Branch (192:2): [True: 277M, False: 0]
  ------------------
  193|       |
  194|   277M|	if (bits == 0)
  ------------------
  |  Branch (194:6): [True: 30.4M, False: 247M]
  ------------------
  195|  30.4M|		return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
  ------------------
  |  Branch (195:10): [True: 11.2M, False: 19.1M]
  ------------------
  196|       |
  197|   247M|	if (bits == 8)
  ------------------
  |  Branch (197:6): [True: 60.2M, False: 186M]
  ------------------
  198|  60.2M|		return kByteGroupSize;
  199|       |
  200|   186M|	size_t result = kByteGroupSize * bits / 8;
  201|       |
  202|   186M|	unsigned char sentinel = (1 << bits) - 1;
  203|       |
  204|  3.17G|	for (size_t i = 0; i < kByteGroupSize; ++i)
  ------------------
  |  Branch (204:21): [True: 2.98G, False: 186M]
  ------------------
  205|  2.98G|		result += buffer[i] >= sentinel;
  206|       |
  207|   186M|	return result;
  208|   247M|}
vertexcodec.cpp:_ZN7meshoptL20encodeBytesGroupZeroEPKh:
  181|  48.8M|{
  182|  48.8M|	assert(kByteGroupSize == sizeof(unsigned long long) * 2);
  ------------------
  |  Branch (182:2): [True: 48.8M, Folded]
  ------------------
  183|       |
  184|  48.8M|	unsigned long long v[2];
  185|  48.8M|	memcpy(v, buffer, sizeof(v));
  186|       |
  187|  48.8M|	return (v[0] | v[1]) == 0;
  188|  48.8M|}
vertexcodec.cpp:_ZN7meshoptL17encodeVertexBlockEPhS0_PKhmmS0_S2_ii:
  510|   318k|{
  511|   318k|	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  ------------------
  |  Branch (511:2): [True: 318k, False: 0]
  |  Branch (511:2): [True: 318k, False: 0]
  |  Branch (511:2): [True: 318k, False: 0]
  ------------------
  512|   318k|	assert(vertex_size % 4 == 0);
  ------------------
  |  Branch (512:2): [True: 318k, False: 0]
  ------------------
  513|       |
  514|   318k|	unsigned char buffer[kVertexBlockMaxSize];
  515|   318k|	assert(sizeof(buffer) % kByteGroupSize == 0);
  ------------------
  |  Branch (515:2): [True: 318k, Folded]
  ------------------
  516|       |
  517|   318k|	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  518|       |
  519|       |	// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
  520|   318k|	memset(buffer, 0, sizeof(buffer));
  521|       |
  522|   318k|	size_t control_size = version == 0 ? 0 : vertex_size / 4;
  ------------------
  |  Branch (522:24): [True: 45.4k, False: 272k]
  ------------------
  523|   318k|	if (size_t(data_end - data) < control_size)
  ------------------
  |  Branch (523:6): [True: 0, False: 318k]
  ------------------
  524|      0|		return NULL;
  525|       |
  526|   318k|	unsigned char* control = data;
  527|   318k|	data += control_size;
  528|       |
  529|   318k|	memset(control, 0, control_size);
  530|       |
  531|  3.69M|	for (size_t k = 0; k < vertex_size; ++k)
  ------------------
  |  Branch (531:21): [True: 3.37M, False: 318k]
  ------------------
  532|  3.37M|	{
  533|  3.37M|		encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]);
  ------------------
  |  Branch (533:80): [True: 493k, False: 2.88M]
  ------------------
  534|       |
  535|       |#if TRACE
  536|       |		const unsigned char* olddata = data;
  537|       |		bytestats = &vertexstats[k];
  538|       |#endif
  539|       |
  540|  3.37M|		int ctrl = 0;
  541|       |
  542|  3.37M|		if (version != 0)
  ------------------
  |  Branch (542:7): [True: 2.88M, False: 493k]
  ------------------
  543|  2.88M|		{
  544|  2.88M|			ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level);
  545|       |
  546|  2.88M|			assert(unsigned(ctrl) < 4);
  ------------------
  |  Branch (546:4): [True: 2.88M, False: 0]
  ------------------
  547|  2.88M|			control[k / 4] |= ctrl << ((k % 4) * 2);
  548|       |
  549|       |#if TRACE
  550|       |			vertexstats[k].ctrl[ctrl]++;
  551|       |#endif
  552|  2.88M|		}
  553|       |
  554|  3.37M|		if (ctrl == 3)
  ------------------
  |  Branch (554:7): [True: 774k, False: 2.60M]
  ------------------
  555|   774k|		{
  556|       |			// literal encoding
  557|   774k|			if (size_t(data_end - data) < vertex_count)
  ------------------
  |  Branch (557:8): [True: 0, False: 774k]
  ------------------
  558|      0|				return NULL;
  559|       |
  560|   774k|			memcpy(data, buffer, vertex_count);
  561|   774k|			data += vertex_count;
  562|   774k|		}
  563|  2.60M|		else if (ctrl != 2) // non-zero encoding
  ------------------
  |  Branch (563:12): [True: 1.60M, False: 996k]
  ------------------
  564|  1.60M|		{
  565|  1.60M|			data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
  ------------------
  |  Branch (565:69): [True: 493k, False: 1.11M]
  ------------------
  566|  1.60M|			if (!data)
  ------------------
  |  Branch (566:8): [True: 0, False: 1.60M]
  ------------------
  567|      0|				return NULL;
  568|  1.60M|		}
  569|       |
  570|       |#if TRACE
  571|       |		bytestats = NULL;
  572|       |		vertexstats[k].size += data - olddata;
  573|       |#endif
  574|  3.37M|	}
  575|       |
  576|   318k|	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
  577|       |
  578|   318k|	return data;
  579|   318k|}
vertexcodec.cpp:_ZN7meshoptL15estimateControlEPKhmmi:
  472|  2.88M|{
  473|  2.88M|	if (estimateControlZero(buffer, vertex_count_aligned))
  ------------------
  |  Branch (473:6): [True: 996k, False: 1.88M]
  ------------------
  474|   996k|		return 2; // zero encoding
  475|       |
  476|  1.88M|	if (level == 0)
  ------------------
  |  Branch (476:6): [True: 649k, False: 1.23M]
  ------------------
  477|   649k|		return 1; // 1248 encoding in level 0 for encoding speed
  478|       |
  479|       |	// round number of groups to 4 to get number of header bytes
  480|  1.23M|	size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4;
  481|       |
  482|  1.23M|	size_t est_bytes0 = header_size, est_bytes1 = header_size;
  483|       |
  484|  20.1M|	for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
  ------------------
  |  Branch (484:21): [True: 18.9M, False: 1.23M]
  ------------------
  485|  18.9M|	{
  486|       |		// assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance
  487|  18.9M|		size_t size0 = encodeBytesGroupMeasure(buffer + i, 0);
  488|  18.9M|		size_t size1 = encodeBytesGroupMeasure(buffer + i, 1);
  489|  18.9M|		size_t size2 = encodeBytesGroupMeasure(buffer + i, 2);
  490|  18.9M|		size_t size4 = encodeBytesGroupMeasure(buffer + i, 4);
  491|  18.9M|		size_t size8 = encodeBytesGroupMeasure(buffer + i, 8);
  492|       |
  493|       |		// both control modes have access to 1/2/4 bit encoding
  494|  18.9M|		size_t size12 = size1 < size2 ? size1 : size2;
  ------------------
  |  Branch (494:19): [True: 16.9M, False: 1.96M]
  ------------------
  495|  18.9M|		size_t size124 = size12 < size4 ? size12 : size4;
  ------------------
  |  Branch (495:20): [True: 18.8M, False: 70.4k]
  ------------------
  496|       |
  497|       |		// each control mode has access to 0/8 bit encoding respectively
  498|  18.9M|		est_bytes0 += size124 < size0 ? size124 : size0;
  ------------------
  |  Branch (498:17): [True: 15.5M, False: 3.35M]
  ------------------
  499|  18.9M|		est_bytes1 += size124 < size8 ? size124 : size8;
  ------------------
  |  Branch (499:17): [True: 6.54M, False: 12.3M]
  ------------------
  500|  18.9M|	}
  501|       |
  502|       |	// pick shortest control entry but prefer literal encoding
  503|  1.23M|	if (est_bytes0 < vertex_count || est_bytes1 < vertex_count)
  ------------------
  |  Branch (503:6): [True: 441k, False: 797k]
  |  Branch (503:35): [True: 22.8k, False: 774k]
  ------------------
  504|   464k|		return est_bytes0 < est_bytes1 ? 0 : 1;
  ------------------
  |  Branch (504:10): [True: 277k, False: 186k]
  ------------------
  505|   774k|	else
  506|   774k|		return 3; // literal encoding
  507|  1.23M|}
vertexcodec.cpp:_ZN7meshoptL19estimateControlZeroEPKhm:
  463|  2.88M|{
  464|  19.4M|	for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
  ------------------
  |  Branch (464:21): [True: 18.4M, False: 996k]
  ------------------
  465|  18.4M|		if (!encodeBytesGroupZero(buffer + i))
  ------------------
  |  Branch (465:7): [True: 1.88M, False: 16.5M]
  ------------------
  466|  1.88M|			return false;
  467|       |
  468|   996k|	return true;
  469|  2.88M|}
vertexcodec.cpp:_ZN7meshoptL11encodeBytesEPhS0_PKhmPKi:
  264|  1.60M|{
  265|  1.60M|	assert(buffer_size % kByteGroupSize == 0);
  ------------------
  |  Branch (265:2): [True: 1.60M, False: 0]
  ------------------
  266|       |
  267|  1.60M|	unsigned char* header = data;
  268|       |
  269|       |	// round number of groups to 4 to get number of header bytes
  270|  1.60M|	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  271|       |
  272|  1.60M|	if (size_t(data_end - data) < header_size)
  ------------------
  |  Branch (272:6): [True: 0, False: 1.60M]
  ------------------
  273|      0|		return NULL;
  274|       |
  275|  1.60M|	data += header_size;
  276|       |
  277|  1.60M|	memset(header, 0, header_size);
  278|       |
  279|  1.60M|	int last_bits = -1;
  280|       |
  281|  25.9M|	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  ------------------
  |  Branch (281:21): [True: 24.3M, False: 1.60M]
  ------------------
  282|  24.3M|	{
  283|  24.3M|		if (size_t(data_end - data) < kByteGroupDecodeLimit)
  ------------------
  |  Branch (283:7): [True: 0, False: 24.3M]
  ------------------
  284|      0|			return NULL;
  285|       |
  286|  24.3M|		int best_bitk = 3;
  287|  24.3M|		size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]);
  288|       |
  289|  97.2M|		for (int bitk = 0; bitk < 3; ++bitk)
  ------------------
  |  Branch (289:22): [True: 72.9M, False: 24.3M]
  ------------------
  290|  72.9M|		{
  291|  72.9M|			size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]);
  292|       |
  293|       |			// favor consistent bit selection across groups, but never replace literals
  294|  72.9M|			if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8))
  ------------------
  |  Branch (294:8): [True: 15.1M, False: 57.7M]
  |  Branch (294:29): [True: 1.21M, False: 56.5M]
  |  Branch (294:50): [True: 297k, False: 916k]
  |  Branch (294:77): [True: 48.5k, False: 249k]
  ------------------
  295|  15.2M|			{
  296|  15.2M|				best_bitk = bitk;
  297|  15.2M|				best_size = size;
  298|  15.2M|			}
  299|  72.9M|		}
  300|       |
  301|  24.3M|		size_t header_offset = i / kByteGroupSize;
  302|  24.3M|		header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2);
  303|       |
  304|  24.3M|		int best_bits = bits[best_bitk];
  305|  24.3M|		unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
  306|       |
  307|  24.3M|		assert(data + best_size == next);
  ------------------
  |  Branch (307:3): [True: 24.3M, False: 0]
  ------------------
  308|  24.3M|		data = next;
  309|  24.3M|		last_bits = best_bits;
  310|       |
  311|       |#if TRACE
  312|       |		bytestats->bitg[best_bits] += best_size;
  313|       |#endif
  314|  24.3M|	}
  315|       |
  316|       |#if TRACE
  317|       |	bytestats->header += header_size;
  318|       |#endif
  319|       |
  320|  1.60M|	return data;
  321|  1.60M|}
vertexcodec.cpp:_ZN7meshoptL16encodeBytesGroupEPhPKhi:
  211|  24.3M|{
  212|  24.3M|	assert(bits >= 0 && bits <= 8);
  ------------------
  |  Branch (212:2): [True: 24.3M, False: 0]
  |  Branch (212:2): [True: 24.3M, False: 0]
  |  Branch (212:2): [True: 24.3M, False: 0]
  ------------------
  213|  24.3M|	assert(kByteGroupSize % 8 == 0);
  ------------------
  |  Branch (213:2): [True: 24.3M, Folded]
  ------------------
  214|       |
  215|  24.3M|	if (bits == 0)
  ------------------
  |  Branch (215:6): [True: 7.92M, False: 16.3M]
  ------------------
  216|  7.92M|		return data;
  217|       |
  218|  16.3M|	if (bits == 8)
  ------------------
  |  Branch (218:6): [True: 10.5M, False: 5.83M]
  ------------------
  219|  10.5M|	{
  220|  10.5M|		memcpy(data, buffer, kByteGroupSize);
  221|  10.5M|		return data + kByteGroupSize;
  222|  10.5M|	}
  223|       |
  224|  5.83M|	size_t byte_size = 8 / bits;
  225|  5.83M|	assert(kByteGroupSize % byte_size == 0);
  ------------------
  |  Branch (225:2): [True: 5.83M, False: 0]
  ------------------
  226|       |
  227|       |	// fixed portion: bits bits for each value
  228|       |	// variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
  229|  5.83M|	unsigned char sentinel = (1 << bits) - 1;
  230|       |
  231|  22.3M|	for (size_t i = 0; i < kByteGroupSize; i += byte_size)
  ------------------
  |  Branch (231:21): [True: 16.4M, False: 5.83M]
  ------------------
  232|  16.4M|	{
  233|  16.4M|		unsigned char byte = 0;
  234|       |
  235|   109M|		for (size_t k = 0; k < byte_size; ++k)
  ------------------
  |  Branch (235:22): [True: 93.3M, False: 16.4M]
  ------------------
  236|  93.3M|		{
  237|  93.3M|			unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
  ------------------
  |  Branch (237:24): [True: 32.6M, False: 60.7M]
  ------------------
  238|       |
  239|  93.3M|			byte <<= bits;
  240|  93.3M|			byte |= enc;
  241|  93.3M|		}
  242|       |
  243|       |		// encode 1-bit groups in reverse bit order
  244|       |		// this makes them faster to decode alongside other groups
  245|  16.4M|		if (bits == 1)
  ------------------
  |  Branch (245:7): [True: 7.18M, False: 9.29M]
  ------------------
  246|  7.18M|			byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
  247|       |
  248|  16.4M|		*data++ = byte;
  249|  16.4M|	}
  250|       |
  251|  99.1M|	for (size_t i = 0; i < kByteGroupSize; ++i)
  ------------------
  |  Branch (251:21): [True: 93.3M, False: 5.83M]
  ------------------
  252|  93.3M|	{
  253|  93.3M|		unsigned char v = buffer[i];
  254|       |
  255|       |		// branchless append of out-of-range values
  256|  93.3M|		*data = v;
  257|  93.3M|		data += v >= sentinel;
  258|  93.3M|	}
  259|       |
  260|  5.83M|	return data;
  261|  5.83M|}
vertexcodec.cpp:_ZN7meshoptL21decodeVertexBlockSimdEPKhS1_PhmmS2_S1_i:
 1515|   160k|{
 1516|   160k|	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  ------------------
  |  Branch (1516:2): [True: 160k, False: 0]
  |  Branch (1516:2): [True: 160k, False: 0]
  |  Branch (1516:2): [True: 160k, False: 0]
  ------------------
 1517|       |
 1518|   160k|	unsigned char buffer[kVertexBlockMaxSize * 4];
 1519|   160k|	unsigned char transposed[kVertexBlockSizeBytes];
 1520|       |
 1521|   160k|	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
 1522|       |
 1523|       |	// we can decode directly into the output buffer if vertex count is aligned to 16 (delta decode works 16 vertices at a time)
 1524|       |	// this uses strided writes and also reads the last vertex once, which is bad for performance for write-combined memory so we only enable this if configured
 1525|       |#ifdef MESHOPTIMIZER_VERTEXCODEC_ZEROCOPY
 1526|       |	unsigned char* target = vertex_count == vertex_count_aligned ? vertex_data : transposed;
 1527|       |#else
 1528|   160k|	unsigned char* target = transposed;
 1529|   160k|#endif
 1530|       |
 1531|   160k|	size_t control_size = version == 0 ? 0 : vertex_size / 4;
  ------------------
  |  Branch (1531:24): [True: 22.9k, False: 137k]
  ------------------
 1532|   160k|	if (size_t(data_end - data) < control_size)
  ------------------
  |  Branch (1532:6): [True: 0, False: 160k]
  ------------------
 1533|      0|		return NULL;
 1534|       |
 1535|   160k|	const unsigned char* control = data;
 1536|   160k|	data += control_size;
 1537|       |
 1538|   584k|	for (size_t k = 0; k < vertex_size; k += 4)
  ------------------
  |  Branch (1538:21): [True: 425k, False: 159k]
  ------------------
 1539|   425k|	{
 1540|   425k|		unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
  ------------------
  |  Branch (1540:29): [True: 62.3k, False: 362k]
  ------------------
 1541|       |
 1542|  2.12M|		for (size_t j = 0; j < 4; ++j)
  ------------------
  |  Branch (1542:22): [True: 1.70M, False: 424k]
  ------------------
 1543|  1.70M|		{
 1544|  1.70M|			int ctrl = (ctrl_byte >> (j * 2)) & 3;
 1545|       |
 1546|  1.70M|			if (ctrl == 3)
  ------------------
  |  Branch (1546:8): [True: 388k, False: 1.31M]
  ------------------
 1547|   388k|			{
 1548|       |				// literal encoding; safe to over-copy due to tail
 1549|   388k|				if (size_t(data_end - data) < vertex_count_aligned)
  ------------------
  |  Branch (1549:9): [True: 72, False: 388k]
  ------------------
 1550|     72|					return NULL;
 1551|       |
 1552|   388k|				memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned);
 1553|   388k|				data += vertex_count;
 1554|   388k|			}
 1555|  1.31M|			else if (ctrl == 2)
  ------------------
  |  Branch (1555:13): [True: 501k, False: 810k]
  ------------------
 1556|   501k|			{
 1557|       |				// zero encoding
 1558|   501k|				memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned);
 1559|   501k|			}
 1560|   810k|			else
 1561|   810k|			{
 1562|       |				// for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8
 1563|   810k|				int hshift = version == 0 ? 0 : 4 + ctrl;
  ------------------
  |  Branch (1563:18): [True: 249k, False: 561k]
  ------------------
 1564|       |
 1565|   810k|				data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift);
 1566|   810k|				if (!data)
  ------------------
  |  Branch (1566:9): [True: 307, False: 810k]
  ------------------
 1567|    307|					return NULL;
 1568|   810k|			}
 1569|  1.70M|		}
 1570|       |
 1571|   424k|		int channel = version == 0 ? 0 : channels[k / 4];
  ------------------
  |  Branch (1571:17): [True: 62.2k, False: 362k]
  ------------------
 1572|       |
 1573|   424k|		switch (channel & 3)
 1574|   424k|		{
 1575|   388k|		case 0:
  ------------------
  |  Branch (1575:3): [True: 388k, False: 36.6k]
  ------------------
 1576|   388k|			decodeDeltas4Simd<0>(buffer, target + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
 1577|   388k|			break;
 1578|  16.2k|		case 1:
  ------------------
  |  Branch (1578:3): [True: 16.2k, False: 408k]
  ------------------
 1579|  16.2k|			decodeDeltas4Simd<1>(buffer, target + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
 1580|  16.2k|			break;
 1581|  20.2k|		case 2:
  ------------------
  |  Branch (1581:3): [True: 20.2k, False: 404k]
  ------------------
 1582|  20.2k|			decodeDeltas4Simd<2>(buffer, target + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
 1583|  20.2k|			break;
 1584|    169|		default:
  ------------------
  |  Branch (1584:3): [True: 169, False: 424k]
  ------------------
 1585|    169|			return NULL; // invalid channel type
 1586|   424k|		}
 1587|   424k|	}
 1588|       |
 1589|   159k|	if (target == transposed)
  ------------------
  |  Branch (1589:6): [True: 159k, False: 0]
  ------------------
 1590|   159k|		memcpy(vertex_data, transposed, vertex_count * vertex_size);
 1591|       |
 1592|   159k|	memcpy(last_vertex, &target[vertex_size * (vertex_count - 1)], vertex_size);
 1593|       |
 1594|   159k|	return data;
 1595|   160k|}
vertexcodec.cpp:_ZN7meshoptL15decodeBytesSimdEPKhS1_Phmi:
 1370|   810k|{
 1371|   810k|	assert(buffer_size % kByteGroupSize == 0);
  ------------------
  |  Branch (1371:2): [True: 810k, False: 0]
  ------------------
 1372|   810k|	assert(kByteGroupSize == 16);
  ------------------
  |  Branch (1372:2): [True: 810k, Folded]
  ------------------
 1373|       |
 1374|       |	// round number of groups to 4 to get number of header bytes
 1375|   810k|	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
 1376|   810k|	if (size_t(data_end - data) < header_size)
  ------------------
  |  Branch (1376:6): [True: 13, False: 810k]
  ------------------
 1377|     13|		return NULL;
 1378|       |
 1379|   810k|	const unsigned char* header = data;
 1380|   810k|	data += header_size;
 1381|       |
 1382|   810k|	size_t i = 0;
 1383|       |
 1384|       |	// fast-path: process 4 groups at a time, do a shared bounds check
 1385|  3.82M|	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
  ------------------
  |  Branch (1385:9): [True: 3.02M, False: 805k]
  |  Branch (1385:50): [True: 3.01M, False: 4.59k]
  ------------------
 1386|  3.01M|	{
 1387|  3.01M|		size_t header_offset = i / kByteGroupSize;
 1388|  3.01M|		unsigned char header_byte = header[header_offset / 4];
 1389|       |
 1390|  3.01M|#if defined(SIMD_SSE) || defined(SIMD_AVX)
 1391|       |		// very-fast-path: for consecutive 4 groups that are all 0-bit (v0/0, v1/0/0000) or 8-bit (v0/3333, v1/1/3333),
 1392|       |		// the branchless decoders are slower than branching over the decoding of 4 groups and issuing a few load/store ops
 1393|  3.01M|		if (hshift != 5 && header_byte == 0)
  ------------------
  |  Branch (1393:7): [True: 1.42M, False: 1.58M]
  |  Branch (1393:22): [True: 835k, False: 590k]
  ------------------
 1394|   835k|		{
 1395|   835k|			memset(buffer + i, 0, kByteGroupSize * 4);
 1396|   835k|			continue;
 1397|   835k|		}
 1398|  2.18M|		else if (hshift != 4 && header_byte == 255)
  ------------------
  |  Branch (1398:12): [True: 1.91M, False: 263k]
  |  Branch (1398:27): [True: 1.20M, False: 709k]
  ------------------
 1399|  1.20M|		{
 1400|  1.20M|			memcpy(buffer + i, data, kByteGroupSize * 4);
 1401|  1.20M|			data += kByteGroupSize * 4;
 1402|  1.20M|			continue;
 1403|  1.20M|		}
 1404|   973k|#endif
 1405|       |
 1406|   973k|		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3));
 1407|   973k|		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3));
 1408|   973k|		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3));
 1409|   973k|		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3));
 1410|   973k|	}
 1411|       |
 1412|       |	// slow-path: process remaining groups
 1413|   927k|	for (; i < buffer_size; i += kByteGroupSize)
  ------------------
  |  Branch (1413:9): [True: 117k, False: 810k]
  ------------------
 1414|   117k|	{
 1415|   117k|		if (size_t(data_end - data) < kByteGroupDecodeLimit)
  ------------------
  |  Branch (1415:7): [True: 294, False: 117k]
  ------------------
 1416|    294|			return NULL;
 1417|       |
 1418|   117k|		size_t header_offset = i / kByteGroupSize;
 1419|   117k|		unsigned char header_byte = header[header_offset / 4];
 1420|       |
 1421|   117k|		data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3));
 1422|   117k|	}
 1423|       |
 1424|   810k|	return data;
 1425|   810k|}
vertexcodec.cpp:_ZN7meshoptL20decodeBytesGroupSimdEPKhPhi:
  831|  4.01M|{
  832|       |	// 0 for 1-bit, 1 for 2-bit, 2 for 4-bit, 3 for 8-bit, and 4 for 0-bit as it makes some of the uses easier
  833|  4.01M|	static const int hbtn[9] = {4, 1, 2, 3, 4, 0, 1, 2, 3};
  834|       |
  835|  4.01M|	int n = hbtn[hbits];
  836|       |
  837|  4.01M|#ifdef SIMD_LATENCYOPT
  838|  4.01M|	unsigned long long data64;
  839|  4.01M|	memcpy(&data64, data, 8);
  840|  4.01M|	data64 &= data64 >> n;
  841|  4.01M|	data64 &= data64 >> (n >> 1);
  842|       |
  843|       |	// mask out one bit per group that is set if all group bits were 1
  844|  4.01M|	static const unsigned long long lanes[9] = {0, 0x55555555, 0x1111111111111111ull, 0, 0, 0xffff, 0x55555555, 0x1111111111111111ull, 0};
  845|  4.01M|	int datacnt = int(_mm_popcnt_u64(data64 & lanes[hbits]));
  846|  4.01M|#endif
  847|       |
  848|       |	// for 8-bit groups, instead of loading the bytes through 'data', we load them through 'skip' as they are easier to preserve
  849|       |	// for 0-bit groups, the load results get discarded because mask is always 0; in both cases the shift wraps to zero
  850|  4.01M|	const unsigned char* skip = data + ((2 << n) & 15);
  851|       |
  852|  4.01M|	__m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  853|  4.01M|	__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
  854|       |
  855|       |	// unpack 1, 2 or 4-bit values: shuffle replicates each source byte into both halves of a 16-bit lane
  856|       |	// mulhi extracts even and odd fields into the low byte; the results are interleaved back with shift/or
  857|  4.01M|	__m128i selw = _mm_shuffle_epi8(selb, _mm_loadu_si128(reinterpret_cast<const __m128i*>(kDecodeBytesGroupConfig[hbits][1])));
  858|  4.01M|	__m128i sel0 = _mm_mulhi_epu16(selw, _mm_loadu_si128(reinterpret_cast<const __m128i*>(kDecodeBytesGroupConfig[hbits][2])));
  859|  4.01M|	__m128i sel1 = _mm_mulhi_epu16(selw, _mm_loadu_si128(reinterpret_cast<const __m128i*>(kDecodeBytesGroupConfig[hbits][3])));
  860|  4.01M|	__m128i seli = _mm_or_si128(sel0, _mm_slli_epi16(sel1, 8));
  861|       |
  862|       |	// the interleaved fields are masked by the bit count (special handling: for 0/8-bit values, mul produces 0)
  863|  4.01M|	__m128i sent = _mm_loadu_si128(reinterpret_cast<const __m128i*>(kDecodeBytesGroupConfig[hbits][0]));
  864|  4.01M|	__m128i sel = _mm_and_si128(seli, sent);
  865|       |
  866|       |	// compare sel to sentinel; returns 0 for 0-bit (mul produces 0, sent is 1), 1 for 8-bit (mul produces 0, sent is 0)
  867|  4.01M|	__m128i mask = _mm_cmpeq_epi8(sel, sent);
  868|  4.01M|	int mask16 = _mm_movemask_epi8(mask);
  869|  4.01M|	unsigned char mask0 = (unsigned char)(mask16 & 255);
  870|  4.01M|	unsigned char mask1 = (unsigned char)(mask16 >> 8);
  871|       |
  872|       |	// decode shuffle mask from two halves; second half needs to be shifted by popcount(mask0)
  873|  4.01M|	__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
  874|  4.01M|	__m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
  875|       |
  876|       |	// each lane of mask is 0x00 or 0xff; sad yields 255*popcount(mask0) in low word => low byte is -popcount(mask0)
  877|  4.01M|	__m128i npops = _mm_sad_epu8(mask, _mm_setzero_si128());
  878|  4.01M|	__m128i sm1r = _mm_sub_epi8(sm1, _mm_shuffle_epi8(npops, _mm_setzero_si128()));
  879|  4.01M|	__m128i shuf = _mm_unpacklo_epi64(sm0, sm1r);
  880|       |
  881|       |	// expand rest via shuffle mask and combine with sel; shuffle mask zeroes out bytes that are replaced by sel
  882|  4.01M|	__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  883|       |
  884|  4.01M|	_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  885|       |
  886|  4.01M|#ifdef SIMD_LATENCYOPT
  887|       |	// datacnt is 0 for 8-bit groups so we can't use skip to advance; 0-bit groups wrap the shift to zero
  888|  4.01M|	return data + ((2 << n) & 31) + datacnt;
  889|       |#else
  890|       |	return skip + _mm_popcnt_u32(mask16);
  891|       |#endif
  892|  4.01M|}
vertexcodec.cpp:_ZN7meshoptL17decodeDeltas4SimdILi0EEEvPKhPhmmS3_i:
 1430|   388k|{
 1431|   388k|#if defined(SIMD_SSE) || defined(SIMD_AVX)
 1432|   388k|#define TEMP __m128i
 1433|   388k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
 1434|   388k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
 1435|   388k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
 1436|   388k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
 1437|   388k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
 1438|   388k|#endif
 1439|       |
 1440|       |#ifdef SIMD_NEON
 1441|       |#define TEMP uint8x8_t
 1442|       |#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
 1443|       |#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
 1444|       |#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
 1445|       |#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
 1446|       |#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
 1447|       |#endif
 1448|       |
 1449|       |#ifdef SIMD_WASM
 1450|       |#define TEMP v128_t
 1451|       |#define PREP() v128_t pi = wasm_v128_load(last_vertex)
 1452|       |#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
 1453|       |#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
 1454|       |#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
 1455|       |#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
 1456|       |#endif
 1457|       |
 1458|   388k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
 1459|       |
 1460|   388k|	PREP();
  ------------------
  |  | 1433|   388k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
  ------------------
 1461|       |
 1462|   388k|	unsigned char* savep = transposed;
 1463|       |
 1464|  6.25M|	for (size_t j = 0; j < vertex_count_aligned; j += 16)
  ------------------
  |  Branch (1464:21): [True: 5.87M, False: 388k]
  ------------------
 1465|  5.87M|	{
 1466|  5.87M|		LOAD(0);
  ------------------
  |  | 1434|  5.87M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1467|  5.87M|		LOAD(1);
  ------------------
  |  | 1434|  5.87M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1468|  5.87M|		LOAD(2);
  ------------------
  |  | 1434|  5.87M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1469|  5.87M|		LOAD(3);
  ------------------
  |  | 1434|  5.87M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1470|       |
 1471|  5.87M|		transpose8(r0, r1, r2, r3);
 1472|       |
 1473|  5.87M|		TEMP t0, t1, t2, t3;
  ------------------
  |  | 1432|  5.87M|#define TEMP __m128i
  ------------------
 1474|  5.87M|		TEMP npi = pi;
  ------------------
  |  | 1432|  5.87M|#define TEMP __m128i
  ------------------
 1475|       |
 1476|  5.87M|		UNZR(0);
  ------------------
  |  | 1458|  5.87M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1458:24): [True: 5.87M, Folded]
  |  |  |  Branch (1458:58): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1477|  5.87M|		GRP4(0);
  ------------------
  |  | 1435|  5.87M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1478|  5.87M|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1479|  5.87M|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1480|       |
 1481|  5.87M|		UNZR(1);
  ------------------
  |  | 1458|  5.87M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1458:24): [True: 5.87M, Folded]
  |  |  |  Branch (1458:58): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1482|  5.87M|		GRP4(1);
  ------------------
  |  | 1435|  5.87M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1483|  5.87M|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1484|  5.87M|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1485|       |
 1486|  5.87M|		UNZR(2);
  ------------------
  |  | 1458|  5.87M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1458:24): [True: 5.87M, Folded]
  |  |  |  Branch (1458:58): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1487|  5.87M|		GRP4(2);
  ------------------
  |  | 1435|  5.87M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1488|  5.87M|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1489|  5.87M|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1490|       |
 1491|  5.87M|		UNZR(3);
  ------------------
  |  | 1458|  5.87M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1458:24): [True: 5.87M, Folded]
  |  |  |  Branch (1458:58): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1492|  5.87M|		GRP4(3);
  ------------------
  |  | 1435|  5.87M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1493|  5.87M|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|  5.87M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [True: 5.87M, Folded]
  |  |  |  Branch (1436:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1494|  5.87M|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|  5.87M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1495|       |
 1496|       |#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
 1497|       |		// instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
 1498|       |		pi = rebase<Channel>(npi, r0, r1, r2, r3);
 1499|       |#else
 1500|  5.87M|		(void)npi;
 1501|  5.87M|#endif
 1502|       |
 1503|  5.87M|#undef UNZR
 1504|  5.87M|#undef TEMP
 1505|  5.87M|#undef PREP
 1506|  5.87M|#undef LOAD
 1507|  5.87M|#undef GRP4
 1508|  5.87M|#undef FIXD
 1509|  5.87M|#undef SAVE
 1510|  5.87M|	}
 1511|   388k|}
_ZN7meshopt10transpose8ERDv2_xS1_S1_S1_:
 1219|  6.42M|{
 1220|  6.42M|	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
 1221|  6.42M|	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
 1222|  6.42M|	__m128i t2 = _mm_unpacklo_epi8(x2, x3);
 1223|  6.42M|	__m128i t3 = _mm_unpackhi_epi8(x2, x3);
 1224|       |
 1225|  6.42M|	x0 = _mm_unpacklo_epi16(t0, t2);
 1226|  6.42M|	x1 = _mm_unpackhi_epi16(t0, t2);
 1227|  6.42M|	x2 = _mm_unpacklo_epi16(t1, t3);
 1228|  6.42M|	x3 = _mm_unpackhi_epi16(t1, t3);
 1229|  6.42M|}
_ZN7meshopt9unzigzag8EDv2_x:
 1233|  23.4M|{
 1234|  23.4M|	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
 1235|  23.4M|	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
 1236|       |
 1237|  23.4M|	return _mm_xor_si128(xl, xr);
 1238|  23.4M|}
vertexcodec.cpp:_ZN7meshoptL17decodeDeltas4SimdILi1EEEvPKhPhmmS3_i:
 1430|  16.2k|{
 1431|  16.2k|#if defined(SIMD_SSE) || defined(SIMD_AVX)
 1432|  16.2k|#define TEMP __m128i
 1433|  16.2k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
 1434|  16.2k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
 1435|  16.2k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
 1436|  16.2k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
 1437|  16.2k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
 1438|  16.2k|#endif
 1439|       |
 1440|       |#ifdef SIMD_NEON
 1441|       |#define TEMP uint8x8_t
 1442|       |#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
 1443|       |#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
 1444|       |#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
 1445|       |#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
 1446|       |#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
 1447|       |#endif
 1448|       |
 1449|       |#ifdef SIMD_WASM
 1450|       |#define TEMP v128_t
 1451|       |#define PREP() v128_t pi = wasm_v128_load(last_vertex)
 1452|       |#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
 1453|       |#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
 1454|       |#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
 1455|       |#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
 1456|       |#endif
 1457|       |
 1458|  16.2k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
 1459|       |
 1460|  16.2k|	PREP();
  ------------------
  |  | 1433|  16.2k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
  ------------------
 1461|       |
 1462|  16.2k|	unsigned char* savep = transposed;
 1463|       |
 1464|   258k|	for (size_t j = 0; j < vertex_count_aligned; j += 16)
  ------------------
  |  Branch (1464:21): [True: 241k, False: 16.2k]
  ------------------
 1465|   241k|	{
 1466|   241k|		LOAD(0);
  ------------------
  |  | 1434|   241k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1467|   241k|		LOAD(1);
  ------------------
  |  | 1434|   241k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1468|   241k|		LOAD(2);
  ------------------
  |  | 1434|   241k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1469|   241k|		LOAD(3);
  ------------------
  |  | 1434|   241k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1470|       |
 1471|   241k|		transpose8(r0, r1, r2, r3);
 1472|       |
 1473|   241k|		TEMP t0, t1, t2, t3;
  ------------------
  |  | 1432|   241k|#define TEMP __m128i
  ------------------
 1474|   241k|		TEMP npi = pi;
  ------------------
  |  | 1432|   241k|#define TEMP __m128i
  ------------------
 1475|       |
 1476|   241k|		UNZR(0);
  ------------------
  |  | 1458|   241k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1458:24): [Folded, False: 241k]
  |  |  |  Branch (1458:58): [True: 241k, Folded]
  |  |  ------------------
  ------------------
 1477|   241k|		GRP4(0);
  ------------------
  |  | 1435|   241k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1478|   241k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
 1479|   241k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1480|       |
 1481|   241k|		UNZR(1);
  ------------------
  |  | 1458|   241k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1458:24): [Folded, False: 241k]
  |  |  |  Branch (1458:58): [True: 241k, Folded]
  |  |  ------------------
  ------------------
 1482|   241k|		GRP4(1);
  ------------------
  |  | 1435|   241k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1483|   241k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
 1484|   241k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1485|       |
 1486|   241k|		UNZR(2);
  ------------------
  |  | 1458|   241k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1458:24): [Folded, False: 241k]
  |  |  |  Branch (1458:58): [True: 241k, Folded]
  |  |  ------------------
  ------------------
 1487|   241k|		GRP4(2);
  ------------------
  |  | 1435|   241k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1488|   241k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
 1489|   241k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1490|       |
 1491|   241k|		UNZR(3);
  ------------------
  |  | 1458|   241k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1458:24): [Folded, False: 241k]
  |  |  |  Branch (1458:58): [True: 241k, Folded]
  |  |  ------------------
  ------------------
 1492|   241k|		GRP4(3);
  ------------------
  |  | 1435|   241k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1493|   241k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   241k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 241k]
  |  |  |  Branch (1436:70): [True: 241k, Folded]
  |  |  ------------------
  ------------------
 1494|   241k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   241k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1495|       |
 1496|       |#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
 1497|       |		// instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
 1498|       |		pi = rebase<Channel>(npi, r0, r1, r2, r3);
 1499|       |#else
 1500|   241k|		(void)npi;
 1501|   241k|#endif
 1502|       |
 1503|   241k|#undef UNZR
 1504|   241k|#undef TEMP
 1505|   241k|#undef PREP
 1506|   241k|#undef LOAD
 1507|   241k|#undef GRP4
 1508|   241k|#undef FIXD
 1509|   241k|#undef SAVE
 1510|   241k|	}
 1511|  16.2k|}
_ZN7meshopt10unzigzag16EDv2_x:
 1242|   967k|{
 1243|   967k|	__m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1)));
 1244|   967k|	__m128i xr = _mm_srli_epi16(v, 1);
 1245|       |
 1246|   967k|	return _mm_xor_si128(xl, xr);
 1247|   967k|}
vertexcodec.cpp:_ZN7meshoptL17decodeDeltas4SimdILi2EEEvPKhPhmmS3_i:
 1430|  20.2k|{
 1431|  20.2k|#if defined(SIMD_SSE) || defined(SIMD_AVX)
 1432|  20.2k|#define TEMP __m128i
 1433|  20.2k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
 1434|  20.2k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
 1435|  20.2k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
 1436|  20.2k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
 1437|  20.2k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
 1438|  20.2k|#endif
 1439|       |
 1440|       |#ifdef SIMD_NEON
 1441|       |#define TEMP uint8x8_t
 1442|       |#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
 1443|       |#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
 1444|       |#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
 1445|       |#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
 1446|       |#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
 1447|       |#endif
 1448|       |
 1449|       |#ifdef SIMD_WASM
 1450|       |#define TEMP v128_t
 1451|       |#define PREP() v128_t pi = wasm_v128_load(last_vertex)
 1452|       |#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
 1453|       |#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
 1454|       |#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
 1455|       |#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
 1456|       |#endif
 1457|       |
 1458|  20.2k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
 1459|       |
 1460|  20.2k|	PREP();
  ------------------
  |  | 1433|  20.2k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
  ------------------
 1461|       |
 1462|  20.2k|	unsigned char* savep = transposed;
 1463|       |
 1464|   330k|	for (size_t j = 0; j < vertex_count_aligned; j += 16)
  ------------------
  |  Branch (1464:21): [True: 309k, False: 20.2k]
  ------------------
 1465|   309k|	{
 1466|   309k|		LOAD(0);
  ------------------
  |  | 1434|   309k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1467|   309k|		LOAD(1);
  ------------------
  |  | 1434|   309k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1468|   309k|		LOAD(2);
  ------------------
  |  | 1434|   309k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1469|   309k|		LOAD(3);
  ------------------
  |  | 1434|   309k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1470|       |
 1471|   309k|		transpose8(r0, r1, r2, r3);
 1472|       |
 1473|   309k|		TEMP t0, t1, t2, t3;
  ------------------
  |  | 1432|   309k|#define TEMP __m128i
  ------------------
 1474|   309k|		TEMP npi = pi;
  ------------------
  |  | 1432|   309k|#define TEMP __m128i
  ------------------
 1475|       |
 1476|   309k|		UNZR(0);
  ------------------
  |  | 1458|   309k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1458:24): [Folded, False: 309k]
  |  |  |  Branch (1458:58): [Folded, False: 309k]
  |  |  ------------------
  ------------------
 1477|   309k|		GRP4(0);
  ------------------
  |  | 1435|   309k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1478|   309k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
 1479|   309k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1480|       |
 1481|   309k|		UNZR(1);
  ------------------
  |  | 1458|   309k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1458:24): [Folded, False: 309k]
  |  |  |  Branch (1458:58): [Folded, False: 309k]
  |  |  ------------------
  ------------------
 1482|   309k|		GRP4(1);
  ------------------
  |  | 1435|   309k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1483|   309k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
 1484|   309k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1485|       |
 1486|   309k|		UNZR(2);
  ------------------
  |  | 1458|   309k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1458:24): [Folded, False: 309k]
  |  |  |  Branch (1458:58): [Folded, False: 309k]
  |  |  ------------------
  ------------------
 1487|   309k|		GRP4(2);
  ------------------
  |  | 1435|   309k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1488|   309k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
 1489|   309k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1490|       |
 1491|   309k|		UNZR(3);
  ------------------
  |  | 1458|   309k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1458:24): [Folded, False: 309k]
  |  |  |  Branch (1458:58): [Folded, False: 309k]
  |  |  ------------------
  ------------------
 1492|   309k|		GRP4(3);
  ------------------
  |  | 1435|   309k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1493|   309k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1436|   309k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1436:29): [Folded, False: 309k]
  |  |  |  Branch (1436:70): [Folded, False: 309k]
  |  |  ------------------
  ------------------
 1494|   309k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1437|   309k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1495|       |
 1496|       |#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
 1497|       |		// instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
 1498|       |		pi = rebase<Channel>(npi, r0, r1, r2, r3);
 1499|       |#else
 1500|   309k|		(void)npi;
 1501|   309k|#endif
 1502|       |
 1503|   309k|#undef UNZR
 1504|   309k|#undef TEMP
 1505|   309k|#undef PREP
 1506|   309k|#undef LOAD
 1507|   309k|#undef GRP4
 1508|   309k|#undef FIXD
 1509|   309k|#undef SAVE
 1510|   309k|	}
 1511|  20.2k|}
_ZN7meshopt8rotate32EDv2_xi:
 1251|  1.23M|{
 1252|  1.23M|	return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r));
 1253|  1.23M|}

_Z11fuzzDecoderPKhmmPFiPvmmS0_mE:
    8|  16.5k|{
    9|  16.5k|	size_t count = 66; // must be divisible by 3 for decodeIndexBuffer; should be >=64 to cover large vertex blocks
   10|       |
   11|  16.5k|	void* destination = malloc(count * stride);
   12|  16.5k|	assert(destination);
  ------------------
  |  Branch (12:2): [True: 16.5k, False: 0]
  ------------------
   13|       |
   14|  16.5k|	int rc = decode(destination, count, stride, reinterpret_cast<const unsigned char*>(data), size);
   15|  16.5k|	(void)rc;
   16|       |
   17|  16.5k|	free(destination);
   18|  16.5k|}
_Z13fuzzRoundtripPKhmmi:
   21|  8.29k|{
   22|  8.29k|	size_t count = size / stride;
   23|       |
   24|  8.29k|	size_t bound = meshopt_encodeVertexBufferBound(count, stride);
   25|  8.29k|	void* encoded = malloc(bound);
   26|  8.29k|	void* decoded = malloc(count * stride);
   27|  8.29k|	assert(encoded && decoded);
  ------------------
  |  Branch (27:2): [True: 8.29k, False: 0]
  |  Branch (27:2): [True: 8.29k, False: 0]
  |  Branch (27:2): [True: 8.29k, False: 0]
  ------------------
   28|       |
   29|  8.29k|	size_t res = meshopt_encodeVertexBufferLevel(static_cast<unsigned char*>(encoded), bound, data, count, stride, level, -1);
   30|  8.29k|	assert(res > 0 && res <= bound);
  ------------------
  |  Branch (30:2): [True: 8.29k, False: 0]
  |  Branch (30:2): [True: 8.29k, False: 0]
  |  Branch (30:2): [True: 8.29k, False: 0]
  ------------------
   31|       |
   32|       |	// encode again at the boundary to check for memory safety
   33|       |	// this should produce the same output because encoder is deterministic
   34|  8.29k|	size_t rese = meshopt_encodeVertexBufferLevel(static_cast<unsigned char*>(encoded) + bound - res, res, data, count, stride, level, -1);
   35|  8.29k|	assert(rese == res);
  ------------------
  |  Branch (35:2): [True: 8.29k, False: 0]
  ------------------
   36|       |
   37|  8.29k|	int rc = meshopt_decodeVertexBuffer(decoded, count, stride, static_cast<unsigned char*>(encoded) + bound - res, res);
   38|  8.29k|	assert(rc == 0);
  ------------------
  |  Branch (38:2): [True: 8.29k, False: 0]
  ------------------
   39|       |
   40|  8.29k|	assert(memcmp(data, decoded, count * stride) == 0);
  ------------------
  |  Branch (40:2): [True: 8.29k, False: 0]
  ------------------
   41|       |
   42|  8.29k|	free(decoded);
   43|  8.29k|	free(encoded);
   44|  8.29k|}
_Z5alignmm:
   47|  12.3k|{
   48|  12.3k|	return (value + alignment - 1) & ~(alignment - 1);
   49|  12.3k|}
_Z17fuzzDecodeMeshletmmPKhm:
   52|  2.06k|{
   53|       |	// raw decoding: allowed to write align(count, 4) elements
   54|  2.06k|	unsigned int rt[256];
   55|  2.06k|	unsigned int rv[256];
   56|  2.06k|	meshopt_decodeMeshletRaw(rv + 256 - align(vertex_count, 4), vertex_count, rt + 256 - align(triangle_count, 4), triangle_count, data, size);
   57|       |
   58|       |	// regular decoding: allowed to write align(count * size, 4) bytes
   59|       |	// with variations for 3-byte triangles and 2-byte vertex references
   60|  2.06k|	unsigned short rsv[256];
   61|  2.06k|	unsigned char rbt[256 * 3];
   62|       |
   63|  2.06k|	meshopt_decodeMeshlet(rv + 256 - vertex_count, vertex_count, 4, rt + 256 - triangle_count, triangle_count, 4, data, size);
   64|  2.06k|	meshopt_decodeMeshlet(rsv + 256 - align(vertex_count, 2), vertex_count, 2, rt + 256 - triangle_count, triangle_count, 4, data, size);
   65|  2.06k|	meshopt_decodeMeshlet(rv + 256 - vertex_count, vertex_count, 4, rbt + 256 * 3 - align(triangle_count * 3, 4), triangle_count, 3, data, size);
   66|  2.06k|	meshopt_decodeMeshlet(rsv + 256 - align(vertex_count, 2), vertex_count, 2, rbt + 256 * 3 - align(triangle_count * 3, 4), triangle_count, 3, data, size);
   67|  2.06k|}
_Z20fuzzRoundtripMeshletPKhm:
   70|  2.07k|{
   71|  2.07k|	size_t triangle_count = size / 3;
   72|  2.07k|	if (triangle_count > 256)
  ------------------
  |  Branch (72:6): [True: 512, False: 1.56k]
  ------------------
   73|    512|		triangle_count = 256;
   74|       |
   75|  2.07k|	unsigned char buf[4096];
   76|  2.07k|	size_t enc = meshopt_encodeMeshlet(buf, sizeof(buf), NULL, 0, reinterpret_cast<const unsigned char*>(data), triangle_count);
   77|  2.07k|	assert(enc > 0);
  ------------------
  |  Branch (77:2): [True: 2.07k, False: 0]
  ------------------
   78|  2.07k|	assert(enc <= meshopt_encodeMeshletBound(0, triangle_count));
  ------------------
  |  Branch (78:2): [True: 2.07k, False: 0]
  ------------------
   79|       |
   80|  2.07k|	unsigned int rt4[256];
   81|  2.07k|	int rc4 = meshopt_decodeMeshlet(static_cast<unsigned int*>(NULL), 0, rt4, triangle_count, buf, enc);
   82|  2.07k|	assert(rc4 == 0);
  ------------------
  |  Branch (82:2): [True: 2.07k, False: 0]
  ------------------
   83|       |
   84|   175k|	for (size_t i = 0; i < triangle_count; ++i)
  ------------------
  |  Branch (84:21): [True: 173k, False: 2.07k]
  ------------------
   85|   173k|	{
   86|   173k|		unsigned char a = data[i * 3 + 0], b = data[i * 3 + 1], c = data[i * 3 + 2];
   87|       |
   88|   173k|		unsigned int abc = (a << 0) | (b << 8) | (c << 16);
   89|   173k|		unsigned int bca = (b << 0) | (c << 8) | (a << 16);
   90|   173k|		unsigned int cba = (c << 0) | (a << 8) | (b << 16);
   91|       |
   92|   173k|		unsigned int tri = rt4[i];
   93|       |
   94|   173k|		assert(tri == abc || tri == bca || tri == cba);
  ------------------
  |  Branch (94:3): [True: 106k, False: 66.9k]
  |  Branch (94:3): [True: 35.1k, False: 31.8k]
  |  Branch (94:3): [True: 31.8k, False: 0]
  |  Branch (94:3): [True: 173k, False: 0]
  ------------------
   95|   173k|	}
   96|       |
   97|  2.07k|	unsigned char rt3[256 * 3];
   98|  2.07k|	int rc3 = meshopt_decodeMeshlet(static_cast<unsigned int*>(NULL), 0, rt3, triangle_count, buf, enc);
   99|  2.07k|	assert(rc3 == 0);
  ------------------
  |  Branch (99:2): [True: 2.07k, False: 0]
  ------------------
  100|       |
  101|   175k|	for (size_t i = 0; i < triangle_count; ++i)
  ------------------
  |  Branch (101:21): [True: 173k, False: 2.07k]
  ------------------
  102|   173k|	{
  103|   173k|		unsigned char a = data[i * 3 + 0], b = data[i * 3 + 1], c = data[i * 3 + 2];
  104|       |
  105|   173k|		unsigned int abc = (a << 0) | (b << 8) | (c << 16);
  106|   173k|		unsigned int bca = (b << 0) | (c << 8) | (a << 16);
  107|   173k|		unsigned int cba = (c << 0) | (a << 8) | (b << 16);
  108|       |
  109|   173k|		unsigned int tri = rt3[i * 3 + 0] | (rt3[i * 3 + 1] << 8) | (rt3[i * 3 + 2] << 16);
  110|       |
  111|       |		assert(tri == abc || tri == bca || tri == cba);
  ------------------
  |  Branch (111:3): [True: 106k, False: 66.9k]
  |  Branch (111:3): [True: 35.1k, False: 31.8k]
  |  Branch (111:3): [True: 31.8k, False: 0]
  |  Branch (111:3): [True: 173k, False: 0]
  ------------------
  112|   173k|	}
  113|  2.07k|}
_Z21fuzzRoundtripMeshletVPKhm:
  116|  2.07k|{
  117|  2.07k|	size_t vertex_count = size / 4;
  118|  2.07k|	if (vertex_count > 256)
  ------------------
  |  Branch (118:6): [True: 460, False: 1.61k]
  ------------------
  119|    460|		vertex_count = 256;
  120|       |
  121|  2.07k|	unsigned char tri[4] = {0, 1, 2};
  122|       |
  123|  2.07k|	unsigned char buf[4096];
  124|  2.07k|	size_t enc = meshopt_encodeMeshlet(buf, sizeof(buf), reinterpret_cast<const uint32_t*>(data), vertex_count, tri, 1);
  125|  2.07k|	assert(enc > 0);
  ------------------
  |  Branch (125:2): [True: 2.07k, False: 0]
  ------------------
  126|  2.07k|	assert(enc <= meshopt_encodeMeshletBound(vertex_count, 1));
  ------------------
  |  Branch (126:2): [True: 2.07k, False: 0]
  ------------------
  127|       |
  128|  2.07k|	unsigned int rv4[256];
  129|  2.07k|	int rc4 = meshopt_decodeMeshlet(rv4, vertex_count, tri, 1, buf, enc);
  130|  2.07k|	assert(rc4 == 0);
  ------------------
  |  Branch (130:2): [True: 2.07k, False: 0]
  ------------------
  131|       |
  132|   163k|	for (size_t i = 0; i < vertex_count; ++i)
  ------------------
  |  Branch (132:21): [True: 161k, False: 2.07k]
  ------------------
  133|   161k|		assert(rv4[i] == reinterpret_cast<const uint32_t*>(data)[i]);
  ------------------
  |  Branch (133:3): [True: 161k, False: 0]
  ------------------
  134|       |
  135|  2.07k|	unsigned short rv2[256];
  136|  2.07k|	int rc2 = meshopt_decodeMeshlet(rv2, vertex_count, tri, 1, buf, enc);
  137|  2.07k|	assert(rc2 == 0);
  ------------------
  |  Branch (137:2): [True: 2.07k, False: 0]
  ------------------
  138|       |
  139|   163k|	for (size_t i = 0; i < vertex_count; ++i)
  ------------------
  |  Branch (139:21): [True: 161k, False: 2.07k]
  ------------------
  140|       |		assert(rv2[i] == uint16_t(reinterpret_cast<const uint32_t*>(data)[i]));
  ------------------
  |  Branch (140:3): [True: 161k, False: 0]
  ------------------
  141|  2.07k|}
LLVMFuzzerTestOneInput:
  144|  2.07k|{
  145|       |	// decodeIndexBuffer supports 2 and 4-byte indices
  146|  2.07k|	fuzzDecoder(data, size, 2, meshopt_decodeIndexBuffer);
  147|  2.07k|	fuzzDecoder(data, size, 4, meshopt_decodeIndexBuffer);
  148|       |
  149|       |	// decodeIndexSequence supports 2 and 4-byte indices
  150|  2.07k|	fuzzDecoder(data, size, 2, meshopt_decodeIndexSequence);
  151|  2.07k|	fuzzDecoder(data, size, 4, meshopt_decodeIndexSequence);
  152|       |
  153|       |	// decodeVertexBuffer supports any strides divisible by 4 in 4-256 interval
  154|       |	// it's a waste of time to check all of them, so we'll just check a few with different alignment mod 16
  155|  2.07k|	fuzzDecoder(data, size, 4, meshopt_decodeVertexBuffer);
  156|  2.07k|	fuzzDecoder(data, size, 16, meshopt_decodeVertexBuffer);
  157|  2.07k|	fuzzDecoder(data, size, 24, meshopt_decodeVertexBuffer);
  158|  2.07k|	fuzzDecoder(data, size, 32, meshopt_decodeVertexBuffer);
  159|       |
  160|       |	// encodeVertexBuffer/decodeVertexBuffer should roundtrip for any stride, check a few with different alignment mod 16
  161|       |	// this also checks memory safety properties of the encoder
  162|       |	// to conserve time, we only check one version/level combination, biased towards version 1
  163|  2.07k|	uint8_t data0 = size > 0 ? data[0] : 0;
  ------------------
  |  Branch (163:18): [True: 2.07k, False: 0]
  ------------------
  164|  2.07k|	int level = data0 % 5;
  165|       |
  166|  2.07k|	meshopt_encodeVertexVersion(level < 4 ? 1 : 0);
  ------------------
  |  Branch (166:30): [True: 1.58k, False: 486]
  ------------------
  167|       |
  168|  2.07k|	fuzzRoundtrip(data, size, 4, level);
  169|  2.07k|	fuzzRoundtrip(data, size, 16, level);
  170|  2.07k|	fuzzRoundtrip(data, size, 24, level);
  171|  2.07k|	fuzzRoundtrip(data, size, 32, level);
  172|       |
  173|       |	// validate that decodeMeshlet works on untrusted data and is memory safe within documented limits
  174|  2.07k|	if (size > 2)
  ------------------
  |  Branch (174:6): [True: 2.06k, False: 13]
  ------------------
  175|  2.06k|		fuzzDecodeMeshlet(data[0] + 1, data[1] + 1, reinterpret_cast<const unsigned char*>(data + 2), size - 2);
  176|       |
  177|       |	// validate that index data roundtrips in meshlet encoding modulo rotation
  178|  2.07k|	fuzzRoundtripMeshlet(data, size);
  179|       |
  180|       |	// validate that vertex data roundtrips in meshlet encoding
  181|  2.07k|	fuzzRoundtripMeshletV(data, size);
  182|       |
  183|  2.07k|	return 0;
  184|  2.07k|}