meshopt_decodeIndexBuffer:
  379|  4.51k|{
  380|  4.51k|	using namespace meshopt;
  381|       |
  382|  4.51k|	assert(index_count % 3 == 0);
  ------------------
  |  Branch (382:2): [True: 4.51k, False: 0]
  ------------------
  383|  4.51k|	assert(index_size == 2 || index_size == 4);
  ------------------
  |  Branch (383:2): [True: 2.25k, False: 2.25k]
  |  Branch (383:2): [True: 2.25k, False: 0]
  |  Branch (383:2): [True: 4.51k, False: 0]
  ------------------
  384|       |
  385|       |	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
  386|  4.51k|	if (buffer_size < 1 + index_count / 3 + 16)
  ------------------
  |  Branch (386:6): [True: 1.79k, False: 2.71k]
  ------------------
  387|  1.79k|		return -2;
  388|       |
  389|  2.71k|	if ((buffer[0] & 0xf0) != kIndexHeader)
  ------------------
  |  Branch (389:6): [True: 1.96k, False: 752]
  ------------------
  390|  1.96k|		return -1;
  391|       |
  392|    752|	int version = buffer[0] & 0x0f;
  393|    752|	if (version > kDecodeIndexVersion)
  ------------------
  |  Branch (393:6): [True: 80, False: 672]
  ------------------
  394|     80|		return -1;
  395|       |
  396|    672|	EdgeFifo edgefifo;
  397|    672|	memset(edgefifo, -1, sizeof(edgefifo));
  398|       |
  399|    672|	VertexFifo vertexfifo;
  400|    672|	memset(vertexfifo, -1, sizeof(vertexfifo));
  401|       |
  402|    672|	size_t edgefifooffset = 0;
  403|    672|	size_t vertexfifooffset = 0;
  404|       |
  405|    672|	unsigned int next = 0;
  406|    672|	unsigned int last = 0;
  407|       |
  408|    672|	int fecmax = version >= 1 ? 13 : 15;
  ------------------
  |  Branch (408:15): [True: 164, False: 508]
  ------------------
  409|       |
  410|       |	// since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end
  411|    672|	const unsigned char* code = buffer + 1;
  412|    672|	const unsigned char* data = code + index_count / 3;
  413|    672|	const unsigned char* data_safe_end = buffer + buffer_size - 16;
  414|       |
  415|    672|	const unsigned char* codeaux_table = data_safe_end;
  416|       |
  417|  11.8k|	for (size_t i = 0; i < index_count; i += 3)
  ------------------
  |  Branch (417:21): [True: 11.4k, False: 368]
  ------------------
  418|  11.4k|	{
  419|       |		// make sure we have enough data to read for a triangle
  420|       |		// each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index
  421|       |		// after this we can be sure we can read without extra bounds checks
  422|  11.4k|		if (data > data_safe_end)
  ------------------
  |  Branch (422:7): [True: 304, False: 11.1k]
  ------------------
  423|    304|			return -2;
  424|       |
  425|  11.1k|		unsigned char codetri = *code++;
  426|       |
  427|  11.1k|		if (codetri < 0xf0)
  ------------------
  |  Branch (427:7): [True: 6.75k, False: 4.40k]
  ------------------
  428|  6.75k|		{
  429|  6.75k|			int fe = codetri >> 4;
  430|       |
  431|       |			// fifo reads are wrapped around 16 entry buffer
  432|  6.75k|			unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
  433|  6.75k|			unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
  434|  6.75k|			unsigned int c = 0;
  435|       |
  436|  6.75k|			int fec = codetri & 15;
  437|       |
  438|       |			// note: this is the most common path in the entire decoder
  439|       |			// inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable
  440|  6.75k|			if (fec < fecmax)
  ------------------
  |  Branch (440:8): [True: 5.60k, False: 1.15k]
  ------------------
  441|  5.60k|			{
  442|       |				// fifo reads are wrapped around 16 entry buffer
  443|  5.60k|				unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
  444|  5.60k|				c = (fec == 0) ? next : cf;
  ------------------
  |  Branch (444:9): [True: 2.39k, False: 3.20k]
  ------------------
  445|       |
  446|  5.60k|				int fec0 = fec == 0;
  447|  5.60k|				next += fec0;
  448|       |
  449|       |				// push vertex fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
  450|  5.60k|				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
  451|  5.60k|			}
  452|  1.15k|			else
  453|  1.15k|			{
  454|       |				// fec - (fec ^ 3) decodes 13, 14 into -1, 1
  455|       |				// note that we need to update the last index since free indices are delta-encoded
  456|  1.15k|				last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
  ------------------
  |  Branch (456:16): [True: 318, False: 834]
  ------------------
  457|       |
  458|       |				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
  459|  1.15k|				pushVertexFifo(vertexfifo, c, vertexfifooffset);
  460|  1.15k|			}
  461|       |
  462|       |			// push edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
  463|  6.75k|			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
  464|  6.75k|			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
  465|       |
  466|       |			// output triangle
  467|  6.75k|			writeTriangle(destination, i, index_size, a, b, c);
  468|  6.75k|		}
  469|  4.40k|		else
  470|  4.40k|		{
  471|       |			// fast path: read codeaux from the table
  472|  4.40k|			if (codetri < 0xfe)
  ------------------
  |  Branch (472:8): [True: 1.36k, False: 3.04k]
  ------------------
  473|  1.36k|			{
  474|  1.36k|				unsigned char codeaux = codeaux_table[codetri & 15];
  475|       |
  476|       |				// note: table can't contain feb/fec=15
  477|  1.36k|				int feb = codeaux >> 4;
  478|  1.36k|				int fec = codeaux & 15;
  479|       |
  480|       |				// fifo reads are wrapped around 16 entry buffer
  481|       |				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
  482|  1.36k|				unsigned int a = next++;
  483|       |
  484|  1.36k|				unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15];
  485|  1.36k|				unsigned int b = (feb == 0) ? next : bf;
  ------------------
  |  Branch (485:22): [True: 492, False: 872]
  ------------------
  486|       |
  487|  1.36k|				int feb0 = feb == 0;
  488|  1.36k|				next += feb0;
  489|       |
  490|  1.36k|				unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15];
  491|  1.36k|				unsigned int c = (fec == 0) ? next : cf;
  ------------------
  |  Branch (491:22): [True: 440, False: 924]
  ------------------
  492|       |
  493|  1.36k|				int fec0 = fec == 0;
  494|  1.36k|				next += fec0;
  495|       |
  496|       |				// output triangle
  497|  1.36k|				writeTriangle(destination, i, index_size, a, b, c);
  498|       |
  499|       |				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
  500|  1.36k|				pushVertexFifo(vertexfifo, a, vertexfifooffset);
  501|  1.36k|				pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0);
  502|  1.36k|				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
  503|       |
  504|  1.36k|				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
  505|  1.36k|				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
  506|  1.36k|				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
  507|  1.36k|			}
  508|  3.04k|			else
  509|  3.04k|			{
  510|       |				// slow path: read a full byte for codeaux instead of using a table lookup
  511|  3.04k|				unsigned char codeaux = *data++;
  512|       |
  513|  3.04k|				int fea = codetri == 0xfe ? 0 : 15;
  ------------------
  |  Branch (513:15): [True: 1.11k, False: 1.93k]
  ------------------
  514|  3.04k|				int feb = codeaux >> 4;
  515|  3.04k|				int fec = codeaux & 15;
  516|       |
  517|       |				// reset: codeaux is 0 but encoded as not-a-table
  518|  3.04k|				if (codeaux == 0)
  ------------------
  |  Branch (518:9): [True: 520, False: 2.52k]
  ------------------
  519|    520|					next = 0;
  520|       |
  521|       |				// fifo reads are wrapped around 16 entry buffer
  522|       |				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
  523|  3.04k|				unsigned int a = (fea == 0) ? next++ : 0;
  ------------------
  |  Branch (523:22): [True: 1.11k, False: 1.93k]
  ------------------
  524|  3.04k|				unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15];
  ------------------
  |  Branch (524:22): [True: 656, False: 2.38k]
  ------------------
  525|  3.04k|				unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15];
  ------------------
  |  Branch (525:22): [True: 684, False: 2.36k]
  ------------------
  526|       |
  527|       |				// note that we need to update the last index since free indices are delta-encoded
  528|  3.04k|				if (fea == 15)
  ------------------
  |  Branch (528:9): [True: 1.93k, False: 1.11k]
  ------------------
  529|  1.93k|					last = a = decodeIndex(data, last);
  530|       |
  531|  3.04k|				if (feb == 15)
  ------------------
  |  Branch (531:9): [True: 970, False: 2.07k]
  ------------------
  532|    970|					last = b = decodeIndex(data, last);
  533|       |
  534|  3.04k|				if (fec == 15)
  ------------------
  |  Branch (534:9): [True: 936, False: 2.10k]
  ------------------
  535|    936|					last = c = decodeIndex(data, last);
  536|       |
  537|       |				// output triangle
  538|  3.04k|				writeTriangle(destination, i, index_size, a, b, c);
  539|       |
  540|       |				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
  541|  3.04k|				pushVertexFifo(vertexfifo, a, vertexfifooffset);
  542|  3.04k|				pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15));
  543|  3.04k|				pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15));
  544|       |
  545|  3.04k|				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
  546|  3.04k|				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
  547|  3.04k|				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
  548|  3.04k|			}
  549|  4.40k|		}
  550|  11.1k|	}
  551|       |
  552|       |	// we should've read all data bytes and stopped at the boundary between data and codeaux table
  553|    368|	if (data != data_safe_end)
  ------------------
  |  Branch (553:6): [True: 348, False: 20]
  ------------------
  554|    348|		return -3;
  555|       |
  556|     20|	return 0;
  557|    368|}
meshopt_decodeIndexSequence:
  629|  4.51k|{
  630|  4.51k|	using namespace meshopt;
  631|       |
  632|       |	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
  633|  4.51k|	if (buffer_size < 1 + index_count + 4)
  ------------------
  |  Branch (633:6): [True: 2.39k, False: 2.11k]
  ------------------
  634|  2.39k|		return -2;
  635|       |
  636|  2.11k|	if ((buffer[0] & 0xf0) != kSequenceHeader)
  ------------------
  |  Branch (636:6): [True: 1.79k, False: 318]
  ------------------
  637|  1.79k|		return -1;
  638|       |
  639|    318|	int version = buffer[0] & 0x0f;
  640|    318|	if (version > kDecodeIndexVersion)
  ------------------
  |  Branch (640:6): [True: 20, False: 298]
  ------------------
  641|     20|		return -1;
  642|       |
  643|    298|	const unsigned char* data = buffer + 1;
  644|    298|	const unsigned char* data_safe_end = buffer + buffer_size - 4;
  645|       |
  646|    298|	unsigned int last[2] = {};
  647|       |
  648|  18.3k|	for (size_t i = 0; i < index_count; ++i)
  ------------------
  |  Branch (648:21): [True: 18.0k, False: 234]
  ------------------
  649|  18.0k|	{
  650|       |		// make sure we have enough data to read
  651|       |		// each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end
  652|       |		// after this we can be sure we can read without extra bounds checks
  653|  18.0k|		if (data >= data_safe_end)
  ------------------
  |  Branch (653:7): [True: 64, False: 18.0k]
  ------------------
  654|     64|			return -2;
  655|       |
  656|  18.0k|		unsigned int v = decodeVByte(data);
  657|       |
  658|       |		// decode the index of the last baseline
  659|  18.0k|		unsigned int current = v & 1;
  660|  18.0k|		v >>= 1;
  661|       |
  662|       |		// reconstruct index as a delta
  663|  18.0k|		unsigned int d = (v >> 1) ^ -int(v & 1);
  664|  18.0k|		unsigned int index = last[current] + d;
  665|       |
  666|       |		// update last for the next iteration that uses it
  667|  18.0k|		last[current] = index;
  668|       |
  669|  18.0k|		if (index_size == 2)
  ------------------
  |  Branch (669:7): [True: 9.01k, False: 9.01k]
  ------------------
  670|  9.01k|		{
  671|  9.01k|			static_cast<unsigned short*>(destination)[i] = (unsigned short)(index);
  672|  9.01k|		}
  673|  9.01k|		else
  674|  9.01k|		{
  675|  9.01k|			static_cast<unsigned int*>(destination)[i] = index;
  676|  9.01k|		}
  677|  18.0k|	}
  678|       |
  679|       |	// we should've read all data bytes and stopped at the boundary between data and tail
  680|    234|	if (data != data_safe_end)
  ------------------
  |  Branch (680:6): [True: 232, False: 2]
  ------------------
  681|    232|		return -3;
  682|       |
  683|      2|	return 0;
  684|    234|}
indexcodec.cpp:_ZN7meshoptL14pushVertexFifoEPjjRmi:
   75|  19.9k|{
   76|  19.9k|	fifo[offset] = v;
   77|  19.9k|	offset = (offset + cond) & 15;
   78|  19.9k|}
indexcodec.cpp:_ZN7meshoptL12pushEdgeFifoEPA2_jjjRm:
   55|  26.7k|{
   56|  26.7k|	fifo[offset][0] = a;
   57|  26.7k|	fifo[offset][1] = b;
   58|  26.7k|	offset = (offset + 1) & 15;
   59|  26.7k|}
indexcodec.cpp:_ZN7meshoptL11decodeIndexERPKhj:
  125|  4.67k|{
  126|  4.67k|	unsigned int v = decodeVByte(data);
  127|  4.67k|	unsigned int d = (v >> 1) ^ -int(v & 1);
  128|       |
  129|  4.67k|	return last + d;
  130|  4.67k|}
indexcodec.cpp:_ZN7meshoptL13writeTriangleEPvmmjjj:
  142|  11.1k|{
  143|  11.1k|	if (index_size == 2)
  ------------------
  |  Branch (143:6): [True: 5.58k, False: 5.58k]
  ------------------
  144|  5.58k|	{
  145|  5.58k|		static_cast<unsigned short*>(destination)[offset + 0] = (unsigned short)(a);
  146|  5.58k|		static_cast<unsigned short*>(destination)[offset + 1] = (unsigned short)(b);
  147|  5.58k|		static_cast<unsigned short*>(destination)[offset + 2] = (unsigned short)(c);
  148|  5.58k|	}
  149|  5.58k|	else
  150|  5.58k|	{
  151|  5.58k|		static_cast<unsigned int*>(destination)[offset + 0] = a;
  152|  5.58k|		static_cast<unsigned int*>(destination)[offset + 1] = b;
  153|  5.58k|		static_cast<unsigned int*>(destination)[offset + 2] = c;
  154|  5.58k|	}
  155|  11.1k|}
indexcodec.cpp:_ZN7meshoptL11decodeVByteERPKh:
   91|  22.6k|{
   92|  22.6k|	unsigned char lead = *data++;
   93|       |
   94|       |	// fast path: single byte
   95|  22.6k|	if (lead < 128)
  ------------------
  |  Branch (95:6): [True: 13.5k, False: 9.19k]
  ------------------
   96|  13.5k|		return lead;
   97|       |
   98|       |	// slow path: up to 4 extra bytes
   99|       |	// note that this loop always terminates, which is important for malformed data
  100|  9.19k|	unsigned int result = lead & 127;
  101|  9.19k|	unsigned int shift = 7;
  102|       |
  103|  31.4k|	for (int i = 0; i < 4; ++i)
  ------------------
  |  Branch (103:18): [True: 26.8k, False: 4.66k]
  ------------------
  104|  26.8k|	{
  105|  26.8k|		unsigned char group = *data++;
  106|  26.8k|		result |= unsigned(group & 127) << shift;
  107|  26.8k|		shift += 7;
  108|       |
  109|  26.8k|		if (group < 128)
  ------------------
  |  Branch (109:7): [True: 4.53k, False: 22.2k]
  ------------------
  110|  4.53k|			break;
  111|  26.8k|	}
  112|       |
  113|  9.19k|	return result;
  114|  22.6k|}

meshopt_encodeMeshletBound:
  899|  4.51k|{
  900|  4.51k|	size_t codes_size = (max_triangles + 1) / 2;
  901|  4.51k|	size_t extra_size = max_triangles * 3;
  902|       |
  903|  4.51k|	size_t ctrl_size = (max_vertices + 3) / 4;
  904|  4.51k|	size_t data_size = (max_vertices + 3) / 4 * 16; // worst case: 16 bytes per vertex group
  905|       |
  906|  4.51k|	size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
  ------------------
  |  Branch (906:20): [True: 2.82k, False: 1.68k]
  ------------------
  907|       |
  908|  4.51k|	return codes_size + extra_size + ctrl_size + data_size + gap_size;
  909|  4.51k|}
meshopt_encodeMeshlet:
  912|  4.51k|{
  913|  4.51k|	using namespace meshopt;
  914|       |
  915|  4.51k|	assert(triangle_count <= 256 && vertex_count <= 256);
  ------------------
  |  Branch (915:2): [True: 4.51k, False: 0]
  |  Branch (915:2): [True: 4.51k, False: 0]
  |  Branch (915:2): [True: 4.51k, False: 0]
  ------------------
  916|       |
  917|       |	// 4 bits per triangle + up to three bytes of extra data
  918|  4.51k|	unsigned char codes[256 / 2];
  919|  4.51k|	unsigned char extra[256 * 3];
  920|  4.51k|	size_t codes_size = (triangle_count + 1) / 2;
  921|  4.51k|	size_t extra_size = encodeTriangles(codes, extra, triangles, triangle_count);
  922|  4.51k|	assert(extra_size <= sizeof(extra));
  ------------------
  |  Branch (922:2): [True: 4.51k, False: 0]
  ------------------
  923|       |
  924|       |	// 2 bits per vertex + up to 4 bytes of actual data
  925|  4.51k|	unsigned char ctrl[256 / 4];
  926|  4.51k|	unsigned char data[256 * 4];
  927|  4.51k|	size_t ctrl_size = (vertex_count + 3) / 4;
  928|  4.51k|	size_t data_size = encodeVertices(ctrl, data, vertices, vertex_count);
  929|  4.51k|	assert(data_size <= sizeof(data));
  ------------------
  |  Branch (929:2): [True: 4.51k, False: 0]
  ------------------
  930|       |
  931|       |	// we need to ensure that up to 16 bytes after extra+data are available for SIMD decoding
  932|       |	// to minimize overhead, we place fixed-size codes+control at the end of the buffer
  933|  4.51k|	size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
  ------------------
  |  Branch (933:20): [True: 2.82k, False: 1.68k]
  ------------------
  934|       |
  935|  4.51k|	size_t result = codes_size + extra_size + ctrl_size + data_size + gap_size;
  936|       |
  937|  4.51k|	if (result > buffer_size)
  ------------------
  |  Branch (937:6): [True: 0, False: 4.51k]
  ------------------
  938|      0|		return 0;
  939|       |
  940|       |	// variable-size data first
  941|  4.51k|	memcpy(buffer, data, data_size);
  942|  4.51k|	buffer += data_size;
  943|  4.51k|	memcpy(buffer, extra, extra_size);
  944|  4.51k|	buffer += extra_size;
  945|       |
  946|       |	// gap (for accelerated decoding) separates variable-size and fixed-size data
  947|  4.51k|	memset(buffer, 0, gap_size);
  948|  4.51k|	buffer += gap_size;
  949|       |
  950|       |	// fixed-size data last; it can be located from buffer end during decoding
  951|  4.51k|	memcpy(buffer, ctrl, ctrl_size);
  952|  4.51k|	buffer += ctrl_size;
  953|  4.51k|	memcpy(buffer, codes, codes_size);
  954|  4.51k|	buffer += codes_size;
  955|       |
  956|       |#if TRACE > 1
  957|       |	printf("extra:");
  958|       |	for (size_t i = 0; i < extra_size; ++i)
  959|       |		printf(" %d", extra[i]);
  960|       |	printf("\n");
  961|       |
  962|       |	unsigned int minv = ~0u;
  963|       |	for (size_t i = 0; i < vertex_count; ++i)
  964|       |		minv = minv < vertices[i] ? minv : vertices[i];
  965|       |
  966|       |	printf("vertices: [%d+]", minv);
  967|       |	for (size_t i = 0; i < vertex_count; ++i)
  968|       |		printf(" %d", vertices[i] - minv);
  969|       |	printf("\n");
  970|       |#endif
  971|       |
  972|       |#if TRACE
  973|       |	printf("stats: %d vertices, %d triangles => %d bytes (triangles: %d codes, %d extra; vertices: %d control, %d data; %d gap)\n",
  974|       |	    int(vertex_count), int(triangle_count), int(result),
  975|       |	    int(codes_size), int(extra_size), int(ctrl_size), int(data_size), int(gap_size));
  976|       |#endif
  977|       |
  978|  4.51k|	return result;
  979|  4.51k|}
meshopt_decodeMeshlet:
  982|  17.9k|{
  983|  17.9k|	using namespace meshopt;
  984|       |
  985|  17.9k|	assert(triangle_count <= 256 && vertex_count <= 256);
  ------------------
  |  Branch (985:2): [True: 17.9k, False: 0]
  |  Branch (985:2): [True: 17.9k, False: 0]
  |  Branch (985:2): [True: 17.9k, False: 0]
  ------------------
  986|  17.9k|	assert(vertex_size == 4 || vertex_size == 2);
  ------------------
  |  Branch (986:2): [True: 11.2k, False: 6.73k]
  |  Branch (986:2): [True: 6.73k, False: 0]
  |  Branch (986:2): [True: 17.9k, False: 0]
  ------------------
  987|  17.9k|	assert(triangle_size == 4 || triangle_size == 3);
  ------------------
  |  Branch (987:2): [True: 6.73k, False: 11.2k]
  |  Branch (987:2): [True: 11.2k, False: 0]
  |  Branch (987:2): [True: 17.9k, False: 0]
  ------------------
  988|       |
  989|       |	// layout must match encoding
  990|  17.9k|	size_t codes_size = (triangle_count + 1) / 2;
  991|  17.9k|	size_t ctrl_size = (vertex_count + 3) / 4;
  992|  17.9k|	size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
  ------------------
  |  Branch (992:20): [True: 7.10k, False: 10.8k]
  ------------------
  993|       |
  994|  17.9k|	if (buffer_size < codes_size + ctrl_size + gap_size)
  ------------------
  |  Branch (994:6): [True: 4.29k, False: 13.6k]
  ------------------
  995|  4.29k|		return -2;
  996|       |
  997|  13.6k|	const unsigned char* end = buffer + buffer_size;
  998|  13.6k|	const unsigned char* codes = end - codes_size;
  999|  13.6k|	const unsigned char* ctrl = codes - ctrl_size;
 1000|  13.6k|	const unsigned char* data = buffer;
 1001|       |
 1002|       |	// gap ensures we have at least 16 bytes available after bound; this allows SIMD decoders to over-read safely
 1003|  13.6k|	const unsigned char* bound = ctrl - gap_size;
 1004|  13.6k|	assert(bound >= buffer && bound + 16 <= buffer + buffer_size);
  ------------------
  |  Branch (1004:2): [True: 13.6k, False: 0]
  |  Branch (1004:2): [True: 13.6k, False: 0]
  |  Branch (1004:2): [True: 13.6k, False: 0]
  ------------------
 1005|       |
 1006|  13.6k|#if defined(SIMD_FALLBACK)
 1007|  13.6k|	return (gDecodeTablesInitialized ? decodeMeshletSimd<0> : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
  ------------------
  |  Branch (1007:10): [True: 13.6k, False: 0]
  ------------------
 1008|       |#elif defined(SIMD_SSE) || defined(SIMD_NEON)
 1009|       |	return decodeMeshletSimd<0>(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
 1010|       |#else
 1011|       |	return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, vertex_size, triangle_size);
 1012|       |#endif
 1013|  13.6k|}
meshopt_decodeMeshletRaw:
 1016|  2.24k|{
 1017|  2.24k|	using namespace meshopt;
 1018|       |
 1019|  2.24k|	assert(triangle_count <= 256 && vertex_count <= 256);
  ------------------
  |  Branch (1019:2): [True: 2.24k, False: 0]
  |  Branch (1019:2): [True: 2.24k, False: 0]
  |  Branch (1019:2): [True: 2.24k, False: 0]
  ------------------
 1020|       |
 1021|       |	// layout must match encoding
 1022|  2.24k|	size_t codes_size = (triangle_count + 1) / 2;
 1023|  2.24k|	size_t ctrl_size = (vertex_count + 3) / 4;
 1024|  2.24k|	size_t gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
  ------------------
  |  Branch (1024:20): [True: 363, False: 1.87k]
  ------------------
 1025|       |
 1026|  2.24k|	if (buffer_size < codes_size + ctrl_size + gap_size)
  ------------------
  |  Branch (1026:6): [True: 1.07k, False: 1.16k]
  ------------------
 1027|  1.07k|		return -2;
 1028|       |
 1029|  1.16k|	const unsigned char* end = buffer + buffer_size;
 1030|  1.16k|	const unsigned char* codes = end - codes_size;
 1031|  1.16k|	const unsigned char* ctrl = codes - ctrl_size;
 1032|  1.16k|	const unsigned char* data = buffer;
 1033|       |
 1034|       |	// gap ensures we have at least 16 bytes available after bound; this allows SIMD decoders to over-read safely
 1035|  1.16k|	const unsigned char* bound = ctrl - gap_size;
 1036|  1.16k|	assert(bound >= buffer && bound + 16 <= buffer + buffer_size);
  ------------------
  |  Branch (1036:2): [True: 1.16k, False: 0]
  |  Branch (1036:2): [True: 1.16k, False: 0]
  |  Branch (1036:2): [True: 1.16k, False: 0]
  ------------------
 1037|       |
 1038|  1.16k|#if defined(SIMD_FALLBACK)
 1039|  1.16k|	return (gDecodeTablesInitialized ? decodeMeshletSimd<1> : decodeMeshlet)(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);
  ------------------
  |  Branch (1039:10): [True: 1.16k, False: 0]
  ------------------
 1040|       |#elif defined(SIMD_SSE) || defined(SIMD_NEON)
 1041|       |	return decodeMeshletSimd<1>(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);
 1042|       |#else
 1043|       |	return decodeMeshlet(vertices, triangles, codes, ctrl, data, bound, vertex_count, triangle_count, 4, 4);
 1044|       |#endif
 1045|  1.16k|}
meshletcodec.cpp:_ZN7meshoptL17decodeBuildTablesEv:
  398|      2|{
  399|      2|#define NEXT(var, ec) \
  400|      2|	shuf[var] = (ec) ? (unsigned char)extra : 15; \
  401|      2|	next[var] = (ec) ? 0 : (unsigned char)nextoff; \
  402|      2|	extra += (ec), nextoff += 1 - (ec)
  403|       |
  404|       |	// check for SSE4.1 support if we have a fallback path
  405|      2|#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  406|      2|	int cpuinfo[4] = {};
  407|       |#ifdef _MSC_VER
  408|       |	__cpuid(cpuinfo, 1);
  409|       |#else
  410|      2|	__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
  411|      2|#endif
  412|       |	// bit 19 = SSE4.1
  413|      2|	if ((cpuinfo[2] & (1 << 19)) == 0)
  ------------------
  |  Branch (413:6): [True: 0, False: 2]
  ------------------
  414|      0|		return false;
  415|      2|#endif
  416|       |
  417|       |	// fill triangle decoding tables for each combination of two triangle codes
  418|    514|	for (int code = 0; code < 256; ++code)
  ------------------
  |  Branch (418:21): [True: 512, False: 2]
  ------------------
  419|    512|	{
  420|    512|		unsigned char shuf[16] = {};
  421|    512|		unsigned char next[16] = {};
  422|    512|		int extra = 0;
  423|    512|		int nextoff = 0;
  424|       |
  425|       |		// state 0..5 will be refilled every iteration, so we ignore that
  426|       |		// state 6..8 will always contain the last decoded triangle because every triangle shifts fifo equally, so we can decode it independently
  427|    512|		shuf[6] = 12;
  428|    512|		shuf[7] = 13;
  429|    512|		shuf[8] = 14;
  430|       |
  431|       |		// state 15 will contain next (potentially incremented a few times)
  432|    512|		shuf[15] = 15;
  433|       |
  434|       |		// state 9..11 will contain the first decoded triangle (tri0), which can refer to extra/next and the original triangle history
  435|       |		// state 12..14 will contain the second decoded triangle (tri1); when decoding edge reuse, we need to handle edge 0/1 specially as it was just decoded earlier
  436|  1.53k|		for (int k = 0; k < 2; ++k)
  ------------------
  |  Branch (436:19): [True: 1.02k, False: 512]
  ------------------
  437|  1.02k|		{
  438|  1.02k|			int tri = (code >> (k * 4)) & 0xf;
  439|       |
  440|  1.02k|			if (tri < 12)
  ------------------
  |  Branch (440:8): [True: 768, False: 256]
  ------------------
  441|    768|			{
  442|    768|				if (k == 1 && tri / 4 == 0)
  ------------------
  |  Branch (442:9): [True: 384, False: 384]
  |  Branch (442:19): [True: 128, False: 256]
  ------------------
  443|    128|				{
  444|       |					// we need to decode one of two edges from the triangle we just decoded earlier
  445|       |					// for that we simply need to copy shuf/next values for the two decoded indices
  446|    128|					shuf[9 + k * 3] = shuf[9 + ((tri & 2) ? 2 : 0)];
  ------------------
  |  Branch (446:34): [True: 64, False: 64]
  ------------------
  447|    128|					next[9 + k * 3] = next[9 + ((tri & 2) ? 2 : 0)];
  ------------------
  |  Branch (447:34): [True: 64, False: 64]
  ------------------
  448|       |
  449|    128|					shuf[10 + k * 3] = shuf[9 + ((tri & 2) ? 1 : 2)];
  ------------------
  |  Branch (449:35): [True: 64, False: 64]
  ------------------
  450|    128|					next[10 + k * 3] = next[9 + ((tri & 2) ? 1 : 2)];
  ------------------
  |  Branch (450:35): [True: 64, False: 64]
  ------------------
  451|    128|				}
  452|    640|				else
  453|    640|				{
  454|       |					// reuse: edge comes from the history based on edge index
  455|       |					// note: we reuse with an offset because last triangle in the original history was consumed by tri0
  456|    640|					int trioff = 6 + k * 3 + (2 - tri / 4) * 3;
  457|       |
  458|       |					// edge cb or ac
  459|    640|					shuf[9 + k * 3] = (unsigned char)(trioff + ((tri & 2) ? 2 : 0));
  ------------------
  |  Branch (459:50): [True: 320, False: 320]
  ------------------
  460|    640|					shuf[10 + k * 3] = (unsigned char)(trioff + ((tri & 2) ? 1 : 2));
  ------------------
  |  Branch (460:51): [True: 320, False: 320]
  ------------------
  461|    640|				}
  462|       |
  463|       |				// third vertex is either next or comes from extra
  464|    768|				NEXT(11 + k * 3, tri & 1);
  ------------------
  |  |  400|    768|	shuf[var] = (ec) ? (unsigned char)extra : 15; \
  |  |  ------------------
  |  |  |  Branch (400:14): [True: 384, False: 384]
  |  |  ------------------
  |  |  401|    768|	next[var] = (ec) ? 0 : (unsigned char)nextoff; \
  |  |  ------------------
  |  |  |  Branch (401:14): [True: 384, False: 384]
  |  |  ------------------
  |  |  402|    768|	extra += (ec), nextoff += 1 - (ec)
  ------------------
  465|    768|			}
  466|    256|			else
  467|    256|			{
  468|       |				// restart: three vertices, each comes from next or extra
  469|    256|				int fea = tri > 12;
  470|    256|				int feb = tri > 13;
  471|    256|				int fec = tri > 14;
  472|       |
  473|    256|				NEXT(9 + k * 3, fea);
  ------------------
  |  |  400|    256|	shuf[var] = (ec) ? (unsigned char)extra : 15; \
  |  |  ------------------
  |  |  |  Branch (400:14): [True: 192, False: 64]
  |  |  ------------------
  |  |  401|    256|	next[var] = (ec) ? 0 : (unsigned char)nextoff; \
  |  |  ------------------
  |  |  |  Branch (401:14): [True: 192, False: 64]
  |  |  ------------------
  |  |  402|    256|	extra += (ec), nextoff += 1 - (ec)
  ------------------
  474|    256|				NEXT(10 + k * 3, feb);
  ------------------
  |  |  400|    256|	shuf[var] = (ec) ? (unsigned char)extra : 15; \
  |  |  ------------------
  |  |  |  Branch (400:14): [True: 128, False: 128]
  |  |  ------------------
  |  |  401|    256|	next[var] = (ec) ? 0 : (unsigned char)nextoff; \
  |  |  ------------------
  |  |  |  Branch (401:14): [True: 128, False: 128]
  |  |  ------------------
  |  |  402|    256|	extra += (ec), nextoff += 1 - (ec)
  ------------------
  475|    256|				NEXT(11 + k * 3, fec);
  ------------------
  |  |  400|    256|	shuf[var] = (ec) ? (unsigned char)extra : 15; \
  |  |  ------------------
  |  |  |  Branch (400:14): [True: 64, False: 192]
  |  |  ------------------
  |  |  401|    256|	next[var] = (ec) ? 0 : (unsigned char)nextoff; \
  |  |  ------------------
  |  |  |  Branch (401:14): [True: 64, False: 192]
  |  |  ------------------
  |  |  402|    256|	extra += (ec), nextoff += 1 - (ec)
  ------------------
  476|    256|			}
  477|  1.02k|		}
  478|       |
  479|       |		// next needs to advance
  480|    512|		next[15] = (unsigned char)nextoff;
  481|       |
  482|       |		// next[0..8] = 0 trivially (never written to); next[9] must also be 0 because nextoff is 0 initially
  483|       |		// shuf[0..5] is not used, which allows us to pack next[10..15] + shuf[6..15] into a single 16-byte entry
  484|    512|		assert(next[9] == 0);
  ------------------
  |  Branch (484:3): [True: 512, False: 0]
  ------------------
  485|    512|		memcpy(&kDecodeTableMasks[code][0], &next[10], 6);
  486|    512|		memcpy(&kDecodeTableMasks[code][6], &shuf[6], 10);
  487|    512|		kDecodeTableExtra[code] = (unsigned char)extra;
  488|    512|	}
  489|       |
  490|       |	// fill vertex decoding tables for each combination of four vertex references
  491|    514|	for (unsigned int i = 0; i < 256; ++i)
  ------------------
  |  Branch (491:27): [True: 512, False: 2]
  ------------------
  492|    512|	{
  493|    512|		unsigned char shuf[16] = {};
  494|    512|		int offset = 0;
  495|       |
  496|  2.56k|		for (int k = 0; k < 4; ++k)
  ------------------
  |  Branch (496:19): [True: 2.04k, False: 512]
  ------------------
  497|  2.04k|		{
  498|  2.04k|			int code = ((i >> k) & 1) | ((i >> (k + 3)) & 2);
  499|  2.04k|			int length = i == 0xff ? 4 : code; // 0/1/2/3 bytes, or all 4 bytes if code==0xff
  ------------------
  |  Branch (499:17): [True: 8, False: 2.04k]
  ------------------
  500|       |
  501|  2.04k|			shuf[k * 4 + 0] = (length > 0) ? (unsigned char)(offset + 0) : 0x80;
  ------------------
  |  Branch (501:22): [True: 1.53k, False: 512]
  ------------------
  502|  2.04k|			shuf[k * 4 + 1] = (length > 1) ? (unsigned char)(offset + 1) : 0x80;
  ------------------
  |  Branch (502:22): [True: 1.02k, False: 1.02k]
  ------------------
  503|  2.04k|			shuf[k * 4 + 2] = (length > 2) ? (unsigned char)(offset + 2) : 0x80;
  ------------------
  |  Branch (503:22): [True: 512, False: 1.53k]
  ------------------
  504|  2.04k|			shuf[k * 4 + 3] = (length > 3) ? (unsigned char)(offset + 3) : 0x80;
  ------------------
  |  Branch (504:22): [True: 8, False: 2.04k]
  ------------------
  505|       |
  506|  2.04k|			offset += length;
  507|  2.04k|		}
  508|       |
  509|    512|		memcpy(kDecodeTableVerts[i], shuf, sizeof(shuf));
  510|    512|		kDecodeTableLength[i] = (unsigned char)offset;
  511|    512|	}
  512|       |
  513|      2|	return true;
  514|       |
  515|      2|#undef NEXT
  516|      2|}
meshletcodec.cpp:_ZN7meshoptL15encodeTrianglesEPhS0_PKhm:
  109|  4.51k|{
  110|  4.51k|	EdgeFifo8 edgefifo;
  111|  4.51k|	memset(edgefifo, -1, sizeof(edgefifo));
  112|       |
  113|  4.51k|	size_t edgefifooffset = 0;
  114|       |
  115|  4.51k|	unsigned int next = 0;
  116|       |
  117|       |	// 4-bit triangle codes give us 16 options that we use as follows:
  118|       |	// 3*2 edge reuse (2 edges * 3 last triangles) * 2 next/explicit = 12 options
  119|       |	// 4 remaining options = next bits; 000, 001, 011, 111.
  120|       |	// triangles are rotated to make next bits line up.
  121|  4.51k|	memset(codes, 0, (triangle_count + 1) / 2);
  122|       |
  123|  4.51k|	static const int rotations[] = {0, 1, 2, 0, 1};
  124|       |
  125|  4.51k|	unsigned char* start = extra;
  126|       |
  127|   188k|	for (size_t i = 0; i < triangle_count; ++i)
  ------------------
  |  Branch (127:21): [True: 184k, False: 4.51k]
  ------------------
  128|   184k|	{
  129|       |#if TRACE > 1
  130|       |		unsigned int last = next;
  131|       |#endif
  132|       |
  133|   184k|		int fer = getEdgeFifo8(edgefifo, triangles[i * 3 + 0], triangles[i * 3 + 1], triangles[i * 3 + 2], edgefifooffset);
  134|       |
  135|   184k|		if (fer >= 0 && (fer >> 2) < 6)
  ------------------
  |  Branch (135:7): [True: 95.4k, False: 88.6k]
  |  Branch (135:19): [True: 92.0k, False: 3.41k]
  ------------------
  136|  92.0k|		{
  137|       |			// note: getEdgeFifo8 implicitly rotates triangles by matching a/b to existing edge
  138|  92.0k|			const int* order = rotations + (fer & 3);
  139|       |
  140|  92.0k|			unsigned int a = triangles[i * 3 + order[0]], b = triangles[i * 3 + order[1]], c = triangles[i * 3 + order[2]];
  141|       |
  142|  92.0k|			int fec = (c == next) ? (next++, 0) : 1;
  ------------------
  |  Branch (142:14): [True: 830, False: 91.1k]
  ------------------
  143|       |
  144|       |#if TRACE > 1
  145|       |			printf("%3d+ | %3d %3d %3d | edge: e%d c%d\n", last, a, b, c, fer >> 2, fec);
  146|       |#endif
  147|       |
  148|  92.0k|			unsigned int code = (fer >> 2) * 2 + fec;
  149|       |
  150|  92.0k|			codes[i / 2] |= (unsigned char)(code << ((i & 1) * 4));
  151|       |
  152|  92.0k|			if (fec)
  ------------------
  |  Branch (152:8): [True: 91.1k, False: 830]
  ------------------
  153|  91.1k|				*extra++ = (unsigned char)c;
  154|       |
  155|  92.0k|			pushEdgeFifo8(edgefifo, c, b, edgefifooffset);
  156|  92.0k|			pushEdgeFifo8(edgefifo, a, c, edgefifooffset);
  157|  92.0k|		}
  158|  92.0k|		else
  159|  92.0k|		{
  160|       |			// rotate triangles to minimize the need for extra vertices
  161|  92.0k|			int rotation = rotateTriangle(triangles[i * 3 + 0], triangles[i * 3 + 1], triangles[i * 3 + 2]);
  162|  92.0k|			const int* order = rotations + rotation;
  163|       |
  164|  92.0k|			unsigned int a = triangles[i * 3 + order[0]], b = triangles[i * 3 + order[1]], c = triangles[i * 3 + order[2]];
  165|       |
  166|       |			// fe must be continuous: once a vertex is encoded with next, further vertices must also be encoded with next
  167|  92.0k|			int fea = (a == next && b == next + 1 && c == next + 2) ? (next++, 0) : 1;
  ------------------
  |  Branch (167:15): [True: 8.73k, False: 83.2k]
  |  Branch (167:28): [True: 3.80k, False: 4.93k]
  |  Branch (167:45): [True: 3.32k, False: 478]
  ------------------
  168|  92.0k|			int feb = (b == next && c == next + 1) ? (next++, 0) : 1;
  ------------------
  |  Branch (168:15): [True: 8.77k, False: 83.2k]
  |  Branch (168:28): [True: 3.58k, False: 5.18k]
  ------------------
  169|  92.0k|			int fec = (c == next) ? (next++, 0) : 1;
  ------------------
  |  Branch (169:14): [True: 4.76k, False: 87.2k]
  ------------------
  170|       |
  171|  92.0k|			assert(fea == 1 || feb == 0);
  ------------------
  |  Branch (171:4): [True: 88.7k, False: 3.32k]
  |  Branch (171:4): [True: 3.32k, False: 0]
  |  Branch (171:4): [True: 92.0k, False: 0]
  ------------------
  172|  92.0k|			assert(feb == 1 || fec == 0);
  ------------------
  |  Branch (172:4): [True: 88.4k, False: 3.58k]
  |  Branch (172:4): [True: 3.58k, False: 0]
  |  Branch (172:4): [True: 92.0k, False: 0]
  ------------------
  173|       |
  174|       |#if TRACE > 1
  175|       |			printf("%3d+ | %3d %3d %3d | restart: %d%d%d\n", last, a, b, c, fea, feb, fec);
  176|       |#endif
  177|       |
  178|  92.0k|			unsigned int code = 12 + (fea + feb + fec);
  179|       |
  180|  92.0k|			codes[i / 2] |= (unsigned char)(code << ((i & 1) * 4));
  181|       |
  182|  92.0k|			if (fea)
  ------------------
  |  Branch (182:8): [True: 88.7k, False: 3.32k]
  ------------------
  183|  88.7k|				*extra++ = (unsigned char)a;
  184|  92.0k|			if (feb)
  ------------------
  |  Branch (184:8): [True: 88.4k, False: 3.58k]
  ------------------
  185|  88.4k|				*extra++ = (unsigned char)b;
  186|  92.0k|			if (fec)
  ------------------
  |  Branch (186:8): [True: 87.2k, False: 4.76k]
  ------------------
  187|  87.2k|				*extra++ = (unsigned char)c;
  188|       |
  189|  92.0k|			pushEdgeFifo8(edgefifo, c, b, edgefifooffset);
  190|  92.0k|			pushEdgeFifo8(edgefifo, a, c, edgefifooffset);
  191|  92.0k|		}
  192|   184k|	}
  193|       |
  194|  4.51k|	return extra - start;
  195|  4.51k|}
meshletcodec.cpp:_ZN7meshoptL12getEdgeFifo8EPA2_jjjjm:
   82|   184k|{
   83|   957k|	for (int i = 0; i < 8; ++i)
  ------------------
  |  Branch (83:18): [True: 869k, False: 88.6k]
  ------------------
   84|   869k|	{
   85|   869k|		size_t index = (offset - 1 - i) & 7;
   86|       |
   87|   869k|		unsigned int e0 = fifo[index][0];
   88|   869k|		unsigned int e1 = fifo[index][1];
   89|       |
   90|   869k|		if (e0 == a && e1 == b)
  ------------------
  |  Branch (90:7): [True: 144k, False: 725k]
  |  Branch (90:18): [True: 78.4k, False: 65.8k]
  ------------------
   91|  78.4k|			return (i << 2) | 0;
   92|   790k|		if (e0 == b && e1 == c)
  ------------------
  |  Branch (92:7): [True: 55.5k, False: 735k]
  |  Branch (92:18): [True: 9.38k, False: 46.1k]
  ------------------
   93|  9.38k|			return (i << 2) | 1;
   94|   781k|		if (e0 == c && e1 == a)
  ------------------
  |  Branch (94:7): [True: 54.4k, False: 727k]
  |  Branch (94:18): [True: 7.59k, False: 46.8k]
  ------------------
   95|  7.59k|			return (i << 2) | 2;
   96|   781k|	}
   97|       |
   98|  88.6k|	return -1;
   99|   184k|}
meshletcodec.cpp:_ZN7meshoptL13pushEdgeFifo8EPA2_jjjRm:
  102|   368k|{
  103|   368k|	fifo[offset][0] = a;
  104|   368k|	fifo[offset][1] = b;
  105|   368k|	offset = (offset + 1) & 7;
  106|   368k|}
meshletcodec.cpp:_ZN7meshoptL14rotateTriangleEjjj:
   77|  92.0k|{
   78|  92.0k|	return (a > b && a > c) ? 1 : (b > c ? 2 : 0);
  ------------------
  |  Branch (78:10): [True: 38.3k, False: 53.6k]
  |  Branch (78:19): [True: 26.6k, False: 11.6k]
  |  Branch (78:33): [True: 25.2k, False: 40.1k]
  ------------------
   79|  92.0k|}
meshletcodec.cpp:_ZN7meshoptL14encodeVerticesEPhS0_PKjm:
  198|  4.51k|{
  199|       |	// grouped varint, 2 bit per value to indicate 0/1/2/3 byte deltas, with per-group 4-byte fallback
  200|  4.51k|	memset(ctrl, 0, (vertex_count + 3) / 4);
  201|       |
  202|  4.51k|	unsigned char* start = data;
  203|       |
  204|  4.51k|	unsigned int last = ~0u;
  205|       |
  206|  47.2k|	for (size_t i = 0; i < vertex_count; i += 4)
  ------------------
  |  Branch (206:21): [True: 42.7k, False: 4.51k]
  ------------------
  207|  42.7k|	{
  208|  42.7k|		unsigned int gv[4] = {};
  209|       |
  210|   211k|		for (int k = 0; k < 4 && i + k < vertex_count; ++k)
  ------------------
  |  Branch (210:19): [True: 169k, False: 41.5k]
  |  Branch (210:28): [True: 168k, False: 1.19k]
  ------------------
  211|   168k|		{
  212|   168k|			unsigned int d = vertices[i + k] - last - 1;
  213|   168k|			unsigned int v = (d << 1) ^ (int(d) >> 31);
  214|       |
  215|   168k|			gv[k] = v;
  216|   168k|			last = vertices[i + k];
  217|   168k|		}
  218|       |
  219|       |		// if any value needs 4 bytes, or if *all* values need 3 bytes, we use 4 bytes for all values
  220|       |		// this allows us to encode most 3-byte deltas with 3 bytes which saves space overall
  221|  42.7k|		bool use4 = (gv[0] | gv[1] | gv[2] | gv[3]) > 0xffffff || (gv[0] > 0xffff && gv[1] > 0xffff && gv[2] > 0xffff && gv[3] > 0xffff);
  ------------------
  |  Branch (221:15): [True: 28.7k, False: 13.9k]
  |  Branch (221:62): [True: 1.34k, False: 12.6k]
  |  Branch (221:80): [True: 800, False: 542]
  |  Branch (221:98): [True: 390, False: 410]
  |  Branch (221:116): [True: 251, False: 139]
  ------------------
  222|       |
  223|   213k|		for (int k = 0; k < 4; ++k)
  ------------------
  |  Branch (223:19): [True: 170k, False: 42.7k]
  ------------------
  224|   170k|		{
  225|   170k|			unsigned int v = gv[k];
  226|       |
  227|       |			// 0/1/2/3 bytes per value, or all 4 values use 4 bytes
  228|   170k|			int code = use4 ? 3 : (v == 0 ? 0 : (v < 256 ? 1 : (v < 65536 ? 2 : 3)));
  ------------------
  |  Branch (228:15): [True: 116k, False: 54.7k]
  |  Branch (228:27): [True: 1.58k, False: 53.2k]
  |  Branch (228:41): [True: 47.6k, False: 5.52k]
  |  Branch (228:56): [True: 2.44k, False: 3.07k]
  ------------------
  229|       |
  230|   170k|			if (code > 0)
  ------------------
  |  Branch (230:8): [True: 169k, False: 1.58k]
  ------------------
  231|   169k|				*data++ = (unsigned char)(v & 0xff);
  232|   170k|			if (code > 1)
  ------------------
  |  Branch (232:8): [True: 121k, False: 49.2k]
  ------------------
  233|   121k|				*data++ = (unsigned char)((v >> 8) & 0xff);
  234|   170k|			if (code > 2)
  ------------------
  |  Branch (234:8): [True: 119k, False: 51.7k]
  ------------------
  235|   119k|				*data++ = (unsigned char)((v >> 16) & 0xff);
  236|   170k|			if (use4)
  ------------------
  |  Branch (236:8): [True: 116k, False: 54.7k]
  ------------------
  237|   116k|				*data++ = (unsigned char)((v >> 24) & 0xff);
  238|       |
  239|       |			// split low and high bits into two nibbles for better packing
  240|   170k|			ctrl[i / 4] |= ((code & 1) << k) | ((code >> 1) << (k + 4));
  241|   170k|		}
  242|  42.7k|	}
  243|       |
  244|  4.51k|	return data - start;
  245|  4.51k|}
meshletcodec.cpp:_ZN7meshoptL17decodeMeshletSimdILi0EEEiPvS1_PKhS3_S3_S3_mmmm:
  865|  13.6k|{
  866|  13.6k|	assert(gDecodeTablesInitialized);
  ------------------
  |  Branch (866:2): [True: 13.6k, False: 0]
  ------------------
  867|  13.6k|	(void)gDecodeTablesInitialized;
  868|       |
  869|  13.6k|#ifdef __clang__
  870|       |	// data is guaranteed to be non-null initially; if decode loops never hit bounds errors, it remains non-null
  871|  13.6k|	__builtin_assume(data);
  872|  13.6k|#endif
  873|       |
  874|       |	// decodes 4 vertices at a time with tail processing; writes up to align(vertex_size * vertex_count, 4)
  875|       |	// raw decoding skips tail processing by rounding up vertex count; it's safe because output buffer is guaranteed to have extra space, and tail control data is 0
  876|  13.6k|	if (vertex_size == 4 || Raw)
  ------------------
  |  Branch (876:6): [True: 9.10k, False: 4.59k]
  |  Branch (876:26): [Folded, False: 0]
  ------------------
  877|  9.10k|		data = decodeVerticesSimd(static_cast<unsigned int*>(vertices), ctrl, data, bound, Raw ? (vertex_count + 3) & ~3 : vertex_count);
  ------------------
  |  Branch (877:86): [Folded, False: 9.10k]
  ------------------
  878|  4.59k|	else
  879|  4.59k|		data = decodeVerticesSimd(static_cast<unsigned short*>(vertices), ctrl, data, bound, vertex_count);
  880|  13.6k|	if (!data)
  ------------------
  |  Branch (880:6): [True: 1.34k, False: 12.3k]
  ------------------
  881|  1.34k|		return -2;
  882|       |
  883|       |	// decodes 2/4 triangles at a time with tail processing; writes up to align(triangle_size * triangle_count, 4)
  884|       |	// raw decoding skips tail processing by rounding up triangle count; it's safe because output buffer is guaranteed to have extra space, and tail code data is 0
  885|  12.3k|	if (triangle_size == 4 || Raw)
  ------------------
  |  Branch (885:6): [True: 3.91k, False: 8.42k]
  |  Branch (885:28): [Folded, False: 0]
  ------------------
  886|  3.91k|		data = decodeTrianglesSimd(static_cast<unsigned int*>(triangles), codes, data, bound, Raw ? (triangle_count + 1) & ~1 : triangle_count);
  ------------------
  |  Branch (886:89): [Folded, False: 3.91k]
  ------------------
  887|  8.42k|	else
  888|  8.42k|		data = decodeTrianglesSimd(static_cast<unsigned char*>(triangles), codes, data, bound, triangle_count);
  889|  12.3k|	if (!data)
  ------------------
  |  Branch (889:6): [True: 278, False: 12.0k]
  ------------------
  890|    278|		return -2;
  891|       |
  892|  12.0k|	return (data == bound) ? 0 : -3;
  ------------------
  |  Branch (892:9): [True: 9.06k, False: 2.99k]
  ------------------
  893|  12.3k|}
meshletcodec.cpp:_ZN7meshoptL18decodeVerticesSimdEPjPKhS2_S2_m:
  750|  10.2k|{
  751|  10.2k|#if defined(SIMD_SSE)
  752|  10.2k|	__m128i last = _mm_set1_epi32(-1);
  753|       |#elif defined(SIMD_NEON)
  754|       |	uint32x4_t last = vdupq_n_u32(~0u);
  755|       |#endif
  756|       |
  757|  10.2k|	size_t groups = vertex_count / 4;
  758|       |
  759|       |	// process all complete groups
  760|   141k|	for (size_t i = 0; i < groups; ++i)
  ------------------
  |  Branch (760:21): [True: 131k, False: 9.27k]
  ------------------
  761|   131k|	{
  762|   131k|		unsigned char code = *ctrl++;
  763|   131k|		if (data > bound)
  ------------------
  |  Branch (763:7): [True: 997, False: 130k]
  ------------------
  764|    997|			return NULL;
  765|       |
  766|   130k|		last = decodeVertexGroup(last, code, data);
  767|       |
  768|   130k|#if defined(SIMD_SSE)
  769|   130k|		_mm_storeu_si128(reinterpret_cast<__m128i*>(&vertices[i * 4]), last);
  770|       |#elif defined(SIMD_NEON)
  771|       |		vst1q_u32(&vertices[i * 4], last);
  772|       |#endif
  773|   130k|	}
  774|       |
  775|       |	// process a 1-3 vertex tail; to maintain the memory safety guarantee we have to write individual elements
  776|  9.27k|	if (vertex_count & 3)
  ------------------
  |  Branch (776:6): [True: 2.66k, False: 6.61k]
  ------------------
  777|  2.66k|	{
  778|  2.66k|		unsigned char code = *ctrl++;
  779|       |
  780|  2.66k|		if (data > bound)
  ------------------
  |  Branch (780:7): [True: 14, False: 2.64k]
  ------------------
  781|     14|			return NULL;
  782|       |
  783|  2.64k|		last = decodeVertexGroup(last, code, data);
  784|       |
  785|  2.64k|		unsigned int* tail = &vertices[vertex_count & ~3u];
  786|       |
  787|  2.64k|#if defined(SIMD_SSE)
  788|  2.64k|		tail[0] = _mm_cvtsi128_si32(last);
  789|  2.64k|		if ((vertex_count & 3) > 1)
  ------------------
  |  Branch (789:7): [True: 1.19k, False: 1.45k]
  ------------------
  790|  1.19k|			tail[1] = _mm_extract_epi32(last, 1);
  791|  2.64k|		if ((vertex_count & 3) > 2)
  ------------------
  |  Branch (791:7): [True: 418, False: 2.22k]
  ------------------
  792|    418|			tail[2] = _mm_extract_epi32(last, 2);
  793|       |#elif defined(SIMD_NEON)
  794|       |		vst1q_lane_u32(&tail[0], last, 0);
  795|       |		if ((vertex_count & 3) > 1)
  796|       |			vst1q_lane_u32(&tail[1], last, 1);
  797|       |		if ((vertex_count & 3) > 2)
  798|       |			vst1q_lane_u32(&tail[2], last, 2);
  799|       |#endif
  800|  2.64k|	}
  801|       |
  802|  9.25k|	return data;
  803|  9.27k|}
_ZN7meshopt17decodeVertexGroupEDv2_xhRPKh:
  540|   236k|{
  541|   236k|	__m128i word = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  542|   236k|	__m128i shuf = _mm_loadu_si128(reinterpret_cast<const __m128i*>(kDecodeTableVerts[code]));
  543|       |
  544|   236k|	__m128i v = _mm_shuffle_epi8(word, shuf);
  545|       |
  546|       |	// unzigzag+1
  547|   236k|	__m128i xl = _mm_sub_epi32(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi32(1)));
  548|   236k|	__m128i xr = _mm_srli_epi32(v, 1);
  549|   236k|	__m128i x = _mm_add_epi32(_mm_xor_si128(xl, xr), _mm_set1_epi32(1));
  550|       |
  551|       |	// prefix sum
  552|   236k|	x = _mm_add_epi32(x, _mm_slli_si128(x, 8));
  553|   236k|	x = _mm_add_epi32(x, _mm_slli_si128(x, 4));
  554|   236k|	x = _mm_add_epi32(x, _mm_shuffle_epi32(last, 0xff));
  555|       |
  556|   236k|	data += kDecodeTableLength[code];
  557|       |
  558|   236k|	return x;
  559|   236k|}
meshletcodec.cpp:_ZN7meshoptL18decodeVerticesSimdEPtPKhS2_S2_m:
  807|  4.59k|{
  808|  4.59k|#if defined(SIMD_SSE)
  809|  4.59k|	__m128i repack = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);
  810|  4.59k|	__m128i last = _mm_set1_epi32(-1);
  811|       |#elif defined(SIMD_NEON)
  812|       |	uint32x4_t last = vdupq_n_u32(~0u);
  813|       |#endif
  814|       |
  815|       |	// because the output buffer is guaranteed to have 32-bit aligned size available, we can simplify tail processing
  816|       |	// if the number of vertices mod 4 is 3, we'd normally need to write 8+6 bytes, but we can instead overwrite up to 2 bytes in the main loop
  817|  4.59k|	size_t groups = (vertex_count + 1) / 4;
  818|       |
  819|       |	// process all complete groups
  820|   105k|	for (size_t i = 0; i < groups; ++i)
  ------------------
  |  Branch (820:21): [True: 101k, False: 3.93k]
  ------------------
  821|   101k|	{
  822|   101k|		unsigned char code = *ctrl++;
  823|       |
  824|   101k|		if (data > bound)
  ------------------
  |  Branch (824:7): [True: 660, False: 100k]
  ------------------
  825|    660|			return NULL;
  826|       |
  827|   100k|		last = decodeVertexGroup(last, code, data);
  828|       |
  829|   100k|#if defined(SIMD_SSE)
  830|   100k|		__m128i r = _mm_shuffle_epi8(last, repack);
  831|   100k|		_mm_storel_epi64(reinterpret_cast<__m128i*>(&vertices[i * 4]), r);
  832|       |#elif defined(SIMD_NEON)
  833|       |		uint16x4_t r = vmovn_u32(last);
  834|       |		vst1_u16(&vertices[i * 4], r);
  835|       |#endif
  836|   100k|	}
  837|       |
  838|       |	// process a 1-2 vertex tail; to maintain the memory safety guarantee we have to write a 32-bit element
  839|  3.93k|	if (groups * 4 < vertex_count)
  ------------------
  |  Branch (839:6): [True: 2.24k, False: 1.68k]
  ------------------
  840|  2.24k|	{
  841|  2.24k|		unsigned char code = *ctrl++;
  842|       |
  843|  2.24k|		if (data > bound)
  ------------------
  |  Branch (843:7): [True: 14, False: 2.22k]
  ------------------
  844|     14|			return NULL;
  845|       |
  846|  2.22k|		last = decodeVertexGroup(last, code, data);
  847|       |
  848|  2.22k|		unsigned short* tail = &vertices[vertex_count & ~3u];
  849|       |
  850|  2.22k|#if defined(SIMD_SSE)
  851|  2.22k|		__m128i r = _mm_shufflelo_epi16(last, 8);
  852|  2.22k|		*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
  853|       |#elif defined(SIMD_NEON)
  854|       |		uint16x4_t r = vmovn_u32(last);
  855|       |		vst1_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpret_u32_u16(r), 0);
  856|       |#endif
  857|  2.22k|	}
  858|       |
  859|  3.91k|	return data;
  860|  3.93k|}
meshletcodec.cpp:_ZN7meshoptL19decodeTrianglesSimdEPjPKhS2_S2_m:
  615|  4.74k|{
  616|  4.74k|#if defined(SIMD_SSE)
  617|  4.74k|	__m128i repack = _mm_setr_epi8(9, 10, 11, -1, 12, 13, 14, -1, 0, 0, 0, 0, 0, 0, 0, 0);
  618|  4.74k|	__m128i state = _mm_setzero_si128();
  619|       |#elif defined(SIMD_NEON)
  620|       |	uint8x8_t repack = vcreate_u8(0xff0e0d0cff0b0a09ull);
  621|       |	uint8x16_t state = vdupq_n_u8(0);
  622|       |#endif
  623|       |
  624|  4.74k|	size_t groups = triangle_count / 2;
  625|       |
  626|       |	// process all complete groups
  627|   202k|	for (size_t i = 0; i < groups; ++i)
  ------------------
  |  Branch (627:21): [True: 198k, False: 4.57k]
  ------------------
  628|   198k|	{
  629|   198k|		unsigned char code = *codes++;
  630|       |
  631|   198k|		if (extra > bound)
  ------------------
  |  Branch (631:7): [True: 173, False: 198k]
  ------------------
  632|    173|			return NULL;
  633|       |
  634|   198k|		state = decodeTriangleGroup(state, code, extra);
  635|       |
  636|       |		// write 6 bytes of new triangle data into output, formatted as 8 bytes with 0 padding
  637|   198k|#if defined(SIMD_SSE)
  638|   198k|		__m128i r = _mm_shuffle_epi8(state, repack);
  639|   198k|		_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 2]), r);
  640|       |#elif defined(SIMD_NEON)
  641|       |		uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
  642|       |		vst1_u32(&triangles[i * 2], r);
  643|       |#endif
  644|   198k|	}
  645|       |
  646|       |	// process a 1 triangle tail; to maintain the memory safety guarantee we have to write a 32-bit element
  647|  4.57k|	if (triangle_count & 1)
  ------------------
  |  Branch (647:6): [True: 1.76k, False: 2.81k]
  ------------------
  648|  1.76k|	{
  649|  1.76k|		unsigned char code = *codes++;
  650|       |
  651|  1.76k|		if (extra > bound)
  ------------------
  |  Branch (651:7): [True: 40, False: 1.72k]
  ------------------
  652|     40|			return NULL;
  653|       |
  654|  1.72k|		state = decodeTriangleGroup(state, code, extra);
  655|       |
  656|  1.72k|		unsigned int* tail = &triangles[triangle_count & ~1u];
  657|       |
  658|  1.72k|#if defined(SIMD_SSE)
  659|  1.72k|		__m128i r = _mm_shuffle_epi8(state, repack);
  660|  1.72k|		*tail = unsigned(_mm_cvtsi128_si32(r));
  661|       |#elif defined(SIMD_NEON)
  662|       |		uint32x2_t r = vreinterpret_u32_u8(vqtbl1_u8(state, repack));
  663|       |		vst1_lane_u32(tail, r, 0);
  664|       |#endif
  665|  1.72k|	}
  666|       |
  667|  4.53k|	return extra;
  668|  4.57k|}
_ZN7meshopt19decodeTriangleGroupEDv2_xhRPKh:
  524|   368k|{
  525|   368k|	__m128i shuf = _mm_loadu_si128(reinterpret_cast<const __m128i*>(kDecodeTableMasks[code]));
  526|   368k|	__m128i next = _mm_slli_si128(shuf, 10);
  527|       |
  528|       |	// patch first 6 bytes with current extra and roll state forward
  529|   368k|	__m128i ext = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(extra));
  530|   368k|	state = _mm_blend_epi16(state, ext, 7);
  531|   368k|	state = _mm_add_epi8(_mm_shuffle_epi8(state, shuf), next);
  532|       |
  533|   368k|	extra += kDecodeTableExtra[code];
  534|       |
  535|   368k|	return state;
  536|   368k|}
meshletcodec.cpp:_ZN7meshoptL19decodeTrianglesSimdEPhPKhS2_S2_m:
  672|  8.42k|{
  673|  8.42k|#if defined(SIMD_SSE)
  674|  8.42k|	__m128i state = _mm_setzero_si128();
  675|       |#elif defined(SIMD_NEON)
  676|       |	uint8x16_t state = vdupq_n_u8(0);
  677|       |#endif
  678|       |
  679|       |	// because the output buffer is guaranteed to have 32-bit aligned size available, we can optimize writes and tail processing
  680|       |	// instead of processing triangles 2 at a time, we process 2 *pairs* at a time (12-byte write) followed by a tail pair, if present
  681|       |	// if the number of triangles mod 4 is 3, we'd normally need to write 12k+9 bytes, but we can instead overwrite up to 3 bytes in the main loop
  682|  8.42k|	size_t groups = (triangle_count + 1) / 4;
  683|       |
  684|       |	// process all complete groups
  685|  89.2k|	for (size_t i = 0; i < groups; ++i)
  ------------------
  |  Branch (685:21): [True: 80.9k, False: 8.33k]
  ------------------
  686|  80.9k|	{
  687|  80.9k|		unsigned char code0 = *codes++;
  688|  80.9k|		unsigned char code1 = *codes++;
  689|       |
  690|       |		// each triangle pair reads <=6 bytes from extra, so two pairs need <=12 bytes and gap guarantees 16 byte of overread
  691|  80.9k|		if (extra > bound)
  ------------------
  |  Branch (691:7): [True: 90, False: 80.8k]
  ------------------
  692|     90|			return NULL;
  693|       |
  694|  80.8k|		state = decodeTriangleGroup(state, code0, extra);
  695|       |
  696|       |		// write first decoded triangle and first index of second decoded triangle
  697|  80.8k|#if defined(SIMD_SSE)
  698|  80.8k|		__m128i r0 = _mm_srli_si128(state, 9);
  699|  80.8k|		*reinterpret_cast<unaligned_int*>(&triangles[i * 12]) = _mm_cvtsi128_si32(r0);
  700|       |#elif defined(SIMD_NEON)
  701|       |		uint8x16_t r0 = vextq_u8(state, vdupq_n_u8(0), 9);
  702|       |		vst1q_lane_u32(reinterpret_cast<unsigned int*>(&triangles[i * 12]), vreinterpretq_u32_u8(r0), 0);
  703|       |#endif
  704|       |
  705|  80.8k|		state = decodeTriangleGroup(state, code1, extra);
  706|       |
  707|       |		// write last two indices of second decoded triangle that we didn't write above plus two new ones
  708|       |		// note that the second decoded triangle has shifted down to 6-8 bytes, hence shift by 7
  709|  80.8k|#if defined(SIMD_SSE)
  710|  80.8k|		__m128i r1 = _mm_srli_si128(state, 7);
  711|  80.8k|		_mm_storel_epi64(reinterpret_cast<__m128i*>(&triangles[i * 12 + 4]), r1);
  712|       |#elif defined(SIMD_NEON)
  713|       |		uint8x16_t r1 = vextq_u8(state, vdupq_n_u8(0), 7);
  714|       |		vst1_u8(&triangles[i * 12 + 4], vget_low_u8(r1));
  715|       |#endif
  716|  80.8k|	}
  717|       |
  718|       |	// process a 1-2 triangle tail; to maintain the memory safety guarantee we have to write 1-2 32-bit elements
  719|  8.33k|	if (groups * 4 < triangle_count)
  ------------------
  |  Branch (719:6): [True: 6.61k, False: 1.71k]
  ------------------
  720|  6.61k|	{
  721|  6.61k|		unsigned char code = *codes++;
  722|       |
  723|  6.61k|		if (extra > bound)
  ------------------
  |  Branch (723:7): [True: 46, False: 6.57k]
  ------------------
  724|     46|			return NULL;
  725|       |
  726|  6.57k|		state = decodeTriangleGroup(state, code, extra);
  727|       |
  728|  6.57k|		unsigned char* tail = &triangles[(triangle_count & ~3u) * 3];
  729|       |
  730|  6.57k|#if defined(SIMD_SSE)
  731|  6.57k|		__m128i r = _mm_srli_si128(state, 9);
  732|       |
  733|  6.57k|		*reinterpret_cast<unaligned_int*>(tail) = _mm_cvtsi128_si32(r);
  734|  6.57k|		if ((triangle_count & 3) > 1)
  ------------------
  |  Branch (734:7): [True: 787, False: 5.78k]
  ------------------
  735|    787|			*reinterpret_cast<unaligned_int*>(tail + 4) = _mm_extract_epi32(r, 1);
  736|       |#elif defined(SIMD_NEON)
  737|       |		uint8x16_t r = vextq_u8(state, vdupq_n_u8(0), 9);
  738|       |
  739|       |		vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail), vreinterpretq_u32_u8(r), 0);
  740|       |		if ((triangle_count & 3) > 1)
  741|       |			vst1q_lane_u32(reinterpret_cast<unsigned int*>(tail + 4), vreinterpretq_u32_u8(r), 1);
  742|       |#endif
  743|  6.57k|	}
  744|       |
  745|  8.29k|	return extra;
  746|  8.33k|}
meshletcodec.cpp:_ZN7meshoptL17decodeMeshletSimdILi1EEEiPvS1_PKhS3_S3_S3_mmmm:
  865|  1.16k|{
  866|  1.16k|	assert(gDecodeTablesInitialized);
  ------------------
  |  Branch (866:2): [True: 1.16k, False: 0]
  ------------------
  867|  1.16k|	(void)gDecodeTablesInitialized;
  868|       |
  869|  1.16k|#ifdef __clang__
  870|       |	// data is guaranteed to be non-null initially; if decode loops never hit bounds errors, it remains non-null
  871|  1.16k|	__builtin_assume(data);
  872|  1.16k|#endif
  873|       |
  874|       |	// decodes 4 vertices at a time with tail processing; writes up to align(vertex_size * vertex_count, 4)
  875|       |	// raw decoding skips tail processing by rounding up vertex count; it's safe because output buffer is guaranteed to have extra space, and tail control data is 0
  876|  1.16k|	if (vertex_size == 4 || Raw)
  ------------------
  |  Branch (876:6): [True: 1.16k, False: 0]
  |  Branch (876:26): [True: 0, Folded]
  ------------------
  877|  1.16k|		data = decodeVerticesSimd(static_cast<unsigned int*>(vertices), ctrl, data, bound, Raw ? (vertex_count + 3) & ~3 : vertex_count);
  ------------------
  |  Branch (877:86): [True: 1.16k, Folded]
  ------------------
  878|      0|	else
  879|      0|		data = decodeVerticesSimd(static_cast<unsigned short*>(vertices), ctrl, data, bound, vertex_count);
  880|  1.16k|	if (!data)
  ------------------
  |  Branch (880:6): [True: 337, False: 831]
  ------------------
  881|    337|		return -2;
  882|       |
  883|       |	// decodes 2/4 triangles at a time with tail processing; writes up to align(triangle_size * triangle_count, 4)
  884|       |	// raw decoding skips tail processing by rounding up triangle count; it's safe because output buffer is guaranteed to have extra space, and tail code data is 0
  885|    831|	if (triangle_size == 4 || Raw)
  ------------------
  |  Branch (885:6): [True: 831, False: 0]
  |  Branch (885:28): [True: 0, Folded]
  ------------------
  886|    831|		data = decodeTrianglesSimd(static_cast<unsigned int*>(triangles), codes, data, bound, Raw ? (triangle_count + 1) & ~1 : triangle_count);
  ------------------
  |  Branch (886:89): [True: 831, Folded]
  ------------------
  887|      0|	else
  888|      0|		data = decodeTrianglesSimd(static_cast<unsigned char*>(triangles), codes, data, bound, triangle_count);
  889|    831|	if (!data)
  ------------------
  |  Branch (889:6): [True: 71, False: 760]
  ------------------
  890|     71|		return -2;
  891|       |
  892|    760|	return (data == bound) ? 0 : -3;
  ------------------
  |  Branch (892:9): [True: 12, False: 748]
  ------------------
  893|    831|}

_Z21meshopt_decodeMeshletIjjEiPT_mPT0_mPKhm:
 1406|  2.25k|{
 1407|  2.25k|	char types_valid[(sizeof(V) == 2 || sizeof(V) == 4) && (sizeof(T) == 1 || sizeof(T) == 4) ? 1 : -1];
 1408|  2.25k|	(void)types_valid;
 1409|       |
 1410|  2.25k|	return meshopt_decodeMeshlet(vertices, vertex_count, sizeof(V), triangles, triangle_count, sizeof(T) == 1 ? 3 : 4, buffer, buffer_size);
  ------------------
  |  Branch (1410:93): [Folded, False: 2.25k]
  ------------------
 1411|  2.25k|}
_Z21meshopt_decodeMeshletIjhEiPT_mPT0_mPKhm:
 1406|  4.51k|{
 1407|  4.51k|	char types_valid[(sizeof(V) == 2 || sizeof(V) == 4) && (sizeof(T) == 1 || sizeof(T) == 4) ? 1 : -1];
 1408|  4.51k|	(void)types_valid;
 1409|       |
 1410|  4.51k|	return meshopt_decodeMeshlet(vertices, vertex_count, sizeof(V), triangles, triangle_count, sizeof(T) == 1 ? 3 : 4, buffer, buffer_size);
  ------------------
  |  Branch (1410:93): [True: 4.51k, Folded]
  ------------------
 1411|  4.51k|}
_Z21meshopt_decodeMeshletIthEiPT_mPT0_mPKhm:
 1406|  2.25k|{
 1407|  2.25k|	char types_valid[(sizeof(V) == 2 || sizeof(V) == 4) && (sizeof(T) == 1 || sizeof(T) == 4) ? 1 : -1];
 1408|  2.25k|	(void)types_valid;
 1409|       |
 1410|  2.25k|	return meshopt_decodeMeshlet(vertices, vertex_count, sizeof(V), triangles, triangle_count, sizeof(T) == 1 ? 3 : 4, buffer, buffer_size);
  ------------------
  |  Branch (1410:93): [True: 2.25k, Folded]
  ------------------
 1411|  2.25k|}

meshopt_encodeVertexBufferLevel:
 1647|  18.0k|{
 1648|  18.0k|	using namespace meshopt;
 1649|       |
 1650|  18.0k|	assert(vertex_size > 0 && vertex_size <= 256);
  ------------------
  |  Branch (1650:2): [True: 18.0k, False: 0]
  |  Branch (1650:2): [True: 18.0k, False: 0]
  |  Branch (1650:2): [True: 18.0k, False: 0]
  ------------------
 1651|  18.0k|	assert(vertex_size % 4 == 0);
  ------------------
  |  Branch (1651:2): [True: 18.0k, False: 0]
  ------------------
 1652|  18.0k|	assert(level >= 0 && level <= 9); // only a subset of this range is used right now
  ------------------
  |  Branch (1652:2): [True: 18.0k, False: 0]
  |  Branch (1652:2): [True: 18.0k, False: 0]
  |  Branch (1652:2): [True: 18.0k, False: 0]
  ------------------
 1653|  18.0k|	assert(version < 0 || unsigned(version) <= kDecodeVertexVersion);
  ------------------
  |  Branch (1653:2): [True: 18.0k, False: 0]
  |  Branch (1653:2): [True: 0, False: 0]
  |  Branch (1653:2): [True: 18.0k, False: 0]
  ------------------
 1654|       |
 1655|  18.0k|	version = version < 0 ? gEncodeVertexVersion : version;
  ------------------
  |  Branch (1655:12): [True: 18.0k, False: 0]
  ------------------
 1656|       |
 1657|       |#if TRACE
 1658|       |	memset(vertexstats, 0, sizeof(vertexstats));
 1659|       |#endif
 1660|       |
 1661|  18.0k|	const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
 1662|       |
 1663|  18.0k|	unsigned char* data = buffer;
 1664|  18.0k|	unsigned char* data_end = buffer + buffer_size;
 1665|       |
 1666|  18.0k|	if (size_t(data_end - data) < 1)
  ------------------
  |  Branch (1666:6): [True: 0, False: 18.0k]
  ------------------
 1667|      0|		return 0;
 1668|       |
 1669|  18.0k|	*data++ = (unsigned char)(kVertexHeader | version);
 1670|       |
 1671|  18.0k|	unsigned char first_vertex[256] = {};
 1672|  18.0k|	if (vertex_count > 0)
  ------------------
  |  Branch (1672:6): [True: 14.2k, False: 3.74k]
  ------------------
 1673|  14.2k|		memcpy(first_vertex, vertex_data, vertex_size);
 1674|       |
 1675|  18.0k|	unsigned char last_vertex[256] = {};
 1676|  18.0k|	memcpy(last_vertex, first_vertex, vertex_size);
 1677|       |
 1678|  18.0k|	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 1679|       |
 1680|  18.0k|	unsigned char channels[64] = {};
 1681|  18.0k|	if (version != 0 && level > 1 && vertex_count > 1)
  ------------------
  |  Branch (1681:6): [True: 14.0k, False: 3.94k]
  |  Branch (1681:22): [True: 6.16k, False: 7.92k]
  |  Branch (1681:35): [True: 4.08k, False: 2.08k]
  ------------------
 1682|  21.2k|		for (size_t k = 0; k < vertex_size; k += 4)
  ------------------
  |  Branch (1682:22): [True: 17.1k, False: 4.08k]
  ------------------
 1683|  17.1k|		{
 1684|  17.1k|			int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0;
  ------------------
  |  Branch (1684:14): [True: 14.9k, False: 2.24k]
  ------------------
 1685|  17.1k|			int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot);
  ------------------
  |  Branch (1685:137): [True: 14.9k, False: 2.24k]
  ------------------
 1686|       |
 1687|  17.1k|			assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8));
  ------------------
  |  Branch (1687:4): [True: 2.35k, False: 0]
  |  Branch (1687:4): [True: 2.35k, False: 0]
  |  Branch (1687:4): [True: 14.8k, False: 2.35k]
  |  Branch (1687:4): [True: 17.1k, False: 0]
  ------------------
 1688|  17.1k|			channels[k / 4] = (unsigned char)channel;
 1689|  17.1k|		}
 1690|       |
 1691|  18.0k|	size_t vertex_offset = 0;
 1692|       |
 1693|   329k|	while (vertex_offset < vertex_count)
  ------------------
  |  Branch (1693:9): [True: 311k, False: 18.0k]
  ------------------
 1694|   311k|	{
 1695|   311k|		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  ------------------
  |  Branch (1695:23): [True: 297k, False: 14.2k]
  ------------------
 1696|       |
 1697|   311k|		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level);
 1698|   311k|		if (!data)
  ------------------
  |  Branch (1698:7): [True: 0, False: 311k]
  ------------------
 1699|      0|			return 0;
 1700|       |
 1701|   311k|		vertex_offset += block_size;
 1702|   311k|	}
 1703|       |
 1704|  18.0k|	size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
  ------------------
  |  Branch (1704:36): [True: 3.94k, False: 14.0k]
  ------------------
 1705|  18.0k|	size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
  ------------------
  |  Branch (1705:25): [True: 3.94k, False: 14.0k]
  ------------------
 1706|  18.0k|	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
  ------------------
  |  Branch (1706:25): [True: 10.0k, False: 8.03k]
  ------------------
 1707|       |
 1708|  18.0k|	if (size_t(data_end - data) < tail_size_pad)
  ------------------
  |  Branch (1708:6): [True: 0, False: 18.0k]
  ------------------
 1709|      0|		return 0;
 1710|       |
 1711|  18.0k|	if (tail_size < tail_size_pad)
  ------------------
  |  Branch (1711:6): [True: 10.0k, False: 8.03k]
  ------------------
 1712|  10.0k|	{
 1713|  10.0k|		memset(data, 0, tail_size_pad - tail_size);
 1714|  10.0k|		data += tail_size_pad - tail_size;
 1715|  10.0k|	}
 1716|       |
 1717|  18.0k|	memcpy(data, first_vertex, vertex_size);
 1718|  18.0k|	data += vertex_size;
 1719|       |
 1720|  18.0k|	if (version != 0)
  ------------------
  |  Branch (1720:6): [True: 14.0k, False: 3.94k]
  ------------------
 1721|  14.0k|	{
 1722|  14.0k|		memcpy(data, channels, vertex_size / 4);
 1723|  14.0k|		data += vertex_size / 4;
 1724|  14.0k|	}
 1725|       |
 1726|  18.0k|	assert(data >= buffer + tail_size);
  ------------------
  |  Branch (1726:2): [True: 18.0k, False: 0]
  ------------------
 1727|  18.0k|	assert(data <= buffer + buffer_size);
  ------------------
  |  Branch (1727:2): [True: 18.0k, False: 0]
  ------------------
 1728|       |
 1729|       |#if TRACE
 1730|       |	size_t total_size = data - buffer;
 1731|       |
 1732|       |	for (size_t k = 0; k < vertex_size; ++k)
 1733|       |	{
 1734|       |		const Stats& vsk = vertexstats[k];
 1735|       |
 1736|       |		printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
 1737|       |
 1738|       |		size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8];
 1739|       |		double total_kr = total_k ? 1.0 / double(total_k) : 0;
 1740|       |
 1741|       |		if (version != 0)
 1742|       |		{
 1743|       |			int channel = channels[k / 4];
 1744|       |
 1745|       |			if ((channel & 3) == 2 && k % 4 == 0)
 1746|       |				printf(" | ^%d", channel >> 4);
 1747|       |			else
 1748|       |				printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : "."));
 1749|       |		}
 1750|       |
 1751|       |		printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]",
 1752|       |		    double(vsk.header) * total_kr * 100,
 1753|       |		    double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100,
 1754|       |		    double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100);
 1755|       |
 1756|       |		size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3];
 1757|       |
 1758|       |		if (total_ctrl)
 1759|       |		{
 1760|       |			printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%",
 1761|       |			    double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100,
 1762|       |			    double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100);
 1763|       |		}
 1764|       |
 1765|       |		if (level >= 3)
 1766|       |			printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
 1767|       |			    double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
 1768|       |			    double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
 1769|       |			    double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
 1770|       |			    double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
 1771|       |
 1772|       |		printf("\n");
 1773|       |	}
 1774|       |#endif
 1775|       |
 1776|  18.0k|	return data - buffer;
 1777|  18.0k|}
meshopt_encodeVertexBufferBound:
 1785|  9.02k|{
 1786|  9.02k|	using namespace meshopt;
 1787|       |
 1788|  9.02k|	assert(vertex_size > 0 && vertex_size <= 256);
  ------------------
  |  Branch (1788:2): [True: 9.02k, False: 0]
  |  Branch (1788:2): [True: 9.02k, False: 0]
  |  Branch (1788:2): [True: 9.02k, False: 0]
  ------------------
 1789|  9.02k|	assert(vertex_size % 4 == 0);
  ------------------
  |  Branch (1789:2): [True: 9.02k, False: 0]
  ------------------
 1790|       |
 1791|  9.02k|	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 1792|  9.02k|	size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
 1793|       |
 1794|  9.02k|	size_t vertex_block_control_size = vertex_size / 4;
 1795|  9.02k|	size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
 1796|  9.02k|	size_t vertex_block_data_size = vertex_block_size;
 1797|       |
 1798|  9.02k|	size_t tail_size = vertex_size + (vertex_size / 4);
 1799|  9.02k|	size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1;
  ------------------
  |  Branch (1799:25): [True: 9.02k, Folded]
  ------------------
 1800|  9.02k|	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
  ------------------
  |  Branch (1800:25): [True: 6.76k, False: 2.25k]
  ------------------
 1801|  9.02k|	assert(tail_size_pad >= kByteGroupDecodeLimit);
  ------------------
  |  Branch (1801:2): [True: 9.02k, False: 0]
  ------------------
 1802|       |
 1803|  9.02k|	return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad;
 1804|  9.02k|}
meshopt_encodeVertexVersion:
 1807|  2.25k|{
 1808|  2.25k|	assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion));
  ------------------
  |  Branch (1808:2): [True: 2.25k, False: 0]
  ------------------
 1809|       |
 1810|  2.25k|	meshopt::gEncodeVertexVersion = version;
 1811|  2.25k|}
meshopt_decodeVertexBuffer:
 1831|  18.0k|{
 1832|  18.0k|	using namespace meshopt;
 1833|       |
 1834|  18.0k|	assert(vertex_size > 0 && vertex_size <= 256);
  ------------------
  |  Branch (1834:2): [True: 18.0k, False: 0]
  |  Branch (1834:2): [True: 18.0k, False: 0]
  |  Branch (1834:2): [True: 18.0k, False: 0]
  ------------------
 1835|  18.0k|	assert(vertex_size % 4 == 0);
  ------------------
  |  Branch (1835:2): [True: 18.0k, False: 0]
  ------------------
 1836|       |
 1837|  18.0k|	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL;
 1838|       |
 1839|  18.0k|#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
 1840|  18.0k|	decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
  ------------------
  |  Branch (1840:11): [True: 18.0k, False: 0]
  ------------------
 1841|       |#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
 1842|       |	decode = decodeVertexBlockSimd;
 1843|       |#else
 1844|       |	decode = decodeVertexBlock;
 1845|       |#endif
 1846|       |
 1847|  18.0k|#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
 1848|  18.0k|	assert(gDecodeBytesGroupInitialized);
  ------------------
  |  Branch (1848:2): [True: 18.0k, False: 0]
  ------------------
 1849|  18.0k|	(void)gDecodeBytesGroupInitialized;
 1850|  18.0k|#endif
 1851|       |
 1852|  18.0k|	unsigned char* vertex_data = static_cast<unsigned char*>(destination);
 1853|       |
 1854|  18.0k|	const unsigned char* data = buffer;
 1855|  18.0k|	const unsigned char* data_end = buffer + buffer_size;
 1856|       |
 1857|  18.0k|	if (size_t(data_end - data) < 1)
  ------------------
  |  Branch (1857:6): [True: 0, False: 18.0k]
  ------------------
 1858|      0|		return -2;
 1859|       |
 1860|  18.0k|	unsigned char data_header = *data++;
 1861|       |
 1862|  18.0k|	if ((data_header & 0xf0) != kVertexHeader)
  ------------------
  |  Branch (1862:6): [True: 6.99k, False: 11.0k]
  ------------------
 1863|  6.99k|		return -1;
 1864|       |
 1865|  11.0k|	int version = data_header & 0x0f;
 1866|  11.0k|	if (version > kDecodeVertexVersion)
  ------------------
  |  Branch (1866:6): [True: 172, False: 10.8k]
  ------------------
 1867|    172|		return -1;
 1868|       |
 1869|  10.8k|	size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
  ------------------
  |  Branch (1869:36): [True: 2.54k, False: 8.33k]
  ------------------
 1870|  10.8k|	size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
  ------------------
  |  Branch (1870:25): [True: 2.54k, False: 8.33k]
  ------------------
 1871|  10.8k|	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
  ------------------
  |  Branch (1871:25): [True: 6.07k, False: 4.80k]
  ------------------
 1872|       |
 1873|  10.8k|	if (size_t(data_end - data) < tail_size_pad)
  ------------------
  |  Branch (1873:6): [True: 219, False: 10.6k]
  ------------------
 1874|    219|		return -2;
 1875|       |
 1876|  10.6k|	const unsigned char* tail = data_end - tail_size;
 1877|       |
 1878|  10.6k|	unsigned char last_vertex[256];
 1879|  10.6k|	memcpy(last_vertex, tail, vertex_size);
 1880|       |
 1881|  10.6k|	const unsigned char* channels = version == 0 ? NULL : tail + vertex_size;
  ------------------
  |  Branch (1881:34): [True: 2.49k, False: 8.15k]
  ------------------
 1882|       |
 1883|  10.6k|	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 1884|       |
 1885|  10.6k|	size_t vertex_offset = 0;
 1886|       |
 1887|   166k|	while (vertex_offset < vertex_count)
  ------------------
  |  Branch (1887:9): [True: 157k, False: 9.63k]
  ------------------
 1888|   157k|	{
 1889|   157k|		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  ------------------
  |  Branch (1889:23): [True: 148k, False: 8.78k]
  ------------------
 1890|       |
 1891|   157k|		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version);
 1892|   157k|		if (!data)
  ------------------
  |  Branch (1892:7): [True: 1.01k, False: 156k]
  ------------------
 1893|  1.01k|			return -2;
 1894|       |
 1895|   156k|		vertex_offset += block_size;
 1896|   156k|	}
 1897|       |
 1898|  9.63k|	if (size_t(data_end - data) != tail_size_pad)
  ------------------
  |  Branch (1898:6): [True: 616, False: 9.02k]
  ------------------
 1899|    616|		return -3;
 1900|       |
 1901|  9.02k|	return 0;
 1902|  9.63k|}
vertexcodec.cpp:_ZN7meshoptL27decodeBytesGroupBuildTablesEv:
  783|      2|{
  784|    514|	for (int mask = 0; mask < 256; ++mask)
  ------------------
  |  Branch (784:21): [True: 512, False: 2]
  ------------------
  785|    512|	{
  786|    512|		unsigned char shuffle[8];
  787|    512|		unsigned char count = 0;
  788|       |
  789|  4.60k|		for (int i = 0; i < 8; ++i)
  ------------------
  |  Branch (789:19): [True: 4.09k, False: 512]
  ------------------
  790|  4.09k|		{
  791|  4.09k|			int maski = (mask >> i) & 1;
  792|  4.09k|			shuffle[i] = maski ? count : 0x80;
  ------------------
  |  Branch (792:17): [True: 2.04k, False: 2.04k]
  ------------------
  793|  4.09k|			count += (unsigned char)(maski);
  794|  4.09k|		}
  795|       |
  796|    512|		memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
  797|    512|		kDecodeBytesGroupCount[mask] = count;
  798|    512|	}
  799|       |
  800|      2|	return true;
  801|      2|}
vertexcodec.cpp:_ZN7meshoptL14getCpuFeaturesEv:
 1631|      2|{
 1632|      2|	int cpuinfo[4] = {};
 1633|       |#ifdef _MSC_VER
 1634|       |	__cpuid(cpuinfo, 1);
 1635|       |#else
 1636|       |	__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
 1637|      2|#endif
 1638|      2|	return cpuinfo[2];
 1639|      2|}
vertexcodec.cpp:_ZN7meshoptL18getVertexBlockSizeEm:
  141|  37.7k|{
  142|       |	// make sure the entire block fits into the scratch buffer and is aligned to byte group size
  143|       |	// note: the block size is implicitly part of the format, so we can't change it without breaking compatibility
  144|  37.7k|	size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1);
  145|       |
  146|  37.7k|	return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
  ------------------
  |  Branch (146:9): [True: 0, False: 37.7k]
  ------------------
  147|  37.7k|}
vertexcodec.cpp:_ZN7meshoptL14estimateRotateEPKhmmmm:
  370|  14.9k|{
  371|  14.9k|	size_t sizes[8] = {};
  372|       |
  373|  14.9k|	const unsigned char* vertex = vertex_data + k;
  374|  14.9k|	unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
  375|       |
  376|  3.89M|	for (size_t i = 0; i < vertex_count; i += group_size)
  ------------------
  |  Branch (376:21): [True: 3.88M, False: 14.9k]
  ------------------
  377|  3.88M|	{
  378|  3.88M|		unsigned int bitg = 0;
  379|       |
  380|       |		// calculate bit consistency mask for the group
  381|  65.9M|		for (size_t j = 0; j < group_size && i + j < vertex_count; ++j)
  ------------------
  |  Branch (381:22): [True: 62.0M, False: 3.87M]
  |  Branch (381:40): [True: 62.0M, False: 12.7k]
  ------------------
  382|  62.0M|		{
  383|  62.0M|			unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
  384|  62.0M|			unsigned int d = v ^ last;
  385|       |
  386|  62.0M|			bitg |= d;
  387|  62.0M|			last = v;
  388|  62.0M|			vertex += vertex_size;
  389|  62.0M|		}
  390|       |
  391|       |#if TRACE
  392|       |		for (int j = 0; j < 32; ++j)
  393|       |			vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1));
  394|       |#endif
  395|       |
  396|  34.9M|		for (int j = 0; j < 8; ++j)
  ------------------
  |  Branch (396:19): [True: 31.0M, False: 3.88M]
  ------------------
  397|  31.0M|		{
  398|  31.0M|			unsigned int bitr = rotate(bitg, j);
  399|       |
  400|  31.0M|			sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8));
  401|  31.0M|			sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24));
  402|  31.0M|		}
  403|  3.88M|	}
  404|       |
  405|  14.9k|	int best_rot = 0;
  406|   119k|	for (int rot = 1; rot < 8; ++rot)
  ------------------
  |  Branch (406:20): [True: 104k, False: 14.9k]
  ------------------
  407|   104k|		best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot;
  ------------------
  |  Branch (407:14): [True: 6.76k, False: 97.6k]
  ------------------
  408|       |
  409|  14.9k|	return best_rot;
  410|  14.9k|}
_ZN7meshopt6rotateEji:
  150|   165M|{
  151|   165M|	return (v << r) | (v >> ((32 - r) & 31));
  152|   165M|}
vertexcodec.cpp:_ZN7meshoptL12estimateBitsEh:
  365|   124M|{
  366|   124M|	return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8;
  ------------------
  |  Branch (366:9): [True: 43.9M, False: 80.2M]
  |  Branch (366:20): [True: 42.1M, False: 1.83M]
  |  Branch (366:30): [True: 40.5M, False: 1.59M]
  ------------------
  367|   124M|}
vertexcodec.cpp:_ZN7meshoptL15estimateChannelEPKhmmmmmii:
  413|  17.1k|{
  414|  17.1k|	unsigned char block[kVertexBlockMaxSize];
  415|  17.1k|	assert(vertex_block_size <= kVertexBlockMaxSize);
  ------------------
  |  Branch (415:2): [True: 17.1k, False: 0]
  ------------------
  416|       |
  417|  17.1k|	unsigned char last_vertex[256] = {};
  418|       |
  419|  17.1k|	size_t sizes[3] = {};
  420|  17.1k|	assert(max_channel <= 3);
  ------------------
  |  Branch (420:2): [True: 17.1k, False: 0]
  ------------------
  421|       |
  422|   148k|	for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip)
  ------------------
  |  Branch (422:21): [True: 130k, False: 17.1k]
  ------------------
  423|   130k|	{
  424|   130k|		size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i;
  ------------------
  |  Branch (424:23): [True: 118k, False: 12.7k]
  ------------------
  425|   130k|		size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  426|       |
  427|   130k|		memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size);
  ------------------
  |  Branch (427:38): [True: 17.1k, False: 113k]
  ------------------
  428|       |
  429|       |		// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
  430|   130k|		if (block_size < block_size_aligned)
  ------------------
  |  Branch (430:7): [True: 11.1k, False: 119k]
  ------------------
  431|  11.1k|			memset(block + block_size, 0, block_size_aligned - block_size);
  432|       |
  433|   485k|		for (int channel = 0; channel < max_channel; ++channel)
  ------------------
  |  Branch (433:25): [True: 354k, False: 130k]
  ------------------
  434|  1.77M|			for (size_t j = 0; j < 4; ++j)
  ------------------
  |  Branch (434:23): [True: 1.41M, False: 354k]
  ------------------
  435|  1.41M|			{
  436|  1.41M|				encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4));
  437|       |
  438|  22.3M|				for (size_t ig = 0; ig < block_size; ig += kByteGroupSize)
  ------------------
  |  Branch (438:25): [True: 20.9M, False: 1.41M]
  ------------------
  439|  20.9M|				{
  440|       |					// to maximize encoding performance we only evaluate 1/2/4/8 bit groups
  441|  20.9M|					size_t size1 = encodeBytesGroupMeasure(block + ig, 1);
  442|  20.9M|					size_t size2 = encodeBytesGroupMeasure(block + ig, 2);
  443|  20.9M|					size_t size4 = encodeBytesGroupMeasure(block + ig, 4);
  444|  20.9M|					size_t size8 = encodeBytesGroupMeasure(block + ig, 8);
  445|       |
  446|  20.9M|					size_t best_size = size1 < size2 ? size1 : size2;
  ------------------
  |  Branch (446:25): [True: 19.5M, False: 1.36M]
  ------------------
  447|  20.9M|					best_size = best_size < size4 ? best_size : size4;
  ------------------
  |  Branch (447:18): [True: 20.7M, False: 149k]
  ------------------
  448|  20.9M|					best_size = best_size < size8 ? best_size : size8;
  ------------------
  |  Branch (448:18): [True: 12.3M, False: 8.50M]
  ------------------
  449|       |
  450|  20.9M|					sizes[channel] += best_size;
  451|  20.9M|				}
  452|  1.41M|			}
  453|   130k|	}
  454|       |
  455|  17.1k|	int best_channel = 0;
  456|  49.2k|	for (int channel = 1; channel < max_channel; ++channel)
  ------------------
  |  Branch (456:24): [True: 32.0k, False: 17.1k]
  ------------------
  457|  32.0k|		best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel;
  ------------------
  |  Branch (457:18): [True: 4.98k, False: 27.0k]
  ------------------
  458|       |
  459|  17.1k|	return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel;
  ------------------
  |  Branch (459:9): [True: 2.35k, False: 14.8k]
  ------------------
  460|  17.1k|}
vertexcodec.cpp:_ZN7meshoptL12encodeDeltasEPhPKhmmS2_mi:
  350|  4.73M|{
  351|  4.73M|	switch (channel & 3)
  352|  4.73M|	{
  353|  3.52M|	case 0:
  ------------------
  |  Branch (353:2): [True: 3.52M, False: 1.21M]
  ------------------
  354|  3.52M|		return encodeDeltas1<unsigned char, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
  355|   646k|	case 1:
  ------------------
  |  Branch (355:2): [True: 646k, False: 4.08M]
  ------------------
  356|   646k|		return encodeDeltas1<unsigned short, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
  357|   565k|	case 2:
  ------------------
  |  Branch (357:2): [True: 565k, False: 4.16M]
  ------------------
  358|   565k|		return encodeDeltas1<unsigned int, true>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4);
  359|      0|	default:
  ------------------
  |  Branch (359:2): [True: 0, False: 4.73M]
  ------------------
  360|       |		assert(!"Unsupported channel encoding"); // unreachable
  ------------------
  |  Branch (360:3): [Folded, False: 0]
  ------------------
  361|  4.73M|	}
  362|  4.73M|}
vertexcodec.cpp:_ZN7meshoptL13encodeDeltas1IhLb0EEEvPhPKhmmS3_mi:
  325|  3.52M|{
  326|  3.52M|	size_t k0 = k & ~(sizeof(T) - 1);
  327|  3.52M|	int ks = (k & (sizeof(T) - 1)) * 8;
  328|       |
  329|  3.52M|	T p = last_vertex[k0];
  330|  3.52M|	for (size_t j = 1; j < sizeof(T); ++j)
  ------------------
  |  Branch (330:21): [True: 0, False: 3.52M]
  ------------------
  331|      0|		p |= T(last_vertex[k0 + j]) << (j * 8);
  332|       |
  333|  3.52M|	const unsigned char* vertex = vertex_data + k0;
  334|       |
  335|   845M|	for (size_t i = 0; i < vertex_count; ++i)
  ------------------
  |  Branch (335:21): [True: 841M, False: 3.52M]
  ------------------
  336|   841M|	{
  337|   841M|		T v = vertex[0];
  338|   841M|		for (size_t j = 1; j < sizeof(T); ++j)
  ------------------
  |  Branch (338:22): [True: 0, False: 841M]
  ------------------
  339|      0|			v |= vertex[j] << (j * 8);
  340|       |
  341|   841M|		T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
  ------------------
  |  Branch (341:9): [Folded, False: 841M]
  ------------------
  342|       |
  343|   841M|		buffer[i] = (unsigned char)(d >> ks);
  344|   841M|		p = v;
  345|   841M|		vertex += vertex_size;
  346|   841M|	}
  347|  3.52M|}
_ZN7meshopt6zigzagIhEET_S1_:
  156|   841M|{
  157|   841M|	return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
  158|   841M|}
vertexcodec.cpp:_ZN7meshoptL13encodeDeltas1ItLb0EEEvPhPKhmmS3_mi:
  325|   646k|{
  326|   646k|	size_t k0 = k & ~(sizeof(T) - 1);
  327|   646k|	int ks = (k & (sizeof(T) - 1)) * 8;
  328|       |
  329|   646k|	T p = last_vertex[k0];
  330|  1.29M|	for (size_t j = 1; j < sizeof(T); ++j)
  ------------------
  |  Branch (330:21): [True: 646k, False: 646k]
  ------------------
  331|   646k|		p |= T(last_vertex[k0 + j]) << (j * 8);
  332|       |
  333|   646k|	const unsigned char* vertex = vertex_data + k0;
  334|       |
  335|   153M|	for (size_t i = 0; i < vertex_count; ++i)
  ------------------
  |  Branch (335:21): [True: 153M, False: 646k]
  ------------------
  336|   153M|	{
  337|   153M|		T v = vertex[0];
  338|   306M|		for (size_t j = 1; j < sizeof(T); ++j)
  ------------------
  |  Branch (338:22): [True: 153M, False: 153M]
  ------------------
  339|   153M|			v |= vertex[j] << (j * 8);
  340|       |
  341|   153M|		T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
  ------------------
  |  Branch (341:9): [Folded, False: 153M]
  ------------------
  342|       |
  343|   153M|		buffer[i] = (unsigned char)(d >> ks);
  344|   153M|		p = v;
  345|   153M|		vertex += vertex_size;
  346|   153M|	}
  347|   646k|}
_ZN7meshopt6zigzagItEET_S1_:
  156|   153M|{
  157|   153M|	return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
  158|   153M|}
vertexcodec.cpp:_ZN7meshoptL13encodeDeltas1IjLb1EEEvPhPKhmmS3_mi:
  325|   565k|{
  326|   565k|	size_t k0 = k & ~(sizeof(T) - 1);
  327|   565k|	int ks = (k & (sizeof(T) - 1)) * 8;
  328|       |
  329|   565k|	T p = last_vertex[k0];
  330|  2.26M|	for (size_t j = 1; j < sizeof(T); ++j)
  ------------------
  |  Branch (330:21): [True: 1.69M, False: 565k]
  ------------------
  331|  1.69M|		p |= T(last_vertex[k0 + j]) << (j * 8);
  332|       |
  333|   565k|	const unsigned char* vertex = vertex_data + k0;
  334|       |
  335|   134M|	for (size_t i = 0; i < vertex_count; ++i)
  ------------------
  |  Branch (335:21): [True: 134M, False: 565k]
  ------------------
  336|   134M|	{
  337|   134M|		T v = vertex[0];
  338|   537M|		for (size_t j = 1; j < sizeof(T); ++j)
  ------------------
  |  Branch (338:22): [True: 403M, False: 134M]
  ------------------
  339|   403M|			v |= vertex[j] << (j * 8);
  340|       |
  341|   134M|		T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
  ------------------
  |  Branch (341:9): [True: 134M, Folded]
  ------------------
  342|       |
  343|   134M|		buffer[i] = (unsigned char)(d >> ks);
  344|   134M|		p = v;
  345|   134M|		vertex += vertex_size;
  346|   134M|	}
  347|   565k|}
vertexcodec.cpp:_ZN7meshoptL23encodeBytesGroupMeasureEPKhi:
  191|   257M|{
  192|   257M|	assert(bits >= 0 && bits <= 8);
  ------------------
  |  Branch (192:2): [True: 257M, False: 0]
  |  Branch (192:2): [True: 257M, False: 0]
  |  Branch (192:2): [True: 257M, False: 0]
  ------------------
  193|       |
  194|   257M|	if (bits == 0)
  ------------------
  |  Branch (194:6): [True: 25.0M, False: 232M]
  ------------------
  195|  25.0M|		return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
  ------------------
  |  Branch (195:10): [True: 7.07M, False: 17.9M]
  ------------------
  196|       |
  197|   232M|	if (bits == 8)
  ------------------
  |  Branch (197:6): [True: 56.3M, False: 176M]
  ------------------
  198|  56.3M|		return kByteGroupSize;
  199|       |
  200|   176M|	size_t result = kByteGroupSize * bits / 8;
  201|       |
  202|   176M|	unsigned char sentinel = (1 << bits) - 1;
  203|       |
  204|  2.99G|	for (size_t i = 0; i < kByteGroupSize; ++i)
  ------------------
  |  Branch (204:21): [True: 2.81G, False: 176M]
  ------------------
  205|  2.81G|		result += buffer[i] >= sentinel;
  206|       |
  207|   176M|	return result;
  208|   232M|}
vertexcodec.cpp:_ZN7meshoptL20encodeBytesGroupZeroEPKh:
  181|  46.2M|{
  182|  46.2M|	assert(kByteGroupSize == sizeof(unsigned long long) * 2);
  ------------------
  |  Branch (182:2): [True: 46.2M, Folded]
  ------------------
  183|       |
  184|  46.2M|	unsigned long long v[2];
  185|  46.2M|	memcpy(v, buffer, sizeof(v));
  186|       |
  187|  46.2M|	return (v[0] | v[1]) == 0;
  188|  46.2M|}
vertexcodec.cpp:_ZN7meshoptL17encodeVertexBlockEPhS0_PKhmmS0_S2_ii:
  510|   311k|{
  511|   311k|	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  ------------------
  |  Branch (511:2): [True: 311k, False: 0]
  |  Branch (511:2): [True: 311k, False: 0]
  |  Branch (511:2): [True: 311k, False: 0]
  ------------------
  512|   311k|	assert(vertex_size % 4 == 0);
  ------------------
  |  Branch (512:2): [True: 311k, False: 0]
  ------------------
  513|       |
  514|   311k|	unsigned char buffer[kVertexBlockMaxSize];
  515|   311k|	assert(sizeof(buffer) % kByteGroupSize == 0);
  ------------------
  |  Branch (515:2): [True: 311k, Folded]
  ------------------
  516|       |
  517|   311k|	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  518|       |
  519|       |	// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
  520|   311k|	memset(buffer, 0, sizeof(buffer));
  521|       |
  522|   311k|	size_t control_size = version == 0 ? 0 : vertex_size / 4;
  ------------------
  |  Branch (522:24): [True: 27.9k, False: 283k]
  ------------------
  523|   311k|	if (size_t(data_end - data) < control_size)
  ------------------
  |  Branch (523:6): [True: 0, False: 311k]
  ------------------
  524|      0|		return NULL;
  525|       |
  526|   311k|	unsigned char* control = data;
  527|   311k|	data += control_size;
  528|       |
  529|   311k|	memset(control, 0, control_size);
  530|       |
  531|  3.62M|	for (size_t k = 0; k < vertex_size; ++k)
  ------------------
  |  Branch (531:21): [True: 3.31M, False: 311k]
  ------------------
  532|  3.31M|	{
  533|  3.31M|		encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]);
  ------------------
  |  Branch (533:80): [True: 311k, False: 3.00M]
  ------------------
  534|       |
  535|       |#if TRACE
  536|       |		const unsigned char* olddata = data;
  537|       |		bytestats = &vertexstats[k];
  538|       |#endif
  539|       |
  540|  3.31M|		int ctrl = 0;
  541|       |
  542|  3.31M|		if (version != 0)
  ------------------
  |  Branch (542:7): [True: 3.00M, False: 311k]
  ------------------
  543|  3.00M|		{
  544|  3.00M|			ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level);
  545|       |
  546|  3.00M|			assert(unsigned(ctrl) < 4);
  ------------------
  |  Branch (546:4): [True: 3.00M, False: 0]
  ------------------
  547|  3.00M|			control[k / 4] |= ctrl << ((k % 4) * 2);
  548|       |
  549|       |#if TRACE
  550|       |			vertexstats[k].ctrl[ctrl]++;
  551|       |#endif
  552|  3.00M|		}
  553|       |
  554|  3.31M|		if (ctrl == 3)
  ------------------
  |  Branch (554:7): [True: 648k, False: 2.66M]
  ------------------
  555|   648k|		{
  556|       |			// literal encoding
  557|   648k|			if (size_t(data_end - data) < vertex_count)
  ------------------
  |  Branch (557:8): [True: 0, False: 648k]
  ------------------
  558|      0|				return NULL;
  559|       |
  560|   648k|			memcpy(data, buffer, vertex_count);
  561|   648k|			data += vertex_count;
  562|   648k|		}
  563|  2.66M|		else if (ctrl != 2) // non-zero encoding
  ------------------
  |  Branch (563:12): [True: 1.47M, False: 1.19M]
  ------------------
  564|  1.47M|		{
  565|  1.47M|			data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
  ------------------
  |  Branch (565:69): [True: 311k, False: 1.16M]
  ------------------
  566|  1.47M|			if (!data)
  ------------------
  |  Branch (566:8): [True: 0, False: 1.47M]
  ------------------
  567|      0|				return NULL;
  568|  1.47M|		}
  569|       |
  570|       |#if TRACE
  571|       |		bytestats = NULL;
  572|       |		vertexstats[k].size += data - olddata;
  573|       |#endif
  574|  3.31M|	}
  575|       |
  576|   311k|	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
  577|       |
  578|   311k|	return data;
  579|   311k|}
vertexcodec.cpp:_ZN7meshoptL15estimateControlEPKhmmi:
  472|  3.00M|{
  473|  3.00M|	if (estimateControlZero(buffer, vertex_count_aligned))
  ------------------
  |  Branch (473:6): [True: 1.19M, False: 1.81M]
  ------------------
  474|  1.19M|		return 2; // zero encoding
  475|       |
  476|  1.81M|	if (level == 0)
  ------------------
  |  Branch (476:6): [True: 677k, False: 1.13M]
  ------------------
  477|   677k|		return 1; // 1248 encoding in level 0 for encoding speed
  478|       |
  479|       |	// round number of groups to 4 to get number of header bytes
  480|  1.13M|	size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4;
  481|       |
  482|  1.13M|	size_t est_bytes0 = header_size, est_bytes1 = header_size;
  483|       |
  484|  18.2M|	for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
  ------------------
  |  Branch (484:21): [True: 17.1M, False: 1.13M]
  ------------------
  485|  17.1M|	{
  486|       |		// assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance
  487|  17.1M|		size_t size0 = encodeBytesGroupMeasure(buffer + i, 0);
  488|  17.1M|		size_t size1 = encodeBytesGroupMeasure(buffer + i, 1);
  489|  17.1M|		size_t size2 = encodeBytesGroupMeasure(buffer + i, 2);
  490|  17.1M|		size_t size4 = encodeBytesGroupMeasure(buffer + i, 4);
  491|  17.1M|		size_t size8 = encodeBytesGroupMeasure(buffer + i, 8);
  492|       |
  493|       |		// both control modes have access to 1/2/4 bit encoding
  494|  17.1M|		size_t size12 = size1 < size2 ? size1 : size2;
  ------------------
  |  Branch (494:19): [True: 14.9M, False: 2.15M]
  ------------------
  495|  17.1M|		size_t size124 = size12 < size4 ? size12 : size4;
  ------------------
  |  Branch (495:20): [True: 16.9M, False: 184k]
  ------------------
  496|       |
  497|       |		// each control mode has access to 0/8 bit encoding respectively
  498|  17.1M|		est_bytes0 += size124 < size0 ? size124 : size0;
  ------------------
  |  Branch (498:17): [True: 14.3M, False: 2.80M]
  ------------------
  499|  17.1M|		est_bytes1 += size124 < size8 ? size124 : size8;
  ------------------
  |  Branch (499:17): [True: 6.42M, False: 10.7M]
  ------------------
  500|  17.1M|	}
  501|       |
  502|       |	// pick shortest control entry but prefer literal encoding
  503|  1.13M|	if (est_bytes0 < vertex_count || est_bytes1 < vertex_count)
  ------------------
  |  Branch (503:6): [True: 445k, False: 691k]
  |  Branch (503:35): [True: 43.1k, False: 648k]
  ------------------
  504|   488k|		return est_bytes0 < est_bytes1 ? 0 : 1;
  ------------------
  |  Branch (504:10): [True: 235k, False: 252k]
  ------------------
  505|   648k|	else
  506|   648k|		return 3; // literal encoding
  507|  1.13M|}
vertexcodec.cpp:_ZN7meshoptL19estimateControlZeroEPKhm:
  463|  3.00M|{
  464|  22.4M|	for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
  ------------------
  |  Branch (464:21): [True: 21.2M, False: 1.19M]
  ------------------
  465|  21.2M|		if (!encodeBytesGroupZero(buffer + i))
  ------------------
  |  Branch (465:7): [True: 1.81M, False: 19.3M]
  ------------------
  466|  1.81M|			return false;
  467|       |
  468|  1.19M|	return true;
  469|  3.00M|}
vertexcodec.cpp:_ZN7meshoptL11encodeBytesEPhS0_PKhmPKi:
  264|  1.47M|{
  265|  1.47M|	assert(buffer_size % kByteGroupSize == 0);
  ------------------
  |  Branch (265:2): [True: 1.47M, False: 0]
  ------------------
  266|       |
  267|  1.47M|	unsigned char* header = data;
  268|       |
  269|       |	// round number of groups to 4 to get number of header bytes
  270|  1.47M|	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  271|       |
  272|  1.47M|	if (size_t(data_end - data) < header_size)
  ------------------
  |  Branch (272:6): [True: 0, False: 1.47M]
  ------------------
  273|      0|		return NULL;
  274|       |
  275|  1.47M|	data += header_size;
  276|       |
  277|  1.47M|	memset(header, 0, header_size);
  278|       |
  279|  1.47M|	int last_bits = -1;
  280|       |
  281|  23.5M|	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  ------------------
  |  Branch (281:21): [True: 22.0M, False: 1.47M]
  ------------------
  282|  22.0M|	{
  283|  22.0M|		if (size_t(data_end - data) < kByteGroupDecodeLimit)
  ------------------
  |  Branch (283:7): [True: 0, False: 22.0M]
  ------------------
  284|      0|			return NULL;
  285|       |
  286|  22.0M|		int best_bitk = 3;
  287|  22.0M|		size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]);
  288|       |
  289|  88.1M|		for (int bitk = 0; bitk < 3; ++bitk)
  ------------------
  |  Branch (289:22): [True: 66.1M, False: 22.0M]
  ------------------
  290|  66.1M|		{
  291|  66.1M|			size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]);
  292|       |
  293|       |			// favor consistent bit selection across groups, but never replace literals
  294|  66.1M|			if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8))
  ------------------
  |  Branch (294:8): [True: 11.1M, False: 55.0M]
  |  Branch (294:29): [True: 1.34M, False: 53.6M]
  |  Branch (294:50): [True: 270k, False: 1.06M]
  |  Branch (294:77): [True: 56.4k, False: 213k]
  ------------------
  295|  11.1M|			{
  296|  11.1M|				best_bitk = bitk;
  297|  11.1M|				best_size = size;
  298|  11.1M|			}
  299|  66.1M|		}
  300|       |
  301|  22.0M|		size_t header_offset = i / kByteGroupSize;
  302|  22.0M|		header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2);
  303|       |
  304|  22.0M|		int best_bits = bits[best_bitk];
  305|  22.0M|		unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
  306|       |
  307|  22.0M|		assert(data + best_size == next);
  ------------------
  |  Branch (307:3): [True: 22.0M, False: 0]
  ------------------
  308|  22.0M|		data = next;
  309|  22.0M|		last_bits = best_bits;
  310|       |
  311|       |#if TRACE
  312|       |		bytestats->bitg[best_bits] += best_size;
  313|       |#endif
  314|  22.0M|	}
  315|       |
  316|       |#if TRACE
  317|       |	bytestats->header += header_size;
  318|       |#endif
  319|       |
  320|  1.47M|	return data;
  321|  1.47M|}
vertexcodec.cpp:_ZN7meshoptL16encodeBytesGroupEPhPKhi:
  211|  22.0M|{
  212|  22.0M|	assert(bits >= 0 && bits <= 8);
  ------------------
  |  Branch (212:2): [True: 22.0M, False: 0]
  |  Branch (212:2): [True: 22.0M, False: 0]
  |  Branch (212:2): [True: 22.0M, False: 0]
  ------------------
  213|  22.0M|	assert(kByteGroupSize % 8 == 0);
  ------------------
  |  Branch (213:2): [True: 22.0M, Folded]
  ------------------
  214|       |
  215|  22.0M|	if (bits == 0)
  ------------------
  |  Branch (215:6): [True: 4.27M, False: 17.7M]
  ------------------
  216|  4.27M|		return data;
  217|       |
  218|  17.7M|	if (bits == 8)
  ------------------
  |  Branch (218:6): [True: 12.3M, False: 5.40M]
  ------------------
  219|  12.3M|	{
  220|  12.3M|		memcpy(data, buffer, kByteGroupSize);
  221|  12.3M|		return data + kByteGroupSize;
  222|  12.3M|	}
  223|       |
  224|  5.40M|	size_t byte_size = 8 / bits;
  225|  5.40M|	assert(kByteGroupSize % byte_size == 0);
  ------------------
  |  Branch (225:2): [True: 5.40M, False: 0]
  ------------------
  226|       |
  227|       |	// fixed portion: bits bits for each value
  228|       |	// variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
  229|  5.40M|	unsigned char sentinel = (1 << bits) - 1;
  230|       |
  231|  20.7M|	for (size_t i = 0; i < kByteGroupSize; i += byte_size)
  ------------------
  |  Branch (231:21): [True: 15.3M, False: 5.40M]
  ------------------
  232|  15.3M|	{
  233|  15.3M|		unsigned char byte = 0;
  234|       |
  235|   101M|		for (size_t k = 0; k < byte_size; ++k)
  ------------------
  |  Branch (235:22): [True: 86.4M, False: 15.3M]
  ------------------
  236|  86.4M|		{
  237|  86.4M|			unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
  ------------------
  |  Branch (237:24): [True: 28.3M, False: 58.1M]
  ------------------
  238|       |
  239|  86.4M|			byte <<= bits;
  240|  86.4M|			byte |= enc;
  241|  86.4M|		}
  242|       |
  243|       |		// encode 1-bit groups in reverse bit order
  244|       |		// this makes them faster to decode alongside other groups
  245|  15.3M|		if (bits == 1)
  ------------------
  |  Branch (245:7): [True: 6.68M, False: 8.68M]
  ------------------
  246|  6.68M|			byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
  247|       |
  248|  15.3M|		*data++ = byte;
  249|  15.3M|	}
  250|       |
  251|  91.8M|	for (size_t i = 0; i < kByteGroupSize; ++i)
  ------------------
  |  Branch (251:21): [True: 86.4M, False: 5.40M]
  ------------------
  252|  86.4M|	{
  253|  86.4M|		unsigned char v = buffer[i];
  254|       |
  255|       |		// branchless append of out-of-range values
  256|  86.4M|		*data = v;
  257|  86.4M|		data += v >= sentinel;
  258|  86.4M|	}
  259|       |
  260|  5.40M|	return data;
  261|  5.40M|}
vertexcodec.cpp:_ZN7meshoptL21decodeVertexBlockSimdEPKhS1_PhmmS2_S1_i:
 1555|   157k|{
 1556|   157k|	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  ------------------
  |  Branch (1556:2): [True: 157k, False: 0]
  |  Branch (1556:2): [True: 157k, False: 0]
  |  Branch (1556:2): [True: 157k, False: 0]
  ------------------
 1557|       |
 1558|   157k|	unsigned char buffer[kVertexBlockMaxSize * 4];
 1559|   157k|	unsigned char transposed[kVertexBlockSizeBytes];
 1560|       |
 1561|   157k|	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
 1562|       |
 1563|   157k|	size_t control_size = version == 0 ? 0 : vertex_size / 4;
  ------------------
  |  Branch (1563:24): [True: 14.4k, False: 142k]
  ------------------
 1564|   157k|	if (size_t(data_end - data) < control_size)
  ------------------
  |  Branch (1564:6): [True: 0, False: 157k]
  ------------------
 1565|      0|		return NULL;
 1566|       |
 1567|   157k|	const unsigned char* control = data;
 1568|   157k|	data += control_size;
 1569|       |
 1570|   575k|	for (size_t k = 0; k < vertex_size; k += 4)
  ------------------
  |  Branch (1570:21): [True: 418k, False: 156k]
  ------------------
 1571|   418k|	{
 1572|   418k|		unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
  ------------------
  |  Branch (1572:29): [True: 40.4k, False: 378k]
  ------------------
 1573|       |
 1574|  2.09M|		for (size_t j = 0; j < 4; ++j)
  ------------------
  |  Branch (1574:22): [True: 1.67M, False: 418k]
  ------------------
 1575|  1.67M|		{
 1576|  1.67M|			int ctrl = (ctrl_byte >> (j * 2)) & 3;
 1577|       |
 1578|  1.67M|			if (ctrl == 3)
  ------------------
  |  Branch (1578:8): [True: 325k, False: 1.34M]
  ------------------
 1579|   325k|			{
 1580|       |				// literal encoding; safe to over-copy due to tail
 1581|   325k|				if (size_t(data_end - data) < vertex_count_aligned)
  ------------------
  |  Branch (1581:9): [True: 104, False: 325k]
  ------------------
 1582|    104|					return NULL;
 1583|       |
 1584|   325k|				memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned);
 1585|   325k|				data += vertex_count;
 1586|   325k|			}
 1587|  1.34M|			else if (ctrl == 2)
  ------------------
  |  Branch (1587:13): [True: 598k, False: 750k]
  ------------------
 1588|   598k|			{
 1589|       |				// zero encoding
 1590|   598k|				memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned);
 1591|   598k|			}
 1592|   750k|			else
 1593|   750k|			{
 1594|       |				// for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8
 1595|   750k|				int hshift = version == 0 ? 0 : 4 + ctrl;
  ------------------
  |  Branch (1595:18): [True: 161k, False: 589k]
  ------------------
 1596|       |
 1597|   750k|				data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift);
 1598|   750k|				if (!data)
  ------------------
  |  Branch (1598:9): [True: 728, False: 750k]
  ------------------
 1599|    728|					return NULL;
 1600|   750k|			}
 1601|  1.67M|		}
 1602|       |
 1603|   418k|		int channel = version == 0 ? 0 : channels[k / 4];
  ------------------
  |  Branch (1603:17): [True: 40.1k, False: 377k]
  ------------------
 1604|       |
 1605|   418k|		switch (channel & 3)
 1606|   418k|		{
 1607|   377k|		case 0:
  ------------------
  |  Branch (1607:3): [True: 377k, False: 40.8k]
  ------------------
 1608|   377k|			decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
 1609|   377k|			break;
 1610|  15.8k|		case 1:
  ------------------
  |  Branch (1610:3): [True: 15.8k, False: 402k]
  ------------------
 1611|  15.8k|			decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
 1612|  15.8k|			break;
 1613|  24.8k|		case 2:
  ------------------
  |  Branch (1613:3): [True: 24.8k, False: 393k]
  ------------------
 1614|  24.8k|			decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
 1615|  24.8k|			break;
 1616|    182|		default:
  ------------------
  |  Branch (1616:3): [True: 182, False: 417k]
  ------------------
 1617|    182|			return NULL; // invalid channel type
 1618|   418k|		}
 1619|   418k|	}
 1620|       |
 1621|   156k|	memcpy(vertex_data, transposed, vertex_count * vertex_size);
 1622|       |
 1623|   156k|	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
 1624|       |
 1625|   156k|	return data;
 1626|   157k|}
vertexcodec.cpp:_ZN7meshoptL15decodeBytesSimdEPKhS1_Phmi:
 1426|   750k|{
 1427|   750k|	assert(buffer_size % kByteGroupSize == 0);
  ------------------
  |  Branch (1427:2): [True: 750k, False: 0]
  ------------------
 1428|   750k|	assert(kByteGroupSize == 16);
  ------------------
  |  Branch (1428:2): [True: 750k, Folded]
  ------------------
 1429|       |
 1430|       |	// round number of groups to 4 to get number of header bytes
 1431|   750k|	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
 1432|   750k|	if (size_t(data_end - data) < header_size)
  ------------------
  |  Branch (1432:6): [True: 20, False: 750k]
  ------------------
 1433|     20|		return NULL;
 1434|       |
 1435|   750k|	const unsigned char* header = data;
 1436|   750k|	data += header_size;
 1437|       |
 1438|   750k|	size_t i = 0;
 1439|       |
 1440|       |	// fast-path: process 4 groups at a time, do a shared bounds check
 1441|  3.48M|	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
  ------------------
  |  Branch (1441:9): [True: 2.74M, False: 744k]
  |  Branch (1441:50): [True: 2.73M, False: 5.99k]
  ------------------
 1442|  2.73M|	{
 1443|  2.73M|		size_t header_offset = i / kByteGroupSize;
 1444|  2.73M|		unsigned char header_byte = header[header_offset / 4];
 1445|       |
 1446|  2.73M|		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3));
 1447|  2.73M|		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3));
 1448|  2.73M|		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3));
 1449|  2.73M|		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3));
 1450|  2.73M|	}
 1451|       |
 1452|       |	// slow-path: process remaining groups
 1453|   883k|	for (; i < buffer_size; i += kByteGroupSize)
  ------------------
  |  Branch (1453:9): [True: 133k, False: 750k]
  ------------------
 1454|   133k|	{
 1455|   133k|		if (size_t(data_end - data) < kByteGroupDecodeLimit)
  ------------------
  |  Branch (1455:7): [True: 708, False: 133k]
  ------------------
 1456|    708|			return NULL;
 1457|       |
 1458|   133k|		size_t header_offset = i / kByteGroupSize;
 1459|   133k|		unsigned char header_byte = header[header_offset / 4];
 1460|       |
 1461|   133k|		data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3));
 1462|   133k|	}
 1463|       |
 1464|   750k|	return data;
 1465|   750k|}
_ZN7meshopt20decodeBytesGroupSimdEPKhPhi:
  827|  11.0M|{
  828|  11.0M|	switch (hbits)
  829|  11.0M|	{
  830|   782k|	case 0:
  ------------------
  |  Branch (830:2): [True: 782k, False: 10.2M]
  ------------------
  831|  2.17M|	case 4:
  ------------------
  |  Branch (831:2): [True: 1.38M, False: 9.68M]
  ------------------
  832|  2.17M|	{
  833|  2.17M|		__m128i result = _mm_setzero_si128();
  834|       |
  835|  2.17M|		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  836|       |
  837|  2.17M|		return data;
  838|   782k|	}
  839|       |
  840|   129k|	case 1:
  ------------------
  |  Branch (840:2): [True: 129k, False: 10.9M]
  ------------------
  841|   982k|	case 6:
  ------------------
  |  Branch (841:2): [True: 852k, False: 10.2M]
  ------------------
  842|   982k|	{
  843|   982k|#ifdef SIMD_LATENCYOPT
  844|   982k|		unsigned int data32;
  845|   982k|		memcpy(&data32, data, 4);
  846|   982k|		data32 &= data32 >> 1;
  847|       |
  848|       |		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
  849|   982k|		unsigned long long data64 = ((unsigned long long)data32 << 30) | data32;
  850|       |
  851|       |		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  852|   982k|		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  853|   982k|#endif
  854|       |
  855|   982k|		__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
  856|   982k|		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
  857|       |
  858|   982k|		__m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
  859|   982k|		__m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
  860|   982k|		__m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
  861|       |
  862|   982k|		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
  863|   982k|		int mask16 = _mm_movemask_epi8(mask);
  864|   982k|		unsigned char mask0 = (unsigned char)(mask16 & 255);
  865|   982k|		unsigned char mask1 = (unsigned char)(mask16 >> 8);
  866|       |
  867|   982k|		__m128i shuf = decodeShuffleMask(mask0, mask1);
  868|   982k|		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  869|       |
  870|   982k|		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  871|       |
  872|   982k|#ifdef SIMD_LATENCYOPT
  873|   982k|		return data + 4 + datacnt;
  874|       |#else
  875|       |		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  876|       |#endif
  877|   129k|	}
  878|       |
  879|  7.40k|	case 2:
  ------------------
  |  Branch (879:2): [True: 7.40k, False: 11.0M]
  ------------------
  880|  59.4k|	case 7:
  ------------------
  |  Branch (880:2): [True: 52.0k, False: 11.0M]
  ------------------
  881|  59.4k|	{
  882|  59.4k|#ifdef SIMD_LATENCYOPT
  883|  59.4k|		unsigned long long data64;
  884|  59.4k|		memcpy(&data64, data, 8);
  885|  59.4k|		data64 &= data64 >> 1;
  886|  59.4k|		data64 &= data64 >> 2;
  887|       |
  888|       |		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  889|  59.4k|		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  890|  59.4k|#endif
  891|       |
  892|  59.4k|		__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  893|  59.4k|		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
  894|       |
  895|  59.4k|		__m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
  896|  59.4k|		__m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
  897|       |
  898|  59.4k|		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
  899|  59.4k|		int mask16 = _mm_movemask_epi8(mask);
  900|  59.4k|		unsigned char mask0 = (unsigned char)(mask16 & 255);
  901|  59.4k|		unsigned char mask1 = (unsigned char)(mask16 >> 8);
  902|       |
  903|  59.4k|		__m128i shuf = decodeShuffleMask(mask0, mask1);
  904|  59.4k|		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  905|       |
  906|  59.4k|		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  907|       |
  908|  59.4k|#ifdef SIMD_LATENCYOPT
  909|  59.4k|		return data + 8 + datacnt;
  910|       |#else
  911|       |		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  912|       |#endif
  913|  7.40k|	}
  914|       |
  915|  1.21M|	case 3:
  ------------------
  |  Branch (915:2): [True: 1.21M, False: 9.86M]
  ------------------
  916|  6.18M|	case 8:
  ------------------
  |  Branch (916:2): [True: 4.97M, False: 6.10M]
  ------------------
  917|  6.18M|	{
  918|  6.18M|		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  919|       |
  920|  6.18M|		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  921|       |
  922|  6.18M|		return data + 16;
  923|  1.21M|	}
  924|       |
  925|  1.67M|	case 5:
  ------------------
  |  Branch (925:2): [True: 1.67M, False: 9.40M]
  ------------------
  926|  1.67M|	{
  927|  1.67M|		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 2));
  928|       |
  929|  1.67M|		unsigned char mask0 = data[0];
  930|  1.67M|		unsigned char mask1 = data[1];
  931|       |
  932|  1.67M|		__m128i shuf = decodeShuffleMask(mask0, mask1);
  933|  1.67M|		__m128i result = _mm_shuffle_epi8(rest, shuf);
  934|       |
  935|  1.67M|		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  936|       |
  937|  1.67M|		return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  938|  1.21M|	}
  939|       |
  940|      0|	default:
  ------------------
  |  Branch (940:2): [True: 0, False: 11.0M]
  ------------------
  941|      0|		SIMD_UNREACHABLE(); // unreachable
  ------------------
  |  |   65|      0|#define SIMD_UNREACHABLE() __builtin_unreachable()
  ------------------
  942|  11.0M|	}
  943|  11.0M|}
_ZN7meshopt17decodeShuffleMaskEhh:
  809|  2.71M|{
  810|  2.71M|	__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
  811|  2.71M|	__m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
  812|  2.71M|	__m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
  813|       |
  814|  2.71M|	__m128i sm1r = _mm_add_epi8(sm1, sm1off);
  815|       |
  816|  2.71M|	return _mm_unpacklo_epi64(sm0, sm1r);
  817|  2.71M|}
vertexcodec.cpp:_ZN7meshoptL17decodeDeltas4SimdILi0EEEvPKhPhmmS3_i:
 1470|   377k|{
 1471|   377k|#if defined(SIMD_SSE) || defined(SIMD_AVX)
 1472|   377k|#define TEMP __m128i
 1473|   377k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
 1474|   377k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
 1475|   377k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
 1476|   377k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
 1477|   377k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
 1478|   377k|#endif
 1479|       |
 1480|       |#ifdef SIMD_NEON
 1481|       |#define TEMP uint8x8_t
 1482|       |#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
 1483|       |#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
 1484|       |#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
 1485|       |#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
 1486|       |#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
 1487|       |#endif
 1488|       |
 1489|       |#ifdef SIMD_WASM
 1490|       |#define TEMP v128_t
 1491|       |#define PREP() v128_t pi = wasm_v128_load(last_vertex)
 1492|       |#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
 1493|       |#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
 1494|       |#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
 1495|       |#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
 1496|       |#endif
 1497|       |
 1498|   377k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
 1499|       |
 1500|   377k|	PREP();
  ------------------
  |  | 1473|   377k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
  ------------------
 1501|       |
 1502|   377k|	unsigned char* savep = transposed;
 1503|       |
 1504|  6.01M|	for (size_t j = 0; j < vertex_count_aligned; j += 16)
  ------------------
  |  Branch (1504:21): [True: 5.63M, False: 377k]
  ------------------
 1505|  5.63M|	{
 1506|  5.63M|		LOAD(0);
  ------------------
  |  | 1474|  5.63M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1507|  5.63M|		LOAD(1);
  ------------------
  |  | 1474|  5.63M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1508|  5.63M|		LOAD(2);
  ------------------
  |  | 1474|  5.63M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1509|  5.63M|		LOAD(3);
  ------------------
  |  | 1474|  5.63M|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1510|       |
 1511|  5.63M|		transpose8(r0, r1, r2, r3);
 1512|       |
 1513|  5.63M|		TEMP t0, t1, t2, t3;
  ------------------
  |  | 1472|  5.63M|#define TEMP __m128i
  ------------------
 1514|  5.63M|		TEMP npi = pi;
  ------------------
  |  | 1472|  5.63M|#define TEMP __m128i
  ------------------
 1515|       |
 1516|  5.63M|		UNZR(0);
  ------------------
  |  | 1498|  5.63M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1498:24): [True: 5.63M, Folded]
  |  |  |  Branch (1498:58): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1517|  5.63M|		GRP4(0);
  ------------------
  |  | 1475|  5.63M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1518|  5.63M|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1519|  5.63M|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1520|       |
 1521|  5.63M|		UNZR(1);
  ------------------
  |  | 1498|  5.63M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1498:24): [True: 5.63M, Folded]
  |  |  |  Branch (1498:58): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1522|  5.63M|		GRP4(1);
  ------------------
  |  | 1475|  5.63M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1523|  5.63M|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1524|  5.63M|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1525|       |
 1526|  5.63M|		UNZR(2);
  ------------------
  |  | 1498|  5.63M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1498:24): [True: 5.63M, Folded]
  |  |  |  Branch (1498:58): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1527|  5.63M|		GRP4(2);
  ------------------
  |  | 1475|  5.63M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1528|  5.63M|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1529|  5.63M|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1530|       |
 1531|  5.63M|		UNZR(3);
  ------------------
  |  | 1498|  5.63M|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1498:24): [True: 5.63M, Folded]
  |  |  |  Branch (1498:58): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1532|  5.63M|		GRP4(3);
  ------------------
  |  | 1475|  5.63M|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1533|  5.63M|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|  5.63M|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [True: 5.63M, Folded]
  |  |  |  Branch (1476:70): [Folded, False: 0]
  |  |  ------------------
  ------------------
 1534|  5.63M|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|  5.63M|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1535|       |
 1536|       |#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
 1537|       |		// instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
 1538|       |		pi = rebase<Channel>(npi, r0, r1, r2, r3);
 1539|       |#else
 1540|  5.63M|		(void)npi;
 1541|  5.63M|#endif
 1542|       |
 1543|  5.63M|#undef UNZR
 1544|  5.63M|#undef TEMP
 1545|  5.63M|#undef PREP
 1546|  5.63M|#undef LOAD
 1547|  5.63M|#undef GRP4
 1548|  5.63M|#undef FIXD
 1549|  5.63M|#undef SAVE
 1550|  5.63M|	}
 1551|   377k|}
_ZN7meshopt10transpose8ERDv2_xS1_S1_S1_:
 1275|  6.25M|{
 1276|  6.25M|	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
 1277|  6.25M|	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
 1278|  6.25M|	__m128i t2 = _mm_unpacklo_epi8(x2, x3);
 1279|  6.25M|	__m128i t3 = _mm_unpackhi_epi8(x2, x3);
 1280|       |
 1281|  6.25M|	x0 = _mm_unpacklo_epi16(t0, t2);
 1282|  6.25M|	x1 = _mm_unpackhi_epi16(t0, t2);
 1283|  6.25M|	x2 = _mm_unpacklo_epi16(t1, t3);
 1284|  6.25M|	x3 = _mm_unpackhi_epi16(t1, t3);
 1285|  6.25M|}
_ZN7meshopt9unzigzag8EDv2_x:
 1289|  22.5M|{
 1290|  22.5M|	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
 1291|  22.5M|	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
 1292|       |
 1293|  22.5M|	return _mm_xor_si128(xl, xr);
 1294|  22.5M|}
vertexcodec.cpp:_ZN7meshoptL17decodeDeltas4SimdILi1EEEvPKhPhmmS3_i:
 1470|  15.8k|{
 1471|  15.8k|#if defined(SIMD_SSE) || defined(SIMD_AVX)
 1472|  15.8k|#define TEMP __m128i
 1473|  15.8k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
 1474|  15.8k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
 1475|  15.8k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
 1476|  15.8k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
 1477|  15.8k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
 1478|  15.8k|#endif
 1479|       |
 1480|       |#ifdef SIMD_NEON
 1481|       |#define TEMP uint8x8_t
 1482|       |#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
 1483|       |#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
 1484|       |#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
 1485|       |#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
 1486|       |#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
 1487|       |#endif
 1488|       |
 1489|       |#ifdef SIMD_WASM
 1490|       |#define TEMP v128_t
 1491|       |#define PREP() v128_t pi = wasm_v128_load(last_vertex)
 1492|       |#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
 1493|       |#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
 1494|       |#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
 1495|       |#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
 1496|       |#endif
 1497|       |
 1498|  15.8k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
 1499|       |
 1500|  15.8k|	PREP();
  ------------------
  |  | 1473|  15.8k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
  ------------------
 1501|       |
 1502|  15.8k|	unsigned char* savep = transposed;
 1503|       |
 1504|   249k|	for (size_t j = 0; j < vertex_count_aligned; j += 16)
  ------------------
  |  Branch (1504:21): [True: 233k, False: 15.8k]
  ------------------
 1505|   233k|	{
 1506|   233k|		LOAD(0);
  ------------------
  |  | 1474|   233k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1507|   233k|		LOAD(1);
  ------------------
  |  | 1474|   233k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1508|   233k|		LOAD(2);
  ------------------
  |  | 1474|   233k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1509|   233k|		LOAD(3);
  ------------------
  |  | 1474|   233k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1510|       |
 1511|   233k|		transpose8(r0, r1, r2, r3);
 1512|       |
 1513|   233k|		TEMP t0, t1, t2, t3;
  ------------------
  |  | 1472|   233k|#define TEMP __m128i
  ------------------
 1514|   233k|		TEMP npi = pi;
  ------------------
  |  | 1472|   233k|#define TEMP __m128i
  ------------------
 1515|       |
 1516|   233k|		UNZR(0);
  ------------------
  |  | 1498|   233k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1498:24): [Folded, False: 233k]
  |  |  |  Branch (1498:58): [True: 233k, Folded]
  |  |  ------------------
  ------------------
 1517|   233k|		GRP4(0);
  ------------------
  |  | 1475|   233k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1518|   233k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
 1519|   233k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1520|       |
 1521|   233k|		UNZR(1);
  ------------------
  |  | 1498|   233k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1498:24): [Folded, False: 233k]
  |  |  |  Branch (1498:58): [True: 233k, Folded]
  |  |  ------------------
  ------------------
 1522|   233k|		GRP4(1);
  ------------------
  |  | 1475|   233k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1523|   233k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
 1524|   233k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1525|       |
 1526|   233k|		UNZR(2);
  ------------------
  |  | 1498|   233k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1498:24): [Folded, False: 233k]
  |  |  |  Branch (1498:58): [True: 233k, Folded]
  |  |  ------------------
  ------------------
 1527|   233k|		GRP4(2);
  ------------------
  |  | 1475|   233k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1528|   233k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
 1529|   233k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1530|       |
 1531|   233k|		UNZR(3);
  ------------------
  |  | 1498|   233k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1498:24): [Folded, False: 233k]
  |  |  |  Branch (1498:58): [True: 233k, Folded]
  |  |  ------------------
  ------------------
 1532|   233k|		GRP4(3);
  ------------------
  |  | 1475|   233k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1533|   233k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   233k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 233k]
  |  |  |  Branch (1476:70): [True: 233k, Folded]
  |  |  ------------------
  ------------------
 1534|   233k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   233k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1535|       |
 1536|       |#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
 1537|       |		// instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
 1538|       |		pi = rebase<Channel>(npi, r0, r1, r2, r3);
 1539|       |#else
 1540|   233k|		(void)npi;
 1541|   233k|#endif
 1542|       |
 1543|   233k|#undef UNZR
 1544|   233k|#undef TEMP
 1545|   233k|#undef PREP
 1546|   233k|#undef LOAD
 1547|   233k|#undef GRP4
 1548|   233k|#undef FIXD
 1549|   233k|#undef SAVE
 1550|   233k|	}
 1551|  15.8k|}
_ZN7meshopt10unzigzag16EDv2_x:
 1298|   934k|{
 1299|   934k|	__m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1)));
 1300|   934k|	__m128i xr = _mm_srli_epi16(v, 1);
 1301|       |
 1302|   934k|	return _mm_xor_si128(xl, xr);
 1303|   934k|}
vertexcodec.cpp:_ZN7meshoptL17decodeDeltas4SimdILi2EEEvPKhPhmmS3_i:
 1470|  24.8k|{
 1471|  24.8k|#if defined(SIMD_SSE) || defined(SIMD_AVX)
 1472|  24.8k|#define TEMP __m128i
 1473|  24.8k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
 1474|  24.8k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
 1475|  24.8k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
 1476|  24.8k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
 1477|  24.8k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
 1478|  24.8k|#endif
 1479|       |
 1480|       |#ifdef SIMD_NEON
 1481|       |#define TEMP uint8x8_t
 1482|       |#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
 1483|       |#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
 1484|       |#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
 1485|       |#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
 1486|       |#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
 1487|       |#endif
 1488|       |
 1489|       |#ifdef SIMD_WASM
 1490|       |#define TEMP v128_t
 1491|       |#define PREP() v128_t pi = wasm_v128_load(last_vertex)
 1492|       |#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
 1493|       |#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
 1494|       |#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
 1495|       |#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
 1496|       |#endif
 1497|       |
 1498|  24.8k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
 1499|       |
 1500|  24.8k|	PREP();
  ------------------
  |  | 1473|  24.8k|#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
  ------------------
 1501|       |
 1502|  24.8k|	unsigned char* savep = transposed;
 1503|       |
 1504|   407k|	for (size_t j = 0; j < vertex_count_aligned; j += 16)
  ------------------
  |  Branch (1504:21): [True: 382k, False: 24.8k]
  ------------------
 1505|   382k|	{
 1506|   382k|		LOAD(0);
  ------------------
  |  | 1474|   382k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1507|   382k|		LOAD(1);
  ------------------
  |  | 1474|   382k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1508|   382k|		LOAD(2);
  ------------------
  |  | 1474|   382k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1509|   382k|		LOAD(3);
  ------------------
  |  | 1474|   382k|#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  ------------------
 1510|       |
 1511|   382k|		transpose8(r0, r1, r2, r3);
 1512|       |
 1513|   382k|		TEMP t0, t1, t2, t3;
  ------------------
  |  | 1472|   382k|#define TEMP __m128i
  ------------------
 1514|   382k|		TEMP npi = pi;
  ------------------
  |  | 1472|   382k|#define TEMP __m128i
  ------------------
 1515|       |
 1516|   382k|		UNZR(0);
  ------------------
  |  | 1498|   382k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1498:24): [Folded, False: 382k]
  |  |  |  Branch (1498:58): [Folded, False: 382k]
  |  |  ------------------
  ------------------
 1517|   382k|		GRP4(0);
  ------------------
  |  | 1475|   382k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1518|   382k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
 1519|   382k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1520|       |
 1521|   382k|		UNZR(1);
  ------------------
  |  | 1498|   382k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1498:24): [Folded, False: 382k]
  |  |  |  Branch (1498:58): [Folded, False: 382k]
  |  |  ------------------
  ------------------
 1522|   382k|		GRP4(1);
  ------------------
  |  | 1475|   382k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1523|   382k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
 1524|   382k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1525|       |
 1526|   382k|		UNZR(2);
  ------------------
  |  | 1498|   382k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1498:24): [Folded, False: 382k]
  |  |  |  Branch (1498:58): [Folded, False: 382k]
  |  |  ------------------
  ------------------
 1527|   382k|		GRP4(2);
  ------------------
  |  | 1475|   382k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1528|   382k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
 1529|   382k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1530|       |
 1531|   382k|		UNZR(3);
  ------------------
  |  | 1498|   382k|#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
  |  |  ------------------
  |  |  |  Branch (1498:24): [Folded, False: 382k]
  |  |  |  Branch (1498:58): [Folded, False: 382k]
  |  |  ------------------
  ------------------
 1532|   382k|		GRP4(3);
  ------------------
  |  | 1475|   382k|#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  ------------------
 1533|   382k|		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
              		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  ------------------
  |  | 1476|   382k|#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
  |  |  ------------------
  |  |  |  Branch (1476:29): [Folded, False: 382k]
  |  |  |  Branch (1476:70): [Folded, False: 382k]
  |  |  ------------------
  ------------------
 1534|   382k|		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
              		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  ------------------
  |  | 1477|   382k|#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  ------------------
 1535|       |
 1536|       |#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
 1537|       |		// instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
 1538|       |		pi = rebase<Channel>(npi, r0, r1, r2, r3);
 1539|       |#else
 1540|   382k|		(void)npi;
 1541|   382k|#endif
 1542|       |
 1543|   382k|#undef UNZR
 1544|   382k|#undef TEMP
 1545|   382k|#undef PREP
 1546|   382k|#undef LOAD
 1547|   382k|#undef GRP4
 1548|   382k|#undef FIXD
 1549|   382k|#undef SAVE
 1550|   382k|	}
 1551|  24.8k|}
_ZN7meshopt8rotate32EDv2_xi:
 1307|  1.53M|{
 1308|  1.53M|	return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r));
 1309|  1.53M|}

_Z11fuzzDecoderPKhmmPFiPvmmS0_mE:
    8|  18.0k|{
    9|  18.0k|	size_t count = 66; // must be divisible by 3 for decodeIndexBuffer; should be >=64 to cover large vertex blocks
   10|       |
   11|  18.0k|	void* destination = malloc(count * stride);
   12|  18.0k|	assert(destination);
  ------------------
  |  Branch (12:2): [True: 18.0k, False: 0]
  ------------------
   13|       |
   14|  18.0k|	int rc = decode(destination, count, stride, reinterpret_cast<const unsigned char*>(data), size);
   15|  18.0k|	(void)rc;
   16|       |
   17|  18.0k|	free(destination);
   18|  18.0k|}
_Z13fuzzRoundtripPKhmmi:
   21|  9.02k|{
   22|  9.02k|	size_t count = size / stride;
   23|       |
   24|  9.02k|	size_t bound = meshopt_encodeVertexBufferBound(count, stride);
   25|  9.02k|	void* encoded = malloc(bound);
   26|  9.02k|	void* decoded = malloc(count * stride);
   27|  9.02k|	assert(encoded && decoded);
  ------------------
  |  Branch (27:2): [True: 9.02k, False: 0]
  |  Branch (27:2): [True: 9.02k, False: 0]
  |  Branch (27:2): [True: 9.02k, False: 0]
  ------------------
   28|       |
   29|  9.02k|	size_t res = meshopt_encodeVertexBufferLevel(static_cast<unsigned char*>(encoded), bound, data, count, stride, level, -1);
   30|  9.02k|	assert(res > 0 && res <= bound);
  ------------------
  |  Branch (30:2): [True: 9.02k, False: 0]
  |  Branch (30:2): [True: 9.02k, False: 0]
  |  Branch (30:2): [True: 9.02k, False: 0]
  ------------------
   31|       |
   32|       |	// encode again at the boundary to check for memory safety
   33|       |	// this should produce the same output because encoder is deterministic
   34|  9.02k|	size_t rese = meshopt_encodeVertexBufferLevel(static_cast<unsigned char*>(encoded) + bound - res, res, data, count, stride, level, -1);
   35|  9.02k|	assert(rese == res);
  ------------------
  |  Branch (35:2): [True: 9.02k, False: 0]
  ------------------
   36|       |
   37|  9.02k|	int rc = meshopt_decodeVertexBuffer(decoded, count, stride, static_cast<unsigned char*>(encoded) + bound - res, res);
   38|  9.02k|	assert(rc == 0);
  ------------------
  |  Branch (38:2): [True: 9.02k, False: 0]
  ------------------
   39|       |
   40|  9.02k|	assert(memcmp(data, decoded, count * stride) == 0);
  ------------------
  |  Branch (40:2): [True: 9.02k, False: 0]
  ------------------
   41|       |
   42|  9.02k|	free(decoded);
   43|  9.02k|	free(encoded);
   44|  9.02k|}
_Z5alignmm:
   47|  13.4k|{
   48|  13.4k|	return (value + alignment - 1) & ~(alignment - 1);
   49|  13.4k|}
_Z17fuzzDecodeMeshletmmPKhm:
   52|  2.24k|{
   53|       |	// raw decoding: allowed to write align(count, 4) elements
   54|  2.24k|	unsigned int rt[256];
   55|  2.24k|	unsigned int rv[256];
   56|  2.24k|	meshopt_decodeMeshletRaw(rv + 256 - align(vertex_count, 4), vertex_count, rt + 256 - align(triangle_count, 4), triangle_count, data, size);
   57|       |
   58|       |	// regular decoding: allowed to write align(count * size, 4) bytes
   59|       |	// with variations for 3-byte triangles and 2-byte vertex references
   60|  2.24k|	unsigned short rsv[256];
   61|  2.24k|	unsigned char rbt[256 * 3];
   62|       |
   63|  2.24k|	meshopt_decodeMeshlet(rv + 256 - vertex_count, vertex_count, 4, rt + 256 - triangle_count, triangle_count, 4, data, size);
   64|  2.24k|	meshopt_decodeMeshlet(rsv + 256 - align(vertex_count, 2), vertex_count, 2, rt + 256 - triangle_count, triangle_count, 4, data, size);
   65|  2.24k|	meshopt_decodeMeshlet(rv + 256 - vertex_count, vertex_count, 4, rbt + 256 * 3 - align(triangle_count * 3, 4), triangle_count, 3, data, size);
   66|  2.24k|	meshopt_decodeMeshlet(rsv + 256 - align(vertex_count, 2), vertex_count, 2, rbt + 256 * 3 - align(triangle_count * 3, 4), triangle_count, 3, data, size);
   67|  2.24k|}
_Z20fuzzRoundtripMeshletPKhm:
   70|  2.25k|{
   71|  2.25k|	size_t triangle_count = size / 3;
   72|  2.25k|	if (triangle_count > 256)
  ------------------
  |  Branch (72:6): [True: 518, False: 1.73k]
  ------------------
   73|    518|		triangle_count = 256;
   74|       |
   75|  2.25k|	unsigned char buf[4096];
   76|  2.25k|	size_t enc = meshopt_encodeMeshlet(buf, sizeof(buf), NULL, 0, reinterpret_cast<const unsigned char*>(data), triangle_count);
   77|  2.25k|	assert(enc > 0);
  ------------------
  |  Branch (77:2): [True: 2.25k, False: 0]
  ------------------
   78|  2.25k|	assert(enc <= meshopt_encodeMeshletBound(0, triangle_count));
  ------------------
  |  Branch (78:2): [True: 2.25k, False: 0]
  ------------------
   79|       |
   80|  2.25k|	unsigned int rt4[256];
   81|  2.25k|	int rc4 = meshopt_decodeMeshlet(static_cast<unsigned int*>(NULL), 0, rt4, triangle_count, buf, enc);
   82|  2.25k|	assert(rc4 == 0);
  ------------------
  |  Branch (82:2): [True: 2.25k, False: 0]
  ------------------
   83|       |
   84|   184k|	for (size_t i = 0; i < triangle_count; ++i)
  ------------------
  |  Branch (84:21): [True: 181k, False: 2.25k]
  ------------------
   85|   181k|	{
   86|   181k|		unsigned char a = data[i * 3 + 0], b = data[i * 3 + 1], c = data[i * 3 + 2];
   87|       |
   88|   181k|		unsigned int abc = (a << 0) | (b << 8) | (c << 16);
   89|   181k|		unsigned int bca = (b << 0) | (c << 8) | (a << 16);
   90|   181k|		unsigned int cba = (c << 0) | (a << 8) | (b << 16);
   91|       |
   92|   181k|		unsigned int tri = rt4[i];
   93|       |
   94|   181k|		assert(tri == abc || tri == bca || tri == cba);
  ------------------
  |  Branch (94:3): [True: 114k, False: 67.3k]
  |  Branch (94:3): [True: 35.2k, False: 32.0k]
  |  Branch (94:3): [True: 32.0k, False: 0]
  |  Branch (94:3): [True: 181k, False: 0]
  ------------------
   95|   181k|	}
   96|       |
   97|  2.25k|	unsigned char rt3[256 * 3];
   98|  2.25k|	int rc3 = meshopt_decodeMeshlet(static_cast<unsigned int*>(NULL), 0, rt3, triangle_count, buf, enc);
   99|  2.25k|	assert(rc3 == 0);
  ------------------
  |  Branch (99:2): [True: 2.25k, False: 0]
  ------------------
  100|       |
  101|   184k|	for (size_t i = 0; i < triangle_count; ++i)
  ------------------
  |  Branch (101:21): [True: 181k, False: 2.25k]
  ------------------
  102|   181k|	{
  103|   181k|		unsigned char a = data[i * 3 + 0], b = data[i * 3 + 1], c = data[i * 3 + 2];
  104|       |
  105|   181k|		unsigned int abc = (a << 0) | (b << 8) | (c << 16);
  106|   181k|		unsigned int bca = (b << 0) | (c << 8) | (a << 16);
  107|   181k|		unsigned int cba = (c << 0) | (a << 8) | (b << 16);
  108|       |
  109|   181k|		unsigned int tri = rt3[i * 3 + 0] | (rt3[i * 3 + 1] << 8) | (rt3[i * 3 + 2] << 16);
  110|       |
  111|       |		assert(tri == abc || tri == bca || tri == cba);
  ------------------
  |  Branch (111:3): [True: 114k, False: 67.3k]
  |  Branch (111:3): [True: 35.2k, False: 32.0k]
  |  Branch (111:3): [True: 32.0k, False: 0]
  |  Branch (111:3): [True: 181k, False: 0]
  ------------------
  112|   181k|	}
  113|  2.25k|}
_Z21fuzzRoundtripMeshletVPKhm:
  116|  2.25k|{
  117|  2.25k|	size_t vertex_count = size / 4;
  118|  2.25k|	if (vertex_count > 256)
  ------------------
  |  Branch (118:6): [True: 468, False: 1.78k]
  ------------------
  119|    468|		vertex_count = 256;
  120|       |
  121|  2.25k|	unsigned char tri[4] = {0, 1, 2};
  122|       |
  123|  2.25k|	unsigned char buf[4096];
  124|  2.25k|	size_t enc = meshopt_encodeMeshlet(buf, sizeof(buf), reinterpret_cast<const uint32_t*>(data), vertex_count, tri, 1);
  125|  2.25k|	assert(enc > 0);
  ------------------
  |  Branch (125:2): [True: 2.25k, False: 0]
  ------------------
  126|  2.25k|	assert(enc <= meshopt_encodeMeshletBound(vertex_count, 1));
  ------------------
  |  Branch (126:2): [True: 2.25k, False: 0]
  ------------------
  127|       |
  128|  2.25k|	unsigned int rv4[256];
  129|  2.25k|	int rc4 = meshopt_decodeMeshlet(rv4, vertex_count, tri, 1, buf, enc);
  130|  2.25k|	assert(rc4 == 0);
  ------------------
  |  Branch (130:2): [True: 2.25k, False: 0]
  ------------------
  131|       |
  132|   170k|	for (size_t i = 0; i < vertex_count; ++i)
  ------------------
  |  Branch (132:21): [True: 168k, False: 2.25k]
  ------------------
  133|   168k|		assert(rv4[i] == reinterpret_cast<const uint32_t*>(data)[i]);
  ------------------
  |  Branch (133:3): [True: 168k, False: 0]
  ------------------
  134|       |
  135|  2.25k|	unsigned short rv2[256];
  136|  2.25k|	int rc2 = meshopt_decodeMeshlet(rv2, vertex_count, tri, 1, buf, enc);
  137|  2.25k|	assert(rc2 == 0);
  ------------------
  |  Branch (137:2): [True: 2.25k, False: 0]
  ------------------
  138|       |
  139|   170k|	for (size_t i = 0; i < vertex_count; ++i)
  ------------------
  |  Branch (139:21): [True: 168k, False: 2.25k]
  ------------------
  140|       |		assert(rv2[i] == uint16_t(reinterpret_cast<const uint32_t*>(data)[i]));
  ------------------
  |  Branch (140:3): [True: 168k, False: 0]
  ------------------
  141|  2.25k|}
LLVMFuzzerTestOneInput:
  144|  2.25k|{
  145|       |	// decodeIndexBuffer supports 2 and 4-byte indices
  146|  2.25k|	fuzzDecoder(data, size, 2, meshopt_decodeIndexBuffer);
  147|  2.25k|	fuzzDecoder(data, size, 4, meshopt_decodeIndexBuffer);
  148|       |
  149|       |	// decodeIndexSequence supports 2 and 4-byte indices
  150|  2.25k|	fuzzDecoder(data, size, 2, meshopt_decodeIndexSequence);
  151|  2.25k|	fuzzDecoder(data, size, 4, meshopt_decodeIndexSequence);
  152|       |
  153|       |	// decodeVertexBuffer supports any strides divisible by 4 in 4-256 interval
  154|       |	// it's a waste of time to check all of them, so we'll just check a few with different alignment mod 16
  155|  2.25k|	fuzzDecoder(data, size, 4, meshopt_decodeVertexBuffer);
  156|  2.25k|	fuzzDecoder(data, size, 16, meshopt_decodeVertexBuffer);
  157|  2.25k|	fuzzDecoder(data, size, 24, meshopt_decodeVertexBuffer);
  158|  2.25k|	fuzzDecoder(data, size, 32, meshopt_decodeVertexBuffer);
  159|       |
  160|       |	// encodeVertexBuffer/decodeVertexBuffer should roundtrip for any stride, check a few with different alignment mod 16
  161|       |	// this also checks memory safety properties of the encoder
  162|       |	// to conserve time, we only check one version/level combination, biased towards version 1
  163|  2.25k|	uint8_t data0 = size > 0 ? data[0] : 0;
  ------------------
  |  Branch (163:18): [True: 2.25k, False: 0]
  ------------------
  164|  2.25k|	int level = data0 % 5;
  165|       |
  166|  2.25k|	meshopt_encodeVertexVersion(level < 4 ? 1 : 0);
  ------------------
  |  Branch (166:30): [True: 1.76k, False: 493]
  ------------------
  167|       |
  168|  2.25k|	fuzzRoundtrip(data, size, 4, level);
  169|  2.25k|	fuzzRoundtrip(data, size, 16, level);
  170|  2.25k|	fuzzRoundtrip(data, size, 24, level);
  171|  2.25k|	fuzzRoundtrip(data, size, 32, level);
  172|       |
  173|       |	// validate that decodeMeshlet works on untrusted data and is memory safe within documented limits
  174|  2.25k|	if (size > 2)
  ------------------
  |  Branch (174:6): [True: 2.24k, False: 14]
  ------------------
  175|  2.24k|		fuzzDecodeMeshlet(data[0] + 1, data[1] + 1, reinterpret_cast<const unsigned char*>(data + 2), size - 2);
  176|       |
  177|       |	// validate that index data roundtrips in meshlet encoding modulo rotation
  178|  2.25k|	fuzzRoundtripMeshlet(data, size);
  179|       |
  180|       |	// validate that vertex data roundtrips in meshlet encoding
  181|  2.25k|	fuzzRoundtripMeshletV(data, size);
  182|       |
  183|  2.25k|	return 0;
  184|  2.25k|}