/src/duckdb/third_party/utf8proc/utf8proc_wrapper.cpp

Source (jump to first uncovered line)
#include "utf8proc_wrapper.hpp"
#include "utf8proc.hpp"

using namespace std;

namespace duckdb {

// This function efficiently checks if a string is valid UTF8.
// It was originally written by Sjoerd Mullender.

// Here is the table that makes it work:

// B    = Number of Bytes in UTF8 encoding
// C_MIN  = First Unicode code point
// C_MAX  = Last Unicode code point
// B1     = First Byte Prefix

//  B C_MIN   C_MAX   B1
//  1 U+000000  U+00007F    0xxxxxxx
//  2 U+000080  U+0007FF    110xxxxx
//  3 U+000800  U+00FFFF    1110xxxx
//  4 U+010000  U+10FFFF    11110xxx

static void AssignInvalidUTF8Reason(UnicodeInvalidReason *invalid_reason, size_t *invalid_pos, size_t pos, UnicodeInvalidReason reason) {
  if (invalid_reason) {
    *invalid_reason = reason;
  }
  if (invalid_pos) {
    *invalid_pos = pos;
  }
}

UnicodeType Utf8Proc::Analyze(const char *s, size_t len, UnicodeInvalidReason *invalid_reason, size_t *invalid_pos) {
  UnicodeType type = UnicodeType::ASCII;
  char c;
  for (size_t i = 0; i < len; i++) {
    c = s[i];
    if (c == '\0') {
      AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::NULL_BYTE);
      return UnicodeType::INVALID;
    }
    // 1 Byte / ASCII
    if ((c & 0x80) == 0) {
      continue;
    }
    type = UnicodeType::UNICODE;
    if ((s[++i] & 0xC0) != 0x80) {
      AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
      return UnicodeType::INVALID;
    }
    if ((c & 0xE0) == 0xC0) {
      continue;
    }
    if ((s[++i] & 0xC0) != 0x80) {
      AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
      return UnicodeType::INVALID;
    }
    if ((c & 0xF0) == 0xE0) {
      continue;
    }
    if ((s[++i] & 0xC0) != 0x80) {
      AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
      return UnicodeType::INVALID;
    }
    if ((c & 0xF8) == 0xF0) {
      continue;
    }
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
    return UnicodeType::INVALID;
  }

  return type;
}


char* Utf8Proc::Normalize(const char *s, size_t len) {
  assert(s);
  assert(Utf8Proc::Analyze(s, len) != UnicodeType::INVALID);
  return (char*) utf8proc_NFC((const utf8proc_uint8_t*) s, len);
}

bool Utf8Proc::IsValid(const char *s, size_t len) {
  return Utf8Proc::Analyze(s, len) != UnicodeType::INVALID;
}

size_t Utf8Proc::NextGraphemeCluster(const char *s, size_t len, size_t cpos) {
  return utf8proc_next_grapheme(s, len, cpos);
}

size_t Utf8Proc::PreviousGraphemeCluster(const char *s, size_t len, size_t cpos) {
  if (!Utf8Proc::IsValid(s, len)) {
    return cpos - 1;
  }
  size_t current_pos = 0;
  while(true) {
    size_t new_pos = NextGraphemeCluster(s, len, current_pos);
    if (new_pos <= current_pos || new_pos >= cpos) {
      return current_pos;
    }
    current_pos = new_pos;
  }
}

bool Utf8Proc::CodepointToUtf8(int cp, int &sz, char *c) {
  return utf8proc_codepoint_to_utf8(cp, sz, c);
}

int Utf8Proc::CodepointLength(int cp) {
  return utf8proc_codepoint_length(cp);
}

int32_t Utf8Proc::UTF8ToCodepoint(const char *c, int &sz) {
  return utf8proc_codepoint(c, sz);
}

size_t Utf8Proc::RenderWidth(const char *s, size_t len, size_t pos) {
    int sz;
    auto codepoint = duckdb::utf8proc_codepoint(s + pos, sz);
    auto properties = duckdb::utf8proc_get_property(codepoint);
    return properties->charwidth;
}

}

Line	Count	Source (jump to first uncovered line)
1		#include "utf8proc_wrapper.hpp"
2		#include "utf8proc.hpp"
3
4		using namespace std;
5
6		namespace duckdb {
7
8		// This function efficiently checks if a string is valid UTF8.
9		// It was originally written by Sjoerd Mullender.
10
11		// Here is the table that makes it work:
12
13		// B = Number of Bytes in UTF8 encoding
14		// C_MIN = First Unicode code point
15		// C_MAX = Last Unicode code point
16		// B1 = First Byte Prefix
17
18		// B C_MIN C_MAX B1
19		// 1 U+000000 U+00007F 0xxxxxxx
20		// 2 U+000080 U+0007FF 110xxxxx
21		// 3 U+000800 U+00FFFF 1110xxxx
22		// 4 U+010000 U+10FFFF 11110xxx
23
24	3	static void AssignInvalidUTF8Reason(UnicodeInvalidReason invalid_reason, size_t invalid_pos, size_t pos, UnicodeInvalidReason reason) {
25	3	if (invalid_reason) {
26	0	*invalid_reason = reason;
27	0	}
28	3	if (invalid_pos) {
29	0	*invalid_pos = pos;
30	0	}
31	3	}
32
33	496	UnicodeType Utf8Proc::Analyze(const char s, size_t len, UnicodeInvalidReason invalid_reason, size_t *invalid_pos) {
34	496	UnicodeType type = UnicodeType::ASCII;
35	496	char c;
36	1.92k	for (size_t i = 0; i < len; i++) {
37	1.43k	c = s[i];
38	1.43k	if (c == '\0') {
39	1	AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::NULL_BYTE);
40	1	return UnicodeType::INVALID;
41	1	}
42		// 1 Byte / ASCII
43	1.43k	if ((c & 0x80) == 0) {
44	1.42k	continue;
45	1.42k	}
46	2	type = UnicodeType::UNICODE;
47	2	if ((s[++i] & 0xC0) != 0x80) {
48	2	AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
49	2	return UnicodeType::INVALID;
50	2	}
51	0	if ((c & 0xE0) == 0xC0) {
52	0	continue;
53	0	}
54	0	if ((s[++i] & 0xC0) != 0x80) {
55	0	AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
56	0	return UnicodeType::INVALID;
57	0	}
58	0	if ((c & 0xF0) == 0xE0) {
59	0	continue;
60	0	}
61	0	if ((s[++i] & 0xC0) != 0x80) {
62	0	AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
63	0	return UnicodeType::INVALID;
64	0	}
65	0	if ((c & 0xF8) == 0xF0) {
66	0	continue;
67	0	}
68	0	AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
69	0	return UnicodeType::INVALID;
70	0	}
71
72	493	return type;
73	496	}
74
75
76	0	char* Utf8Proc::Normalize(const char *s, size_t len) {
77	0	assert(s);
78	0	assert(Utf8Proc::Analyze(s, len) != UnicodeType::INVALID);
79	0	return (char) utf8proc_NFC((const utf8proc_uint8_t) s, len);
80	0	}
81
82	4	bool Utf8Proc::IsValid(const char *s, size_t len) {
83	4	return Utf8Proc::Analyze(s, len) != UnicodeType::INVALID;
84	4	}
85
86	4	size_t Utf8Proc::NextGraphemeCluster(const char *s, size_t len, size_t cpos) {
87	4	return utf8proc_next_grapheme(s, len, cpos);
88	4	}
89
90	0	size_t Utf8Proc::PreviousGraphemeCluster(const char *s, size_t len, size_t cpos) {
91	0	if (!Utf8Proc::IsValid(s, len)) {
92	0	return cpos - 1;
93	0	}
94	0	size_t current_pos = 0;
95	0	while(true) {
96	0	size_t new_pos = NextGraphemeCluster(s, len, current_pos);
97	0	if (new_pos <= current_pos \|\| new_pos >= cpos) {
98	0	return current_pos;
99	0	}
100	0	current_pos = new_pos;
101	0	}
102	0	}
103
104	0	bool Utf8Proc::CodepointToUtf8(int cp, int &sz, char *c) {
105	0	return utf8proc_codepoint_to_utf8(cp, sz, c);
106	0	}
107
108	0	int Utf8Proc::CodepointLength(int cp) {
109	0	return utf8proc_codepoint_length(cp);
110	0	}
111
112	0	int32_t Utf8Proc::UTF8ToCodepoint(const char *c, int &sz) {
113	0	return utf8proc_codepoint(c, sz);
114	0	}
115
116	4	size_t Utf8Proc::RenderWidth(const char *s, size_t len, size_t pos) {
117	4	int sz;
118	4	auto codepoint = duckdb::utf8proc_codepoint(s + pos, sz);
119	4	auto properties = duckdb::utf8proc_get_property(codepoint);
120	4	return properties->charwidth;
121	4	}
122
123		}

Coverage Report

Created: 2022-08-24 06:40