Coverage Report

Created: 2022-08-24 06:40

/src/duckdb/third_party/utf8proc/utf8proc_wrapper.cpp
Line
Count
Source (jump to first uncovered line)
1
#include "utf8proc_wrapper.hpp"
2
#include "utf8proc.hpp"
3
4
using namespace std;
5
6
namespace duckdb {
7
8
// This function efficiently checks if a string is valid UTF8.
9
// It was originally written by Sjoerd Mullender.
10
11
// Here is the table that makes it work:
12
13
// B    = Number of Bytes in UTF8 encoding
14
// C_MIN  = First Unicode code point
15
// C_MAX  = Last Unicode code point
16
// B1     = First Byte Prefix
17
18
//  B C_MIN   C_MAX   B1
19
//  1 U+000000  U+00007F    0xxxxxxx
20
//  2 U+000080  U+0007FF    110xxxxx
21
//  3 U+000800  U+00FFFF    1110xxxx
22
//  4 U+010000  U+10FFFF    11110xxx
23
24
3
static void AssignInvalidUTF8Reason(UnicodeInvalidReason *invalid_reason, size_t *invalid_pos, size_t pos, UnicodeInvalidReason reason) {
25
3
  if (invalid_reason) {
26
0
    *invalid_reason = reason;
27
0
  }
28
3
  if (invalid_pos) {
29
0
    *invalid_pos = pos;
30
0
  }
31
3
}
32
33
496
UnicodeType Utf8Proc::Analyze(const char *s, size_t len, UnicodeInvalidReason *invalid_reason, size_t *invalid_pos) {
34
496
  UnicodeType type = UnicodeType::ASCII;
35
496
  char c;
36
1.92k
  for (size_t i = 0; i < len; i++) {
37
1.43k
    c = s[i];
38
1.43k
    if (c == '\0') {
39
1
      AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::NULL_BYTE);
40
1
      return UnicodeType::INVALID;
41
1
    }
42
    // 1 Byte / ASCII
43
1.43k
    if ((c & 0x80) == 0) {
44
1.42k
      continue;
45
1.42k
    }
46
2
    type = UnicodeType::UNICODE;
47
2
    if ((s[++i] & 0xC0) != 0x80) {
48
2
      AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
49
2
      return UnicodeType::INVALID;
50
2
    }
51
0
    if ((c & 0xE0) == 0xC0) {
52
0
      continue;
53
0
    }
54
0
    if ((s[++i] & 0xC0) != 0x80) {
55
0
      AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
56
0
      return UnicodeType::INVALID;
57
0
    }
58
0
    if ((c & 0xF0) == 0xE0) {
59
0
      continue;
60
0
    }
61
0
    if ((s[++i] & 0xC0) != 0x80) {
62
0
      AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
63
0
      return UnicodeType::INVALID;
64
0
    }
65
0
    if ((c & 0xF8) == 0xF0) {
66
0
      continue;
67
0
    }
68
0
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
69
0
    return UnicodeType::INVALID;
70
0
  }
71
72
493
  return type;
73
496
}
74
75
76
0
char* Utf8Proc::Normalize(const char *s, size_t len) {
77
0
  assert(s);
78
0
  assert(Utf8Proc::Analyze(s, len) != UnicodeType::INVALID);
79
0
  return (char*) utf8proc_NFC((const utf8proc_uint8_t*) s, len);
80
0
}
81
82
4
bool Utf8Proc::IsValid(const char *s, size_t len) {
83
4
  return Utf8Proc::Analyze(s, len) != UnicodeType::INVALID;
84
4
}
85
86
4
size_t Utf8Proc::NextGraphemeCluster(const char *s, size_t len, size_t cpos) {
87
4
  return utf8proc_next_grapheme(s, len, cpos);
88
4
}
89
90
0
size_t Utf8Proc::PreviousGraphemeCluster(const char *s, size_t len, size_t cpos) {
91
0
  if (!Utf8Proc::IsValid(s, len)) {
92
0
    return cpos - 1;
93
0
  }
94
0
  size_t current_pos = 0;
95
0
  while(true) {
96
0
    size_t new_pos = NextGraphemeCluster(s, len, current_pos);
97
0
    if (new_pos <= current_pos || new_pos >= cpos) {
98
0
      return current_pos;
99
0
    }
100
0
    current_pos = new_pos;
101
0
  }
102
0
}
103
104
0
bool Utf8Proc::CodepointToUtf8(int cp, int &sz, char *c) {
105
0
  return utf8proc_codepoint_to_utf8(cp, sz, c);
106
0
}
107
108
0
int Utf8Proc::CodepointLength(int cp) {
109
0
  return utf8proc_codepoint_length(cp);
110
0
}
111
112
0
int32_t Utf8Proc::UTF8ToCodepoint(const char *c, int &sz) {
113
0
  return utf8proc_codepoint(c, sz);
114
0
}
115
116
4
size_t Utf8Proc::RenderWidth(const char *s, size_t len, size_t pos) {
117
4
    int sz;
118
4
    auto codepoint = duckdb::utf8proc_codepoint(s + pos, sz);
119
4
    auto properties = duckdb::utf8proc_get_property(codepoint);
120
4
    return properties->charwidth;
121
4
}
122
123
}