Coverage Report

Created: 2024-09-08 06:06

/src/simdjson/src/westmere.cpp
Line
Count
Source (jump to first uncovered line)
1
#ifndef SIMDJSON_SRC_WESTMERE_CPP
2
#define SIMDJSON_SRC_WESTMERE_CPP
3
4
#ifndef SIMDJSON_CONDITIONAL_INCLUDE
5
#include <base.h>
6
#endif // SIMDJSON_CONDITIONAL_INCLUDE
7
8
#include <simdjson/westmere.h>
9
#include <simdjson/westmere/implementation.h>
10
11
#include <simdjson/westmere/begin.h>
12
#include <generic/amalgamated.h>
13
#include <generic/stage1/amalgamated.h>
14
#include <generic/stage2/amalgamated.h>
15
16
//
17
// Stage 1
18
//
19
20
namespace simdjson {
21
namespace westmere {
22
23
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
24
  size_t capacity,
25
  size_t max_depth,
26
  std::unique_ptr<internal::dom_parser_implementation>& dst
27
10.0k
) const noexcept {
28
10.0k
  dst.reset( new (std::nothrow) dom_parser_implementation() );
29
10.0k
  if (!dst) { return MEMALLOC; }
30
10.0k
  if (auto err = dst->set_capacity(capacity))
31
0
    return err;
32
10.0k
  if (auto err = dst->set_max_depth(max_depth))
33
0
    return err;
34
10.0k
  return SUCCESS;
35
10.0k
}
36
37
namespace {
38
39
using namespace simd;
40
41
1.77M
simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
42
  // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
43
  // we can't use the generic lookup_16.
44
1.77M
  auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
45
46
  // The 6 operators (:,[]{}) have these values:
47
  //
48
  // , 2C
49
  // : 3A
50
  // [ 5B
51
  // { 7B
52
  // ] 5D
53
  // } 7D
54
  //
55
  // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
56
  // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
57
  // match it (against | 0x20).
58
  //
59
  // To prevent recognizing other characters, everything else gets compared with 0, which cannot
60
  // match due to the | 0x20.
61
  //
62
  // NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
63
  // and :. This gets caught in stage 2, which checks the actual character to ensure the right
64
  // operators are in the right places.
65
1.77M
  const auto op_table = simd8<uint8_t>::repeat_16(
66
1.77M
    0, 0, 0, 0,
67
1.77M
    0, 0, 0, 0,
68
1.77M
    0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
69
1.77M
    ',', '}', 0, 0  // , = 2C, ] = 5D, } = 7D
70
1.77M
  );
71
72
  // We compute whitespace and op separately. If the code later only use one or the
73
  // other, given the fact that all functions are aggressively inlined, we can
74
  // hope that useless computations will be omitted. This is namely case when
75
  // minifying (we only need whitespace).
76
77
78
1.77M
  const uint64_t whitespace = in.eq({
79
1.77M
    _mm_shuffle_epi8(whitespace_table, in.chunks[0]),
80
1.77M
    _mm_shuffle_epi8(whitespace_table, in.chunks[1]),
81
1.77M
    _mm_shuffle_epi8(whitespace_table, in.chunks[2]),
82
1.77M
    _mm_shuffle_epi8(whitespace_table, in.chunks[3])
83
1.77M
  });
84
  // Turn [ and ] into { and }
85
1.77M
  const simd8x64<uint8_t> curlified{
86
1.77M
    in.chunks[0] | 0x20,
87
1.77M
    in.chunks[1] | 0x20,
88
1.77M
    in.chunks[2] | 0x20,
89
1.77M
    in.chunks[3] | 0x20
90
1.77M
  };
91
1.77M
  const uint64_t op = curlified.eq({
92
1.77M
    _mm_shuffle_epi8(op_table, in.chunks[0]),
93
1.77M
    _mm_shuffle_epi8(op_table, in.chunks[1]),
94
1.77M
    _mm_shuffle_epi8(op_table, in.chunks[2]),
95
1.77M
    _mm_shuffle_epi8(op_table, in.chunks[3])
96
1.77M
  });
97
1.77M
    return { whitespace, op };
98
1.77M
}
99
100
1.67M
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
101
1.67M
  return input.reduce_or().is_ascii();
102
1.67M
}
103
104
0
simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
105
0
  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
106
0
  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
107
0
  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
108
0
  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
109
0
  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
110
0
}
111
112
424k
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
113
424k
  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
114
424k
  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
115
424k
  return is_third_byte | is_fourth_byte;
116
424k
}
117
118
} // unnamed namespace
119
} // namespace westmere
120
} // namespace simdjson
121
122
//
123
// Stage 2
124
//
125
126
//
127
// Implementation-specific overrides
128
//
129
130
namespace simdjson {
131
namespace westmere {
132
133
683
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
134
683
  return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
135
683
}
136
137
10.0k
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
138
10.0k
  this->buf = _buf;
139
10.0k
  this->len = _len;
140
10.0k
  return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
141
10.0k
}
142
143
460
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
144
460
  return westmere::stage1::generic_validate_utf8(buf,len);
145
460
}
146
147
8.58k
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
148
8.58k
  return stage2::tape_builder::parse_document<false>(*this, _doc);
149
8.58k
}
150
151
0
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
152
0
  return stage2::tape_builder::parse_document<true>(*this, _doc);
153
0
}
154
155
0
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
156
0
  return westmere::stringparsing::parse_string(src, dst, replacement_char);
157
0
}
158
159
0
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
160
0
  return westmere::stringparsing::parse_wobbly_string(src, dst);
161
0
}
162
163
10.0k
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
164
10.0k
  auto error = stage1(_buf, _len, stage1_mode::regular);
165
10.0k
  if (error) { return error; }
166
8.58k
  return stage2(_doc);
167
10.0k
}
168
169
} // namespace westmere
170
} // namespace simdjson
171
172
#include <simdjson/westmere/end.h>
173
174
#endif // SIMDJSON_SRC_WESTMERE_CPP