Coverage Report

Created: 2023-11-27 06:04

/src/simdjson/src/westmere.cpp
Line
Count
Source (jump to first uncovered line)
1
#ifndef SIMDJSON_SRC_WESTMERE_CPP
2
#define SIMDJSON_SRC_WESTMERE_CPP
3
4
#ifndef SIMDJSON_CONDITIONAL_INCLUDE
5
#include <base.h>
6
#endif // SIMDJSON_CONDITIONAL_INCLUDE
7
8
#include <simdjson/westmere.h>
9
#include <simdjson/westmere/implementation.h>
10
11
#include <simdjson/westmere/begin.h>
12
#include <generic/amalgamated.h>
13
#include <generic/stage1/amalgamated.h>
14
#include <generic/stage2/amalgamated.h>
15
16
//
17
// Stage 1
18
//
19
20
namespace simdjson {
21
namespace westmere {
22
23
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
24
  size_t capacity,
25
  size_t max_depth,
26
  std::unique_ptr<internal::dom_parser_implementation>& dst
27
11.4k
) const noexcept {
28
11.4k
  dst.reset( new (std::nothrow) dom_parser_implementation() );
29
11.4k
  if (!dst) { return MEMALLOC; }
30
11.4k
  if (auto err = dst->set_capacity(capacity))
31
0
    return err;
32
11.4k
  if (auto err = dst->set_max_depth(max_depth))
33
0
    return err;
34
11.4k
  return SUCCESS;
35
11.4k
}
36
37
namespace {
38
39
using namespace simd;
40
41
2.15M
simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
42
  // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
43
  // we can't use the generic lookup_16.
44
2.15M
  auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
45
46
  // The 6 operators (:,[]{}) have these values:
47
  //
48
  // , 2C
49
  // : 3A
50
  // [ 5B
51
  // { 7B
52
  // ] 5D
53
  // } 7D
54
  //
55
  // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
56
  // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
57
  // match it (against | 0x20).
58
  //
59
  // To prevent recognizing other characters, everything else gets compared with 0, which cannot
60
  // match due to the | 0x20.
61
  //
62
  // NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
63
  // and :. This gets caught in stage 2, which checks the actual character to ensure the right
64
  // operators are in the right places.
65
2.15M
  const auto op_table = simd8<uint8_t>::repeat_16(
66
2.15M
    0, 0, 0, 0,
67
2.15M
    0, 0, 0, 0,
68
2.15M
    0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
69
2.15M
    ',', '}', 0, 0  // , = 2C, ] = 5D, } = 7D
70
2.15M
  );
71
72
  // We compute whitespace and op separately. If the code later only use one or the
73
  // other, given the fact that all functions are aggressively inlined, we can
74
  // hope that useless computations will be omitted. This is namely case when
75
  // minifying (we only need whitespace).
76
77
78
2.15M
  const uint64_t whitespace = in.eq({
79
2.15M
    _mm_shuffle_epi8(whitespace_table, in.chunks[0]),
80
2.15M
    _mm_shuffle_epi8(whitespace_table, in.chunks[1]),
81
2.15M
    _mm_shuffle_epi8(whitespace_table, in.chunks[2]),
82
2.15M
    _mm_shuffle_epi8(whitespace_table, in.chunks[3])
83
2.15M
  });
84
  // Turn [ and ] into { and }
85
2.15M
  const simd8x64<uint8_t> curlified{
86
2.15M
    in.chunks[0] | 0x20,
87
2.15M
    in.chunks[1] | 0x20,
88
2.15M
    in.chunks[2] | 0x20,
89
2.15M
    in.chunks[3] | 0x20
90
2.15M
  };
91
2.15M
  const uint64_t op = curlified.eq({
92
2.15M
    _mm_shuffle_epi8(op_table, in.chunks[0]),
93
2.15M
    _mm_shuffle_epi8(op_table, in.chunks[1]),
94
2.15M
    _mm_shuffle_epi8(op_table, in.chunks[2]),
95
2.15M
    _mm_shuffle_epi8(op_table, in.chunks[3])
96
2.15M
  });
97
2.15M
    return { whitespace, op };
98
2.15M
}
99
100
2.15M
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
101
2.15M
  return input.reduce_or().is_ascii();
102
2.15M
}
103
104
0
simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
105
0
  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
106
0
  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
107
0
  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
108
0
  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
109
0
  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
110
0
}
111
112
399k
simdjson_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
113
399k
  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
114
399k
  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
115
  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
116
399k
  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
117
399k
}
118
119
} // unnamed namespace
120
} // namespace westmere
121
} // namespace simdjson
122
123
//
124
// Stage 2
125
//
126
127
//
128
// Implementation-specific overrides
129
//
130
131
namespace simdjson {
132
namespace westmere {
133
134
660
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
135
660
  return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
136
660
}
137
138
11.4k
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
139
11.4k
  this->buf = _buf;
140
11.4k
  this->len = _len;
141
11.4k
  return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
142
11.4k
}
143
144
488
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
145
488
  return westmere::stage1::generic_validate_utf8(buf,len);
146
488
}
147
148
9.87k
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
149
9.87k
  return stage2::tape_builder::parse_document<false>(*this, _doc);
150
9.87k
}
151
152
0
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
153
0
  return stage2::tape_builder::parse_document<true>(*this, _doc);
154
0
}
155
156
0
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
157
0
  return westmere::stringparsing::parse_string(src, dst, replacement_char);
158
0
}
159
160
0
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
161
0
  return westmere::stringparsing::parse_wobbly_string(src, dst);
162
0
}
163
164
11.4k
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
165
11.4k
  auto error = stage1(_buf, _len, stage1_mode::regular);
166
11.4k
  if (error) { return error; }
167
9.87k
  return stage2(_doc);
168
11.4k
}
169
170
} // namespace westmere
171
} // namespace simdjson
172
173
#include <simdjson/westmere/end.h>
174
175
#endif // SIMDJSON_SRC_WESTMERE_CPP