/src/rocksdb/table/plain/plain_table_factory.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. |
2 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
3 | | // Use of this source code is governed by a BSD-style license that can be |
4 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
5 | | |
6 | | #pragma once |
7 | | |
8 | | #include <stdint.h> |
9 | | |
10 | | #include <memory> |
11 | | #include <string> |
12 | | |
13 | | #include "rocksdb/table.h" |
14 | | |
15 | | namespace ROCKSDB_NAMESPACE { |
16 | | |
17 | | struct EnvOptions; |
18 | | |
19 | | class Status; |
20 | | class RandomAccessFile; |
21 | | class WritableFile; |
22 | | class Table; |
23 | | class TableBuilder; |
24 | | |
25 | | // PlainTableFactory is the entrance function to the PlainTable format of |
26 | | // SST files. It returns instances PlainTableBuilder as the builder |
27 | | // class and PlainTableReader as the reader class, where the format is |
28 | | // actually implemented. |
29 | | // |
30 | | // The PlainTable is designed for memory-mapped file systems, e.g. tmpfs. |
31 | | // Data is not organized in blocks, which allows fast access. Because of |
32 | | // following downsides |
33 | | // 1. Data compression is not supported. |
34 | | // 2. Data is not checksumed. |
35 | | // it is not recommended to use this format on other type of file systems. |
36 | | // |
37 | | // PlainTable requires fixed length key, configured as a constructor |
38 | | // parameter of the factory class. Output file format: |
39 | | // +-------------+-----------------+ |
40 | | // | version | user_key_length | |
41 | | // +------------++------------+-----------------+ <= key1 offset |
42 | | // | encoded key1 | value_size | | |
43 | | // +------------+-------------+-------------+ | |
44 | | // | value1 | |
45 | | // | | |
46 | | // +--------------------------+-------------+---+ <= key2 offset |
47 | | // | encoded key2 | value_size | | |
48 | | // +------------+-------------+-------------+ | |
49 | | // | value2 | |
50 | | // | | |
51 | | // | ...... | |
52 | | // +-----------------+--------------------------+ |
53 | | // |
54 | | // When the key encoding type is kPlain. Key part is encoded as: |
55 | | // +------------+--------------------+ |
56 | | // | [key_size] | internal key | |
57 | | // +------------+--------------------+ |
58 | | // for the case of user_key_len = kPlainTableVariableLength case, |
59 | | // and simply: |
60 | | // +----------------------+ |
61 | | // | internal key | |
62 | | // +----------------------+ |
63 | | // for user_key_len != kPlainTableVariableLength case. |
64 | | // |
65 | | // If key encoding type is kPrefix. Keys are encoding in this format. |
66 | | // There are three ways to encode a key: |
67 | | // (1) Full Key |
68 | | // +---------------+---------------+-------------------+ |
69 | | // | Full Key Flag | Full Key Size | Full Internal Key | |
70 | | // +---------------+---------------+-------------------+ |
71 | | // which simply encodes a full key |
72 | | // |
73 | | // (2) A key shared the same prefix as the previous key, which is encoded as |
74 | | // format of (1). |
75 | | // +-------------+-------------+-------------+-------------+------------+ |
76 | | // | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix | |
77 | | // +-------------+-------------+-------------+-------------+------------+ |
78 | | // where key is the suffix part of the key, including the internal bytes. |
79 | | // the actual key will be constructed by concatenating prefix part of the |
80 | | // previous key, with the suffix part of the key here, with sizes given here. |
81 | | // |
82 | | // (3) A key shared the same prefix as the previous key, which is encoded as |
83 | | // the format of (2). |
84 | | // +-----------------+-----------------+------------------------+ |
85 | | // | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key | |
86 | | // +-----------------+-----------------+------------------------+ |
87 | | // The key will be constructed by concatenating previous key's prefix (which is |
88 | | // also a prefix which the last key encoded in the format of (1)) and the |
89 | | // key given here. |
90 | | // |
91 | | // For example, we for following keys (prefix and suffix are separated by |
92 | | // spaces): |
93 | | // 0000 0001 |
94 | | // 0000 00021 |
95 | | // 0000 0002 |
96 | | // 00011 00 |
97 | | // 0002 0001 |
98 | | // Will be encoded like this: |
99 | | // FK 8 00000001 |
100 | | // PF 4 SF 5 00021 |
101 | | // SF 4 0002 |
102 | | // FK 7 0001100 |
103 | | // FK 8 00020001 |
104 | | // (where FK means full key flag, PF means prefix flag and SF means suffix flag) |
105 | | // |
106 | | // All those "key flag + key size" shown above are in this format: |
107 | | // The 8 bits of the first byte: |
108 | | // +----+----+----+----+----+----+----+----+ |
109 | | // | Type | Size | |
110 | | // +----+----+----+----+----+----+----+----+ |
111 | | // Type indicates: full key, prefix, or suffix. |
112 | | // The last 6 bits are for size. If the size bits are not all 1, it means the |
113 | | // size of the key. Otherwise, varint32 is read after this byte. This varint |
114 | | // value + 0x3F (the value of all 1) will be the key size. |
115 | | // |
116 | | // For example, full key with length 16 will be encoded as (binary): |
117 | | // 00 010000 |
118 | | // (00 means full key) |
119 | | // and a prefix with 100 bytes will be encoded as: |
120 | | // 01 111111 00100101 |
121 | | // (63) (37) |
122 | | // (01 means key suffix) |
123 | | // |
124 | | // All the internal keys above (including kPlain and kPrefix) are encoded in |
125 | | // this format: |
126 | | // There are two types: |
127 | | // (1) normal internal key format |
128 | | // +----------- ...... -------------+----+---+---+---+---+---+---+---+ |
129 | | // | user key |type| sequence ID | |
130 | | // +----------- ..... --------------+----+---+---+---+---+---+---+---+ |
131 | | // (2) Special case for keys whose sequence ID is 0 and is value type |
132 | | // +----------- ...... -------------+----+ |
133 | | // | user key |0x80| |
134 | | // +----------- ..... --------------+----+ |
135 | | // To save 7 bytes for the special case where sequence ID = 0. |
136 | | // |
137 | | // |
138 | | class PlainTableFactory : public TableFactory { |
139 | | public: |
140 | 0 | ~PlainTableFactory() {} |
141 | | // user_key_len is the length of the user key. If it is set to be |
142 | | // kPlainTableVariableLength, then it means variable length. Otherwise, all |
143 | | // the keys need to have the fix length of this value. bloom_bits_per_key is |
144 | | // number of bits used for bloom filer per key. hash_table_ratio is |
145 | | // the desired utilization of the hash table used for prefix hashing. |
146 | | // hash_table_ratio = number of prefixes / #buckets in the hash table |
147 | | // hash_table_ratio = 0 means skip hash table but only replying on binary |
148 | | // search. |
149 | | // index_sparseness determines index interval for keys |
150 | | // inside the same prefix. It will be the maximum number of linear search |
151 | | // required after hash and binary search. |
152 | | // index_sparseness = 0 means index for every key. |
153 | | // huge_page_tlb_size determines whether to allocate hash indexes from huge |
154 | | // page TLB and the page size if allocating from there. See comments of |
155 | | // Arena::AllocateAligned() for details. |
156 | | explicit PlainTableFactory( |
157 | | const PlainTableOptions& _table_options = PlainTableOptions()); |
158 | | |
159 | | // Method to allow CheckedCast to work for this class |
160 | 0 | static const char* kClassName() { return kPlainTableName(); } |
161 | 0 | const char* Name() const override { return kPlainTableName(); } |
162 | | using TableFactory::NewTableReader; |
163 | | Status NewTableReader(const ReadOptions& ro, |
164 | | const TableReaderOptions& table_reader_options, |
165 | | std::unique_ptr<RandomAccessFileReader>&& file, |
166 | | uint64_t file_size, std::unique_ptr<TableReader>* table, |
167 | | bool prefetch_index_and_filter_in_cache) const override; |
168 | | |
169 | | TableBuilder* NewTableBuilder( |
170 | | const TableBuilderOptions& table_builder_options, |
171 | | WritableFileWriter* file) const override; |
172 | | |
173 | | std::string GetPrintableOptions() const override; |
174 | | static const char kValueTypeSeqId0 = char(~0); |
175 | | |
176 | 0 | std::unique_ptr<TableFactory> Clone() const override { |
177 | 0 | return std::make_unique<PlainTableFactory>(*this); |
178 | 0 | } |
179 | | |
180 | | private: |
181 | | PlainTableOptions table_options_; |
182 | | }; |
183 | | |
184 | | } // namespace ROCKSDB_NAMESPACE |