/src/xz/src/liblzma/simple/riscv.c
Line | Count | Source |
1 | | // SPDX-License-Identifier: 0BSD |
2 | | |
3 | | /////////////////////////////////////////////////////////////////////////////// |
4 | | // |
5 | | /// \file riscv.c |
6 | | /// \brief Filter for 32-bit/64-bit little/big endian RISC-V binaries |
7 | | /// |
8 | | /// This converts program counter relative addresses in function calls |
9 | | /// (JAL, AUIPC+JALR), address calculation of functions and global |
10 | | /// variables (AUIPC+ADDI), loads (AUIPC+load), and stores (AUIPC+store). |
11 | | /// |
12 | | /// For AUIPC+inst2 pairs, the paired instruction checking is fairly relaxed. |
13 | | /// The paired instruction opcode must only have its lowest two bits set, |
14 | | /// meaning it will convert any paired instruction that is not a 16-bit |
15 | | /// compressed instruction. This was shown to be enough to keep the number |
16 | | /// of false matches low while improving code size and speed. |
17 | | // |
18 | | // Authors: Lasse Collin |
19 | | // Jia Tan |
20 | | // |
21 | | // Special thanks: |
22 | | // |
23 | | // - Chien Wong <m@xv97.com> provided a few early versions of RISC-V |
24 | | // filter variants along with test files and benchmark results. |
25 | | // |
26 | | // - Igor Pavlov helped a lot in the filter design, getting it both |
27 | | // faster and smaller. The implementation here is still independently |
28 | | // written, not based on LZMA SDK. |
29 | | // |
30 | | /////////////////////////////////////////////////////////////////////////////// |
31 | | |
32 | | /* |
33 | | |
34 | | RISC-V filtering |
35 | | ================ |
36 | | |
37 | | RV32I and RV64I, possibly combined with extensions C, Zfh, F, D, |
38 | | and Q, are identical enough that the same filter works for both. |
39 | | |
40 | | The instruction encoding is always little endian, even on systems |
41 | | with big endian data access. Thus the same filter works for both |
42 | | endiannesses. |
43 | | |
44 | | The following instructions have program counter relative |
45 | | (pc-relative) behavior: |
46 | | |
47 | | JAL |
48 | | --- |
49 | | |
50 | | JAL is used for function calls (including tail calls) and |
51 | | unconditional jumps within functions. Jumps within functions |
52 | | aren't useful to filter because the absolute addresses often |
53 | | appear only once or at most a few times. Tail calls and jumps |
54 | | within functions look the same to a simple filter so neither |
55 | | are filtered, that is, JAL x0 is ignored (the ABI name of the |
56 | | register x0 is "zero"). |
57 | | |
58 | | Almost all calls store the return address to register x1 (ra) |
59 | | or x5 (t0). To reduce false matches when the filter is applied |
60 | | to non-code data, only the JAL instructions that use x1 or x5 |
61 | | are converted. JAL has pc-relative range of +/-1 MiB so longer |
62 | | calls and jumps need another method (AUIPC+JALR). |
63 | | |
64 | | C.J and C.JAL |
65 | | ------------- |
66 | | |
67 | | C.J and C.JAL have pc-relative range of +/-2 KiB. |
68 | | |
69 | | C.J is for tail calls and jumps within functions and isn't |
70 | | filtered for the reasons mentioned for JAL x0. |
71 | | |
72 | | C.JAL is an RV32C-only instruction. Its encoding overlaps with |
73 | | RV64C-only C.ADDIW which is a common instruction. So if filtering |
74 | | C.JAL was useful (it wasn't tested) then a separate filter would |
75 | | be needed for RV32 and RV64. Also, false positives would be a |
76 | | significant problem when the filter is applied to non-code data |
77 | | because C.JAL needs only five bits to match. Thus, this filter |
78 | | doesn't modify C.JAL instructions. |
79 | | |
80 | | BEQ, BNE, BLT, BGE, BLTU, BGEU, C.BEQZ, and C.BNEZ |
81 | | -------------------------------------------------- |
82 | | |
83 | | These are conditional branches with pc-relative range |
84 | | of +/-4 KiB (+/-256 B for C.*). The absolute addresses often |
85 | | appear only once and very short distances are the most common, |
86 | | so filtering these instructions would make compression worse. |
87 | | |
88 | | AUIPC with rd != x0 |
89 | | ------------------- |
90 | | |
91 | | AUIPC is paired with a second instruction (inst2) to do |
92 | | pc-relative jumps, calls, loads, stores, and for taking |
93 | | an address of a symbol. AUIPC has a 20-bit immediate and |
94 | | the possible inst2 choices have a 12-bit immediate. |
95 | | |
96 | | AUIPC stores pc + 20-bit signed immediate to a register. |
97 | | The immediate encodes a multiple of 4 KiB so AUIPC itself |
98 | | has a pc-relative range of +/-2 GiB. AUIPC does *NOT* set |
99 | | the lowest 12 bits of the result to zero! This means that |
100 | | the 12-bit immediate in inst2 cannot just include the lowest |
101 | | 12 bits of the absolute address as is; the immediate has to |
102 | | compensate for the lowest 12 bits that AUIPC copies from the |
103 | | program counter. This means that a good filter has to convert |
104 | | not only AUIPC but also the paired inst2. |
105 | | |
106 | | A strict filter would focus on filtering the following |
107 | | AUIPC+inst2 pairs: |
108 | | |
109 | | - AUIPC+JALR: Function calls, including tail calls. |
110 | | |
111 | | - AUIPC+ADDI: Calculating the address of a function |
112 | | or a global variable. |
113 | | |
114 | | - AUIPC+load/store from the base instruction sets |
115 | | (RV32I, RV64I) or from the floating point extensions |
116 | | Zfh, F, D, and Q: |
117 | | * RV32I: LB, LH, LW, LBU, LHU, SB, SH, SW |
118 | | * RV64I has also: LD, LWU, SD |
119 | | * Zfh: FLH, FSH |
120 | | * F: FLW, FSW |
121 | | * D: FLD, FSD |
122 | | * Q: FLQ, FSQ |
123 | | |
124 | | NOTE: AUIPC+inst2 can only be a pair if AUIPC's rd specifies |
125 | | the same register as inst2's rs1. |
126 | | |
127 | | Instead of strictly accepting only the above instructions as inst2, |
128 | | this filter uses a much simpler condition: the lowest two bits of |
129 | | inst2 must be set, that is, inst2 must not be a 16-bit compressed |
130 | | instruction. So this will accept all 32-bit and possible future |
131 | | extended instructions as a pair to AUIPC if the bits in AUIPC's |
132 | | rd [11:7] match the bits [19:15] in inst2 (the bits that I-type and |
133 | | S-type instructions use for rs1). Testing showed that this relaxed |
134 | | condition for inst2 did not consistently or significantly affect |
135 | | compression ratio but it reduced code size and improved speed. |
136 | | |
137 | | Additionally, the paired instruction is always treated as an I-type |
138 | | instruction. The S-type instructions used by stores (SB, SH, SW, |
139 | | etc.) place the lowest 5 bits of the immediate in a different |
140 | | location than I-type instructions. AUIPC+store pairs are less |
141 | | common than other pairs, and testing showed that the extra |
142 | | code required to handle S-type instructions was not worth the |
143 | | compression ratio gained. |
144 | | |
145 | | AUIPC+inst2 don't necessarily appear sequentially next to each |
146 | | other although very often they do. Especially AUIPC+JALR are |
147 | | sequential as that may allow instruction fusion in processors |
148 | | (and perhaps help branch prediction as a fused AUIPC+JALR is |
149 | | a direct branch while JALR alone is an indirect branch). |
150 | | |
151 | | Clang 16 can generate code where AUIPC+inst2 is split: |
152 | | |
153 | | - AUIPC is outside a loop and inst2 (load/store) is inside |
154 | | the loop. This way the AUIPC instruction needs to be |
155 | | executed only once. |
156 | | |
157 | | - Load-modify-store may have AUIPC for the load and the same |
158 | | AUIPC-result is used for the store too. This may get combined |
159 | | with AUIPC being outside the loop. |
160 | | |
161 | | - AUIPC is before a conditional branch and inst2 is hundreds |
162 | | of bytes away at the branch target. |
163 | | |
164 | | - Inner and outer pair: |
165 | | |
166 | | auipc a1,0x2f |
167 | | auipc a2,0x3d |
168 | | ld a2,-500(a2) |
169 | | addi a1,a1,-233 |
170 | | |
171 | | - Many split pairs with an untaken conditional branch between: |
172 | | |
173 | | auipc s9,0x1613 # Pair 1 |
174 | | auipc s4,0x1613 # Pair 2 |
175 | | auipc s6,0x1613 # Pair 3 |
176 | | auipc s10,0x1613 # Pair 4 |
177 | | beqz a5,a3baae |
178 | | ld a0,0(a6) |
179 | | ld a6,246(s9) # Pair 1 |
180 | | ld a1,250(s4) # Pair 2 |
181 | | ld a3,254(s6) # Pair 3 |
182 | | ld a4,258(s10) # Pair 4 |
183 | | |
184 | | It's not possible to find all split pairs in a filter like this. |
185 | | At least in 2024, simple sequential pairs are 99 % of AUIPC uses |
186 | | so filtering only such pairs gives good results and makes the |
187 | | filter simpler. However, it's possible that future compilers will |
188 | | produce different code where sequential pairs aren't as common. |
189 | | |
190 | | This filter doesn't convert AUIPC instructions alone because: |
191 | | |
192 | | (1) The conversion would be off-by-one (or off-by-4096) half the |
193 | | time because the lowest 12 bits from inst2 (inst2_imm12) |
194 | | aren't known. We only know that the absolute address is |
195 | | pc + AUIPC_imm20 + [-2048, +2047] but there is no way to |
196 | | know the exact 4096-byte multiple (or 4096 * n + 2048): |
197 | | there are always two possibilities because AUIPC copies |
198 | | the 12 lowest bits from pc instead of zeroing them. |
199 | | |
200 | | NOTE: The sign-extension of inst2_imm12 adds a tiny bit |
201 | | of extra complexity to AUIPC math in general but it's not |
202 | | the reason for this problem. The sign-extension only changes |
203 | | the relative position of the pc-relative 4096-byte window. |
204 | | |
205 | | (2) Matching AUIPC instruction alone requires only seven bits. |
206 | | When the filter is applied to non-code data, that leads |
207 | | to many false positives which make compression worse. |
208 | | As long as most AUIPC+inst2 pairs appear as two consecutive |
209 | | instructions, converting only such pairs gives better results. |
210 | | |
211 | | In assembly, AUIPC+inst2 tend to look like this: |
212 | | |
213 | | # Call: |
214 | | auipc ra, 0x12345 |
215 | | jalr ra, -42(ra) |
216 | | |
217 | | # Tail call: |
218 | | auipc t1, 0x12345 |
219 | | jalr zero, -42(t1) |
220 | | |
221 | | # Getting the absolute address: |
222 | | auipc a0, 0x12345 |
223 | | addi a0, a0, -42 |
224 | | |
225 | | # rd of inst2 isn't necessarily the same as rs1 even |
226 | | # in cases where there is no reason to preserve rs1. |
227 | | auipc a0, 0x12345 |
228 | | addi a1, a0, -42 |
229 | | |
230 | | As of 2024, 16-bit instructions from the C extension don't |
231 | | appear as inst2. The RISC-V psABI doesn't list AUIPC+C.* as |
232 | | a linker relaxation type explicitly but it's not disallowed |
233 | | either. Usefulness is limited as most of the time the lowest |
234 | | 12 bits won't fit in a C instruction. This filter doesn't |
235 | | support AUIPC+C.* combinations because this makes the filter |
236 | | simpler, there are no test files, and it hopefully will never |
237 | | be needed anyway. |
238 | | |
239 | | (Compare AUIPC to ARM64 where ADRP does set the lowest 12 bits |
240 | | to zero. The paired instruction has the lowest 12 bits of the |
241 | | absolute address as is in a zero-extended immediate. Thus the |
242 | | ARM64 filter doesn't need to care about the instructions that |
243 | | are paired with ADRP. An off-by-4096 issue can still occur if |
244 | | the code section isn't aligned with the filter's start offset. |
245 | | It's not a problem with standalone ELF files but Windows PE |
246 | | files need start_offset=3072 for best results. Also, a .tar |
247 | | stores files with 512-byte alignment so most of the time it |
248 | | won't be the best for ARM64.) |
249 | | |
250 | | AUIPC with rd == x0 |
251 | | ------------------- |
252 | | |
253 | | AUIPC instructions with rd=x0 are reserved for HINTs in the base |
254 | | instruction set. Such AUIPC instructions are never filtered. |
255 | | |
256 | | As of January 2024, it seems likely that AUIPC with rd=x0 will |
257 | | be used for landing pads (pseudoinstruction LPAD). LPAD is used |
258 | | to mark valid targets for indirect jumps (for JALR), for example, |
259 | | beginnings of functions. The 20-bit immediate in LPAD instruction |
260 | | is a label, not a pc-relative address. Thus it would be |
261 | | counterproductive to convert AUIPC instructions with rd=x0. |
262 | | |
263 | | Often the next instruction after LPAD won't have rs1=x0 and thus |
264 | | the filtering would be skipped for that reason alone. However, |
265 | | it's not good to rely on this. For example, consider a function |
266 | | that begins like this: |
267 | | |
268 | | int foo(int i) |
269 | | { |
270 | | if (i <= 234) { |
271 | | ... |
272 | | } |
273 | | |
274 | | A compiler may generate something like this: |
275 | | |
276 | | lpad 0x54321 |
277 | | li a5, 234 |
278 | | bgt a0, a5, .L2 |
279 | | |
280 | | Converting the pseudoinstructions to raw instructions: |
281 | | |
282 | | auipc x0, 0x54321 |
283 | | addi x15, x0, 234 |
284 | | blt x15, x10, .L2 |
285 | | |
286 | | In this case the filter would undesirably convert the AUIPC+ADDI |
287 | | pair if the filter didn't explicitly skip AUIPC instructions |
288 | | that have rd=x0. |
289 | | |
290 | | */ |
291 | | |
292 | | |
293 | | #include "simple_private.h" |
294 | | |
295 | | |
296 | | // This checks two conditions at once: |
297 | | // - AUIPC rd == inst2 rs1. |
298 | | // - inst2 opcode has the lowest two bits set. |
299 | | // |
300 | | // The 8 bit left shift aligns the rd of AUIPC with the rs1 of inst2. |
301 | | // By XORing the registers, any non-zero value in those bits indicates the |
302 | | // registers are not equal and thus not an AUIPC pair. Subtracting 3 from |
303 | | // inst2 will zero out the first two opcode bits only when they are set. |
304 | | // The mask tests if any of the register or opcode bits are set (and thus |
305 | | // not an AUIPC pair). |
306 | | // |
307 | | // Alternative expression: (((((auipc) << 8) ^ (inst2)) & 0xF8003) != 3) |
308 | | #define NOT_AUIPC_PAIR(auipc, inst2) \ |
309 | 0 | ((((auipc) << 8) ^ ((inst2) - 3)) & 0xF8003) |
310 | | |
311 | | // This macro checks multiple conditions: |
312 | | // (1) AUIPC rd [11:7] == x2 (special rd value). |
313 | | // (2) AUIPC bits 12 and 13 set (the lowest two opcode bits of packed inst2). |
314 | | // (3) inst2_rs1 doesn't equal x0 or x2 because the opposite |
315 | | // conversion is only done when |
316 | | // auipc_rd != x0 && |
317 | | // auipc_rd != x2 && |
318 | | // auipc_rd == inst2_rs1. |
319 | | // |
320 | | // The left-hand side takes care of (1) and (2). |
321 | | // (a) The lowest 7 bits are already known to be AUIPC so subtracting 0x17 |
322 | | // makes those bits zeros. |
323 | | // (b) If AUIPC rd equals x2, subtracting 0x100 makes bits [11:7] zeros. |
324 | | // If rd doesn't equal x2, then there will be at least one non-zero bit |
325 | | // and the next step (c) is irrelevant. |
326 | | // (c) If the lowest two opcode bits of the packed inst2 are set in [13:12], |
327 | | // then subtracting 0x3000 will make those bits zeros. Otherwise there |
328 | | // will be at least one non-zero bit. |
329 | | // |
330 | | // The shift by 18 removes the high bits from the final '>=' comparison and |
331 | | // ensures that any non-zero result will be larger than any possible result |
332 | | // from the right-hand side of the comparison. The cast ensures that the |
333 | | // left-hand side didn't get promoted to a larger type than uint32_t. |
334 | | // |
335 | | // On the right-hand side, inst2_rs1 & 0x1D will be non-zero as long as |
336 | | // inst2_rs1 is not x0 or x2. |
337 | | // |
338 | | // The final '>=' comparison will make the expression true if: |
339 | | // - The subtraction caused any bits to be set (special AUIPC rd value not |
340 | | // used or inst2 opcode bits not set). (non-zero >= non-zero or 0) |
341 | | // - The subtraction did not cause any bits to be set but inst2_rs1 was |
342 | | // x0 or x2. (0 >= 0) |
343 | | #define NOT_SPECIAL_AUIPC(auipc, inst2_rs1) \ |
344 | 0 | ((uint32_t)(((auipc) - 0x3117) << 18) >= ((inst2_rs1) & 0x1D)) |
345 | | |
346 | | |
347 | | // The encode and decode functions are split for this filter because of the |
348 | | // AUIPC+inst2 filtering. This filter design allows a decoder-only |
349 | | // implementation to be smaller than alternative designs. |
350 | | |
351 | | #ifdef HAVE_ENCODER_RISCV |
352 | | static size_t |
353 | | riscv_encode(void *simple lzma_attribute((__unused__)), |
354 | | uint32_t now_pos, |
355 | | bool is_encoder lzma_attribute((__unused__)), |
356 | | uint8_t *buffer, size_t size) |
357 | 0 | { |
358 | | // Avoid using i + 8 <= size in the loop condition. |
359 | | // |
360 | | // NOTE: If there is a JAL in the last six bytes of the stream, it |
361 | | // won't be converted. This is intentional to keep the code simpler. |
362 | 0 | if (size < 8) |
363 | 0 | return 0; |
364 | | |
365 | 0 | size -= 8; |
366 | |
|
367 | 0 | size_t i; |
368 | | |
369 | | // The loop is advanced by 2 bytes every iteration since the |
370 | | // instruction stream may include 16-bit instructions (C extension). |
371 | 0 | for (i = 0; i <= size; i += 2) { |
372 | 0 | uint32_t inst = buffer[i]; |
373 | |
|
374 | 0 | if (inst == 0xEF) { |
375 | | // JAL |
376 | 0 | const uint32_t b1 = buffer[i + 1]; |
377 | | |
378 | | // Only filter rd=x1(ra) and rd=x5(t0). |
379 | 0 | if ((b1 & 0x0D) != 0) |
380 | 0 | continue; |
381 | | |
382 | | // The 20-bit immediate is in four pieces. |
383 | | // The encoder stores it in big endian form |
384 | | // since it improves compression slightly. |
385 | 0 | const uint32_t b2 = buffer[i + 2]; |
386 | 0 | const uint32_t b3 = buffer[i + 3]; |
387 | 0 | const uint32_t pc = now_pos + (uint32_t)i; |
388 | | |
389 | | // The following chart shows the highest three bytes of JAL, focusing on |
390 | | // the 20-bit immediate field [31:12]. The first row of numbers is the |
391 | | // bit position in a 32-bit little endian instruction. The second row of |
392 | | // numbers shows the order of the immediate field in a J-type instruction. |
393 | | // The last row is the bit number in each byte. |
394 | | // |
395 | | // To determine the amount to shift each bit, subtract the value in |
396 | | // the last row from the value in the second last row. If the number |
397 | | // is positive, shift left. If negative, shift right. |
398 | | // |
399 | | // For example, at the rightmost side of the chart, the bit 4 in b1 is |
400 | | // the bit 12 of the address. Thus that bit needs to be shifted left |
401 | | // by 12 - 4 = 8 bits to put it in the right place in the addr variable. |
402 | | // |
403 | | // NOTE: The immediate of a J-type instruction holds bits [20:1] of |
404 | | // the address. The bit [0] is always 0 and not part of the immediate. |
405 | | // |
406 | | // | b3 | b2 | b1 | |
407 | | // | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x | |
408 | | // | 20 10 9 8 7 6 5 4 | 3 2 1 11 19 18 17 16 | 15 14 13 12 x x x x | |
409 | | // | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 4 x x x x | |
410 | |
|
411 | 0 | uint32_t addr = ((b1 & 0xF0) << 8) |
412 | 0 | | ((b2 & 0x0F) << 16) |
413 | 0 | | ((b2 & 0x10) << 7) |
414 | 0 | | ((b2 & 0xE0) >> 4) |
415 | 0 | | ((b3 & 0x7F) << 4) |
416 | 0 | | ((b3 & 0x80) << 13); |
417 | |
|
418 | 0 | addr += pc; |
419 | |
|
420 | 0 | buffer[i + 1] = (uint8_t)((b1 & 0x0F) |
421 | 0 | | ((addr >> 13) & 0xF0)); |
422 | |
|
423 | 0 | buffer[i + 2] = (uint8_t)(addr >> 9); |
424 | 0 | buffer[i + 3] = (uint8_t)(addr >> 1); |
425 | | |
426 | | // The "-2" is included because the for-loop will |
427 | | // always increment by 2. In this case, we want to |
428 | | // skip an extra 2 bytes since we used 4 bytes |
429 | | // of input. |
430 | 0 | i += 4 - 2; |
431 | |
|
432 | 0 | } else if ((inst & 0x7F) == 0x17) { |
433 | | // AUIPC |
434 | 0 | inst |= (uint32_t)buffer[i + 1] << 8; |
435 | 0 | inst |= (uint32_t)buffer[i + 2] << 16; |
436 | 0 | inst |= (uint32_t)buffer[i + 3] << 24; |
437 | | |
438 | | // Branch based on AUIPC's rd. The bitmask test does |
439 | | // the same thing as this: |
440 | | // |
441 | | // const uint32_t auipc_rd = (inst >> 7) & 0x1F; |
442 | | // if (auipc_rd != 0 && auipc_rd != 2) { |
443 | 0 | if (inst & 0xE80) { |
444 | | // AUIPC's rd doesn't equal x0 or x2. |
445 | | |
446 | | // Check if AUIPC+inst2 are a pair. |
447 | 0 | uint32_t inst2 = read32le(buffer + i + 4); |
448 | |
|
449 | 0 | if (NOT_AUIPC_PAIR(inst, inst2)) { |
450 | | // The NOT_AUIPC_PAIR macro allows |
451 | | // a false AUIPC+AUIPC pair if the |
452 | | // bits [19:15] (where rs1 would be) |
453 | | // in the second AUIPC match the rd |
454 | | // of the first AUIPC. |
455 | | // |
456 | | // We must skip enough forward so |
457 | | // that the first two bytes of the |
458 | | // second AUIPC cannot get converted. |
459 | | // Such a conversion could make the |
460 | | // current pair become a valid pair |
461 | | // which would desync the decoder. |
462 | | // |
463 | | // Skipping six bytes is enough even |
464 | | // though the above condition looks |
465 | | // at the lowest four bits of the |
466 | | // buffer[i + 6] too. This is safe |
467 | | // because this filter never changes |
468 | | // those bits if a conversion at |
469 | | // that position is done. |
470 | 0 | i += 6 - 2; |
471 | 0 | continue; |
472 | 0 | } |
473 | | |
474 | | // Convert AUIPC+inst2 to a special format: |
475 | | // |
476 | | // - The lowest 7 bits [6:0] retain the |
477 | | // AUIPC opcode. |
478 | | // |
479 | | // - The rd [11:7] is set to x2(sp). x2 is |
480 | | // used as the stack pointer so AUIPC with |
481 | | // rd=x2 should be very rare in real-world |
482 | | // executables. |
483 | | // |
484 | | // - The remaining 20 bits [31:12] (that |
485 | | // normally hold the pc-relative immediate) |
486 | | // are used to store the lowest 20 bits of |
487 | | // inst2. That is, the 12-bit immediate of |
488 | | // inst2 is not included. |
489 | | // |
490 | | // - The location of the original inst2 is |
491 | | // used to store the 32-bit absolute |
492 | | // address in big endian format. Compared |
493 | | // to the 20+12-bit split encoding, this |
494 | | // results in a longer uninterrupted |
495 | | // sequence of identical common bytes |
496 | | // when the same address is referred |
497 | | // with different instruction pairs |
498 | | // (like AUIPC+LD vs. AUIPC+ADDI) or |
499 | | // when the occurrences of the same |
500 | | // pair use different registers. When |
501 | | // referring to adjacent memory locations |
502 | | // (like function calls that go via the |
503 | | // ELF PLT), in big endian order only the |
504 | | // last 1-2 bytes differ; in little endian |
505 | | // the differing 1-2 bytes would be in the |
506 | | // middle of the 8-byte sequence. |
507 | | // |
508 | | // When reversing the transformation, the |
509 | | // original rd of AUIPC can be restored |
510 | | // from inst2's rs1 as they are required to |
511 | | // be the same. |
512 | | |
513 | | // Arithmetic right shift makes sign extension |
514 | | // trivial but (1) it's implementation-defined |
515 | | // behavior (C99/C11/C23 6.5.7-p5) and so is |
516 | | // (2) casting unsigned to signed (6.3.1.3-p3). |
517 | | // |
518 | | // One can check for (1) with |
519 | | // |
520 | | // if ((-1 >> 1) == -1) ... |
521 | | // |
522 | | // but (2) has to be checked from the |
523 | | // compiler docs. GCC promises that (1) |
524 | | // and (2) behave in the common expected |
525 | | // way and thus |
526 | | // |
527 | | // addr += (uint32_t)( |
528 | | // (int32_t)inst2 >> 20); |
529 | | // |
530 | | // does the same as the code below. But since |
531 | | // the 100 % portable way is only a few bytes |
532 | | // bigger code and there is no real speed |
533 | | // difference, let's just use that, especially |
534 | | // since the decoder doesn't need this at all. |
535 | 0 | uint32_t addr = inst & 0xFFFFF000; |
536 | 0 | addr += (inst2 >> 20) |
537 | 0 | - ((inst2 >> 19) & 0x1000); |
538 | |
|
539 | 0 | addr += now_pos + (uint32_t)i; |
540 | | |
541 | | // Construct the first 32 bits: |
542 | | // [6:0] AUIPC opcode |
543 | | // [11:7] Special AUIPC rd = x2 |
544 | | // [31:12] The lowest 20 bits of inst2 |
545 | 0 | inst = 0x17 | (2 << 7) | (inst2 << 12); |
546 | |
|
547 | 0 | write32le(buffer + i, inst); |
548 | | |
549 | | // The second 32 bits store the absolute |
550 | | // address in big endian order. |
551 | 0 | write32be(buffer + i + 4, addr); |
552 | 0 | } else { |
553 | | // AUIPC's rd equals x0 or x2. |
554 | | // |
555 | | // x0 indicates a landing pad (LPAD). |
556 | | // It's always skipped. |
557 | | // |
558 | | // AUIPC with rd == x2 is used for the special |
559 | | // format as explained above. When the input |
560 | | // contains a byte sequence that matches the |
561 | | // special format, "fake" decoding must be |
562 | | // done to keep the filter bijective (that |
563 | | // is, safe to apply on arbitrary data). |
564 | | // |
565 | | // See the "x0 or x2" section in riscv_decode() |
566 | | // for how the "real" decoding is done. The |
567 | | // "fake" decoding is a simplified version |
568 | | // of "real" decoding with the following |
569 | | // differences (these reduce code size of |
570 | | // the decoder): |
571 | | // (1) The lowest 12 bits aren't sign-extended. |
572 | | // (2) No address conversion is done. |
573 | | // (3) Big endian format isn't used (the fake |
574 | | // address is in little endian order). |
575 | | |
576 | | // Check if inst matches the special format. |
577 | 0 | const uint32_t fake_rs1 = inst >> 27; |
578 | |
|
579 | 0 | if (NOT_SPECIAL_AUIPC(inst, fake_rs1)) { |
580 | 0 | i += 4 - 2; |
581 | 0 | continue; |
582 | 0 | } |
583 | | |
584 | 0 | const uint32_t fake_addr = |
585 | 0 | read32le(buffer + i + 4); |
586 | | |
587 | | // Construct the second 32 bits: |
588 | | // [19:0] Upper 20 bits from AUIPC |
589 | | // [31:20] The lowest 12 bits of fake_addr |
590 | 0 | const uint32_t fake_inst2 = (inst >> 12) |
591 | 0 | | (fake_addr << 20); |
592 | | |
593 | | // Construct new first 32 bits from: |
594 | | // [6:0] AUIPC opcode |
595 | | // [11:7] Fake AUIPC rd = fake_rs1 |
596 | | // [31:12] The highest 20 bits of fake_addr |
597 | 0 | inst = 0x17 | (fake_rs1 << 7) |
598 | 0 | | (fake_addr & 0xFFFFF000); |
599 | |
|
600 | 0 | write32le(buffer + i, inst); |
601 | 0 | write32le(buffer + i + 4, fake_inst2); |
602 | 0 | } |
603 | | |
604 | 0 | i += 8 - 2; |
605 | 0 | } |
606 | 0 | } |
607 | |
|
608 | 0 | return i; |
609 | 0 | } |
610 | | |
611 | | |
612 | | extern lzma_ret |
613 | | lzma_simple_riscv_encoder_init(lzma_next_coder *next, |
614 | | const lzma_allocator *allocator, |
615 | | const lzma_filter_info *filters) |
616 | 0 | { |
617 | 0 | return lzma_simple_coder_init(next, allocator, filters, |
618 | 0 | &riscv_encode, 0, 8, 2, true); |
619 | 0 | } |
620 | | |
621 | | |
622 | | extern LZMA_API(size_t) |
623 | | lzma_bcj_riscv_encode(uint32_t start_offset, uint8_t *buf, size_t size) |
624 | 0 | { |
625 | | // start_offset must be a multiple of two. |
626 | 0 | start_offset &= ~UINT32_C(1); |
627 | 0 | return riscv_encode(NULL, start_offset, true, buf, size); |
628 | 0 | } |
629 | | #endif |
630 | | |
631 | | |
632 | | #ifdef HAVE_DECODER_RISCV |
633 | | static size_t |
634 | | riscv_decode(void *simple lzma_attribute((__unused__)), |
635 | | uint32_t now_pos, |
636 | | bool is_encoder lzma_attribute((__unused__)), |
637 | | uint8_t *buffer, size_t size) |
638 | 0 | { |
639 | 0 | if (size < 8) |
640 | 0 | return 0; |
641 | | |
642 | 0 | size -= 8; |
643 | |
|
644 | 0 | size_t i; |
645 | 0 | for (i = 0; i <= size; i += 2) { |
646 | 0 | uint32_t inst = buffer[i]; |
647 | |
|
648 | 0 | if (inst == 0xEF) { |
649 | | // JAL |
650 | 0 | const uint32_t b1 = buffer[i + 1]; |
651 | | |
652 | | // Only filter rd=x1(ra) and rd=x5(t0). |
653 | 0 | if ((b1 & 0x0D) != 0) |
654 | 0 | continue; |
655 | | |
656 | 0 | const uint32_t b2 = buffer[i + 2]; |
657 | 0 | const uint32_t b3 = buffer[i + 3]; |
658 | 0 | const uint32_t pc = now_pos + (uint32_t)i; |
659 | | |
660 | | // | b3 | b2 | b1 | |
661 | | // | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x | |
662 | | // | 20 10 9 8 7 6 5 4 | 3 2 1 11 19 18 17 16 | 15 14 13 12 x x x x | |
663 | | // | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 4 x x x x | |
664 | |
|
665 | 0 | uint32_t addr = ((b1 & 0xF0) << 13) |
666 | 0 | | (b2 << 9) | (b3 << 1); |
667 | |
|
668 | 0 | addr -= pc; |
669 | |
|
670 | 0 | buffer[i + 1] = (uint8_t)((b1 & 0x0F) |
671 | 0 | | ((addr >> 8) & 0xF0)); |
672 | |
|
673 | 0 | buffer[i + 2] = (uint8_t)(((addr >> 16) & 0x0F) |
674 | 0 | | ((addr >> 7) & 0x10) |
675 | 0 | | ((addr << 4) & 0xE0)); |
676 | |
|
677 | 0 | buffer[i + 3] = (uint8_t)(((addr >> 4) & 0x7F) |
678 | 0 | | ((addr >> 13) & 0x80)); |
679 | |
|
680 | 0 | i += 4 - 2; |
681 | |
|
682 | 0 | } else if ((inst & 0x7F) == 0x17) { |
683 | | // AUIPC |
684 | 0 | uint32_t inst2; |
685 | |
|
686 | 0 | inst |= (uint32_t)buffer[i + 1] << 8; |
687 | 0 | inst |= (uint32_t)buffer[i + 2] << 16; |
688 | 0 | inst |= (uint32_t)buffer[i + 3] << 24; |
689 | |
|
690 | 0 | if (inst & 0xE80) { |
691 | | // AUIPC's rd doesn't equal x0 or x2. |
692 | | |
693 | | // Check if it is a "fake" AUIPC+inst2 pair. |
694 | 0 | inst2 = read32le(buffer + i + 4); |
695 | |
|
696 | 0 | if (NOT_AUIPC_PAIR(inst, inst2)) { |
697 | 0 | i += 6 - 2; |
698 | 0 | continue; |
699 | 0 | } |
700 | | |
701 | | // Decode (or more like re-encode) the "fake" |
702 | | // pair. The "fake" format doesn't do |
703 | | // sign-extension, address conversion, or |
704 | | // use big endian. (The use of little endian |
705 | | // allows sharing the write32le() calls in |
706 | | // the decoder to reduce code size when |
707 | | // unaligned access isn't supported.) |
708 | 0 | uint32_t addr = inst & 0xFFFFF000; |
709 | 0 | addr += inst2 >> 20; |
710 | |
|
711 | 0 | inst = 0x17 | (2 << 7) | (inst2 << 12); |
712 | 0 | inst2 = addr; |
713 | 0 | } else { |
714 | | // AUIPC's rd equals x0 or x2. |
715 | | |
716 | | // Check if inst matches the special format |
717 | | // used by the encoder. |
718 | 0 | const uint32_t inst2_rs1 = inst >> 27; |
719 | |
|
720 | 0 | if (NOT_SPECIAL_AUIPC(inst, inst2_rs1)) { |
721 | 0 | i += 4 - 2; |
722 | 0 | continue; |
723 | 0 | } |
724 | | |
725 | | // Decode the "real" pair. |
726 | 0 | uint32_t addr = read32be(buffer + i + 4); |
727 | |
|
728 | 0 | addr -= now_pos + (uint32_t)i; |
729 | | |
730 | | // The second instruction: |
731 | | // - Get the lowest 20 bits from inst. |
732 | | // - Add the lowest 12 bits of the address |
733 | | // as the immediate field. |
734 | 0 | inst2 = (inst >> 12) | (addr << 20); |
735 | | |
736 | | // AUIPC: |
737 | | // - rd is the same as inst2_rs1. |
738 | | // - The sign extension of the lowest 12 bits |
739 | | // must be taken into account. |
740 | 0 | inst = 0x17 | (inst2_rs1 << 7) |
741 | 0 | | ((addr + 0x800) & 0xFFFFF000); |
742 | 0 | } |
743 | | |
744 | | // Both decoder branches write in little endian order. |
745 | 0 | write32le(buffer + i, inst); |
746 | 0 | write32le(buffer + i + 4, inst2); |
747 | |
|
748 | 0 | i += 8 - 2; |
749 | 0 | } |
750 | 0 | } |
751 | |
|
752 | 0 | return i; |
753 | 0 | } |
754 | | |
755 | | |
756 | | extern lzma_ret |
757 | | lzma_simple_riscv_decoder_init(lzma_next_coder *next, |
758 | | const lzma_allocator *allocator, |
759 | | const lzma_filter_info *filters) |
760 | 0 | { |
761 | 0 | return lzma_simple_coder_init(next, allocator, filters, |
762 | 0 | &riscv_decode, 0, 8, 2, false); |
763 | 0 | } |
764 | | |
765 | | |
766 | | extern LZMA_API(size_t) |
767 | | lzma_bcj_riscv_decode(uint32_t start_offset, uint8_t *buf, size_t size) |
768 | 0 | { |
769 | | // start_offset must be a multiple of two. |
770 | 0 | start_offset &= ~UINT32_C(1); |
771 | 0 | return riscv_decode(NULL, start_offset, false, buf, size); |
772 | 0 | } |
773 | | #endif |