Coverage Report

Created: 2026-02-14 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/opcache/jit/ir/ir_cfg.c
Line
Count
Source
1
/*
2
 * IR - Lightweight JIT Compilation Framework
3
 * (CFG - Control Flow Graph)
4
 * Copyright (C) 2022 Zend by Perforce.
5
 * Authors: Dmitry Stogov <dmitry@php.net>
6
 */
7
8
#include "ir.h"
9
#include "ir_private.h"
10
11
static int ir_remove_unreachable_blocks(ir_ctx *ctx);
12
13
IR_ALWAYS_INLINE void _ir_add_successors(const ir_ctx *ctx, ir_ref ref, ir_worklist *worklist)
14
0
{
15
0
  ir_use_list *use_list = &ctx->use_lists[ref];
16
0
  ir_ref *p, use, n = use_list->count;
17
18
0
  if (n < 2) {
19
0
    if (n == 1) {
20
0
      use = ctx->use_edges[use_list->refs];
21
0
      IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
22
0
      ir_worklist_push(worklist, use);
23
0
    }
24
0
  } else {
25
0
    p = &ctx->use_edges[use_list->refs];
26
0
    if (n == 2) {
27
0
      use = *p;
28
0
      IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
29
0
      ir_worklist_push(worklist, use);
30
0
      use = *(p + 1);
31
0
      IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
32
0
      ir_worklist_push(worklist, use);
33
0
    } else {
34
0
      for (; n > 0; p++, n--) {
35
0
        use = *p;
36
0
        IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
37
0
        ir_worklist_push(worklist, use);
38
0
      }
39
0
    }
40
0
  }
41
0
}
42
43
IR_ALWAYS_INLINE void _ir_add_predecessors(const ir_insn *insn, ir_worklist *worklist)
44
0
{
45
0
  ir_ref n, ref;
46
0
  const ir_ref *p;
47
48
0
  if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
49
0
    n = insn->inputs_count;
50
0
    for (p = insn->ops + 1; n > 0; p++, n--) {
51
0
      ref = *p;
52
0
      IR_ASSERT(ref);
53
0
      ir_worklist_push(worklist, ref);
54
0
    }
55
0
  } else if (insn->op != IR_START) {
56
0
    if (EXPECTED(insn->op1)) {
57
0
      ir_worklist_push(worklist, insn->op1);
58
0
    }
59
0
  }
60
0
}
61
62
void ir_reset_cfg(ir_ctx *ctx)
63
0
{
64
0
  ctx->cfg_blocks_count = 0;
65
0
  ctx->cfg_edges_count = 0;
66
0
  if (ctx->cfg_blocks) {
67
0
    ir_mem_free(ctx->cfg_blocks);
68
0
    ctx->cfg_blocks = NULL;
69
0
    if (ctx->cfg_edges) {
70
0
      ir_mem_free(ctx->cfg_edges);
71
0
      ctx->cfg_edges = NULL;
72
0
    }
73
0
    if (ctx->cfg_map) {
74
0
      ir_mem_free(ctx->cfg_map);
75
0
      ctx->cfg_map = NULL;
76
0
    }
77
0
  }
78
0
}
79
80
static void ir_remove_phis_inputs(ir_ctx *ctx, ir_use_list *use_list, int new_inputs_count, ir_bitset life_inputs)
81
0
{
82
0
  ir_ref i, j, n, k, *p, *q, use;
83
0
  ir_insn *use_insn;
84
85
0
  if (new_inputs_count == 1) {
86
0
    for (k = use_list->count, p = q = &ctx->use_edges[use_list->refs]; k > 0; p++, k--) {
87
0
      use = *p;
88
0
      use_insn = &ctx->ir_base[use];
89
0
      if (use_insn->op == IR_PHI) {
90
        /* Convert PHI to COPY */
91
0
        n = use_insn->inputs_count;
92
0
        i = 2;
93
0
        for (j = 2; j <= n; j++) {
94
0
          ir_ref input = ir_insn_op(use_insn, j);
95
96
0
          if (ir_bitset_in(life_inputs, j - 1)) {
97
0
            use_insn->op1 = ir_insn_op(use_insn, j);
98
0
          } else if (input > 0) {
99
0
            ir_use_list_remove_one(ctx, input, use);
100
0
          }
101
0
        }
102
0
        use_insn->op = IR_COPY;
103
0
        use_insn->inputs_count = 1;
104
0
        for (j = 2; j <= n; j++) {
105
0
          ir_insn_set_op(use_insn, j, IR_UNUSED);
106
0
        }
107
0
        continue;
108
0
      }
109
110
      /*compact use list */
111
0
      if (p != q){
112
0
        *q = use;
113
0
      }
114
0
      q++;
115
0
    }
116
117
0
    if (p != q) {
118
0
      use_list->count -= (p - q);
119
0
      do {
120
0
        *q = IR_UNUSED; /* clenu-op the removed tail */
121
0
        q++;
122
0
      } while (p != q);
123
0
    }
124
0
  } else {
125
0
    for (k = use_list->count, p = &ctx->use_edges[use_list->refs]; k > 0; p++, k--) {
126
0
      use = *p;
127
0
      use_insn = &ctx->ir_base[use];
128
0
      if (use_insn->op == IR_PHI) {
129
0
        n = use_insn->inputs_count;
130
0
        i = 2;
131
0
        for (j = 2; j <= n; j++) {
132
0
          ir_ref input = ir_insn_op(use_insn, j);
133
134
0
          if (ir_bitset_in(life_inputs, j - 1)) {
135
0
            IR_ASSERT(input);
136
0
            if (i != j) {
137
0
              ir_insn_set_op(use_insn, i, input);
138
0
            }
139
0
            i++;
140
0
          } else if (input > 0) {
141
0
            ir_use_list_remove_one(ctx, input, use);
142
0
          }
143
0
        }
144
0
        use_insn->inputs_count = i - 1;
145
0
        for (j = i; j <= n; j++) {
146
0
          ir_insn_set_op(use_insn, j, IR_UNUSED);
147
0
        }
148
0
      }
149
0
    }
150
0
  }
151
0
}
152
153
static uint32_t IR_NEVER_INLINE ir_cfg_remove_dead_inputs(ir_ctx *ctx, uint32_t *_blocks, ir_block *blocks, uint32_t bb_count)
154
0
{
155
0
  uint32_t b, count = 0;
156
0
  ir_block *bb = blocks + 1;
157
0
  ir_insn *insn;
158
0
  ir_ref i, j, n, *ops, input;
159
0
  ir_bitset life_inputs = NULL;
160
161
0
  for (b = 1; b <= bb_count; b++, bb++) {
162
0
    bb->successors = count;
163
0
    count += ctx->use_lists[bb->end].count;
164
0
    bb->successors_count = 0;
165
0
    bb->predecessors = count;
166
0
    insn = &ctx->ir_base[bb->start];
167
0
    if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
168
0
      n = insn->inputs_count;
169
0
      ops = insn->ops;
170
0
      for (i = 1, j = 1; i <= n; i++) {
171
0
        input = ops[i];
172
0
        if (_blocks[input]) {
173
0
          if (life_inputs) {
174
0
            ir_bitset_incl(life_inputs, i);
175
0
          }
176
0
          if (i != j) {
177
0
            ops[j] = ops[i];
178
0
          }
179
0
          j++;
180
0
        } else {
181
0
          if (ctx->use_lists[bb->start].count > 1) {
182
            /* Some inputs of this MERGE are deleted and we have to update the depended PHIs */
183
0
            if (!life_inputs) {
184
0
              int k;
185
0
              life_inputs = ir_bitset_malloc(n + 1);
186
0
              for (k = 1; k < i; k++) {
187
0
                ir_bitset_incl(life_inputs, k);
188
0
              }
189
0
            }
190
0
          }
191
0
          if (input > 0) {
192
0
            ir_use_list_remove_one(ctx, input, bb->start);
193
0
          }
194
0
        }
195
0
      }
196
0
      j--;
197
0
      if (j != n) {
198
0
        if (j == 1) {
199
0
          insn->op = IR_BEGIN;
200
0
        }
201
0
        insn->inputs_count = j;
202
0
        bb->predecessors_count = j;
203
0
        j++;
204
0
        for (;j <= n; j++) {
205
0
          ops[j] = IR_UNUSED;
206
0
        }
207
0
        if (life_inputs) {
208
0
          ir_remove_phis_inputs(ctx, &ctx->use_lists[bb->start], insn->inputs_count, life_inputs);
209
0
          ir_mem_free(life_inputs);
210
0
        }
211
0
      }
212
0
    }
213
0
    count += bb->predecessors_count;
214
0
  }
215
0
  return count;
216
0
}
217
218
int ir_build_cfg(ir_ctx *ctx)
219
0
{
220
0
  ir_ref n, *p, ref, start, end;
221
0
  uint32_t b;
222
0
  ir_insn *insn;
223
0
  ir_worklist worklist;
224
0
  uint32_t bb_init_falgs;
225
0
  uint32_t count, bb_count = 0;
226
0
  uint32_t edges_count = 0;
227
0
  ir_block *blocks, *bb;
228
0
  uint32_t *_blocks, *edges;
229
0
  ir_use_list *use_list;
230
0
  uint32_t len = ir_bitset_len(ctx->insns_count);
231
0
  ir_bitset bb_starts = ir_mem_calloc(len * 2, IR_BITSET_BITS / 8);
232
0
  ir_bitset bb_leaks = bb_starts + len;
233
0
  _blocks = ir_mem_calloc(ctx->insns_limit, sizeof(uint32_t));
234
0
  ir_worklist_init(&worklist, ctx->insns_count);
235
236
  /* First try to perform backward DFS search starting from "stop" nodes */
237
238
  /* Add all "stop" nodes */
239
0
  ref = ctx->ir_base[1].op1;
240
0
  while (ref) {
241
0
    ir_worklist_push(&worklist, ref);
242
0
    ref = ctx->ir_base[ref].op3;
243
0
  }
244
245
0
  while (ir_worklist_len(&worklist)) {
246
0
    ref = ir_worklist_pop(&worklist);
247
0
    insn = &ctx->ir_base[ref];
248
249
0
    if (insn->op == IR_NOP) {
250
0
      continue;
251
0
    }
252
0
    IR_ASSERT(IR_IS_BB_END(insn->op));
253
    /* Remember BB end */
254
0
    end = ref;
255
    /* Some successors of IF and SWITCH nodes may be inaccessible by backward DFS */
256
0
    use_list = &ctx->use_lists[end];
257
0
    n = use_list->count;
258
0
    if (n > 1 || (n == 1 && (ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR) != 0)) {
259
0
      for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
260
        /* Remember possible inaccessible successors */
261
0
        ir_bitset_incl(bb_leaks, *p);
262
0
      }
263
0
    }
264
    /* Skip control nodes untill BB start */
265
0
    ref = insn->op1;
266
0
    while (1) {
267
0
      insn = &ctx->ir_base[ref];
268
0
      if (IR_IS_BB_START(insn->op)) {
269
0
        break;
270
0
      }
271
0
      ref = insn->op1; // follow connected control blocks untill BB start
272
0
    }
273
    /* Mark BB Start */
274
0
    bb_count++;
275
0
    _blocks[ref] = end;
276
0
    ir_bitset_incl(bb_starts, ref);
277
    /* Add predecessors */
278
0
    _ir_add_predecessors(insn, &worklist);
279
0
  }
280
281
  /* Backward DFS way miss some branches ending by infinite loops.  */
282
  /* Try forward DFS. (in most cases all nodes are already proceed). */
283
284
  /* START node may be inaccessible from "stop" nodes */
285
0
  ir_bitset_incl(bb_leaks, 1);
286
287
  /* Add not processed START and successor of IF and SWITCH */
288
0
  IR_BITSET_FOREACH_DIFFERENCE(bb_leaks, bb_starts, len, start) {
289
0
    ir_worklist_push(&worklist, start);
290
0
  } IR_BITSET_FOREACH_END();
291
292
0
  if (ir_worklist_len(&worklist)) {
293
0
    ir_bitset_union(worklist.visited, bb_starts, len);
294
0
    do {
295
0
      ref = ir_worklist_pop(&worklist);
296
0
      insn = &ctx->ir_base[ref];
297
298
0
      if (insn->op == IR_NOP) {
299
0
        continue;
300
0
      }
301
0
      IR_ASSERT(IR_IS_BB_START(insn->op));
302
      /* Remember BB start */
303
0
      start = ref;
304
      /* Skip control nodes untill BB end */
305
0
      while (1) {
306
0
        ref = ir_next_control(ctx, ref);
307
0
        insn = &ctx->ir_base[ref];
308
0
        if (IR_IS_BB_END(insn->op)) {
309
0
          break;
310
0
        }
311
0
      }
312
      /* Mark BB Start */
313
0
      bb_count++;
314
0
      _blocks[start] = ref;
315
0
      ir_bitset_incl(bb_starts, start);
316
      /* Add successors */
317
0
      _ir_add_successors(ctx, ref, &worklist);
318
0
    } while (ir_worklist_len(&worklist));
319
0
  }
320
321
0
  IR_ASSERT(bb_count > 0);
322
323
  /* Create array of basic blocks and count successor/predecessors edges for each BB */
324
0
  blocks = ir_mem_malloc((bb_count + 1) * sizeof(ir_block));
325
0
  b = 1;
326
0
  bb = blocks + 1;
327
0
  count = 0;
328
  /* SCCP already removed UNREACHABKE blocks, otherwise all blocks are marked as UNREACHABLE first */
329
0
  bb_init_falgs = (ctx->flags2 & IR_CFG_REACHABLE) ? 0 : IR_BB_UNREACHABLE;
330
0
  IR_BITSET_FOREACH(bb_starts, len, start) {
331
0
    insn = &ctx->ir_base[start];
332
0
    if (insn->op == IR_NOP) {
333
0
      _blocks[start] = 0;
334
0
      continue;
335
0
    }
336
0
    end = _blocks[start];
337
0
    _blocks[start] = b;
338
0
    _blocks[end] = b;
339
0
    IR_ASSERT(IR_IS_BB_START(insn->op));
340
0
    bb->start = start;
341
0
    bb->end = end;
342
0
    bb->successors = count;
343
0
    count += ctx->use_lists[end].count;
344
0
    bb->successors_count = 0;
345
0
    bb->predecessors = count;
346
0
    bb->dom_parent = 0;
347
0
    bb->dom_depth = 0;
348
0
    bb->dom_child = 0;
349
0
    bb->dom_next_child = 0;
350
0
    bb->loop_header = 0;
351
0
    bb->loop_depth = 0;
352
0
    if (insn->op == IR_START) {
353
0
      bb->flags = IR_BB_START;
354
0
      bb->predecessors_count = 0;
355
0
    } else {
356
0
      bb->flags = bb_init_falgs;
357
0
      if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
358
0
        n = insn->inputs_count;
359
0
        bb->predecessors_count = n;
360
0
        edges_count += n;
361
0
        count += n;
362
0
      } else if (EXPECTED(insn->op1)) {
363
0
        if (insn->op == IR_ENTRY) {
364
0
          bb->flags |= IR_BB_ENTRY;
365
0
          ctx->entries_count++;
366
0
        }
367
0
        bb->predecessors_count = 1;
368
0
        edges_count++;
369
0
        count++;
370
0
      } else {
371
0
        IR_ASSERT(insn->op == IR_BEGIN); /* start of unreachable block */
372
0
        bb->predecessors_count = 0;
373
0
      }
374
0
    }
375
0
    b++;
376
0
    bb++;
377
0
  } IR_BITSET_FOREACH_END();
378
0
  bb_count = b - 1;
379
0
  if (UNEXPECTED(count != edges_count * 2)) {
380
0
    count = ir_cfg_remove_dead_inputs(ctx, _blocks, blocks, bb_count);
381
0
    IR_ASSERT(count != edges_count * 2);
382
0
  }
383
0
  ir_mem_free(bb_starts);
384
385
  /* Create an array of successor/predecessors control edges */
386
0
  edges = ir_mem_malloc(edges_count * 2 * sizeof(uint32_t));
387
0
  bb = blocks + 1;
388
0
  for (b = 1; b <= bb_count; b++, bb++) {
389
0
    insn = &ctx->ir_base[bb->start];
390
0
    if (bb->predecessors_count > 1) {
391
0
      uint32_t *q = edges + bb->predecessors;
392
0
      n = insn->inputs_count;
393
0
      for (p = insn->ops + 1; n > 0; p++, q++, n--) {
394
0
        ref = *p;
395
0
        IR_ASSERT(ref);
396
0
        ir_ref pred_b = _blocks[ref];
397
0
        ir_block *pred_bb = &blocks[pred_b];
398
0
        IR_ASSERT(pred_b > 0);
399
0
        *q = pred_b;
400
0
        edges[pred_bb->successors + pred_bb->successors_count++] = b;
401
0
      }
402
0
    } else if (bb->predecessors_count == 1) {
403
0
      ref = insn->op1;
404
0
      IR_ASSERT(ref);
405
0
      IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
406
0
      ir_ref pred_b = _blocks[ref];
407
0
      ir_block *pred_bb = &blocks[pred_b];
408
0
      edges[bb->predecessors] = pred_b;
409
0
      edges[pred_bb->successors + pred_bb->successors_count++] = b;
410
0
    }
411
0
  }
412
413
0
  ctx->cfg_blocks_count = bb_count;
414
0
  ctx->cfg_edges_count = edges_count * 2;
415
0
  ctx->cfg_blocks = blocks;
416
0
  ctx->cfg_edges = edges;
417
0
  ctx->cfg_map = _blocks;
418
419
0
  if (!(ctx->flags2 & IR_CFG_REACHABLE)) {
420
0
    uint32_t reachable_count = 0;
421
422
    /* Mark reachable blocks */
423
0
    ir_worklist_clear(&worklist);
424
0
    ir_worklist_push(&worklist, 1);
425
0
    while (ir_worklist_len(&worklist) != 0) {
426
0
      uint32_t *p;
427
428
0
      reachable_count++;
429
0
      b = ir_worklist_pop(&worklist);
430
0
      bb = &blocks[b];
431
0
      bb->flags &= ~IR_BB_UNREACHABLE;
432
0
      n = bb->successors_count;
433
0
      if (n > 1) {
434
0
        for (p = edges + bb->successors; n > 0; p++, n--) {
435
0
          ir_worklist_push(&worklist, *p);
436
0
        }
437
0
      } else if (n == 1) {
438
0
        ir_worklist_push(&worklist, edges[bb->successors]);
439
0
      }
440
0
    }
441
0
    if (reachable_count != ctx->cfg_blocks_count) {
442
0
      ir_remove_unreachable_blocks(ctx);
443
0
    }
444
0
  }
445
446
0
  ir_worklist_free(&worklist);
447
448
0
  return 1;
449
0
}
450
451
static void ir_remove_predecessor(ir_ctx *ctx, ir_block *bb, uint32_t from)
452
0
{
453
0
  uint32_t i, *p, *q, n = 0;
454
455
0
  p = q = &ctx->cfg_edges[bb->predecessors];
456
0
  for (i = 0; i < bb->predecessors_count; i++, p++) {
457
0
    if (*p != from) {
458
0
      if (p != q) {
459
0
        *q = *p;
460
0
      }
461
0
      q++;
462
0
      n++;
463
0
    }
464
0
  }
465
0
  IR_ASSERT(n != bb->predecessors_count);
466
0
  bb->predecessors_count = n;
467
0
}
468
469
static void ir_remove_merge_input(ir_ctx *ctx, ir_ref merge, ir_ref from)
470
0
{
471
0
  ir_ref i, j, n;
472
0
  ir_use_list *use_list;
473
0
  ir_bitset life_inputs;
474
0
  ir_insn *insn = &ctx->ir_base[merge];
475
476
0
  IR_ASSERT(insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN);
477
0
  n = insn->inputs_count;
478
0
  i = 1;
479
0
  life_inputs = ir_bitset_malloc(n + 1);
480
0
  for (j = 1; j <= n; j++) {
481
0
    ir_ref input = ir_insn_op(insn, j);
482
483
0
    if (input != from) {
484
0
      if (i != j) {
485
0
        ir_insn_set_op(insn, i, input);
486
0
      }
487
0
      ir_bitset_incl(life_inputs, j);
488
0
      i++;
489
0
    }
490
0
  }
491
0
  i--;
492
0
  for (j = i + 1; j <= n; j++) {
493
0
    ir_insn_set_op(insn, j, IR_UNUSED);
494
0
  }
495
0
  if (i == 1) {
496
0
    insn->op = IR_BEGIN;
497
0
  }
498
0
  insn->inputs_count = i;
499
500
0
  use_list = &ctx->use_lists[merge];
501
0
  if (use_list->count > 1) {
502
0
    ir_remove_phis_inputs(ctx, use_list, i, life_inputs);
503
0
  }
504
505
0
  ir_mem_free(life_inputs);
506
0
  ir_use_list_remove_all(ctx, from, merge);
507
0
}
508
509
/* CFG constructed after SCCP pass doesn't have unreachable BBs, otherwise they should be removed */
510
static int ir_remove_unreachable_blocks(ir_ctx *ctx)
511
0
{
512
0
  uint32_t b, *p, i;
513
0
  uint32_t unreachable_count = 0;
514
0
  uint32_t bb_count = ctx->cfg_blocks_count;
515
0
  ir_block *bb = ctx->cfg_blocks + 1;
516
517
0
  for (b = 1; b <= bb_count; b++, bb++) {
518
0
    if (bb->flags & IR_BB_UNREACHABLE) {
519
#if 0
520
      do {if (!unreachable_count) ir_dump_cfg(ctx, stderr);} while(0);
521
#endif
522
0
      if (bb->successors_count) {
523
0
        for (i = 0, p = &ctx->cfg_edges[bb->successors]; i < bb->successors_count; i++, p++) {
524
0
          ir_block *succ_bb = &ctx->cfg_blocks[*p];
525
526
0
          if (!(succ_bb->flags & IR_BB_UNREACHABLE)) {
527
0
            ir_remove_predecessor(ctx, succ_bb, b);
528
0
            ir_remove_merge_input(ctx, succ_bb->start, bb->end);
529
0
          }
530
0
        }
531
0
      } else {
532
0
        ir_ref prev, ref = bb->end;
533
0
        ir_insn *insn = &ctx->ir_base[ref];
534
535
0
        IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR);
536
        /* remove from terminators list */
537
0
        prev = ctx->ir_base[1].op1;
538
0
        if (prev == ref) {
539
0
          ctx->ir_base[1].op1 = insn->op3;
540
0
        } else {
541
0
          while (prev) {
542
0
            if (ctx->ir_base[prev].op3 == ref) {
543
0
              ctx->ir_base[prev].op3 = insn->op3;
544
0
              break;
545
0
            }
546
0
            prev = ctx->ir_base[prev].op3;
547
0
          }
548
0
        }
549
0
      }
550
0
      ctx->cfg_map[bb->start] = 0;
551
0
      ctx->cfg_map[bb->end] = 0;
552
0
      unreachable_count++;
553
0
    }
554
0
  }
555
556
0
  if (unreachable_count) {
557
0
    ir_block *dst_bb;
558
0
    uint32_t n = 1;
559
0
    uint32_t *edges;
560
561
0
    dst_bb = bb = ctx->cfg_blocks + 1;
562
0
    for (b = 1; b <= bb_count; b++, bb++) {
563
0
      if (!(bb->flags & IR_BB_UNREACHABLE)) {
564
0
        if (dst_bb != bb) {
565
0
          memcpy(dst_bb, bb, sizeof(ir_block));
566
0
          ctx->cfg_map[dst_bb->start] = n;
567
0
          ctx->cfg_map[dst_bb->end] = n;
568
0
        }
569
0
        dst_bb->successors_count = 0;
570
0
        dst_bb++;
571
0
        n++;
572
0
      }
573
0
    }
574
0
    ctx->cfg_blocks_count = bb_count = n - 1;
575
576
    /* Rebuild successor/predecessors control edges */
577
0
    edges = ctx->cfg_edges;
578
0
    bb = ctx->cfg_blocks + 1;
579
0
    for (b = 1; b <= bb_count; b++, bb++) {
580
0
      ir_insn *insn = &ctx->ir_base[bb->start];
581
0
      ir_ref *p, ref;
582
583
0
      n = bb->predecessors_count;
584
0
      if (n > 1) {
585
0
        uint32_t *q = edges + bb->predecessors;
586
587
0
        IR_ASSERT(n == insn->inputs_count);
588
0
        for (p = insn->ops + 1; n > 0; p++, q++, n--) {
589
0
          ref = *p;
590
0
          IR_ASSERT(ref);
591
0
          ir_ref pred_b = ctx->cfg_map[ref];
592
0
          ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
593
0
          *q = pred_b;
594
0
          edges[pred_bb->successors + pred_bb->successors_count++] = b;
595
0
        }
596
0
      } else if (n == 1) {
597
0
        ref = insn->op1;
598
0
        IR_ASSERT(ref);
599
0
        IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
600
0
        ir_ref pred_b = ctx->cfg_map[ref];
601
0
        ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
602
0
        edges[bb->predecessors] = pred_b;
603
0
        edges[pred_bb->successors + pred_bb->successors_count++] = b;
604
0
      }
605
0
    }
606
0
  }
607
608
0
  return 1;
609
0
}
610
611
static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b)
612
0
{
613
0
  uint32_t i, *p;
614
0
  ir_block *bb = &ctx->cfg_blocks[b];
615
616
0
  if (bb->postnum != 0) {
617
0
    return;
618
0
  }
619
620
0
  if (bb->successors_count) {
621
0
    bb->postnum = -1; /* Marker for "currently visiting" */
622
0
    p = ctx->cfg_edges + bb->successors;
623
0
    i = bb->successors_count;
624
0
    do {
625
0
      compute_postnum(ctx, cur, *p);
626
0
      p++;
627
0
    } while (--i);
628
0
  }
629
0
  bb->postnum = (*cur)++;
630
0
}
631
632
/* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by
633
 * Cooper, Harvey and Kennedy. */
634
static IR_NEVER_INLINE int ir_build_dominators_tree_slow(ir_ctx *ctx)
635
0
{
636
0
  uint32_t blocks_count, b, postnum;
637
0
  ir_block *blocks, *bb;
638
0
  uint32_t *edges;
639
0
  bool changed;
640
641
0
  blocks = ctx->cfg_blocks;
642
0
  edges  = ctx->cfg_edges;
643
0
  blocks_count = ctx->cfg_blocks_count;
644
645
  /* Clear the dominators tree */
646
0
  for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
647
0
    bb->idom = 0;
648
0
    bb->dom_depth = 0;
649
0
    bb->dom_child = 0;
650
0
    bb->dom_next_child = 0;
651
0
  }
652
653
0
  ctx->flags2 &= ~IR_NO_LOOPS;
654
655
0
  postnum = 1;
656
0
  compute_postnum(ctx, &postnum, 1);
657
658
  /* Find immediate dominators by iterative fixed-point algorithm */
659
0
  blocks[1].idom = 1;
660
0
  do {
661
0
    changed = 0;
662
    /* Iterating in Reverse Post Order */
663
0
    for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
664
0
      IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
665
0
      IR_ASSERT(bb->predecessors_count > 0);
666
0
      if (bb->predecessors_count == 1) {
667
0
        uint32_t pred_b = edges[bb->predecessors];
668
669
0
        if (blocks[pred_b].idom > 0 && bb->idom != pred_b) {
670
0
          bb->idom = pred_b;
671
0
          changed = 1;
672
0
        }
673
0
      } else if (bb->predecessors_count) {
674
0
        uint32_t idom = 0;
675
0
        uint32_t k = bb->predecessors_count;
676
0
        uint32_t *p = edges + bb->predecessors;
677
678
0
        do {
679
0
          uint32_t pred_b = *p;
680
0
          ir_block *pred_bb = &blocks[pred_b];
681
0
          ir_block *idom_bb;
682
683
0
          if (pred_bb->idom > 0) {
684
0
            idom = pred_b;
685
0
            idom_bb = &blocks[idom];
686
687
0
            while (--k > 0) {
688
0
              pred_b = *(++p);
689
0
              pred_bb = &blocks[pred_b];
690
0
              if (pred_bb->idom > 0) {
691
0
                while (idom != pred_b) {
692
0
                  while (pred_bb->postnum < idom_bb->postnum) {
693
0
                    pred_b = pred_bb->idom;
694
0
                    pred_bb = &blocks[pred_b];
695
0
                  }
696
0
                  while (idom_bb->postnum < pred_bb->postnum) {
697
0
                    idom = idom_bb->idom;
698
0
                    idom_bb = &blocks[idom];
699
0
                  }
700
0
                }
701
0
              }
702
0
            }
703
704
0
            if (bb->idom != idom) {
705
0
              bb->idom = idom;
706
0
              changed = 1;
707
0
            }
708
0
            break;
709
0
          }
710
0
          p++;
711
0
        } while (--k > 0);
712
0
      }
713
0
    }
714
0
  } while (changed);
715
716
  /* Build dominators tree */
717
0
  blocks[1].idom = 0;
718
0
  blocks[1].dom_depth = 0;
719
720
  /* Construct children lists sorted by block number */
721
0
  for (b = blocks_count, bb = &blocks[b]; b >= 2; b--, bb--) {
722
0
    ir_block *idom_bb = &blocks[bb->idom];
723
0
    bb->dom_depth = 0;
724
0
    bb->dom_next_child = idom_bb->dom_child;
725
0
    idom_bb->dom_child = b;
726
0
  }
727
728
  /* Recalculate dom_depth for all blocks */
729
0
  for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
730
0
    uint32_t idom = bb->idom;
731
0
    uint32_t dom_depth = 0;
732
0
    while (idom) {
733
0
      dom_depth++;
734
0
      if (blocks[idom].dom_depth > 0) {
735
0
        dom_depth += blocks[idom].dom_depth;
736
0
        break;
737
0
      } else {
738
0
        idom = blocks[idom].idom;
739
0
      }
740
0
    }
741
0
    bb->dom_depth = dom_depth;
742
0
  }
743
744
0
  return 1;
745
0
}
746
747
/* A single pass modification of "A Simple, Fast Dominance Algorithm" by
748
 * Cooper, Harvey and Kennedy, that relays on IR block ordering.
749
 * It may fallback to the general slow fixed-point algorithm.  */
750
static int ir_build_dominators_tree_iterative(ir_ctx *ctx);
751
int ir_build_dominators_tree(ir_ctx *ctx)
752
0
{
753
0
  uint32_t blocks_count, b;
754
0
  ir_block *blocks, *bb;
755
0
  uint32_t *edges;
756
0
  ir_list worklist;
757
758
0
  ir_list_init(&worklist, ctx->cfg_blocks_count / 2);
759
760
0
  ctx->flags2 |= IR_NO_LOOPS;
761
762
  /* Find immediate dominators */
763
0
  blocks = ctx->cfg_blocks;
764
0
  edges  = ctx->cfg_edges;
765
0
  blocks_count = ctx->cfg_blocks_count;
766
0
  blocks[1].idom = 1;
767
0
  blocks[1].dom_depth = 0;
768
769
  /* Iterating in Reverse Post Order */
770
0
  for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
771
0
    IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
772
0
    IR_ASSERT(bb->predecessors_count > 0);
773
0
    uint32_t k = bb->predecessors_count;
774
0
    uint32_t *p = edges + bb->predecessors;
775
0
    uint32_t idom = *p;
776
0
    ir_block *idom_bb;
777
778
0
    if (UNEXPECTED(idom >= b)) {
779
      /* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */
780
0
      ctx->flags2 &= ~IR_NO_LOOPS;
781
//      IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor");
782
0
      if (UNEXPECTED(k <= 1)) {
783
0
slow_case:
784
0
        ir_list_free(&worklist);
785
0
        return ir_build_dominators_tree_slow(ctx);
786
0
      }
787
0
      ir_list_push(&worklist, idom);
788
0
      while (1) {
789
0
        k--;
790
0
        p++;
791
0
        idom = *p;
792
0
        if (idom < b) {
793
0
          break;
794
0
        }
795
0
        if (UNEXPECTED(k == 0)) {
796
0
          goto slow_case;
797
0
        }
798
0
        ir_list_push(&worklist, idom);
799
0
      }
800
0
    }
801
0
    IR_ASSERT(blocks[idom].idom > 0);
802
0
    IR_ASSERT(k != 0);
803
804
0
    while (--k > 0) {
805
0
      uint32_t pred_b = *(++p);
806
807
0
      if (pred_b < b) {
808
0
        IR_ASSERT(blocks[pred_b].idom > 0);
809
0
        while (idom != pred_b) {
810
0
          while (pred_b > idom) {
811
0
            pred_b = blocks[pred_b].idom;
812
0
          }
813
0
          while (idom > pred_b) {
814
0
            idom = blocks[idom].idom;
815
0
          }
816
0
        }
817
0
      } else {
818
0
        ctx->flags2 &= ~IR_NO_LOOPS;
819
0
        IR_ASSERT(bb->predecessors_count > 1);
820
0
        ir_list_push(&worklist, pred_b);
821
0
      }
822
0
    }
823
0
    bb->idom = idom;
824
0
    idom_bb = &blocks[idom];
825
0
    bb->dom_depth = idom_bb->dom_depth + 1;
826
0
  }
827
828
  /* Construct children lists sorted by block number */
829
0
  for (b = blocks_count, bb = &blocks[b]; b >= 2; b--, bb--) {
830
0
    ir_block *idom_bb = &blocks[bb->idom];
831
0
    bb->dom_next_child = idom_bb->dom_child;
832
0
    idom_bb->dom_child = b;
833
0
  }
834
835
0
  blocks[1].idom = 0;
836
837
0
  if (ir_list_len(&worklist) != 0) {
838
0
    uint32_t dom_depth;
839
0
    uint32_t succ_b;
840
0
    bool complete = 1;
841
842
    /* Check if all the back-edges lead to the loop headers */
843
0
    do {
844
0
      b = ir_list_pop(&worklist);
845
0
      bb = &blocks[b];
846
0
      succ_b = ctx->cfg_edges[bb->successors];
847
0
      if (bb->successors_count != 1) {
848
        /* LOOP_END/END may be linked with the following ENTRY by a fake edge */
849
0
        if (bb->successors_count != 2) {
850
0
          complete = 0;
851
0
          break;
852
0
        } else if (blocks[succ_b].flags & IR_BB_ENTRY) {
853
0
          succ_b = ctx->cfg_edges[bb->successors + 1];
854
0
        } else if (!(blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY)) {
855
0
          complete = 0;
856
0
          break;
857
0
        }
858
0
      }
859
0
      dom_depth = blocks[succ_b].dom_depth;;
860
0
      while (bb->dom_depth > dom_depth) {
861
0
        b = bb->dom_parent;
862
0
        bb = &blocks[b];
863
0
      }
864
0
      if (UNEXPECTED(b != succ_b)) {
865
0
        complete = 0;
866
0
        break;
867
0
      }
868
0
    } while (ir_list_len(&worklist) != 0);
869
870
0
    if (UNEXPECTED(!complete)) {
871
0
      ir_list_free(&worklist);
872
0
      return ir_build_dominators_tree_iterative(ctx);
873
0
    }
874
0
  }
875
876
0
  ir_list_free(&worklist);
877
878
0
  return 1;
879
0
}
880
881
static int ir_build_dominators_tree_iterative(ir_ctx *ctx)
882
0
{
883
0
  bool changed;
884
0
  uint32_t blocks_count, b;
885
0
  ir_block *blocks, *bb;
886
0
  uint32_t *edges;
887
888
  /* Find immediate dominators */
889
0
  blocks = ctx->cfg_blocks;
890
0
  edges  = ctx->cfg_edges;
891
0
  blocks_count = ctx->cfg_blocks_count;
892
893
  /* Clear the dominators tree, but keep already found dominators */
894
0
  for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
895
0
    bb->dom_depth = 0;
896
0
    bb->dom_child = 0;
897
0
    bb->dom_next_child = 0;
898
0
  }
899
900
  /* Find immediate dominators by iterative fixed-point algorithm */
901
0
  blocks[1].idom = 1;
902
0
  do {
903
0
    changed = 0;
904
905
0
    for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
906
0
      IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
907
0
      IR_ASSERT(bb->predecessors_count > 0);
908
0
      uint32_t k = bb->predecessors_count;
909
0
      uint32_t *p = edges + bb->predecessors;
910
0
      uint32_t idom = *p;
911
912
0
      if (blocks[idom].idom == 0) {
913
0
        while (1) {
914
0
          k--;
915
0
          p++;
916
0
          idom = *p;
917
0
          if (blocks[idom].idom > 0) {
918
0
            break;
919
0
          }
920
0
          IR_ASSERT(k > 0);
921
0
        }
922
0
      }
923
0
      IR_ASSERT(k != 0);
924
0
      while (--k > 0) {
925
0
        uint32_t pred_b = *(++p);
926
927
0
        if (blocks[pred_b].idom > 0) {
928
0
          IR_ASSERT(blocks[pred_b].idom > 0);
929
0
          while (idom != pred_b) {
930
0
            while (pred_b > idom) {
931
0
              pred_b = blocks[pred_b].idom;
932
0
            }
933
0
            while (idom > pred_b) {
934
0
              idom = blocks[idom].idom;
935
0
            }
936
0
          }
937
0
        }
938
0
      }
939
0
      if (bb->idom != idom) {
940
0
        bb->idom = idom;
941
0
        changed = 1;
942
0
      }
943
0
    }
944
0
  } while (changed);
945
946
  /* Build dominators tree */
947
0
  blocks[1].idom = 0;
948
0
  blocks[1].dom_depth = 0;
949
0
  for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
950
0
    uint32_t idom = bb->idom;
951
0
    ir_block *idom_bb = &blocks[idom];
952
953
0
    bb->dom_depth = idom_bb->dom_depth + 1;
954
0
  }
955
956
  /* Construct children lists sorted by block number */
957
0
  for (b = blocks_count, bb = &blocks[b]; b >= 2; b--, bb--) {
958
0
    ir_block *idom_bb = &blocks[bb->idom];
959
0
    bb->dom_next_child = idom_bb->dom_child;
960
0
    idom_bb->dom_child = b;
961
0
  }
962
963
0
  return 1;
964
0
}
965
966
static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
967
0
{
968
0
  uint32_t b1_depth = blocks[b1].dom_depth;
969
0
  const ir_block *bb2 = &blocks[b2];
970
971
0
  while (bb2->dom_depth > b1_depth) {
972
0
    b2 = bb2->dom_parent;
973
0
    bb2 = &blocks[b2];
974
0
  }
975
0
  return b1 == b2;
976
0
}
977
978
int ir_find_loops(ir_ctx *ctx)
979
0
{
980
0
  uint32_t b, j, n, count;
981
0
  uint32_t *entry_times, *exit_times, *sorted_blocks, time = 1;
982
0
  ir_block *blocks = ctx->cfg_blocks;
983
0
  uint32_t *edges = ctx->cfg_edges;
984
0
  ir_worklist work;
985
986
0
  if (ctx->flags2 & IR_NO_LOOPS) {
987
0
    return 1;
988
0
  }
989
990
  /* We don't materialize the DJ spanning tree explicitly, as we are only interested in ancestor
991
   * queries. These are implemented by checking entry/exit times of the DFS search. */
992
0
  ir_worklist_init(&work, ctx->cfg_blocks_count + 1);
993
0
  entry_times = ir_mem_malloc((ctx->cfg_blocks_count + 1) * 3 * sizeof(uint32_t));
994
0
  exit_times = entry_times + ctx->cfg_blocks_count + 1;
995
0
  sorted_blocks = exit_times + ctx->cfg_blocks_count + 1;
996
997
0
  memset(entry_times, 0, (ctx->cfg_blocks_count + 1) * sizeof(uint32_t));
998
999
0
  ir_worklist_push(&work, 1);
1000
0
  while (ir_worklist_len(&work)) {
1001
0
    ir_block *bb;
1002
0
    int child;
1003
1004
0
next:
1005
0
    b = ir_worklist_peek(&work);
1006
0
    if (!entry_times[b]) {
1007
0
      entry_times[b] = time++;
1008
0
    }
1009
1010
    /* Visit blocks immediately dominated by "b". */
1011
0
    bb = &blocks[b];
1012
0
    for (child = bb->dom_child; child > 0; child = blocks[child].dom_next_child) {
1013
0
      if (ir_worklist_push(&work, child)) {
1014
0
        goto next;
1015
0
      }
1016
0
    }
1017
1018
    /* Visit join edges. */
1019
0
    if (bb->successors_count) {
1020
0
      uint32_t *p = edges + bb->successors;
1021
0
      for (j = 0; j < bb->successors_count; j++, p++) {
1022
0
        uint32_t succ = *p;
1023
1024
0
        if (blocks[succ].idom == b) {
1025
0
          continue;
1026
0
        } else if (ir_worklist_push(&work, succ)) {
1027
0
          goto next;
1028
0
        }
1029
0
      }
1030
0
    }
1031
0
    exit_times[b] = time++;
1032
0
    ir_worklist_pop(&work);
1033
0
  }
1034
1035
  /* Sort blocks by level, which is the opposite order in which we want to process them */
1036
0
  sorted_blocks[1] = 1;
1037
0
  j = 1;
1038
0
  n = 2;
1039
0
  while (j != n) {
1040
0
    uint32_t i = j;
1041
0
    j = n;
1042
0
    for (; i < j; i++) {
1043
0
      int child;
1044
0
      for (child = blocks[sorted_blocks[i]].dom_child; child > 0; child = blocks[child].dom_next_child) {
1045
0
        sorted_blocks[n++] = child;
1046
0
      }
1047
0
    }
1048
0
  }
1049
0
  count = n;
1050
1051
  /* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs". */
1052
0
  uint32_t prev_dom_depth = blocks[sorted_blocks[n - 1]].dom_depth;
1053
0
  uint32_t prev_irreducible = 0;
1054
0
  while (n > 1) {
1055
0
    b = sorted_blocks[--n];
1056
0
    ir_block *bb = &blocks[b];
1057
1058
0
    IR_ASSERT(bb->dom_depth <= prev_dom_depth);
1059
0
    if (UNEXPECTED(prev_irreducible) && bb->dom_depth != prev_dom_depth) {
1060
      /* process delyed irreducible loops */
1061
0
      do {
1062
0
        b = sorted_blocks[prev_irreducible];
1063
0
        bb = &blocks[b];
1064
0
        if ((bb->flags & IR_BB_IRREDUCIBLE_LOOP) && !bb->loop_depth) {
1065
          /* process irreducible loop */
1066
0
          uint32_t hdr = b;
1067
1068
0
          bb->loop_depth = 1;
1069
0
          if (ctx->ir_base[bb->start].op == IR_MERGE) {
1070
0
            ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
1071
0
          }
1072
1073
          /* find the closing edge(s) of the irreucible loop */
1074
0
          IR_ASSERT(bb->predecessors_count > 1);
1075
0
          uint32_t *p = &edges[bb->predecessors];
1076
0
          j = bb->predecessors_count;
1077
0
          do {
1078
0
            uint32_t pred = *p;
1079
1080
0
            if (entry_times[pred] > entry_times[b] && exit_times[pred] < exit_times[b]) {
1081
0
              if (!ir_worklist_len(&work)) {
1082
0
                ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
1083
0
              }
1084
0
              blocks[pred].loop_header = 0; /* support for merged loops */
1085
0
              ir_worklist_push(&work, pred);
1086
0
            }
1087
0
            p++;
1088
0
          } while (--j);
1089
0
          if (ir_worklist_len(&work) == 0) continue;
1090
1091
          /* collect members of the irreducible loop */
1092
0
          while (ir_worklist_len(&work)) {
1093
0
            b = ir_worklist_pop(&work);
1094
0
            if (b != hdr) {
1095
0
              ir_block *bb = &blocks[b];
1096
0
              bb->loop_header = hdr;
1097
0
              if (bb->predecessors_count) {
1098
0
                uint32_t *p = &edges[bb->predecessors];
1099
0
                uint32_t n = bb->predecessors_count;
1100
0
                do {
1101
0
                  uint32_t pred = *p;
1102
0
                  while (blocks[pred].loop_header > 0) {
1103
0
                    pred = blocks[pred].loop_header;
1104
0
                  }
1105
0
                  if (pred != hdr) {
1106
0
                    if (entry_times[pred] > entry_times[hdr] && exit_times[pred] < exit_times[hdr]) {
1107
                      /* "pred" is a descendant of "hdr" */
1108
0
                      ir_worklist_push(&work, pred);
1109
0
                    } else {
1110
                      /* another entry to the irreducible loop */
1111
0
                      bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
1112
0
                      if (ctx->ir_base[bb->start].op == IR_MERGE) {
1113
0
                        ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
1114
0
                      }
1115
0
                    }
1116
0
                  }
1117
0
                  p++;
1118
0
                } while (--n);
1119
0
              }
1120
0
            }
1121
0
          }
1122
0
        }
1123
0
      } while (--prev_irreducible != n);
1124
0
      prev_irreducible = 0;
1125
0
      b = sorted_blocks[n];
1126
0
      bb = &blocks[b];
1127
0
    }
1128
1129
0
    if (bb->predecessors_count > 1) {
1130
0
      bool irreducible = 0;
1131
0
      uint32_t *p = &edges[bb->predecessors];
1132
1133
0
      j = bb->predecessors_count;
1134
0
      do {
1135
0
        uint32_t pred = *p;
1136
1137
        /* A join edge is one for which the predecessor does not
1138
           immediately dominate the successor.  */
1139
0
        if (bb->idom != pred) {
1140
          /* In a loop back-edge (back-join edge), the successor dominates
1141
             the predecessor.  */
1142
0
          if (ir_dominates(blocks, b, pred)) {
1143
0
            if (!ir_worklist_len(&work)) {
1144
0
              ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
1145
0
            }
1146
0
            blocks[pred].loop_header = 0; /* support for merged loops */
1147
0
            ir_worklist_push(&work, pred);
1148
0
          } else {
1149
            /* Otherwise it's a cross-join edge.  See if it's a branch
1150
               to an ancestor on the DJ spanning tree.  */
1151
0
            if (entry_times[pred] > entry_times[b] && exit_times[pred] < exit_times[b]) {
1152
0
              irreducible = 1;
1153
0
              break;
1154
0
            }
1155
0
          }
1156
0
        }
1157
0
        p++;
1158
0
      } while (--j);
1159
1160
0
      if (UNEXPECTED(irreducible)) {
1161
0
        bb->flags |= IR_BB_LOOP_HEADER | IR_BB_IRREDUCIBLE_LOOP;
1162
0
        ctx->flags2 |= IR_CFG_HAS_LOOPS | IR_IRREDUCIBLE_CFG;
1163
        /* Remember the position of the first irreducible loop to process all the irreducible loops
1164
         * after the reducible loops with the same dominator tree depth
1165
         */
1166
0
        if (!prev_irreducible) {
1167
0
          prev_irreducible = n;
1168
0
          prev_dom_depth = bb->dom_depth;
1169
0
        }
1170
0
        ir_list_clear(&work.l);
1171
0
      } else if (ir_worklist_len(&work)) {
1172
        /* collect members of the reducible loop */
1173
0
        uint32_t hdr = b;
1174
1175
0
        bb->flags |= IR_BB_LOOP_HEADER;
1176
0
        ctx->flags2 |= IR_CFG_HAS_LOOPS;
1177
0
        bb->loop_depth = 1;
1178
0
        if (ctx->ir_base[bb->start].op == IR_MERGE) {
1179
0
          ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
1180
0
        }
1181
0
        while (ir_worklist_len(&work)) {
1182
0
          b = ir_worklist_pop(&work);
1183
0
          if (b != hdr) {
1184
0
            ir_block *bb = &blocks[b];
1185
0
            bb->loop_header = hdr;
1186
0
            if (bb->predecessors_count) {
1187
0
              uint32_t *p = &edges[bb->predecessors];
1188
0
              uint32_t n = bb->predecessors_count;
1189
0
              do {
1190
0
                uint32_t pred = *p;
1191
0
                while (blocks[pred].loop_header > 0) {
1192
0
                  pred = blocks[pred].loop_header;
1193
0
                }
1194
0
                if (pred != hdr) {
1195
0
                  ir_worklist_push(&work, pred);
1196
0
                }
1197
0
                p++;
1198
0
              } while (--n);
1199
0
            }
1200
0
          }
1201
0
        }
1202
0
      }
1203
0
    }
1204
0
  }
1205
0
  IR_ASSERT(!prev_irreducible);
1206
1207
0
  if (ctx->flags2 & IR_CFG_HAS_LOOPS) {
1208
0
    for (n = 1; n < count; n++) {
1209
0
      b = sorted_blocks[n];
1210
0
      ir_block *bb = &blocks[b];
1211
0
      if (bb->loop_header > 0) {
1212
0
        ir_block *loop = &blocks[bb->loop_header];
1213
0
        uint32_t loop_depth = loop->loop_depth;
1214
1215
0
        if (bb->flags & IR_BB_LOOP_HEADER) {
1216
0
          loop_depth++;
1217
0
        }
1218
0
        bb->loop_depth = loop_depth;
1219
0
        if (bb->flags & (IR_BB_ENTRY|IR_BB_LOOP_WITH_ENTRY)) {
1220
0
          loop->flags |= IR_BB_LOOP_WITH_ENTRY;
1221
0
          if (loop_depth > 1) {
1222
            /* Set IR_BB_LOOP_WITH_ENTRY flag for all the enclosing loops */
1223
0
            bb = &blocks[loop->loop_header];
1224
0
            while (1) {
1225
0
              if (bb->flags & IR_BB_LOOP_WITH_ENTRY) {
1226
0
                break;
1227
0
              }
1228
0
              bb->flags |= IR_BB_LOOP_WITH_ENTRY;
1229
0
              if (bb->loop_depth == 1) {
1230
0
                break;
1231
0
              }
1232
0
              bb = &blocks[loop->loop_header];
1233
0
            }
1234
0
          }
1235
0
        }
1236
0
      }
1237
0
    }
1238
0
  }
1239
1240
0
  ir_mem_free(entry_times);
1241
0
  ir_worklist_free(&work);
1242
1243
0
  return 1;
1244
0
}
1245
1246
static uint32_t _ir_skip_empty_blocks(const ir_ctx *ctx, uint32_t b)
1247
0
{
1248
0
  while (1) {
1249
0
    ir_block *bb = &ctx->cfg_blocks[b];
1250
1251
0
    if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1252
0
      IR_ASSERT(bb->successors_count == 1);
1253
0
      b = ctx->cfg_edges[bb->successors];
1254
0
    } else {
1255
0
      return b;
1256
0
    }
1257
0
  }
1258
0
}
1259
1260
/* A variation of "Bottom-up Positioning" algorithm described by
1261
 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1262
 */
1263
typedef struct _ir_edge_info {
1264
  uint32_t from;
1265
  uint32_t to;
1266
  float    freq;
1267
} ir_edge_info;
1268
1269
typedef struct _ir_chain {
1270
  uint32_t head;
1271
  uint32_t next;
1272
  union {
1273
    uint32_t prev;
1274
    uint32_t tail;
1275
  };
1276
} ir_chain;
1277
1278
static int ir_edge_info_cmp(const void *b1, const void *b2)
1279
0
{
1280
0
  ir_edge_info *e1 = (ir_edge_info*)b1;
1281
0
  ir_edge_info *e2 = (ir_edge_info*)b2;
1282
1283
0
  if (e1->freq != e2->freq) {
1284
0
    return e1->freq < e2->freq ? 1 : -1;
1285
0
  }
1286
  /* In case of equal frequencies, try to avoid penalization of one of the "equal" paths by
1287
   * preferring the first RPO successor (in conditional branches) and the last RPO predecessor
1288
   * (in merge points).
1289
   *
1290
   * See "Static Basic Block Reordering Heuristics for Implicit Control Flow in Baseline JITs"
1291
   * Polito Guillermo, Ducasse Stephane, and Tesone Pablo (2021)
1292
   */
1293
0
  if (e1->from != e2->from) {
1294
0
    return e2->from - e1->from;
1295
0
  } else {
1296
0
    return e1->to - e2->to;
1297
0
  }
1298
0
}
1299
1300
static IR_NEVER_INLINE uint32_t ir_chain_head_path_compress(ir_chain *chains, uint32_t src, uint32_t head)
1301
0
{
1302
0
  IR_ASSERT(head != 0);
1303
0
  do {
1304
0
    head = chains[head].head;
1305
0
  } while (chains[head].head != head);
1306
0
  do {
1307
0
    uint32_t next = chains[src].head;
1308
0
    chains[src].head = head;
1309
0
    src = next;
1310
0
  } while (chains[src].head != head);
1311
0
  return head;
1312
0
}
1313
1314
IR_ALWAYS_INLINE uint32_t ir_chain_head(ir_chain *chains, uint32_t src)
1315
0
{
1316
0
  uint32_t head = chains[src].head;
1317
0
  if (chains[head].head == head) {
1318
0
    return head;
1319
0
  } else {
1320
0
    return ir_chain_head_path_compress(chains, src, head);
1321
0
  }
1322
0
}
1323
1324
static void ir_join_chains(ir_chain *chains, uint32_t src, uint32_t dst)
1325
0
{
1326
0
  uint32_t dst_tail = chains[dst].tail;
1327
0
  uint32_t src_tail = chains[src].tail;
1328
1329
0
  chains[dst_tail].next = src;
1330
0
  chains[dst].prev = src_tail;
1331
0
  chains[src_tail].next = dst;
1332
0
  chains[src].tail = dst_tail;
1333
0
  chains[dst].head = src;
1334
0
}
1335
1336
static void ir_insert_chain_before(ir_chain *chains, uint32_t c, uint32_t before)
1337
0
{
1338
0
  ir_chain *this = &chains[c];
1339
0
  ir_chain *next = &chains[before];
1340
1341
0
  IR_ASSERT(chains[c].head == c);
1342
0
  if (chains[before].head != before) {
1343
0
    this->head = next->head;
1344
0
  } else {
1345
0
    next->head = c;
1346
0
  }
1347
0
  this->next = before;
1348
0
  this->prev = next->prev;
1349
0
  next->prev = c;
1350
0
  chains[this->prev].next = c;
1351
0
}
1352
1353
#ifndef IR_DEBUG_BB_SCHEDULE_GRAPH
1354
# ifdef IR_DEBUG
1355
0
#  define IR_DEBUG_BB_SCHEDULE_GRAPH 1
1356
# else
1357
#  define IR_DEBUG_BB_SCHEDULE_GRAPH 0
1358
# endif
1359
#endif
1360
1361
#if IR_DEBUG_BB_SCHEDULE_GRAPH
1362
static void ir_dump_cfg_freq_graph(ir_ctx *ctx, float *bb_freq, uint32_t edges_count, ir_edge_info *edges, ir_chain *chains)
1363
0
{
1364
0
  uint32_t b, i;
1365
0
  ir_block *bb;
1366
0
  uint8_t c, *colors;
1367
0
  bool is_head, is_empty;
1368
0
  uint32_t max_colors = 12;
1369
1370
0
  colors = alloca(sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1371
0
  memset(colors, 0, sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1372
0
  i = 0;
1373
0
  for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1374
0
    if (chains[b].head == b) {
1375
0
      colors[b] = (i % max_colors) + 1;
1376
0
      i++;
1377
0
    }
1378
0
  }
1379
1380
0
  fprintf(stderr, "digraph {\n");
1381
0
  fprintf(stderr, "\trankdir=TB;\n");
1382
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1383
0
    bb = &ctx->cfg_blocks[b];
1384
0
    c = (chains[b].head) ? colors[ir_chain_head(chains, b)] : 0;
1385
0
    is_head = chains[b].head == b;
1386
0
    is_empty = (bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY;
1387
0
    if (c) {
1388
0
      fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\",colorscheme=set312,fillcolor=%d%s%s]\n", b, b,
1389
0
        bb->loop_depth, bb_freq[b],
1390
0
        ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1391
0
        c,
1392
0
        is_head ? ",penwidth=3" : "",
1393
0
        is_empty ? ",style=\"dotted,filled\"" : ",style=\"filled\"");
1394
0
    } else {
1395
0
      fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\"%s%s]\n", b, b,
1396
0
        bb->loop_depth, bb_freq[b],
1397
0
        ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1398
0
        is_head ? ",penwidth=3" : "",
1399
0
        is_empty ? ",style=\"dotted\"" : "");
1400
0
    }
1401
0
  }
1402
0
  fprintf(stderr, "\n");
1403
1404
0
  for (i = 0; i < edges_count; i++) {
1405
0
    fprintf(stderr, "\tBB%d -> BB%d [label=\"%0.3f\"]\n", edges[i].from, edges[i].to, edges[i].freq);
1406
0
  }
1407
0
  fprintf(stderr, "}\n");
1408
0
}
1409
#endif
1410
1411
#ifdef IR_DEBUG
1412
static void ir_dump_edges(ir_ctx *ctx, uint32_t edges_count, ir_edge_info *edges)
1413
0
{
1414
0
  uint32_t i;
1415
1416
0
  fprintf(stderr, "Edges:\n");
1417
0
  for (i = 0; i < edges_count; i++) {
1418
0
    fprintf(stderr, "\tBB%d -> BB%d %0.3f\n", edges[i].from, edges[i].to, edges[i].freq);
1419
0
  }
1420
0
}
1421
1422
static void ir_dump_chains(ir_ctx *ctx, ir_chain *chains)
1423
0
{
1424
0
  uint32_t b, tail, i;
1425
1426
0
  fprintf(stderr, "Chains:\n");
1427
0
  for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1428
0
    if (chains[b].head == b) {
1429
0
      tail = chains[b].tail;
1430
0
      i = b;
1431
0
      fprintf(stderr, "(BB%d", i);
1432
0
      while (i != tail) {
1433
0
        i = chains[i].next;
1434
0
        fprintf(stderr, ", BB%d", i);
1435
0
      }
1436
0
      fprintf(stderr, ")\n");
1437
0
    }
1438
0
  }
1439
0
}
1440
#endif
1441
1442
static int ir_schedule_blocks_bottom_up(ir_ctx *ctx)
1443
0
{
1444
0
  uint32_t max_edges_count = ctx->cfg_edges_count / 2;
1445
0
  uint32_t edges_count = 0;
1446
0
  uint32_t b, i, loop_depth;
1447
0
  float *bb_freq, freq;
1448
0
  ir_block *bb;
1449
0
  ir_edge_info *edges, *e;
1450
0
  ir_chain *chains;
1451
0
  ir_bitqueue worklist;
1452
0
  ir_bitset visited;
1453
0
  uint32_t *schedule_end, count;
1454
1455
0
  ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1456
0
  schedule_end = ctx->cfg_schedule + ctx->cfg_blocks_count;
1457
1458
  /* 1. Create initial chains for each BB */
1459
0
  chains = ir_mem_malloc(sizeof(ir_chain) * (ctx->cfg_blocks_count + 1));
1460
0
  chains[0].head = 0;
1461
0
  chains[0].next = 0;
1462
0
  chains[0].prev = 0;
1463
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1464
0
    chains[b].head = b;
1465
0
    chains[b].next = b;
1466
0
    chains[b].prev = b;
1467
0
  }
1468
1469
  /* 2. Collect information about BBs and EDGEs frequencies */
1470
0
  edges = ir_mem_malloc(sizeof(ir_edge_info) * max_edges_count);
1471
0
  bb_freq = ir_mem_calloc(ctx->cfg_blocks_count + 1, sizeof(float));
1472
0
  bb_freq[1] = 1.0f;
1473
0
  visited = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
1474
0
  ir_bitqueue_init(&worklist, ctx->cfg_blocks_count + 1);
1475
0
  ir_bitqueue_add(&worklist, 1);
1476
0
  while ((b = ir_bitqueue_pop(&worklist)) != (uint32_t)-1) {
1477
0
restart:
1478
0
    bb = &ctx->cfg_blocks[b];
1479
0
    if (bb->predecessors_count) {
1480
0
      uint32_t n = bb->predecessors_count;
1481
0
      uint32_t *p = ctx->cfg_edges + bb->predecessors;
1482
0
      for (; n > 0; p++, n--) {
1483
0
        uint32_t predecessor = *p;
1484
        /* Basic Blocks are ordered in a way that usual predecessors ids are less than successors.
1485
         * So we may compare blocks ids (predecessor < b) instead of a more expensive check for back edge
1486
         * (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b)
1487
         */
1488
0
        if (predecessor < b) {
1489
0
          if (!ir_bitset_in(visited, predecessor)) {
1490
0
            b = predecessor;
1491
0
            ir_bitqueue_del(&worklist, b);
1492
0
            goto restart;
1493
0
          }
1494
0
        } else if (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b) {
1495
          /* not a loop back-edge */
1496
0
          IR_ASSERT(b == predecessor || ctx->cfg_blocks[predecessor].loop_header == b);
1497
0
        }
1498
0
      }
1499
0
    }
1500
1501
0
    ir_bitset_incl(visited, b);
1502
1503
0
    if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1504
0
      uint32_t successor = ctx->cfg_edges[bb->successors];
1505
1506
      /* move empty blocks to the end */
1507
0
      IR_ASSERT(chains[b].head == b);
1508
0
      chains[b].head = 0;
1509
0
      *schedule_end = b;
1510
0
      schedule_end--;
1511
1512
0
      if (successor > b) {
1513
0
        bb_freq[successor] += bb_freq[b];
1514
0
        b = successor;
1515
0
        ir_bitqueue_del(&worklist, b);
1516
0
        goto restart;
1517
0
      } else {
1518
0
        continue;
1519
0
      }
1520
0
    }
1521
1522
0
    loop_depth = bb->loop_depth;
1523
0
    if (bb->flags & IR_BB_LOOP_HEADER) {
1524
      // TODO: Estimate the loop iterations count
1525
0
      bb_freq[b] *= 10;
1526
0
    }
1527
1528
0
    if (bb->successors_count) {
1529
0
      uint32_t n = bb->successors_count;
1530
0
      uint32_t *p = ctx->cfg_edges + bb->successors;
1531
1532
0
      if (n == 1) {
1533
0
        uint32_t successor = *p;
1534
1535
0
        IR_ASSERT(edges_count < max_edges_count);
1536
0
        freq = bb_freq[b];
1537
0
        if (successor > b) {
1538
0
          IR_ASSERT(!ir_bitset_in(visited, successor));
1539
0
          bb_freq[successor] += freq;
1540
0
          ir_bitqueue_add(&worklist, successor);
1541
0
        }
1542
0
        successor = _ir_skip_empty_blocks(ctx, successor);
1543
0
        edges[edges_count].from = b;
1544
0
        edges[edges_count].to = successor;
1545
0
        edges[edges_count].freq = freq;
1546
0
        edges_count++;
1547
0
      } else if (n == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1548
0
        uint32_t successor1 = *p;
1549
0
        ir_block *successor1_bb = &ctx->cfg_blocks[successor1];
1550
0
        ir_insn *insn1 = &ctx->ir_base[successor1_bb->start];
1551
0
        uint32_t successor2 = *(p + 1);
1552
0
        ir_block *successor2_bb = &ctx->cfg_blocks[successor2];
1553
0
        ir_insn *insn2 = &ctx->ir_base[successor2_bb->start];
1554
0
        int prob1, prob2, probN = 100;
1555
1556
0
        if (insn1->op2) {
1557
0
          prob1 = insn1->op2;
1558
0
          if (insn2->op2) {
1559
0
            prob2 = insn2->op2;
1560
0
            probN = prob1 + prob2;
1561
0
          } else {
1562
0
            if (prob1 > 100) {
1563
0
              prob1 = 100;
1564
0
            }
1565
0
            prob2 = 100 - prob1;
1566
0
          }
1567
1568
0
        } else if (insn2->op2) {
1569
0
          prob2 = insn2->op2;
1570
0
          if (prob2 > 100) {
1571
0
            prob2 = 100;
1572
0
          }
1573
0
          prob1 = 100 - prob2;
1574
0
        } else if (successor1_bb->loop_depth >= loop_depth
1575
0
            && successor2_bb->loop_depth < loop_depth) {
1576
0
          prob1 = 90;
1577
0
          prob2 = 10;
1578
0
        } else if (successor1_bb->loop_depth < loop_depth
1579
0
            && successor2_bb->loop_depth >= loop_depth) {
1580
0
          prob1 = 10;
1581
0
          prob2 = 90;
1582
0
        } else if (successor2_bb->flags & IR_BB_EMPTY) {
1583
0
          prob1 = 51;
1584
0
          prob2 = 49;
1585
0
        } else if (successor1_bb->flags & IR_BB_EMPTY) {
1586
0
          prob1 = 49;
1587
0
          prob2 = 51;
1588
0
        } else {
1589
0
          prob1 = prob2 = 50;
1590
0
        }
1591
0
        do {
1592
0
          freq = bb_freq[b] * (float)prob1 / (float)probN;
1593
0
          if (successor1 > b) {
1594
0
            IR_ASSERT(!ir_bitset_in(visited, successor1));
1595
0
            bb_freq[successor1] += freq;
1596
0
            if (successor1_bb->successors_count == 0 && insn1->op2 == 1) {
1597
              /* move cold block without successors to the end */
1598
0
              IR_ASSERT(chains[successor1].head == successor1);
1599
0
              chains[successor1].head = 0;
1600
0
              *schedule_end = successor1;
1601
0
              schedule_end--;
1602
0
              break;
1603
0
            } else {
1604
0
              ir_bitqueue_add(&worklist, successor1);
1605
0
            }
1606
0
          }
1607
          /* try to join edges early to reduce number of edges and the cost of their sorting */
1608
0
          if (prob1 > prob2
1609
0
           && (successor1_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1610
0
            uint32_t src = chains[b].next;
1611
0
            IR_ASSERT(chains[src].head == src);
1612
0
            IR_ASSERT(src == ir_chain_head(chains, b));
1613
0
            IR_ASSERT(chains[successor1].head == successor1);
1614
0
            ir_join_chains(chains, src, successor1);
1615
0
            if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1616
0
          }
1617
0
          successor1 = _ir_skip_empty_blocks(ctx, successor1);
1618
0
          IR_ASSERT(edges_count < max_edges_count);
1619
0
          edges[edges_count].from = b;
1620
0
          edges[edges_count].to = successor1;
1621
0
          edges[edges_count].freq = freq;
1622
0
          edges_count++;
1623
0
        } while (0);
1624
0
        do {
1625
0
          freq = bb_freq[b] * (float)prob2 / (float)probN;
1626
0
          if (successor2 > b) {
1627
0
            IR_ASSERT(!ir_bitset_in(visited, successor2));
1628
0
            bb_freq[successor2] += freq;
1629
0
            if (successor2_bb->successors_count == 0 && insn2->op2 == 1) {
1630
              /* move cold block without successors to the end */
1631
0
              IR_ASSERT(chains[successor2].head == successor2);
1632
0
              chains[successor2].head = 0;
1633
0
              *schedule_end = successor2;
1634
0
              schedule_end--;
1635
0
              break;
1636
0
            } else {
1637
0
              ir_bitqueue_add(&worklist, successor2);
1638
0
            }
1639
0
          }
1640
0
          if (prob2 > prob1
1641
0
           && (successor2_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1642
0
            uint32_t src = chains[b].next;
1643
0
            IR_ASSERT(chains[src].head == src);
1644
0
            IR_ASSERT(src == ir_chain_head(chains, b));
1645
0
            IR_ASSERT(chains[successor2].head == successor2);
1646
0
            ir_join_chains(chains, src, successor2);
1647
0
            if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1648
0
          }
1649
0
          successor2 = _ir_skip_empty_blocks(ctx, successor2);
1650
0
          IR_ASSERT(edges_count < max_edges_count);
1651
0
          edges[edges_count].from = b;
1652
0
          edges[edges_count].to = successor2;
1653
0
          edges[edges_count].freq = freq;
1654
0
          edges_count++;
1655
0
        } while (0);
1656
0
      } else {
1657
0
        int prob;
1658
1659
0
        for (; n > 0; p++, n--) {
1660
0
          uint32_t successor = *p;
1661
0
          ir_block *successor_bb = &ctx->cfg_blocks[successor];
1662
0
          ir_insn *insn = &ctx->ir_base[successor_bb->start];
1663
1664
0
          if (insn->op == IR_CASE_DEFAULT) {
1665
0
            prob = insn->op2;
1666
0
            if (!prob) {
1667
0
              prob = 100 / bb->successors_count;
1668
0
            }
1669
0
          } else if (insn->op == IR_CASE_VAL) {
1670
0
            prob = insn->op3;
1671
0
            if (!prob) {
1672
0
              prob = 100 / bb->successors_count;
1673
0
            }
1674
0
          } else if (insn->op == IR_ENTRY) {
1675
0
            if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1676
0
              prob = 99; /* prefer empty ENTRY block to go first */
1677
0
            } else {
1678
0
              prob = 1;
1679
0
            }
1680
0
          } else {
1681
0
            prob = 100 / bb->successors_count;
1682
0
          }
1683
0
          freq = bb_freq[b] * (float)prob / 100.0f;
1684
0
          if (successor > b) {
1685
0
            IR_ASSERT(!ir_bitset_in(visited, successor));
1686
0
            bb_freq[successor] += freq;
1687
0
            ir_bitqueue_add(&worklist, successor);
1688
0
          }
1689
0
          successor = _ir_skip_empty_blocks(ctx, successor);
1690
0
          IR_ASSERT(edges_count < max_edges_count);
1691
0
          edges[edges_count].from = b;
1692
0
          edges[edges_count].to = successor;
1693
0
          edges[edges_count].freq = freq;
1694
0
          edges_count++;
1695
0
        }
1696
0
      }
1697
0
    }
1698
0
  }
1699
0
  ir_bitqueue_free(&worklist);
1700
0
  ir_mem_free(visited);
1701
1702
  /* 2. Sort EDGEs according to their frequencies */
1703
0
  qsort(edges, edges_count, sizeof(ir_edge_info), ir_edge_info_cmp);
1704
1705
0
#ifdef IR_DEBUG
1706
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1707
0
    ir_dump_edges(ctx, edges_count, edges);
1708
0
  }
1709
0
#endif
1710
1711
  /* 3. Process EDGEs in the decreasing frequency order and join the connected chains */
1712
0
  for (e = edges, i = edges_count; i > 0; e++, i--) {
1713
0
    uint32_t dst = chains[e->to].head;
1714
0
    if (dst == e->to) {
1715
0
      uint32_t src = chains[e->from].next;
1716
0
      if (chains[src].head == src) {
1717
0
        IR_ASSERT(src == ir_chain_head(chains, e->from) && chains[src].tail == e->from);
1718
0
        if (src != dst) {
1719
0
          ir_join_chains(chains, src, dst);
1720
0
        } else if (ctx->cfg_blocks[e->from].successors_count < 2) {
1721
          /* Try to rotate loop chian to finish it with a conditional branch */
1722
0
          uint32_t tail = e->from;
1723
0
          uint32_t prev = src;
1724
0
          uint32_t next = chains[prev].next;
1725
0
          uint32_t best = 0;
1726
1727
0
          while (prev != tail) {
1728
0
            if (ctx->ir_base[ctx->cfg_blocks[prev].end].op == IR_IF) {
1729
0
              if (ctx->ir_base[ctx->cfg_blocks[prev].start].op == IR_LOOP_BEGIN
1730
0
               && ctx->cfg_blocks[prev].loop_depth > 1) {
1731
0
                best = next;
1732
0
                break;
1733
0
              } else if (!best || bb_freq[next] < bb_freq[best]) {
1734
                /* Find the successor of IF with the least frequency */
1735
0
                best = next;
1736
0
              }
1737
0
            }
1738
0
            prev = next;
1739
0
            next = chains[next].next;
1740
0
          }
1741
0
          if (best) {
1742
            /* change the head of the chain */
1743
0
            chains[src].head = best;
1744
0
            chains[best].head = best;
1745
0
          }
1746
0
        }
1747
#if !IR_DEBUG_BB_SCHEDULE_GRAPH
1748
        e->from = 0; /* reset "from" to avoid check on step #5 */
1749
#endif
1750
0
      }
1751
0
    }
1752
0
  }
1753
1754
0
#if IR_DEBUG_BB_SCHEDULE_GRAPH
1755
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1756
0
    ir_dump_cfg_freq_graph(ctx, bb_freq, edges_count, edges, chains);
1757
0
  }
1758
0
#endif
1759
1760
0
  ir_mem_free(bb_freq);
1761
1762
0
#ifdef IR_DEBUG
1763
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1764
0
    ir_dump_chains(ctx, chains);
1765
0
  }
1766
0
#endif
1767
1768
  /* 4. Merge empty entry blocks */
1769
0
  if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && ctx->entries_count) {
1770
0
    for (i = 0; i < ctx->entries_count; i++) {
1771
0
      b = ctx->entries[i];
1772
0
      IR_ASSERT(ctx->cfg_blocks[b].flags & IR_BB_ENTRY);
1773
0
      if (b && chains[b].head == b && chains[b].tail == b) {
1774
0
        bb = &ctx->cfg_blocks[b];
1775
0
        if (bb->flags & IR_BB_EMPTY) {
1776
0
          uint32_t successor;
1777
1778
0
          do {
1779
0
            IR_ASSERT(bb->successors_count == 1);
1780
0
            successor = ctx->cfg_edges[bb->successors];
1781
0
            bb = &ctx->cfg_blocks[successor];
1782
0
          } while ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY);
1783
0
          IR_ASSERT(chains[successor].head);
1784
0
          ir_insert_chain_before(chains, b, successor);
1785
0
        }
1786
0
      }
1787
0
    }
1788
1789
0
#ifdef IR_DEBUG
1790
0
    if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1791
0
      ir_dump_chains(ctx, chains);
1792
0
    }
1793
0
#endif
1794
0
  }
1795
1796
  /* 5. Align loop headers */
1797
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1798
0
    if (chains[b].head == b) {
1799
0
      bb = &ctx->cfg_blocks[b];
1800
0
      if (bb->loop_depth) {
1801
0
        if ((bb->flags & IR_BB_LOOP_HEADER) || ir_chain_head(chains, bb->loop_header) == b) {
1802
0
          bb->flags |= IR_BB_ALIGN_LOOP;
1803
0
        }
1804
0
      }
1805
0
    }
1806
0
  }
1807
1808
  /* 6. Group chains according to the most frequent edge between them */
1809
  // TODO: Try to find a better heuristic
1810
0
  for (e = edges, i = edges_count; i > 0; e++, i--) {
1811
#if !IR_DEBUG_BB_SCHEDULE_GRAPH
1812
    if (!e->from) continue;
1813
#endif
1814
0
    uint32_t src = ir_chain_head(chains, e->from);
1815
0
    uint32_t dst = ir_chain_head(chains, e->to);
1816
0
    if (src != dst) {
1817
0
      if (dst == 1) {
1818
0
        ir_join_chains(chains, dst, src);
1819
0
      } else {
1820
0
        ir_join_chains(chains, src, dst);
1821
0
      }
1822
0
    }
1823
0
  }
1824
1825
0
#ifdef IR_DEBUG
1826
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1827
0
    ir_dump_chains(ctx, chains);
1828
0
  }
1829
0
#endif
1830
1831
  /* 7. Form a final BB order */
1832
0
  count = 0;
1833
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1834
0
    if (chains[b].head == b) {
1835
0
      uint32_t tail = chains[b].tail;
1836
0
      uint32_t i = b;
1837
0
      while (1) {
1838
0
        count++;
1839
0
        ctx->cfg_schedule[count] = i;
1840
0
        if (i == tail) break;
1841
0
        i = chains[i].next;
1842
0
      }
1843
0
    }
1844
0
  }
1845
1846
0
  IR_ASSERT(ctx->cfg_schedule + count == schedule_end);
1847
0
  ctx->cfg_schedule[ctx->cfg_blocks_count + 1] = 0;
1848
1849
0
  ir_mem_free(edges);
1850
0
  ir_mem_free(chains);
1851
1852
0
  return 1;
1853
0
}
1854
1855
/* A variation of "Top-down Positioning" algorithm described by
1856
 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1857
 */
1858
static int ir_schedule_blocks_top_down(ir_ctx *ctx)
1859
0
{
1860
0
  ir_bitqueue blocks;
1861
0
  uint32_t b, best_successor, last_non_empty;
1862
0
  ir_block *bb, *best_successor_bb;
1863
0
  ir_insn *insn;
1864
0
  uint32_t *list, *schedule_end;
1865
0
  uint32_t count = 0;
1866
1867
0
  ir_bitqueue_init(&blocks, ctx->cfg_blocks_count + 1);
1868
0
  blocks.pos = 0;
1869
0
  list = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1870
0
  list[ctx->cfg_blocks_count + 1] = 0;
1871
0
  schedule_end = list + ctx->cfg_blocks_count;
1872
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1873
0
    ir_bitset_incl(blocks.set, b);
1874
0
  }
1875
1876
0
  while ((b = ir_bitqueue_pop(&blocks)) != (uint32_t)-1) {
1877
0
    bb = &ctx->cfg_blocks[b];
1878
    /* Start trace */
1879
0
    last_non_empty = 0;
1880
0
    do {
1881
0
      if (UNEXPECTED(bb->flags & IR_BB_PREV_EMPTY_ENTRY) && ir_bitqueue_in(&blocks, b - 1)) {
1882
        /* Schedule the previous empty ENTRY block before this one */
1883
0
        uint32_t predecessor = b - 1;
1884
1885
0
        ir_bitqueue_del(&blocks, predecessor);
1886
0
        count++;
1887
0
        list[count] = predecessor;
1888
0
      }
1889
0
      if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1890
        /* move empty blocks to the end */
1891
0
        *schedule_end = b;
1892
0
        schedule_end--;
1893
0
      } else {
1894
0
        count++;
1895
0
        list[count] = b;
1896
0
        last_non_empty = b;
1897
0
      }
1898
0
      best_successor_bb = NULL;
1899
0
      if (bb->successors_count == 1) {
1900
0
        best_successor = ctx->cfg_edges[bb->successors];
1901
0
        if (ir_bitqueue_in(&blocks, best_successor)) {
1902
0
          best_successor_bb = &ctx->cfg_blocks[best_successor];
1903
0
        }
1904
0
      } else if (bb->successors_count > 1) {
1905
0
        uint32_t prob, best_successor_prob;
1906
0
        uint32_t *p, successor;
1907
0
        ir_block *successor_bb;
1908
1909
0
        for (b = 0, p = &ctx->cfg_edges[bb->successors]; b < bb->successors_count; b++, p++) {
1910
0
          successor = *p;
1911
0
          if (ir_bitqueue_in(&blocks, successor)) {
1912
0
            successor_bb = &ctx->cfg_blocks[successor];
1913
0
            insn = &ctx->ir_base[successor_bb->start];
1914
0
            if (insn->op == IR_IF_TRUE || insn->op == IR_IF_FALSE) {
1915
0
              prob = insn->op2;
1916
0
              if (!prob) {
1917
0
                prob = 100 / bb->successors_count;
1918
0
                if (!(successor_bb->flags & IR_BB_EMPTY)) {
1919
0
                  prob++;
1920
0
                }
1921
0
              }
1922
0
            } else if (insn->op == IR_CASE_DEFAULT) {
1923
0
              prob = insn->op2;
1924
0
              if (!prob) {
1925
0
                prob = 100 / bb->successors_count;
1926
0
              }
1927
0
            } else if (insn->op == IR_CASE_VAL) {
1928
0
              prob = insn->op3;
1929
0
              if (!prob) {
1930
0
                prob = 100 / bb->successors_count;
1931
0
              }
1932
0
            } else if (insn->op == IR_ENTRY) {
1933
0
              if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1934
0
                prob = 99; /* prefer empty ENTRY block to go first */
1935
0
              } else {
1936
0
                prob = 1;
1937
0
              }
1938
0
            } else {
1939
0
              prob = 100 / bb->successors_count;
1940
0
            }
1941
0
            if (!best_successor_bb
1942
0
             || successor_bb->loop_depth > best_successor_bb->loop_depth
1943
0
             || prob > best_successor_prob) {
1944
0
              best_successor = successor;
1945
0
              best_successor_bb = successor_bb;
1946
0
              best_successor_prob = prob;
1947
0
            }
1948
0
          }
1949
0
        }
1950
0
      }
1951
0
      if (!best_successor_bb) {
1952
        /* Try to continue trace using the other successor of the last IF */
1953
0
        if ((bb->flags & IR_BB_EMPTY) && last_non_empty) {
1954
0
          bb = &ctx->cfg_blocks[last_non_empty];
1955
0
          if (bb->successors_count == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1956
0
            b = ctx->cfg_edges[bb->successors];
1957
1958
0
            if (!ir_bitqueue_in(&blocks, b)) {
1959
0
              b = ctx->cfg_edges[bb->successors + 1];
1960
0
            }
1961
0
            if (ir_bitqueue_in(&blocks, b)) {
1962
0
              bb = &ctx->cfg_blocks[b];
1963
0
              ir_bitqueue_del(&blocks, b);
1964
0
              continue;
1965
0
            }
1966
0
          }
1967
0
        }
1968
        /* End trace */
1969
0
        break;
1970
0
      }
1971
0
      b = best_successor;
1972
0
      bb = best_successor_bb;
1973
0
      ir_bitqueue_del(&blocks, b);
1974
0
    } while (1);
1975
0
  }
1976
1977
0
  IR_ASSERT(list + count == schedule_end);
1978
0
  ctx->cfg_schedule = list;
1979
0
  ir_bitqueue_free(&blocks);
1980
1981
0
  return 1;
1982
0
}
1983
1984
int ir_schedule_blocks(ir_ctx *ctx)
1985
0
{
1986
0
  ir_ref ref;
1987
1988
0
  if (ctx->cfg_blocks_count <= 2) {
1989
0
    return 1;
1990
0
  }
1991
1992
  /* Mark "stop" blocks termintated with UNREACHABLE as "unexpected" */
1993
0
  ref = ctx->ir_base[1].op1;
1994
0
  while (ref) {
1995
0
    ir_insn *insn = &ctx->ir_base[ref];
1996
1997
0
    if (insn->op == IR_UNREACHABLE && ctx->ir_base[insn->op1].op != IR_TAILCALL) {
1998
0
      ir_block *bb = &ctx->cfg_blocks[ctx->cfg_map[ref]];
1999
0
      uint32_t n = bb->predecessors_count;
2000
2001
0
      if (n == 1) {
2002
0
        ir_insn *start_insn = &ctx->ir_base[bb->start];
2003
0
        if (start_insn->op == IR_IF_TRUE
2004
0
         || start_insn->op == IR_IF_FALSE
2005
0
         || start_insn->op == IR_CASE_DEFAULT) {
2006
0
          if (!start_insn->op2) start_insn->op2 = 1;
2007
0
        } else if (start_insn->op == IR_CASE_VAL) {
2008
0
          if (!start_insn->op3) start_insn->op3 = 1;
2009
0
        }
2010
0
      } else if (n > 1) {
2011
0
        uint32_t *p = &ctx->cfg_edges[bb->predecessors];
2012
2013
0
        for (; n > 0; p++, n--) {
2014
0
          bb = &ctx->cfg_blocks[*p];
2015
0
          if (bb->predecessors_count == 1) {
2016
0
            ir_insn *start_insn = &ctx->ir_base[bb->start];
2017
0
            if (start_insn->op == IR_IF_TRUE
2018
0
             || start_insn->op == IR_IF_FALSE
2019
0
             || start_insn->op == IR_CASE_DEFAULT) {
2020
0
              if (!start_insn->op2) start_insn->op2 = 1;
2021
0
            } else if (start_insn->op == IR_CASE_VAL) {
2022
0
              if (!start_insn->op3) start_insn->op3 = 1;
2023
0
            }
2024
0
          }
2025
0
        }
2026
0
      }
2027
0
    }
2028
0
    ref = insn->op3;
2029
0
  }
2030
2031
  /* The bottom-up Pettis-Hansen algorithm is expensive - O(n^3),
2032
   * use it only for relatively small functions.
2033
   *
2034
   * TODO: make the choice between top-down and bottom-up algorithm configurable
2035
   */
2036
0
  if (UNEXPECTED(ctx->flags2 & IR_IRREDUCIBLE_CFG) || ctx->cfg_blocks_count > 256) {
2037
0
    return ir_schedule_blocks_top_down(ctx);
2038
0
  } else {
2039
0
    return ir_schedule_blocks_bottom_up(ctx);
2040
0
  }
2041
0
}
2042
2043
/* JMP target optimisation */
2044
uint32_t ir_skip_empty_target_blocks(const ir_ctx *ctx, uint32_t b)
2045
0
{
2046
0
  return _ir_skip_empty_blocks(ctx, b);
2047
0
}
2048
2049
uint32_t ir_next_block(const ir_ctx *ctx, uint32_t b)
2050
0
{
2051
0
  ir_block *bb;
2052
2053
0
  if (ctx->cfg_schedule) {
2054
0
    uint32_t ret = ctx->cfg_schedule[++b];
2055
2056
    /* Check for empty ENTRY block */
2057
0
    while (ret && ((ctx->cfg_blocks[ret].flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY)) {
2058
0
      ret = ctx->cfg_schedule[++b];
2059
0
    }
2060
0
    return ret;
2061
0
  }
2062
2063
0
  b++;
2064
0
  while (1) {
2065
0
    if (b > ctx->cfg_blocks_count) {
2066
0
      return 0;
2067
0
    }
2068
2069
0
    bb = &ctx->cfg_blocks[b];
2070
2071
0
    if ((bb->flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY) {
2072
0
      b++;
2073
0
    } else {
2074
0
      break;
2075
0
    }
2076
0
  }
2077
0
  return b;
2078
0
}
2079
2080
void ir_get_true_false_blocks(const ir_ctx *ctx, uint32_t b, uint32_t *true_block, uint32_t *false_block)
2081
0
{
2082
0
  ir_block *bb;
2083
0
  uint32_t *p, use_block;
2084
2085
0
  *true_block = 0;
2086
0
  *false_block = 0;
2087
0
  bb = &ctx->cfg_blocks[b];
2088
0
  IR_ASSERT(ctx->ir_base[bb->end].op == IR_IF);
2089
0
  IR_ASSERT(bb->successors_count == 2);
2090
0
  p = &ctx->cfg_edges[bb->successors];
2091
0
  use_block = *p;
2092
0
  if (ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE) {
2093
0
    *true_block = ir_skip_empty_target_blocks(ctx, use_block);
2094
0
    use_block = *(p+1);
2095
0
    IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
2096
0
    *false_block = ir_skip_empty_target_blocks(ctx, use_block);
2097
0
  } else {
2098
0
    IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
2099
0
    *false_block = ir_skip_empty_target_blocks(ctx, use_block);
2100
0
    use_block = *(p+1);
2101
0
    IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE);
2102
0
    *true_block = ir_skip_empty_target_blocks(ctx, use_block);
2103
0
  }
2104
0
  IR_ASSERT(*true_block && *false_block);
2105
0
}