Coverage Report

Created: 2025-12-31 07:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/opcache/jit/ir/ir_cfg.c
Line
Count
Source
1
/*
2
 * IR - Lightweight JIT Compilation Framework
3
 * (CFG - Control Flow Graph)
4
 * Copyright (C) 2022 Zend by Perforce.
5
 * Authors: Dmitry Stogov <dmitry@php.net>
6
 */
7
8
#include "ir.h"
9
#include "ir_private.h"
10
11
static int ir_remove_unreachable_blocks(ir_ctx *ctx);
12
13
IR_ALWAYS_INLINE void _ir_add_successors(const ir_ctx *ctx, ir_ref ref, ir_worklist *worklist)
14
0
{
15
0
  ir_use_list *use_list = &ctx->use_lists[ref];
16
0
  ir_ref *p, use, n = use_list->count;
17
18
0
  if (n < 2) {
19
0
    if (n == 1) {
20
0
      use = ctx->use_edges[use_list->refs];
21
0
      IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
22
0
      ir_worklist_push(worklist, use);
23
0
    }
24
0
  } else {
25
0
    p = &ctx->use_edges[use_list->refs];
26
0
    if (n == 2) {
27
0
      use = *p;
28
0
      IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
29
0
      ir_worklist_push(worklist, use);
30
0
      use = *(p + 1);
31
0
      IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
32
0
      ir_worklist_push(worklist, use);
33
0
    } else {
34
0
      for (; n > 0; p++, n--) {
35
0
        use = *p;
36
0
        IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
37
0
        ir_worklist_push(worklist, use);
38
0
      }
39
0
    }
40
0
  }
41
0
}
42
43
IR_ALWAYS_INLINE void _ir_add_predecessors(const ir_insn *insn, ir_worklist *worklist)
44
0
{
45
0
  ir_ref n, ref;
46
0
  const ir_ref *p;
47
48
0
  if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
49
0
    n = insn->inputs_count;
50
0
    for (p = insn->ops + 1; n > 0; p++, n--) {
51
0
      ref = *p;
52
0
      IR_ASSERT(ref);
53
0
      ir_worklist_push(worklist, ref);
54
0
    }
55
0
  } else if (insn->op != IR_START) {
56
0
    if (EXPECTED(insn->op1)) {
57
0
      ir_worklist_push(worklist, insn->op1);
58
0
    }
59
0
  }
60
0
}
61
62
void ir_reset_cfg(ir_ctx *ctx)
63
0
{
64
0
  ctx->cfg_blocks_count = 0;
65
0
  ctx->cfg_edges_count = 0;
66
0
  if (ctx->cfg_blocks) {
67
0
    ir_mem_free(ctx->cfg_blocks);
68
0
    ctx->cfg_blocks = NULL;
69
0
    if (ctx->cfg_edges) {
70
0
      ir_mem_free(ctx->cfg_edges);
71
0
      ctx->cfg_edges = NULL;
72
0
    }
73
0
    if (ctx->cfg_map) {
74
0
      ir_mem_free(ctx->cfg_map);
75
0
      ctx->cfg_map = NULL;
76
0
    }
77
0
  }
78
0
}
79
80
static uint32_t IR_NEVER_INLINE ir_cfg_remove_dead_inputs(ir_ctx *ctx, uint32_t *_blocks, ir_block *blocks, uint32_t bb_count)
81
0
{
82
0
  uint32_t b, count = 0;
83
0
  ir_block *bb = blocks + 1;
84
0
  ir_insn *insn;
85
0
  ir_ref i, j, n, *ops, input;
86
87
0
  for (b = 1; b <= bb_count; b++, bb++) {
88
0
    bb->successors = count;
89
0
    count += ctx->use_lists[bb->end].count;
90
0
    bb->successors_count = 0;
91
0
    bb->predecessors = count;
92
0
    insn = &ctx->ir_base[bb->start];
93
0
    if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
94
0
      n = insn->inputs_count;
95
0
      ops = insn->ops;
96
0
      for (i = 1, j = 1; i <= n; i++) {
97
0
        input = ops[i];
98
0
        if (_blocks[input]) {
99
0
          if (i != j) {
100
0
            ops[j] = ops[i];
101
0
          }
102
0
          j++;
103
0
        } else if (input > 0) {
104
0
          ir_use_list_remove_one(ctx, input, bb->start);
105
0
        }
106
0
      }
107
0
      j--;
108
0
      if (j != n) {
109
0
        if (j == 1) {
110
0
          insn->op = IR_BEGIN;
111
0
        }
112
0
        insn->inputs_count = j;
113
0
        bb->predecessors_count = j;
114
0
        j++;
115
0
        for (;j <= n; j++) {
116
0
          ops[j] = IR_UNUSED;
117
0
        }
118
0
      }
119
0
    }
120
0
    count += bb->predecessors_count;
121
0
  }
122
0
  return count;
123
0
}
124
125
int ir_build_cfg(ir_ctx *ctx)
126
0
{
127
0
  ir_ref n, *p, ref, start, end;
128
0
  uint32_t b;
129
0
  ir_insn *insn;
130
0
  ir_worklist worklist;
131
0
  uint32_t bb_init_falgs;
132
0
  uint32_t count, bb_count = 0;
133
0
  uint32_t edges_count = 0;
134
0
  ir_block *blocks, *bb;
135
0
  uint32_t *_blocks, *edges;
136
0
  ir_use_list *use_list;
137
0
  uint32_t len = ir_bitset_len(ctx->insns_count);
138
0
  ir_bitset bb_starts = ir_mem_calloc(len * 2, IR_BITSET_BITS / 8);
139
0
  ir_bitset bb_leaks = bb_starts + len;
140
0
  _blocks = ir_mem_calloc(ctx->insns_limit, sizeof(uint32_t));
141
0
  ir_worklist_init(&worklist, ctx->insns_count);
142
143
  /* First try to perform backward DFS search starting from "stop" nodes */
144
145
  /* Add all "stop" nodes */
146
0
  ref = ctx->ir_base[1].op1;
147
0
  while (ref) {
148
0
    ir_worklist_push(&worklist, ref);
149
0
    ref = ctx->ir_base[ref].op3;
150
0
  }
151
152
0
  while (ir_worklist_len(&worklist)) {
153
0
    ref = ir_worklist_pop(&worklist);
154
0
    insn = &ctx->ir_base[ref];
155
156
0
    if (insn->op == IR_NOP) {
157
0
      continue;
158
0
    }
159
0
    IR_ASSERT(IR_IS_BB_END(insn->op));
160
    /* Remember BB end */
161
0
    end = ref;
162
    /* Some successors of IF and SWITCH nodes may be inaccessible by backward DFS */
163
0
    use_list = &ctx->use_lists[end];
164
0
    n = use_list->count;
165
0
    if (n > 1 || (n == 1 && (ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR) != 0)) {
166
0
      for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
167
        /* Remember possible inaccessible successors */
168
0
        ir_bitset_incl(bb_leaks, *p);
169
0
      }
170
0
    }
171
    /* Skip control nodes untill BB start */
172
0
    ref = insn->op1;
173
0
    while (1) {
174
0
      insn = &ctx->ir_base[ref];
175
0
      if (IR_IS_BB_START(insn->op)) {
176
0
        break;
177
0
      }
178
0
      ref = insn->op1; // follow connected control blocks untill BB start
179
0
    }
180
    /* Mark BB Start */
181
0
    bb_count++;
182
0
    _blocks[ref] = end;
183
0
    ir_bitset_incl(bb_starts, ref);
184
    /* Add predecessors */
185
0
    _ir_add_predecessors(insn, &worklist);
186
0
  }
187
188
  /* Backward DFS way miss some branches ending by infinite loops.  */
189
  /* Try forward DFS. (in most cases all nodes are already proceed). */
190
191
  /* START node may be inaccessible from "stop" nodes */
192
0
  ir_bitset_incl(bb_leaks, 1);
193
194
  /* Add not processed START and successor of IF and SWITCH */
195
0
  IR_BITSET_FOREACH_DIFFERENCE(bb_leaks, bb_starts, len, start) {
196
0
    ir_worklist_push(&worklist, start);
197
0
  } IR_BITSET_FOREACH_END();
198
199
0
  if (ir_worklist_len(&worklist)) {
200
0
    ir_bitset_union(worklist.visited, bb_starts, len);
201
0
    do {
202
0
      ref = ir_worklist_pop(&worklist);
203
0
      insn = &ctx->ir_base[ref];
204
205
0
      if (insn->op == IR_NOP) {
206
0
        continue;
207
0
      }
208
0
      IR_ASSERT(IR_IS_BB_START(insn->op));
209
      /* Remember BB start */
210
0
      start = ref;
211
      /* Skip control nodes untill BB end */
212
0
      while (1) {
213
0
        ref = ir_next_control(ctx, ref);
214
0
        insn = &ctx->ir_base[ref];
215
0
        if (IR_IS_BB_END(insn->op)) {
216
0
          break;
217
0
        }
218
0
      }
219
      /* Mark BB Start */
220
0
      bb_count++;
221
0
      _blocks[start] = ref;
222
0
      ir_bitset_incl(bb_starts, start);
223
      /* Add successors */
224
0
      _ir_add_successors(ctx, ref, &worklist);
225
0
    } while (ir_worklist_len(&worklist));
226
0
  }
227
228
0
  IR_ASSERT(bb_count > 0);
229
230
  /* Create array of basic blocks and count successor/predecessors edges for each BB */
231
0
  blocks = ir_mem_malloc((bb_count + 1) * sizeof(ir_block));
232
0
  b = 1;
233
0
  bb = blocks + 1;
234
0
  count = 0;
235
  /* SCCP already removed UNREACHABKE blocks, otherwise all blocks are marked as UNREACHABLE first */
236
0
  bb_init_falgs = (ctx->flags2 & IR_CFG_REACHABLE) ? 0 : IR_BB_UNREACHABLE;
237
0
  IR_BITSET_FOREACH(bb_starts, len, start) {
238
0
    insn = &ctx->ir_base[start];
239
0
    if (insn->op == IR_NOP) {
240
0
      _blocks[start] = 0;
241
0
      continue;
242
0
    }
243
0
    end = _blocks[start];
244
0
    _blocks[start] = b;
245
0
    _blocks[end] = b;
246
0
    IR_ASSERT(IR_IS_BB_START(insn->op));
247
0
    bb->start = start;
248
0
    bb->end = end;
249
0
    bb->successors = count;
250
0
    count += ctx->use_lists[end].count;
251
0
    bb->successors_count = 0;
252
0
    bb->predecessors = count;
253
0
    bb->dom_parent = 0;
254
0
    bb->dom_depth = 0;
255
0
    bb->dom_child = 0;
256
0
    bb->dom_next_child = 0;
257
0
    bb->loop_header = 0;
258
0
    bb->loop_depth = 0;
259
0
    if (insn->op == IR_START) {
260
0
      bb->flags = IR_BB_START;
261
0
      bb->predecessors_count = 0;
262
0
    } else {
263
0
      bb->flags = bb_init_falgs;
264
0
      if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
265
0
        n = insn->inputs_count;
266
0
        bb->predecessors_count = n;
267
0
        edges_count += n;
268
0
        count += n;
269
0
      } else if (EXPECTED(insn->op1)) {
270
0
        if (insn->op == IR_ENTRY) {
271
0
          bb->flags |= IR_BB_ENTRY;
272
0
          ctx->entries_count++;
273
0
        }
274
0
        bb->predecessors_count = 1;
275
0
        edges_count++;
276
0
        count++;
277
0
      } else {
278
0
        IR_ASSERT(insn->op == IR_BEGIN); /* start of unreachable block */
279
0
        bb->predecessors_count = 0;
280
0
      }
281
0
    }
282
0
    b++;
283
0
    bb++;
284
0
  } IR_BITSET_FOREACH_END();
285
0
  bb_count = b - 1;
286
0
  if (UNEXPECTED(count != edges_count * 2)) {
287
0
    count = ir_cfg_remove_dead_inputs(ctx, _blocks, blocks, bb_count);
288
0
    IR_ASSERT(count != edges_count * 2);
289
0
  }
290
0
  ir_mem_free(bb_starts);
291
292
  /* Create an array of successor/predecessors control edges */
293
0
  edges = ir_mem_malloc(edges_count * 2 * sizeof(uint32_t));
294
0
  bb = blocks + 1;
295
0
  for (b = 1; b <= bb_count; b++, bb++) {
296
0
    insn = &ctx->ir_base[bb->start];
297
0
    if (bb->predecessors_count > 1) {
298
0
      uint32_t *q = edges + bb->predecessors;
299
0
      n = insn->inputs_count;
300
0
      for (p = insn->ops + 1; n > 0; p++, q++, n--) {
301
0
        ref = *p;
302
0
        IR_ASSERT(ref);
303
0
        ir_ref pred_b = _blocks[ref];
304
0
        ir_block *pred_bb = &blocks[pred_b];
305
0
        IR_ASSERT(pred_b > 0);
306
0
        *q = pred_b;
307
0
        edges[pred_bb->successors + pred_bb->successors_count++] = b;
308
0
      }
309
0
    } else if (bb->predecessors_count == 1) {
310
0
      ref = insn->op1;
311
0
      IR_ASSERT(ref);
312
0
      IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
313
0
      ir_ref pred_b = _blocks[ref];
314
0
      ir_block *pred_bb = &blocks[pred_b];
315
0
      edges[bb->predecessors] = pred_b;
316
0
      edges[pred_bb->successors + pred_bb->successors_count++] = b;
317
0
    }
318
0
  }
319
320
0
  ctx->cfg_blocks_count = bb_count;
321
0
  ctx->cfg_edges_count = edges_count * 2;
322
0
  ctx->cfg_blocks = blocks;
323
0
  ctx->cfg_edges = edges;
324
0
  ctx->cfg_map = _blocks;
325
326
0
  if (!(ctx->flags2 & IR_CFG_REACHABLE)) {
327
0
    uint32_t reachable_count = 0;
328
329
    /* Mark reachable blocks */
330
0
    ir_worklist_clear(&worklist);
331
0
    ir_worklist_push(&worklist, 1);
332
0
    while (ir_worklist_len(&worklist) != 0) {
333
0
      uint32_t *p;
334
335
0
      reachable_count++;
336
0
      b = ir_worklist_pop(&worklist);
337
0
      bb = &blocks[b];
338
0
      bb->flags &= ~IR_BB_UNREACHABLE;
339
0
      n = bb->successors_count;
340
0
      if (n > 1) {
341
0
        for (p = edges + bb->successors; n > 0; p++, n--) {
342
0
          ir_worklist_push(&worklist, *p);
343
0
        }
344
0
      } else if (n == 1) {
345
0
        ir_worklist_push(&worklist, edges[bb->successors]);
346
0
      }
347
0
    }
348
0
    if (reachable_count != ctx->cfg_blocks_count) {
349
0
      ir_remove_unreachable_blocks(ctx);
350
0
    }
351
0
  }
352
353
0
  ir_worklist_free(&worklist);
354
355
0
  return 1;
356
0
}
357
358
static void ir_remove_predecessor(ir_ctx *ctx, ir_block *bb, uint32_t from)
359
0
{
360
0
  uint32_t i, *p, *q, n = 0;
361
362
0
  p = q = &ctx->cfg_edges[bb->predecessors];
363
0
  for (i = 0; i < bb->predecessors_count; i++, p++) {
364
0
    if (*p != from) {
365
0
      if (p != q) {
366
0
        *q = *p;
367
0
      }
368
0
      q++;
369
0
      n++;
370
0
    }
371
0
  }
372
0
  IR_ASSERT(n != bb->predecessors_count);
373
0
  bb->predecessors_count = n;
374
0
}
375
376
static void ir_remove_merge_input(ir_ctx *ctx, ir_ref merge, ir_ref from)
377
0
{
378
0
  ir_ref i, j, n, k, *p, *q, use;
379
0
  ir_insn *use_insn;
380
0
  ir_use_list *use_list;
381
0
  ir_bitset life_inputs;
382
0
  ir_insn *insn = &ctx->ir_base[merge];
383
384
0
  IR_ASSERT(insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN);
385
0
  n = insn->inputs_count;
386
0
  i = 1;
387
0
  life_inputs = ir_bitset_malloc(n + 1);
388
0
  for (j = 1; j <= n; j++) {
389
0
    ir_ref input = ir_insn_op(insn, j);
390
391
0
    if (input != from) {
392
0
      if (i != j) {
393
0
        ir_insn_set_op(insn, i, input);
394
0
      }
395
0
      ir_bitset_incl(life_inputs, j);
396
0
      i++;
397
0
    }
398
0
  }
399
0
  i--;
400
0
  for (j = i + 1; j <= n; j++) {
401
0
    ir_insn_set_op(insn, j, IR_UNUSED);
402
0
  }
403
0
  if (i == 1) {
404
0
    insn->op = IR_BEGIN;
405
0
    insn->inputs_count = 1;
406
0
    use_list = &ctx->use_lists[merge];
407
0
    if (use_list->count > 1) {
408
0
      n++;
409
0
      for (k = use_list->count, p = q = &ctx->use_edges[use_list->refs]; k > 0; p++, k--) {
410
0
        use = *p;
411
0
        use_insn = &ctx->ir_base[use];
412
0
        if (use_insn->op == IR_PHI) {
413
          /* Convert PHI to COPY */
414
0
          i = 2;
415
0
          for (j = 2; j <= n; j++) {
416
0
            ir_ref input = ir_insn_op(use_insn, j);
417
418
0
            if (ir_bitset_in(life_inputs, j - 1)) {
419
0
              use_insn->op1 = ir_insn_op(use_insn, j);
420
0
            } else if (input > 0) {
421
0
              ir_use_list_remove_one(ctx, input, use);
422
0
            }
423
0
          }
424
0
          use_insn->op = IR_COPY;
425
0
          use_insn->inputs_count = 1;
426
0
          for (j = 2; j <= n; j++) {
427
0
            ir_insn_set_op(use_insn, j, IR_UNUSED);
428
0
          }
429
0
          continue;
430
0
        }
431
432
        /*compact use list */
433
0
        if (p != q){
434
0
          *q = use;
435
0
        }
436
0
        q++;
437
0
      }
438
439
0
      if (p != q) {
440
0
        use_list->count -= (p - q);
441
0
        do {
442
0
          *q = IR_UNUSED; /* clenu-op the removed tail */
443
0
          q++;
444
0
        } while (p != q);
445
0
      }
446
0
    }
447
0
  } else {
448
0
    insn->inputs_count = i;
449
450
0
    use_list = &ctx->use_lists[merge];
451
0
    if (use_list->count > 1) {
452
0
      n++;
453
0
      for (k = use_list->count, p = &ctx->use_edges[use_list->refs]; k > 0; p++, k--) {
454
0
        use = *p;
455
0
        use_insn = &ctx->ir_base[use];
456
0
        if (use_insn->op == IR_PHI) {
457
0
          i = 2;
458
0
          for (j = 2; j <= n; j++) {
459
0
            ir_ref input = ir_insn_op(use_insn, j);
460
461
0
            if (ir_bitset_in(life_inputs, j - 1)) {
462
0
              IR_ASSERT(input);
463
0
              if (i != j) {
464
0
                ir_insn_set_op(use_insn, i, input);
465
0
              }
466
0
              i++;
467
0
            } else if (input > 0) {
468
0
              ir_use_list_remove_one(ctx, input, use);
469
0
            }
470
0
          }
471
0
          use_insn->inputs_count = i - 1;
472
0
          for (j = i; j <= n; j++) {
473
0
            ir_insn_set_op(use_insn, j, IR_UNUSED);
474
0
          }
475
0
        }
476
0
      }
477
0
    }
478
0
  }
479
0
  ir_mem_free(life_inputs);
480
0
  ir_use_list_remove_all(ctx, from, merge);
481
0
}
482
483
/* CFG constructed after SCCP pass doesn't have unreachable BBs, otherwise they should be removed */
484
static int ir_remove_unreachable_blocks(ir_ctx *ctx)
485
0
{
486
0
  uint32_t b, *p, i;
487
0
  uint32_t unreachable_count = 0;
488
0
  uint32_t bb_count = ctx->cfg_blocks_count;
489
0
  ir_block *bb = ctx->cfg_blocks + 1;
490
491
0
  for (b = 1; b <= bb_count; b++, bb++) {
492
0
    if (bb->flags & IR_BB_UNREACHABLE) {
493
#if 0
494
      do {if (!unreachable_count) ir_dump_cfg(ctx, stderr);} while(0);
495
#endif
496
0
      if (bb->successors_count) {
497
0
        for (i = 0, p = &ctx->cfg_edges[bb->successors]; i < bb->successors_count; i++, p++) {
498
0
          ir_block *succ_bb = &ctx->cfg_blocks[*p];
499
500
0
          if (!(succ_bb->flags & IR_BB_UNREACHABLE)) {
501
0
            ir_remove_predecessor(ctx, succ_bb, b);
502
0
            ir_remove_merge_input(ctx, succ_bb->start, bb->end);
503
0
          }
504
0
        }
505
0
      } else {
506
0
        ir_ref prev, ref = bb->end;
507
0
        ir_insn *insn = &ctx->ir_base[ref];
508
509
0
        IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR);
510
        /* remove from terminators list */
511
0
        prev = ctx->ir_base[1].op1;
512
0
        if (prev == ref) {
513
0
          ctx->ir_base[1].op1 = insn->op3;
514
0
        } else {
515
0
          while (prev) {
516
0
            if (ctx->ir_base[prev].op3 == ref) {
517
0
              ctx->ir_base[prev].op3 = insn->op3;
518
0
              break;
519
0
            }
520
0
            prev = ctx->ir_base[prev].op3;
521
0
          }
522
0
        }
523
0
      }
524
0
      ctx->cfg_map[bb->start] = 0;
525
0
      ctx->cfg_map[bb->end] = 0;
526
0
      unreachable_count++;
527
0
    }
528
0
  }
529
530
0
  if (unreachable_count) {
531
0
    ir_block *dst_bb;
532
0
    uint32_t n = 1;
533
0
    uint32_t *edges;
534
535
0
    dst_bb = bb = ctx->cfg_blocks + 1;
536
0
    for (b = 1; b <= bb_count; b++, bb++) {
537
0
      if (!(bb->flags & IR_BB_UNREACHABLE)) {
538
0
        if (dst_bb != bb) {
539
0
          memcpy(dst_bb, bb, sizeof(ir_block));
540
0
          ctx->cfg_map[dst_bb->start] = n;
541
0
          ctx->cfg_map[dst_bb->end] = n;
542
0
        }
543
0
        dst_bb->successors_count = 0;
544
0
        dst_bb++;
545
0
        n++;
546
0
      }
547
0
    }
548
0
    ctx->cfg_blocks_count = bb_count = n - 1;
549
550
    /* Rebuild successor/predecessors control edges */
551
0
    edges = ctx->cfg_edges;
552
0
    bb = ctx->cfg_blocks + 1;
553
0
    for (b = 1; b <= bb_count; b++, bb++) {
554
0
      ir_insn *insn = &ctx->ir_base[bb->start];
555
0
      ir_ref *p, ref;
556
557
0
      n = bb->predecessors_count;
558
0
      if (n > 1) {
559
0
        uint32_t *q = edges + bb->predecessors;
560
561
0
        IR_ASSERT(n == insn->inputs_count);
562
0
        for (p = insn->ops + 1; n > 0; p++, q++, n--) {
563
0
          ref = *p;
564
0
          IR_ASSERT(ref);
565
0
          ir_ref pred_b = ctx->cfg_map[ref];
566
0
          ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
567
0
          *q = pred_b;
568
0
          edges[pred_bb->successors + pred_bb->successors_count++] = b;
569
0
        }
570
0
      } else if (n == 1) {
571
0
        ref = insn->op1;
572
0
        IR_ASSERT(ref);
573
0
        IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
574
0
        ir_ref pred_b = ctx->cfg_map[ref];
575
0
        ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
576
0
        edges[bb->predecessors] = pred_b;
577
0
        edges[pred_bb->successors + pred_bb->successors_count++] = b;
578
0
      }
579
0
    }
580
0
  }
581
582
0
  return 1;
583
0
}
584
585
static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b)
586
0
{
587
0
  uint32_t i, *p;
588
0
  ir_block *bb = &ctx->cfg_blocks[b];
589
590
0
  if (bb->postnum != 0) {
591
0
    return;
592
0
  }
593
594
0
  if (bb->successors_count) {
595
0
    bb->postnum = -1; /* Marker for "currently visiting" */
596
0
    p = ctx->cfg_edges + bb->successors;
597
0
    i = bb->successors_count;
598
0
    do {
599
0
      compute_postnum(ctx, cur, *p);
600
0
      p++;
601
0
    } while (--i);
602
0
  }
603
0
  bb->postnum = (*cur)++;
604
0
}
605
606
/* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by
607
 * Cooper, Harvey and Kennedy. */
608
static IR_NEVER_INLINE int ir_build_dominators_tree_slow(ir_ctx *ctx)
609
0
{
610
0
  uint32_t blocks_count, b, postnum;
611
0
  ir_block *blocks, *bb;
612
0
  uint32_t *edges;
613
0
  bool changed;
614
615
0
  blocks = ctx->cfg_blocks;
616
0
  edges  = ctx->cfg_edges;
617
0
  blocks_count = ctx->cfg_blocks_count;
618
619
  /* Clear the dominators tree */
620
0
  for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
621
0
    bb->idom = 0;
622
0
    bb->dom_depth = 0;
623
0
    bb->dom_child = 0;
624
0
    bb->dom_next_child = 0;
625
0
  }
626
627
0
  ctx->flags2 &= ~IR_NO_LOOPS;
628
629
0
  postnum = 1;
630
0
  compute_postnum(ctx, &postnum, 1);
631
632
  /* Find immediate dominators by iterative fixed-point algorithm */
633
0
  blocks[1].idom = 1;
634
0
  do {
635
0
    changed = 0;
636
    /* Iterating in Reverse Post Order */
637
0
    for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
638
0
      IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
639
0
      IR_ASSERT(bb->predecessors_count > 0);
640
0
      if (bb->predecessors_count == 1) {
641
0
        uint32_t pred_b = edges[bb->predecessors];
642
643
0
        if (blocks[pred_b].idom > 0 && bb->idom != pred_b) {
644
0
          bb->idom = pred_b;
645
0
          changed = 1;
646
0
        }
647
0
      } else if (bb->predecessors_count) {
648
0
        uint32_t idom = 0;
649
0
        uint32_t k = bb->predecessors_count;
650
0
        uint32_t *p = edges + bb->predecessors;
651
652
0
        do {
653
0
          uint32_t pred_b = *p;
654
0
          ir_block *pred_bb = &blocks[pred_b];
655
0
          ir_block *idom_bb;
656
657
0
          if (pred_bb->idom > 0) {
658
0
            idom = pred_b;
659
0
            idom_bb = &blocks[idom];
660
661
0
            while (--k > 0) {
662
0
              pred_b = *(++p);
663
0
              pred_bb = &blocks[pred_b];
664
0
              if (pred_bb->idom > 0) {
665
0
                while (idom != pred_b) {
666
0
                  while (pred_bb->postnum < idom_bb->postnum) {
667
0
                    pred_b = pred_bb->idom;
668
0
                    pred_bb = &blocks[pred_b];
669
0
                  }
670
0
                  while (idom_bb->postnum < pred_bb->postnum) {
671
0
                    idom = idom_bb->idom;
672
0
                    idom_bb = &blocks[idom];
673
0
                  }
674
0
                }
675
0
              }
676
0
            }
677
678
0
            if (bb->idom != idom) {
679
0
              bb->idom = idom;
680
0
              changed = 1;
681
0
            }
682
0
            break;
683
0
          }
684
0
          p++;
685
0
        } while (--k > 0);
686
0
      }
687
0
    }
688
0
  } while (changed);
689
690
  /* Build dominators tree */
691
0
  blocks[1].idom = 0;
692
0
  blocks[1].dom_depth = 0;
693
694
  /* Construct children lists sorted by block number */
695
0
  for (b = blocks_count, bb = &blocks[b]; b >= 2; b--, bb--) {
696
0
    ir_block *idom_bb = &blocks[bb->idom];
697
0
    bb->dom_depth = 0;
698
0
    bb->dom_next_child = idom_bb->dom_child;
699
0
    idom_bb->dom_child = b;
700
0
  }
701
702
  /* Recalculate dom_depth for all blocks */
703
0
  for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
704
0
    uint32_t idom = bb->idom;
705
0
    uint32_t dom_depth = 0;
706
0
    while (idom) {
707
0
      dom_depth++;
708
0
      if (blocks[idom].dom_depth > 0) {
709
0
        dom_depth += blocks[idom].dom_depth;
710
0
        break;
711
0
      } else {
712
0
        idom = blocks[idom].idom;
713
0
      }
714
0
    }
715
0
    bb->dom_depth = dom_depth;
716
0
  }
717
718
0
  return 1;
719
0
}
720
721
/* A single pass modification of "A Simple, Fast Dominance Algorithm" by
722
 * Cooper, Harvey and Kennedy, that relays on IR block ordering.
723
 * It may fallback to the general slow fixed-point algorithm.  */
724
static int ir_build_dominators_tree_iterative(ir_ctx *ctx);
725
int ir_build_dominators_tree(ir_ctx *ctx)
726
0
{
727
0
  uint32_t blocks_count, b;
728
0
  ir_block *blocks, *bb;
729
0
  uint32_t *edges;
730
0
  ir_list worklist;
731
732
0
  ir_list_init(&worklist, ctx->cfg_blocks_count / 2);
733
734
0
  ctx->flags2 |= IR_NO_LOOPS;
735
736
  /* Find immediate dominators */
737
0
  blocks = ctx->cfg_blocks;
738
0
  edges  = ctx->cfg_edges;
739
0
  blocks_count = ctx->cfg_blocks_count;
740
0
  blocks[1].idom = 1;
741
0
  blocks[1].dom_depth = 0;
742
743
  /* Iterating in Reverse Post Order */
744
0
  for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
745
0
    IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
746
0
    IR_ASSERT(bb->predecessors_count > 0);
747
0
    uint32_t k = bb->predecessors_count;
748
0
    uint32_t *p = edges + bb->predecessors;
749
0
    uint32_t idom = *p;
750
0
    ir_block *idom_bb;
751
752
0
    if (UNEXPECTED(idom >= b)) {
753
      /* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */
754
0
      ctx->flags2 &= ~IR_NO_LOOPS;
755
//      IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor");
756
0
      if (UNEXPECTED(k <= 1)) {
757
0
slow_case:
758
0
        ir_list_free(&worklist);
759
0
        return ir_build_dominators_tree_slow(ctx);
760
0
      }
761
0
      ir_list_push(&worklist, idom);
762
0
      while (1) {
763
0
        k--;
764
0
        p++;
765
0
        idom = *p;
766
0
        if (idom < b) {
767
0
          break;
768
0
        }
769
0
        if (UNEXPECTED(k == 0)) {
770
0
          goto slow_case;
771
0
        }
772
0
        ir_list_push(&worklist, idom);
773
0
      }
774
0
    }
775
0
    IR_ASSERT(blocks[idom].idom > 0);
776
0
    IR_ASSERT(k != 0);
777
778
0
    while (--k > 0) {
779
0
      uint32_t pred_b = *(++p);
780
781
0
      if (pred_b < b) {
782
0
        IR_ASSERT(blocks[pred_b].idom > 0);
783
0
        while (idom != pred_b) {
784
0
          while (pred_b > idom) {
785
0
            pred_b = blocks[pred_b].idom;
786
0
          }
787
0
          while (idom > pred_b) {
788
0
            idom = blocks[idom].idom;
789
0
          }
790
0
        }
791
0
      } else {
792
0
        ctx->flags2 &= ~IR_NO_LOOPS;
793
0
        IR_ASSERT(bb->predecessors_count > 1);
794
0
        ir_list_push(&worklist, pred_b);
795
0
      }
796
0
    }
797
0
    bb->idom = idom;
798
0
    idom_bb = &blocks[idom];
799
0
    bb->dom_depth = idom_bb->dom_depth + 1;
800
0
  }
801
802
  /* Construct children lists sorted by block number */
803
0
  for (b = blocks_count, bb = &blocks[b]; b >= 2; b--, bb--) {
804
0
    ir_block *idom_bb = &blocks[bb->idom];
805
0
    bb->dom_next_child = idom_bb->dom_child;
806
0
    idom_bb->dom_child = b;
807
0
  }
808
809
0
  blocks[1].idom = 0;
810
811
0
  if (ir_list_len(&worklist) != 0) {
812
0
    uint32_t dom_depth;
813
0
    uint32_t succ_b;
814
0
    bool complete = 1;
815
816
    /* Check if all the back-edges lead to the loop headers */
817
0
    do {
818
0
      b = ir_list_pop(&worklist);
819
0
      bb = &blocks[b];
820
0
      succ_b = ctx->cfg_edges[bb->successors];
821
0
      if (bb->successors_count != 1) {
822
        /* LOOP_END/END may be linked with the following ENTRY by a fake edge */
823
0
        if (bb->successors_count != 2) {
824
0
          complete = 0;
825
0
          break;
826
0
        } else if (blocks[succ_b].flags & IR_BB_ENTRY) {
827
0
          succ_b = ctx->cfg_edges[bb->successors + 1];
828
0
        } else if (!(blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY)) {
829
0
          complete = 0;
830
0
          break;
831
0
        }
832
0
      }
833
0
      dom_depth = blocks[succ_b].dom_depth;;
834
0
      while (bb->dom_depth > dom_depth) {
835
0
        b = bb->dom_parent;
836
0
        bb = &blocks[b];
837
0
      }
838
0
      if (UNEXPECTED(b != succ_b)) {
839
0
        complete = 0;
840
0
        break;
841
0
      }
842
0
    } while (ir_list_len(&worklist) != 0);
843
844
0
    if (UNEXPECTED(!complete)) {
845
0
      ir_list_free(&worklist);
846
0
      return ir_build_dominators_tree_iterative(ctx);
847
0
    }
848
0
  }
849
850
0
  ir_list_free(&worklist);
851
852
0
  return 1;
853
0
}
854
855
static int ir_build_dominators_tree_iterative(ir_ctx *ctx)
856
0
{
857
0
  bool changed;
858
0
  uint32_t blocks_count, b;
859
0
  ir_block *blocks, *bb;
860
0
  uint32_t *edges;
861
862
  /* Find immediate dominators */
863
0
  blocks = ctx->cfg_blocks;
864
0
  edges  = ctx->cfg_edges;
865
0
  blocks_count = ctx->cfg_blocks_count;
866
867
  /* Clear the dominators tree, but keep already found dominators */
868
0
  for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
869
0
    bb->dom_depth = 0;
870
0
    bb->dom_child = 0;
871
0
    bb->dom_next_child = 0;
872
0
  }
873
874
  /* Find immediate dominators by iterative fixed-point algorithm */
875
0
  blocks[1].idom = 1;
876
0
  do {
877
0
    changed = 0;
878
879
0
    for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
880
0
      IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
881
0
      IR_ASSERT(bb->predecessors_count > 0);
882
0
      uint32_t k = bb->predecessors_count;
883
0
      uint32_t *p = edges + bb->predecessors;
884
0
      uint32_t idom = *p;
885
886
0
      if (blocks[idom].idom == 0) {
887
0
        while (1) {
888
0
          k--;
889
0
          p++;
890
0
          idom = *p;
891
0
          if (blocks[idom].idom > 0) {
892
0
            break;
893
0
          }
894
0
          IR_ASSERT(k > 0);
895
0
        }
896
0
      }
897
0
      IR_ASSERT(k != 0);
898
0
      while (--k > 0) {
899
0
        uint32_t pred_b = *(++p);
900
901
0
        if (blocks[pred_b].idom > 0) {
902
0
          IR_ASSERT(blocks[pred_b].idom > 0);
903
0
          while (idom != pred_b) {
904
0
            while (pred_b > idom) {
905
0
              pred_b = blocks[pred_b].idom;
906
0
            }
907
0
            while (idom > pred_b) {
908
0
              idom = blocks[idom].idom;
909
0
            }
910
0
          }
911
0
        }
912
0
      }
913
0
      if (bb->idom != idom) {
914
0
        bb->idom = idom;
915
0
        changed = 1;
916
0
      }
917
0
    }
918
0
  } while (changed);
919
920
  /* Build dominators tree */
921
0
  blocks[1].idom = 0;
922
0
  blocks[1].dom_depth = 0;
923
0
  for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
924
0
    uint32_t idom = bb->idom;
925
0
    ir_block *idom_bb = &blocks[idom];
926
927
0
    bb->dom_depth = idom_bb->dom_depth + 1;
928
0
  }
929
930
  /* Construct children lists sorted by block number */
931
0
  for (b = blocks_count, bb = &blocks[b]; b >= 2; b--, bb--) {
932
0
    ir_block *idom_bb = &blocks[bb->idom];
933
0
    bb->dom_next_child = idom_bb->dom_child;
934
0
    idom_bb->dom_child = b;
935
0
  }
936
937
0
  return 1;
938
0
}
939
940
static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
941
0
{
942
0
  uint32_t b1_depth = blocks[b1].dom_depth;
943
0
  const ir_block *bb2 = &blocks[b2];
944
945
0
  while (bb2->dom_depth > b1_depth) {
946
0
    b2 = bb2->dom_parent;
947
0
    bb2 = &blocks[b2];
948
0
  }
949
0
  return b1 == b2;
950
0
}
951
952
int ir_find_loops(ir_ctx *ctx)
953
0
{
954
0
  uint32_t b, j, n, count;
955
0
  uint32_t *entry_times, *exit_times, *sorted_blocks, time = 1;
956
0
  ir_block *blocks = ctx->cfg_blocks;
957
0
  uint32_t *edges = ctx->cfg_edges;
958
0
  ir_worklist work;
959
960
0
  if (ctx->flags2 & IR_NO_LOOPS) {
961
0
    return 1;
962
0
  }
963
964
  /* We don't materialize the DJ spanning tree explicitly, as we are only interested in ancestor
965
   * queries. These are implemented by checking entry/exit times of the DFS search. */
966
0
  ir_worklist_init(&work, ctx->cfg_blocks_count + 1);
967
0
  entry_times = ir_mem_malloc((ctx->cfg_blocks_count + 1) * 3 * sizeof(uint32_t));
968
0
  exit_times = entry_times + ctx->cfg_blocks_count + 1;
969
0
  sorted_blocks = exit_times + ctx->cfg_blocks_count + 1;
970
971
0
  memset(entry_times, 0, (ctx->cfg_blocks_count + 1) * sizeof(uint32_t));
972
973
0
  ir_worklist_push(&work, 1);
974
0
  while (ir_worklist_len(&work)) {
975
0
    ir_block *bb;
976
0
    int child;
977
978
0
next:
979
0
    b = ir_worklist_peek(&work);
980
0
    if (!entry_times[b]) {
981
0
      entry_times[b] = time++;
982
0
    }
983
984
    /* Visit blocks immediately dominated by "b". */
985
0
    bb = &blocks[b];
986
0
    for (child = bb->dom_child; child > 0; child = blocks[child].dom_next_child) {
987
0
      if (ir_worklist_push(&work, child)) {
988
0
        goto next;
989
0
      }
990
0
    }
991
992
    /* Visit join edges. */
993
0
    if (bb->successors_count) {
994
0
      uint32_t *p = edges + bb->successors;
995
0
      for (j = 0; j < bb->successors_count; j++, p++) {
996
0
        uint32_t succ = *p;
997
998
0
        if (blocks[succ].idom == b) {
999
0
          continue;
1000
0
        } else if (ir_worklist_push(&work, succ)) {
1001
0
          goto next;
1002
0
        }
1003
0
      }
1004
0
    }
1005
0
    exit_times[b] = time++;
1006
0
    ir_worklist_pop(&work);
1007
0
  }
1008
1009
  /* Sort blocks by level, which is the opposite order in which we want to process them */
1010
0
  sorted_blocks[1] = 1;
1011
0
  j = 1;
1012
0
  n = 2;
1013
0
  while (j != n) {
1014
0
    uint32_t i = j;
1015
0
    j = n;
1016
0
    for (; i < j; i++) {
1017
0
      int child;
1018
0
      for (child = blocks[sorted_blocks[i]].dom_child; child > 0; child = blocks[child].dom_next_child) {
1019
0
        sorted_blocks[n++] = child;
1020
0
      }
1021
0
    }
1022
0
  }
1023
0
  count = n;
1024
1025
  /* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs". */
1026
0
  uint32_t prev_dom_depth = blocks[sorted_blocks[n - 1]].dom_depth;
1027
0
  uint32_t prev_irreducible = 0;
1028
0
  while (n > 1) {
1029
0
    b = sorted_blocks[--n];
1030
0
    ir_block *bb = &blocks[b];
1031
1032
0
    IR_ASSERT(bb->dom_depth <= prev_dom_depth);
1033
0
    if (UNEXPECTED(prev_irreducible) && bb->dom_depth != prev_dom_depth) {
1034
      /* process delyed irreducible loops */
1035
0
      do {
1036
0
        b = sorted_blocks[prev_irreducible];
1037
0
        bb = &blocks[b];
1038
0
        if ((bb->flags & IR_BB_IRREDUCIBLE_LOOP) && !bb->loop_depth) {
1039
          /* process irreducible loop */
1040
0
          uint32_t hdr = b;
1041
1042
0
          bb->loop_depth = 1;
1043
0
          if (ctx->ir_base[bb->start].op == IR_MERGE) {
1044
0
            ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
1045
0
          }
1046
1047
          /* find the closing edge(s) of the irreucible loop */
1048
0
          IR_ASSERT(bb->predecessors_count > 1);
1049
0
          uint32_t *p = &edges[bb->predecessors];
1050
0
          j = bb->predecessors_count;
1051
0
          do {
1052
0
            uint32_t pred = *p;
1053
1054
0
            if (entry_times[pred] > entry_times[b] && exit_times[pred] < exit_times[b]) {
1055
0
              if (!ir_worklist_len(&work)) {
1056
0
                ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
1057
0
              }
1058
0
              blocks[pred].loop_header = 0; /* support for merged loops */
1059
0
              ir_worklist_push(&work, pred);
1060
0
            }
1061
0
            p++;
1062
0
          } while (--j);
1063
0
          if (ir_worklist_len(&work) == 0) continue;
1064
1065
          /* collect members of the irreducible loop */
1066
0
          while (ir_worklist_len(&work)) {
1067
0
            b = ir_worklist_pop(&work);
1068
0
            if (b != hdr) {
1069
0
              ir_block *bb = &blocks[b];
1070
0
              bb->loop_header = hdr;
1071
0
              if (bb->predecessors_count) {
1072
0
                uint32_t *p = &edges[bb->predecessors];
1073
0
                uint32_t n = bb->predecessors_count;
1074
0
                do {
1075
0
                  uint32_t pred = *p;
1076
0
                  while (blocks[pred].loop_header > 0) {
1077
0
                    pred = blocks[pred].loop_header;
1078
0
                  }
1079
0
                  if (pred != hdr) {
1080
0
                    if (entry_times[pred] > entry_times[hdr] && exit_times[pred] < exit_times[hdr]) {
1081
                      /* "pred" is a descendant of "hdr" */
1082
0
                      ir_worklist_push(&work, pred);
1083
0
                    } else {
1084
                      /* another entry to the irreducible loop */
1085
0
                      bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
1086
0
                      if (ctx->ir_base[bb->start].op == IR_MERGE) {
1087
0
                        ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
1088
0
                      }
1089
0
                    }
1090
0
                  }
1091
0
                  p++;
1092
0
                } while (--n);
1093
0
              }
1094
0
            }
1095
0
          }
1096
0
        }
1097
0
      } while (--prev_irreducible != n);
1098
0
      prev_irreducible = 0;
1099
0
      b = sorted_blocks[n];
1100
0
      bb = &blocks[b];
1101
0
    }
1102
1103
0
    if (bb->predecessors_count > 1) {
1104
0
      bool irreducible = 0;
1105
0
      uint32_t *p = &edges[bb->predecessors];
1106
1107
0
      j = bb->predecessors_count;
1108
0
      do {
1109
0
        uint32_t pred = *p;
1110
1111
        /* A join edge is one for which the predecessor does not
1112
           immediately dominate the successor.  */
1113
0
        if (bb->idom != pred) {
1114
          /* In a loop back-edge (back-join edge), the successor dominates
1115
             the predecessor.  */
1116
0
          if (ir_dominates(blocks, b, pred)) {
1117
0
            if (!ir_worklist_len(&work)) {
1118
0
              ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
1119
0
            }
1120
0
            blocks[pred].loop_header = 0; /* support for merged loops */
1121
0
            ir_worklist_push(&work, pred);
1122
0
          } else {
1123
            /* Otherwise it's a cross-join edge.  See if it's a branch
1124
               to an ancestor on the DJ spanning tree.  */
1125
0
            if (entry_times[pred] > entry_times[b] && exit_times[pred] < exit_times[b]) {
1126
0
              irreducible = 1;
1127
0
              break;
1128
0
            }
1129
0
          }
1130
0
        }
1131
0
        p++;
1132
0
      } while (--j);
1133
1134
0
      if (UNEXPECTED(irreducible)) {
1135
0
        bb->flags |= IR_BB_LOOP_HEADER | IR_BB_IRREDUCIBLE_LOOP;
1136
0
        ctx->flags2 |= IR_CFG_HAS_LOOPS | IR_IRREDUCIBLE_CFG;
1137
        /* Remember the position of the first irreducible loop to process all the irreducible loops
1138
         * after the reducible loops with the same dominator tree depth
1139
         */
1140
0
        if (!prev_irreducible) {
1141
0
          prev_irreducible = n;
1142
0
          prev_dom_depth = bb->dom_depth;
1143
0
        }
1144
0
        ir_list_clear(&work.l);
1145
0
      } else if (ir_worklist_len(&work)) {
1146
        /* collect members of the reducible loop */
1147
0
        uint32_t hdr = b;
1148
1149
0
        bb->flags |= IR_BB_LOOP_HEADER;
1150
0
        ctx->flags2 |= IR_CFG_HAS_LOOPS;
1151
0
        bb->loop_depth = 1;
1152
0
        if (ctx->ir_base[bb->start].op == IR_MERGE) {
1153
0
          ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
1154
0
        }
1155
0
        while (ir_worklist_len(&work)) {
1156
0
          b = ir_worklist_pop(&work);
1157
0
          if (b != hdr) {
1158
0
            ir_block *bb = &blocks[b];
1159
0
            bb->loop_header = hdr;
1160
0
            if (bb->predecessors_count) {
1161
0
              uint32_t *p = &edges[bb->predecessors];
1162
0
              uint32_t n = bb->predecessors_count;
1163
0
              do {
1164
0
                uint32_t pred = *p;
1165
0
                while (blocks[pred].loop_header > 0) {
1166
0
                  pred = blocks[pred].loop_header;
1167
0
                }
1168
0
                if (pred != hdr) {
1169
0
                  ir_worklist_push(&work, pred);
1170
0
                }
1171
0
                p++;
1172
0
              } while (--n);
1173
0
            }
1174
0
          }
1175
0
        }
1176
0
      }
1177
0
    }
1178
0
  }
1179
0
  IR_ASSERT(!prev_irreducible);
1180
1181
0
  if (ctx->flags2 & IR_CFG_HAS_LOOPS) {
1182
0
    for (n = 1; n < count; n++) {
1183
0
      b = sorted_blocks[n];
1184
0
      ir_block *bb = &blocks[b];
1185
0
      if (bb->loop_header > 0) {
1186
0
        ir_block *loop = &blocks[bb->loop_header];
1187
0
        uint32_t loop_depth = loop->loop_depth;
1188
1189
0
        if (bb->flags & IR_BB_LOOP_HEADER) {
1190
0
          loop_depth++;
1191
0
        }
1192
0
        bb->loop_depth = loop_depth;
1193
0
        if (bb->flags & (IR_BB_ENTRY|IR_BB_LOOP_WITH_ENTRY)) {
1194
0
          loop->flags |= IR_BB_LOOP_WITH_ENTRY;
1195
0
          if (loop_depth > 1) {
1196
            /* Set IR_BB_LOOP_WITH_ENTRY flag for all the enclosing loops */
1197
0
            bb = &blocks[loop->loop_header];
1198
0
            while (1) {
1199
0
              if (bb->flags & IR_BB_LOOP_WITH_ENTRY) {
1200
0
                break;
1201
0
              }
1202
0
              bb->flags |= IR_BB_LOOP_WITH_ENTRY;
1203
0
              if (bb->loop_depth == 1) {
1204
0
                break;
1205
0
              }
1206
0
              bb = &blocks[loop->loop_header];
1207
0
            }
1208
0
          }
1209
0
        }
1210
0
      }
1211
0
    }
1212
0
  }
1213
1214
0
  ir_mem_free(entry_times);
1215
0
  ir_worklist_free(&work);
1216
1217
0
  return 1;
1218
0
}
1219
1220
static uint32_t _ir_skip_empty_blocks(const ir_ctx *ctx, uint32_t b)
1221
0
{
1222
0
  while (1) {
1223
0
    ir_block *bb = &ctx->cfg_blocks[b];
1224
1225
0
    if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1226
0
      IR_ASSERT(bb->successors_count == 1);
1227
0
      b = ctx->cfg_edges[bb->successors];
1228
0
    } else {
1229
0
      return b;
1230
0
    }
1231
0
  }
1232
0
}
1233
1234
/* A variation of "Bottom-up Positioning" algorithm described by
1235
 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1236
 */
1237
typedef struct _ir_edge_info {
1238
  uint32_t from;
1239
  uint32_t to;
1240
  float    freq;
1241
} ir_edge_info;
1242
1243
typedef struct _ir_chain {
1244
  uint32_t head;
1245
  uint32_t next;
1246
  union {
1247
    uint32_t prev;
1248
    uint32_t tail;
1249
  };
1250
} ir_chain;
1251
1252
static int ir_edge_info_cmp(const void *b1, const void *b2)
1253
0
{
1254
0
  ir_edge_info *e1 = (ir_edge_info*)b1;
1255
0
  ir_edge_info *e2 = (ir_edge_info*)b2;
1256
1257
0
  if (e1->freq != e2->freq) {
1258
0
    return e1->freq < e2->freq ? 1 : -1;
1259
0
  }
1260
  /* In case of equal frequencies, try to avoid penalization of one of the "equal" paths by
1261
   * preferring the first RPO successor (in conditional branches) and the last RPO predecessor
1262
   * (in merge points).
1263
   *
1264
   * See "Static Basic Block Reordering Heuristics for Implicit Control Flow in Baseline JITs"
1265
   * Polito Guillermo, Ducasse Stephane, and Tesone Pablo (2021)
1266
   */
1267
0
  if (e1->from != e2->from) {
1268
0
    return e2->from - e1->from;
1269
0
  } else {
1270
0
    return e1->to - e2->to;
1271
0
  }
1272
0
}
1273
1274
static IR_NEVER_INLINE uint32_t ir_chain_head_path_compress(ir_chain *chains, uint32_t src, uint32_t head)
1275
0
{
1276
0
  IR_ASSERT(head != 0);
1277
0
  do {
1278
0
    head = chains[head].head;
1279
0
  } while (chains[head].head != head);
1280
0
  do {
1281
0
    uint32_t next = chains[src].head;
1282
0
    chains[src].head = head;
1283
0
    src = next;
1284
0
  } while (chains[src].head != head);
1285
0
  return head;
1286
0
}
1287
1288
IR_ALWAYS_INLINE uint32_t ir_chain_head(ir_chain *chains, uint32_t src)
1289
0
{
1290
0
  uint32_t head = chains[src].head;
1291
0
  if (chains[head].head == head) {
1292
0
    return head;
1293
0
  } else {
1294
0
    return ir_chain_head_path_compress(chains, src, head);
1295
0
  }
1296
0
}
1297
1298
static void ir_join_chains(ir_chain *chains, uint32_t src, uint32_t dst)
1299
0
{
1300
0
  uint32_t dst_tail = chains[dst].tail;
1301
0
  uint32_t src_tail = chains[src].tail;
1302
1303
0
  chains[dst_tail].next = src;
1304
0
  chains[dst].prev = src_tail;
1305
0
  chains[src_tail].next = dst;
1306
0
  chains[src].tail = dst_tail;
1307
0
  chains[dst].head = src;
1308
0
}
1309
1310
static void ir_insert_chain_before(ir_chain *chains, uint32_t c, uint32_t before)
1311
0
{
1312
0
  ir_chain *this = &chains[c];
1313
0
  ir_chain *next = &chains[before];
1314
1315
0
  IR_ASSERT(chains[c].head == c);
1316
0
  if (chains[before].head != before) {
1317
0
    this->head = next->head;
1318
0
  } else {
1319
0
    next->head = c;
1320
0
  }
1321
0
  this->next = before;
1322
0
  this->prev = next->prev;
1323
0
  next->prev = c;
1324
0
  chains[this->prev].next = c;
1325
0
}
1326
1327
#ifndef IR_DEBUG_BB_SCHEDULE_GRAPH
1328
# ifdef IR_DEBUG
1329
0
#  define IR_DEBUG_BB_SCHEDULE_GRAPH 1
1330
# else
1331
#  define IR_DEBUG_BB_SCHEDULE_GRAPH 0
1332
# endif
1333
#endif
1334
1335
#if IR_DEBUG_BB_SCHEDULE_GRAPH
1336
static void ir_dump_cfg_freq_graph(ir_ctx *ctx, float *bb_freq, uint32_t edges_count, ir_edge_info *edges, ir_chain *chains)
1337
0
{
1338
0
  uint32_t b, i;
1339
0
  ir_block *bb;
1340
0
  uint8_t c, *colors;
1341
0
  bool is_head, is_empty;
1342
0
  uint32_t max_colors = 12;
1343
1344
0
  colors = alloca(sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1345
0
  memset(colors, 0, sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1346
0
  i = 0;
1347
0
  for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1348
0
    if (chains[b].head == b) {
1349
0
      colors[b] = (i % max_colors) + 1;
1350
0
      i++;
1351
0
    }
1352
0
  }
1353
1354
0
  fprintf(stderr, "digraph {\n");
1355
0
  fprintf(stderr, "\trankdir=TB;\n");
1356
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1357
0
    bb = &ctx->cfg_blocks[b];
1358
0
    c = (chains[b].head) ? colors[ir_chain_head(chains, b)] : 0;
1359
0
    is_head = chains[b].head == b;
1360
0
    is_empty = (bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY;
1361
0
    if (c) {
1362
0
      fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\",colorscheme=set312,fillcolor=%d%s%s]\n", b, b,
1363
0
        bb->loop_depth, bb_freq[b],
1364
0
        ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1365
0
        c,
1366
0
        is_head ? ",penwidth=3" : "",
1367
0
        is_empty ? ",style=\"dotted,filled\"" : ",style=\"filled\"");
1368
0
    } else {
1369
0
      fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\"%s%s]\n", b, b,
1370
0
        bb->loop_depth, bb_freq[b],
1371
0
        ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1372
0
        is_head ? ",penwidth=3" : "",
1373
0
        is_empty ? ",style=\"dotted\"" : "");
1374
0
    }
1375
0
  }
1376
0
  fprintf(stderr, "\n");
1377
1378
0
  for (i = 0; i < edges_count; i++) {
1379
0
    fprintf(stderr, "\tBB%d -> BB%d [label=\"%0.3f\"]\n", edges[i].from, edges[i].to, edges[i].freq);
1380
0
  }
1381
0
  fprintf(stderr, "}\n");
1382
0
}
1383
#endif
1384
1385
#ifdef IR_DEBUG
1386
static void ir_dump_edges(ir_ctx *ctx, uint32_t edges_count, ir_edge_info *edges)
1387
0
{
1388
0
  uint32_t i;
1389
1390
0
  fprintf(stderr, "Edges:\n");
1391
0
  for (i = 0; i < edges_count; i++) {
1392
0
    fprintf(stderr, "\tBB%d -> BB%d %0.3f\n", edges[i].from, edges[i].to, edges[i].freq);
1393
0
  }
1394
0
}
1395
1396
static void ir_dump_chains(ir_ctx *ctx, ir_chain *chains)
1397
0
{
1398
0
  uint32_t b, tail, i;
1399
1400
0
  fprintf(stderr, "Chains:\n");
1401
0
  for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1402
0
    if (chains[b].head == b) {
1403
0
      tail = chains[b].tail;
1404
0
      i = b;
1405
0
      fprintf(stderr, "(BB%d", i);
1406
0
      while (i != tail) {
1407
0
        i = chains[i].next;
1408
0
        fprintf(stderr, ", BB%d", i);
1409
0
      }
1410
0
      fprintf(stderr, ")\n");
1411
0
    }
1412
0
  }
1413
0
}
1414
#endif
1415
1416
static int ir_schedule_blocks_bottom_up(ir_ctx *ctx)
1417
0
{
1418
0
  uint32_t max_edges_count = ctx->cfg_edges_count / 2;
1419
0
  uint32_t edges_count = 0;
1420
0
  uint32_t b, i, loop_depth;
1421
0
  float *bb_freq, freq;
1422
0
  ir_block *bb;
1423
0
  ir_edge_info *edges, *e;
1424
0
  ir_chain *chains;
1425
0
  ir_bitqueue worklist;
1426
0
  ir_bitset visited;
1427
0
  uint32_t *schedule_end, count;
1428
1429
0
  ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1430
0
  schedule_end = ctx->cfg_schedule + ctx->cfg_blocks_count;
1431
1432
  /* 1. Create initial chains for each BB */
1433
0
  chains = ir_mem_malloc(sizeof(ir_chain) * (ctx->cfg_blocks_count + 1));
1434
0
  chains[0].head = 0;
1435
0
  chains[0].next = 0;
1436
0
  chains[0].prev = 0;
1437
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1438
0
    chains[b].head = b;
1439
0
    chains[b].next = b;
1440
0
    chains[b].prev = b;
1441
0
  }
1442
1443
  /* 2. Collect information about BBs and EDGEs frequencies */
1444
0
  edges = ir_mem_malloc(sizeof(ir_edge_info) * max_edges_count);
1445
0
  bb_freq = ir_mem_calloc(ctx->cfg_blocks_count + 1, sizeof(float));
1446
0
  bb_freq[1] = 1.0f;
1447
0
  visited = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
1448
0
  ir_bitqueue_init(&worklist, ctx->cfg_blocks_count + 1);
1449
0
  ir_bitqueue_add(&worklist, 1);
1450
0
  while ((b = ir_bitqueue_pop(&worklist)) != (uint32_t)-1) {
1451
0
restart:
1452
0
    bb = &ctx->cfg_blocks[b];
1453
0
    if (bb->predecessors_count) {
1454
0
      uint32_t n = bb->predecessors_count;
1455
0
      uint32_t *p = ctx->cfg_edges + bb->predecessors;
1456
0
      for (; n > 0; p++, n--) {
1457
0
        uint32_t predecessor = *p;
1458
        /* Basic Blocks are ordered in a way that usual predecessors ids are less than successors.
1459
         * So we may compare blocks ids (predecessor < b) instead of a more expensive check for back edge
1460
         * (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b)
1461
         */
1462
0
        if (predecessor < b) {
1463
0
          if (!ir_bitset_in(visited, predecessor)) {
1464
0
            b = predecessor;
1465
0
            ir_bitqueue_del(&worklist, b);
1466
0
            goto restart;
1467
0
          }
1468
0
        } else if (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b) {
1469
          /* not a loop back-edge */
1470
0
          IR_ASSERT(b == predecessor || ctx->cfg_blocks[predecessor].loop_header == b);
1471
0
        }
1472
0
      }
1473
0
    }
1474
1475
0
    ir_bitset_incl(visited, b);
1476
1477
0
    if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1478
0
      uint32_t successor = ctx->cfg_edges[bb->successors];
1479
1480
      /* move empty blocks to the end */
1481
0
      IR_ASSERT(chains[b].head == b);
1482
0
      chains[b].head = 0;
1483
0
      *schedule_end = b;
1484
0
      schedule_end--;
1485
1486
0
      if (successor > b) {
1487
0
        bb_freq[successor] += bb_freq[b];
1488
0
        b = successor;
1489
0
        ir_bitqueue_del(&worklist, b);
1490
0
        goto restart;
1491
0
      } else {
1492
0
        continue;
1493
0
      }
1494
0
    }
1495
1496
0
    loop_depth = bb->loop_depth;
1497
0
    if (bb->flags & IR_BB_LOOP_HEADER) {
1498
      // TODO: Estimate the loop iterations count
1499
0
      bb_freq[b] *= 10;
1500
0
    }
1501
1502
0
    if (bb->successors_count) {
1503
0
      uint32_t n = bb->successors_count;
1504
0
      uint32_t *p = ctx->cfg_edges + bb->successors;
1505
1506
0
      if (n == 1) {
1507
0
        uint32_t successor = *p;
1508
1509
0
        IR_ASSERT(edges_count < max_edges_count);
1510
0
        freq = bb_freq[b];
1511
0
        if (successor > b) {
1512
0
          IR_ASSERT(!ir_bitset_in(visited, successor));
1513
0
          bb_freq[successor] += freq;
1514
0
          ir_bitqueue_add(&worklist, successor);
1515
0
        }
1516
0
        successor = _ir_skip_empty_blocks(ctx, successor);
1517
0
        edges[edges_count].from = b;
1518
0
        edges[edges_count].to = successor;
1519
0
        edges[edges_count].freq = freq;
1520
0
        edges_count++;
1521
0
      } else if (n == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1522
0
        uint32_t successor1 = *p;
1523
0
        ir_block *successor1_bb = &ctx->cfg_blocks[successor1];
1524
0
        ir_insn *insn1 = &ctx->ir_base[successor1_bb->start];
1525
0
        uint32_t successor2 = *(p + 1);
1526
0
        ir_block *successor2_bb = &ctx->cfg_blocks[successor2];
1527
0
        ir_insn *insn2 = &ctx->ir_base[successor2_bb->start];
1528
0
        int prob1, prob2, probN = 100;
1529
1530
0
        if (insn1->op2) {
1531
0
          prob1 = insn1->op2;
1532
0
          if (insn2->op2) {
1533
0
            prob2 = insn2->op2;
1534
0
            probN = prob1 + prob2;
1535
0
          } else {
1536
0
            if (prob1 > 100) {
1537
0
              prob1 = 100;
1538
0
            }
1539
0
            prob2 = 100 - prob1;
1540
0
          }
1541
1542
0
        } else if (insn2->op2) {
1543
0
          prob2 = insn2->op2;
1544
0
          if (prob2 > 100) {
1545
0
            prob2 = 100;
1546
0
          }
1547
0
          prob1 = 100 - prob2;
1548
0
        } else if (successor1_bb->loop_depth >= loop_depth
1549
0
            && successor2_bb->loop_depth < loop_depth) {
1550
0
          prob1 = 90;
1551
0
          prob2 = 10;
1552
0
        } else if (successor1_bb->loop_depth < loop_depth
1553
0
            && successor2_bb->loop_depth >= loop_depth) {
1554
0
          prob1 = 10;
1555
0
          prob2 = 90;
1556
0
        } else if (successor2_bb->flags & IR_BB_EMPTY) {
1557
0
          prob1 = 51;
1558
0
          prob2 = 49;
1559
0
        } else if (successor1_bb->flags & IR_BB_EMPTY) {
1560
0
          prob1 = 49;
1561
0
          prob2 = 51;
1562
0
        } else {
1563
0
          prob1 = prob2 = 50;
1564
0
        }
1565
0
        do {
1566
0
          freq = bb_freq[b] * (float)prob1 / (float)probN;
1567
0
          if (successor1 > b) {
1568
0
            IR_ASSERT(!ir_bitset_in(visited, successor1));
1569
0
            bb_freq[successor1] += freq;
1570
0
            if (successor1_bb->successors_count == 0 && insn1->op2 == 1) {
1571
              /* move cold block without successors to the end */
1572
0
              IR_ASSERT(chains[successor1].head == successor1);
1573
0
              chains[successor1].head = 0;
1574
0
              *schedule_end = successor1;
1575
0
              schedule_end--;
1576
0
              break;
1577
0
            } else {
1578
0
              ir_bitqueue_add(&worklist, successor1);
1579
0
            }
1580
0
          }
1581
          /* try to join edges early to reduce number of edges and the cost of their sorting */
1582
0
          if (prob1 > prob2
1583
0
           && (successor1_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1584
0
            uint32_t src = chains[b].next;
1585
0
            IR_ASSERT(chains[src].head == src);
1586
0
            IR_ASSERT(src == ir_chain_head(chains, b));
1587
0
            IR_ASSERT(chains[successor1].head == successor1);
1588
0
            ir_join_chains(chains, src, successor1);
1589
0
            if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1590
0
          }
1591
0
          successor1 = _ir_skip_empty_blocks(ctx, successor1);
1592
0
          IR_ASSERT(edges_count < max_edges_count);
1593
0
          edges[edges_count].from = b;
1594
0
          edges[edges_count].to = successor1;
1595
0
          edges[edges_count].freq = freq;
1596
0
          edges_count++;
1597
0
        } while (0);
1598
0
        do {
1599
0
          freq = bb_freq[b] * (float)prob2 / (float)probN;
1600
0
          if (successor2 > b) {
1601
0
            IR_ASSERT(!ir_bitset_in(visited, successor2));
1602
0
            bb_freq[successor2] += freq;
1603
0
            if (successor2_bb->successors_count == 0 && insn2->op2 == 1) {
1604
              /* move cold block without successors to the end */
1605
0
              IR_ASSERT(chains[successor2].head == successor2);
1606
0
              chains[successor2].head = 0;
1607
0
              *schedule_end = successor2;
1608
0
              schedule_end--;
1609
0
              break;
1610
0
            } else {
1611
0
              ir_bitqueue_add(&worklist, successor2);
1612
0
            }
1613
0
          }
1614
0
          if (prob2 > prob1
1615
0
           && (successor2_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1616
0
            uint32_t src = chains[b].next;
1617
0
            IR_ASSERT(chains[src].head == src);
1618
0
            IR_ASSERT(src == ir_chain_head(chains, b));
1619
0
            IR_ASSERT(chains[successor2].head == successor2);
1620
0
            ir_join_chains(chains, src, successor2);
1621
0
            if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1622
0
          }
1623
0
          successor2 = _ir_skip_empty_blocks(ctx, successor2);
1624
0
          IR_ASSERT(edges_count < max_edges_count);
1625
0
          edges[edges_count].from = b;
1626
0
          edges[edges_count].to = successor2;
1627
0
          edges[edges_count].freq = freq;
1628
0
          edges_count++;
1629
0
        } while (0);
1630
0
      } else {
1631
0
        int prob;
1632
1633
0
        for (; n > 0; p++, n--) {
1634
0
          uint32_t successor = *p;
1635
0
          ir_block *successor_bb = &ctx->cfg_blocks[successor];
1636
0
          ir_insn *insn = &ctx->ir_base[successor_bb->start];
1637
1638
0
          if (insn->op == IR_CASE_DEFAULT) {
1639
0
            prob = insn->op2;
1640
0
            if (!prob) {
1641
0
              prob = 100 / bb->successors_count;
1642
0
            }
1643
0
          } else if (insn->op == IR_CASE_VAL) {
1644
0
            prob = insn->op3;
1645
0
            if (!prob) {
1646
0
              prob = 100 / bb->successors_count;
1647
0
            }
1648
0
          } else if (insn->op == IR_ENTRY) {
1649
0
            if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1650
0
              prob = 99; /* prefer empty ENTRY block to go first */
1651
0
            } else {
1652
0
              prob = 1;
1653
0
            }
1654
0
          } else {
1655
0
            prob = 100 / bb->successors_count;
1656
0
          }
1657
0
          freq = bb_freq[b] * (float)prob / 100.0f;
1658
0
          if (successor > b) {
1659
0
            IR_ASSERT(!ir_bitset_in(visited, successor));
1660
0
            bb_freq[successor] += freq;
1661
0
            ir_bitqueue_add(&worklist, successor);
1662
0
          }
1663
0
          successor = _ir_skip_empty_blocks(ctx, successor);
1664
0
          IR_ASSERT(edges_count < max_edges_count);
1665
0
          edges[edges_count].from = b;
1666
0
          edges[edges_count].to = successor;
1667
0
          edges[edges_count].freq = freq;
1668
0
          edges_count++;
1669
0
        }
1670
0
      }
1671
0
    }
1672
0
  }
1673
0
  ir_bitqueue_free(&worklist);
1674
0
  ir_mem_free(visited);
1675
1676
  /* 2. Sort EDGEs according to their frequencies */
1677
0
  qsort(edges, edges_count, sizeof(ir_edge_info), ir_edge_info_cmp);
1678
1679
0
#ifdef IR_DEBUG
1680
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1681
0
    ir_dump_edges(ctx, edges_count, edges);
1682
0
  }
1683
0
#endif
1684
1685
  /* 3. Process EDGEs in the decreasing frequency order and join the connected chains */
1686
0
  for (e = edges, i = edges_count; i > 0; e++, i--) {
1687
0
    uint32_t dst = chains[e->to].head;
1688
0
    if (dst == e->to) {
1689
0
      uint32_t src = chains[e->from].next;
1690
0
      if (chains[src].head == src) {
1691
0
        IR_ASSERT(src == ir_chain_head(chains, e->from) && chains[src].tail == e->from);
1692
0
        if (src != dst) {
1693
0
          ir_join_chains(chains, src, dst);
1694
0
        } else if (ctx->cfg_blocks[e->from].successors_count < 2) {
1695
          /* Try to rotate loop chian to finish it with a conditional branch */
1696
0
          uint32_t tail = e->from;
1697
0
          uint32_t prev = src;
1698
0
          uint32_t next = chains[prev].next;
1699
0
          uint32_t best = 0;
1700
1701
0
          while (prev != tail) {
1702
0
            if (ctx->ir_base[ctx->cfg_blocks[prev].end].op == IR_IF) {
1703
0
              if (ctx->ir_base[ctx->cfg_blocks[prev].start].op == IR_LOOP_BEGIN
1704
0
               && ctx->cfg_blocks[prev].loop_depth > 1) {
1705
0
                best = next;
1706
0
                break;
1707
0
              } else if (!best || bb_freq[next] < bb_freq[best]) {
1708
                /* Find the successor of IF with the least frequency */
1709
0
                best = next;
1710
0
              }
1711
0
            }
1712
0
            prev = next;
1713
0
            next = chains[next].next;
1714
0
          }
1715
0
          if (best) {
1716
            /* change the head of the chain */
1717
0
            chains[src].head = best;
1718
0
            chains[best].head = best;
1719
0
          }
1720
0
        }
1721
#if !IR_DEBUG_BB_SCHEDULE_GRAPH
1722
        e->from = 0; /* reset "from" to avoid check on step #5 */
1723
#endif
1724
0
      }
1725
0
    }
1726
0
  }
1727
1728
0
#if IR_DEBUG_BB_SCHEDULE_GRAPH
1729
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1730
0
    ir_dump_cfg_freq_graph(ctx, bb_freq, edges_count, edges, chains);
1731
0
  }
1732
0
#endif
1733
1734
0
  ir_mem_free(bb_freq);
1735
1736
0
#ifdef IR_DEBUG
1737
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1738
0
    ir_dump_chains(ctx, chains);
1739
0
  }
1740
0
#endif
1741
1742
  /* 4. Merge empty entry blocks */
1743
0
  if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && ctx->entries_count) {
1744
0
    for (i = 0; i < ctx->entries_count; i++) {
1745
0
      b = ctx->entries[i];
1746
0
      IR_ASSERT(ctx->cfg_blocks[b].flags & IR_BB_ENTRY);
1747
0
      if (b && chains[b].head == b && chains[b].tail == b) {
1748
0
        bb = &ctx->cfg_blocks[b];
1749
0
        if (bb->flags & IR_BB_EMPTY) {
1750
0
          uint32_t successor;
1751
1752
0
          do {
1753
0
            IR_ASSERT(bb->successors_count == 1);
1754
0
            successor = ctx->cfg_edges[bb->successors];
1755
0
            bb = &ctx->cfg_blocks[successor];
1756
0
          } while ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY);
1757
0
          IR_ASSERT(chains[successor].head);
1758
0
          ir_insert_chain_before(chains, b, successor);
1759
0
        }
1760
0
      }
1761
0
    }
1762
1763
0
#ifdef IR_DEBUG
1764
0
    if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1765
0
      ir_dump_chains(ctx, chains);
1766
0
    }
1767
0
#endif
1768
0
  }
1769
1770
  /* 5. Align loop headers */
1771
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1772
0
    if (chains[b].head == b) {
1773
0
      bb = &ctx->cfg_blocks[b];
1774
0
      if (bb->loop_depth) {
1775
0
        if ((bb->flags & IR_BB_LOOP_HEADER) || ir_chain_head(chains, bb->loop_header) == b) {
1776
0
          bb->flags |= IR_BB_ALIGN_LOOP;
1777
0
        }
1778
0
      }
1779
0
    }
1780
0
  }
1781
1782
  /* 6. Group chains according to the most frequent edge between them */
1783
  // TODO: Try to find a better heuristic
1784
0
  for (e = edges, i = edges_count; i > 0; e++, i--) {
1785
#if !IR_DEBUG_BB_SCHEDULE_GRAPH
1786
    if (!e->from) continue;
1787
#endif
1788
0
    uint32_t src = ir_chain_head(chains, e->from);
1789
0
    uint32_t dst = ir_chain_head(chains, e->to);
1790
0
    if (src != dst) {
1791
0
      if (dst == 1) {
1792
0
        ir_join_chains(chains, dst, src);
1793
0
      } else {
1794
0
        ir_join_chains(chains, src, dst);
1795
0
      }
1796
0
    }
1797
0
  }
1798
1799
0
#ifdef IR_DEBUG
1800
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1801
0
    ir_dump_chains(ctx, chains);
1802
0
  }
1803
0
#endif
1804
1805
  /* 7. Form a final BB order */
1806
0
  count = 0;
1807
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1808
0
    if (chains[b].head == b) {
1809
0
      uint32_t tail = chains[b].tail;
1810
0
      uint32_t i = b;
1811
0
      while (1) {
1812
0
        count++;
1813
0
        ctx->cfg_schedule[count] = i;
1814
0
        if (i == tail) break;
1815
0
        i = chains[i].next;
1816
0
      }
1817
0
    }
1818
0
  }
1819
1820
0
  IR_ASSERT(ctx->cfg_schedule + count == schedule_end);
1821
0
  ctx->cfg_schedule[ctx->cfg_blocks_count + 1] = 0;
1822
1823
0
  ir_mem_free(edges);
1824
0
  ir_mem_free(chains);
1825
1826
0
  return 1;
1827
0
}
1828
1829
/* A variation of "Top-down Positioning" algorithm described by
1830
 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1831
 */
1832
static int ir_schedule_blocks_top_down(ir_ctx *ctx)
1833
0
{
1834
0
  ir_bitqueue blocks;
1835
0
  uint32_t b, best_successor, last_non_empty;
1836
0
  ir_block *bb, *best_successor_bb;
1837
0
  ir_insn *insn;
1838
0
  uint32_t *list, *schedule_end;
1839
0
  uint32_t count = 0;
1840
1841
0
  ir_bitqueue_init(&blocks, ctx->cfg_blocks_count + 1);
1842
0
  blocks.pos = 0;
1843
0
  list = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1844
0
  list[ctx->cfg_blocks_count + 1] = 0;
1845
0
  schedule_end = list + ctx->cfg_blocks_count;
1846
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1847
0
    ir_bitset_incl(blocks.set, b);
1848
0
  }
1849
1850
0
  while ((b = ir_bitqueue_pop(&blocks)) != (uint32_t)-1) {
1851
0
    bb = &ctx->cfg_blocks[b];
1852
    /* Start trace */
1853
0
    last_non_empty = 0;
1854
0
    do {
1855
0
      if (UNEXPECTED(bb->flags & IR_BB_PREV_EMPTY_ENTRY) && ir_bitqueue_in(&blocks, b - 1)) {
1856
        /* Schedule the previous empty ENTRY block before this one */
1857
0
        uint32_t predecessor = b - 1;
1858
1859
0
        ir_bitqueue_del(&blocks, predecessor);
1860
0
        count++;
1861
0
        list[count] = predecessor;
1862
0
      }
1863
0
      if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1864
        /* move empty blocks to the end */
1865
0
        *schedule_end = b;
1866
0
        schedule_end--;
1867
0
      } else {
1868
0
        count++;
1869
0
        list[count] = b;
1870
0
        last_non_empty = b;
1871
0
      }
1872
0
      best_successor_bb = NULL;
1873
0
      if (bb->successors_count == 1) {
1874
0
        best_successor = ctx->cfg_edges[bb->successors];
1875
0
        if (ir_bitqueue_in(&blocks, best_successor)) {
1876
0
          best_successor_bb = &ctx->cfg_blocks[best_successor];
1877
0
        }
1878
0
      } else if (bb->successors_count > 1) {
1879
0
        uint32_t prob, best_successor_prob;
1880
0
        uint32_t *p, successor;
1881
0
        ir_block *successor_bb;
1882
1883
0
        for (b = 0, p = &ctx->cfg_edges[bb->successors]; b < bb->successors_count; b++, p++) {
1884
0
          successor = *p;
1885
0
          if (ir_bitqueue_in(&blocks, successor)) {
1886
0
            successor_bb = &ctx->cfg_blocks[successor];
1887
0
            insn = &ctx->ir_base[successor_bb->start];
1888
0
            if (insn->op == IR_IF_TRUE || insn->op == IR_IF_FALSE) {
1889
0
              prob = insn->op2;
1890
0
              if (!prob) {
1891
0
                prob = 100 / bb->successors_count;
1892
0
                if (!(successor_bb->flags & IR_BB_EMPTY)) {
1893
0
                  prob++;
1894
0
                }
1895
0
              }
1896
0
            } else if (insn->op == IR_CASE_DEFAULT) {
1897
0
              prob = insn->op2;
1898
0
              if (!prob) {
1899
0
                prob = 100 / bb->successors_count;
1900
0
              }
1901
0
            } else if (insn->op == IR_CASE_VAL) {
1902
0
              prob = insn->op3;
1903
0
              if (!prob) {
1904
0
                prob = 100 / bb->successors_count;
1905
0
              }
1906
0
            } else if (insn->op == IR_ENTRY) {
1907
0
              if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1908
0
                prob = 99; /* prefer empty ENTRY block to go first */
1909
0
              } else {
1910
0
                prob = 1;
1911
0
              }
1912
0
            } else {
1913
0
              prob = 100 / bb->successors_count;
1914
0
            }
1915
0
            if (!best_successor_bb
1916
0
             || successor_bb->loop_depth > best_successor_bb->loop_depth
1917
0
             || prob > best_successor_prob) {
1918
0
              best_successor = successor;
1919
0
              best_successor_bb = successor_bb;
1920
0
              best_successor_prob = prob;
1921
0
            }
1922
0
          }
1923
0
        }
1924
0
      }
1925
0
      if (!best_successor_bb) {
1926
        /* Try to continue trace using the other successor of the last IF */
1927
0
        if ((bb->flags & IR_BB_EMPTY) && last_non_empty) {
1928
0
          bb = &ctx->cfg_blocks[last_non_empty];
1929
0
          if (bb->successors_count == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1930
0
            b = ctx->cfg_edges[bb->successors];
1931
1932
0
            if (!ir_bitqueue_in(&blocks, b)) {
1933
0
              b = ctx->cfg_edges[bb->successors + 1];
1934
0
            }
1935
0
            if (ir_bitqueue_in(&blocks, b)) {
1936
0
              bb = &ctx->cfg_blocks[b];
1937
0
              ir_bitqueue_del(&blocks, b);
1938
0
              continue;
1939
0
            }
1940
0
          }
1941
0
        }
1942
        /* End trace */
1943
0
        break;
1944
0
      }
1945
0
      b = best_successor;
1946
0
      bb = best_successor_bb;
1947
0
      ir_bitqueue_del(&blocks, b);
1948
0
    } while (1);
1949
0
  }
1950
1951
0
  IR_ASSERT(list + count == schedule_end);
1952
0
  ctx->cfg_schedule = list;
1953
0
  ir_bitqueue_free(&blocks);
1954
1955
0
  return 1;
1956
0
}
1957
1958
int ir_schedule_blocks(ir_ctx *ctx)
1959
0
{
1960
0
  ir_ref ref;
1961
1962
0
  if (ctx->cfg_blocks_count <= 2) {
1963
0
    return 1;
1964
0
  }
1965
1966
  /* Mark "stop" blocks termintated with UNREACHABLE as "unexpected" */
1967
0
  ref = ctx->ir_base[1].op1;
1968
0
  while (ref) {
1969
0
    ir_insn *insn = &ctx->ir_base[ref];
1970
1971
0
    if (insn->op == IR_UNREACHABLE && ctx->ir_base[insn->op1].op != IR_TAILCALL) {
1972
0
      ir_block *bb = &ctx->cfg_blocks[ctx->cfg_map[ref]];
1973
0
      uint32_t n = bb->predecessors_count;
1974
1975
0
      if (n == 1) {
1976
0
        ir_insn *start_insn = &ctx->ir_base[bb->start];
1977
0
        if (start_insn->op == IR_IF_TRUE
1978
0
         || start_insn->op == IR_IF_FALSE
1979
0
         || start_insn->op == IR_CASE_DEFAULT) {
1980
0
          if (!start_insn->op2) start_insn->op2 = 1;
1981
0
        } else if (start_insn->op == IR_CASE_VAL) {
1982
0
          if (!start_insn->op3) start_insn->op3 = 1;
1983
0
        }
1984
0
      } else if (n > 1) {
1985
0
        uint32_t *p = &ctx->cfg_edges[bb->predecessors];
1986
1987
0
        for (; n > 0; p++, n--) {
1988
0
          bb = &ctx->cfg_blocks[*p];
1989
0
          if (bb->predecessors_count == 1) {
1990
0
            ir_insn *start_insn = &ctx->ir_base[bb->start];
1991
0
            if (start_insn->op == IR_IF_TRUE
1992
0
             || start_insn->op == IR_IF_FALSE
1993
0
             || start_insn->op == IR_CASE_DEFAULT) {
1994
0
              if (!start_insn->op2) start_insn->op2 = 1;
1995
0
            } else if (start_insn->op == IR_CASE_VAL) {
1996
0
              if (!start_insn->op3) start_insn->op3 = 1;
1997
0
            }
1998
0
          }
1999
0
        }
2000
0
      }
2001
0
    }
2002
0
    ref = insn->op3;
2003
0
  }
2004
2005
  /* The bottom-up Pettis-Hansen algorithm is expensive - O(n^3),
2006
   * use it only for relatively small functions.
2007
   *
2008
   * TODO: make the choice between top-down and bottom-up algorithm configurable
2009
   */
2010
0
  if (UNEXPECTED(ctx->flags2 & IR_IRREDUCIBLE_CFG) || ctx->cfg_blocks_count > 256) {
2011
0
    return ir_schedule_blocks_top_down(ctx);
2012
0
  } else {
2013
0
    return ir_schedule_blocks_bottom_up(ctx);
2014
0
  }
2015
0
}
2016
2017
/* JMP target optimisation */
2018
uint32_t ir_skip_empty_target_blocks(const ir_ctx *ctx, uint32_t b)
2019
0
{
2020
0
  return _ir_skip_empty_blocks(ctx, b);
2021
0
}
2022
2023
uint32_t ir_next_block(const ir_ctx *ctx, uint32_t b)
2024
0
{
2025
0
  ir_block *bb;
2026
2027
0
  if (ctx->cfg_schedule) {
2028
0
    uint32_t ret = ctx->cfg_schedule[++b];
2029
2030
    /* Check for empty ENTRY block */
2031
0
    while (ret && ((ctx->cfg_blocks[ret].flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY)) {
2032
0
      ret = ctx->cfg_schedule[++b];
2033
0
    }
2034
0
    return ret;
2035
0
  }
2036
2037
0
  b++;
2038
0
  while (1) {
2039
0
    if (b > ctx->cfg_blocks_count) {
2040
0
      return 0;
2041
0
    }
2042
2043
0
    bb = &ctx->cfg_blocks[b];
2044
2045
0
    if ((bb->flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY) {
2046
0
      b++;
2047
0
    } else {
2048
0
      break;
2049
0
    }
2050
0
  }
2051
0
  return b;
2052
0
}
2053
2054
void ir_get_true_false_blocks(const ir_ctx *ctx, uint32_t b, uint32_t *true_block, uint32_t *false_block)
2055
0
{
2056
0
  ir_block *bb;
2057
0
  uint32_t *p, use_block;
2058
2059
0
  *true_block = 0;
2060
0
  *false_block = 0;
2061
0
  bb = &ctx->cfg_blocks[b];
2062
0
  IR_ASSERT(ctx->ir_base[bb->end].op == IR_IF);
2063
0
  IR_ASSERT(bb->successors_count == 2);
2064
0
  p = &ctx->cfg_edges[bb->successors];
2065
0
  use_block = *p;
2066
0
  if (ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE) {
2067
0
    *true_block = ir_skip_empty_target_blocks(ctx, use_block);
2068
0
    use_block = *(p+1);
2069
0
    IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
2070
0
    *false_block = ir_skip_empty_target_blocks(ctx, use_block);
2071
0
  } else {
2072
0
    IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
2073
0
    *false_block = ir_skip_empty_target_blocks(ctx, use_block);
2074
0
    use_block = *(p+1);
2075
0
    IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE);
2076
0
    *true_block = ir_skip_empty_target_blocks(ctx, use_block);
2077
0
  }
2078
0
  IR_ASSERT(*true_block && *false_block);
2079
0
}