Coverage Report

Created: 2025-09-27 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/opcache/jit/ir/ir_cfg.c
Line
Count
Source
1
/*
2
 * IR - Lightweight JIT Compilation Framework
3
 * (CFG - Control Flow Graph)
4
 * Copyright (C) 2022 Zend by Perforce.
5
 * Authors: Dmitry Stogov <dmitry@php.net>
6
 */
7
8
#include "ir.h"
9
#include "ir_private.h"
10
11
static int ir_remove_unreachable_blocks(ir_ctx *ctx);
12
13
IR_ALWAYS_INLINE void _ir_add_successors(const ir_ctx *ctx, ir_ref ref, ir_worklist *worklist)
14
0
{
15
0
  ir_use_list *use_list = &ctx->use_lists[ref];
16
0
  ir_ref *p, use, n = use_list->count;
17
18
0
  if (n < 2) {
19
0
    if (n == 1) {
20
0
      use = ctx->use_edges[use_list->refs];
21
0
      IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
22
0
      ir_worklist_push(worklist, use);
23
0
    }
24
0
  } else {
25
0
    p = &ctx->use_edges[use_list->refs];
26
0
    if (n == 2) {
27
0
      use = *p;
28
0
      IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
29
0
      ir_worklist_push(worklist, use);
30
0
      use = *(p + 1);
31
0
      IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
32
0
      ir_worklist_push(worklist, use);
33
0
    } else {
34
0
      for (; n > 0; p++, n--) {
35
0
        use = *p;
36
0
        IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
37
0
        ir_worklist_push(worklist, use);
38
0
      }
39
0
    }
40
0
  }
41
0
}
42
43
IR_ALWAYS_INLINE void _ir_add_predecessors(const ir_insn *insn, ir_worklist *worklist)
44
0
{
45
0
  ir_ref n, ref;
46
0
  const ir_ref *p;
47
48
0
  if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
49
0
    n = insn->inputs_count;
50
0
    for (p = insn->ops + 1; n > 0; p++, n--) {
51
0
      ref = *p;
52
0
      IR_ASSERT(ref);
53
0
      ir_worklist_push(worklist, ref);
54
0
    }
55
0
  } else if (insn->op != IR_START) {
56
0
    if (EXPECTED(insn->op1)) {
57
0
      ir_worklist_push(worklist, insn->op1);
58
0
    }
59
0
  }
60
0
}
61
62
void ir_reset_cfg(ir_ctx *ctx)
63
0
{
64
0
  ctx->cfg_blocks_count = 0;
65
0
  ctx->cfg_edges_count = 0;
66
0
  if (ctx->cfg_blocks) {
67
0
    ir_mem_free(ctx->cfg_blocks);
68
0
    ctx->cfg_blocks = NULL;
69
0
    if (ctx->cfg_edges) {
70
0
      ir_mem_free(ctx->cfg_edges);
71
0
      ctx->cfg_edges = NULL;
72
0
    }
73
0
    if (ctx->cfg_map) {
74
0
      ir_mem_free(ctx->cfg_map);
75
0
      ctx->cfg_map = NULL;
76
0
    }
77
0
  }
78
0
}
79
80
static uint32_t IR_NEVER_INLINE ir_cfg_remove_dead_inputs(ir_ctx *ctx, uint32_t *_blocks, ir_block *blocks, uint32_t bb_count)
81
0
{
82
0
  uint32_t b, count = 0;
83
0
  ir_block *bb = blocks + 1;
84
0
  ir_insn *insn;
85
0
  ir_ref i, j, n, *ops, input;
86
87
0
  for (b = 1; b <= bb_count; b++, bb++) {
88
0
    bb->successors = count;
89
0
    count += ctx->use_lists[bb->end].count;
90
0
    bb->successors_count = 0;
91
0
    bb->predecessors = count;
92
0
    insn = &ctx->ir_base[bb->start];
93
0
    if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
94
0
      n = insn->inputs_count;
95
0
      ops = insn->ops;
96
0
      for (i = 1, j = 1; i <= n; i++) {
97
0
        input = ops[i];
98
0
        if (_blocks[input]) {
99
0
          if (i != j) {
100
0
            ops[j] = ops[i];
101
0
          }
102
0
          j++;
103
0
        } else if (input > 0) {
104
0
          ir_use_list_remove_one(ctx, input, bb->start);
105
0
        }
106
0
      }
107
0
      j--;
108
0
      if (j != n) {
109
0
        if (j == 1) {
110
0
          insn->op = IR_BEGIN;
111
0
        }
112
0
        insn->inputs_count = j;
113
0
        bb->predecessors_count = j;
114
0
        j++;
115
0
        for (;j <= n; j++) {
116
0
          ops[j] = IR_UNUSED;
117
0
        }
118
0
      }
119
0
    }
120
0
    count += bb->predecessors_count;
121
0
  }
122
0
  return count;
123
0
}
124
125
int ir_build_cfg(ir_ctx *ctx)
126
0
{
127
0
  ir_ref n, *p, ref, start, end;
128
0
  uint32_t b;
129
0
  ir_insn *insn;
130
0
  ir_worklist worklist;
131
0
  uint32_t bb_init_falgs;
132
0
  uint32_t count, bb_count = 0;
133
0
  uint32_t edges_count = 0;
134
0
  ir_block *blocks, *bb;
135
0
  uint32_t *_blocks, *edges;
136
0
  ir_use_list *use_list;
137
0
  uint32_t len = ir_bitset_len(ctx->insns_count);
138
0
  ir_bitset bb_starts = ir_mem_calloc(len * 2, IR_BITSET_BITS / 8);
139
0
  ir_bitset bb_leaks = bb_starts + len;
140
0
  _blocks = ir_mem_calloc(ctx->insns_limit, sizeof(uint32_t));
141
0
  ir_worklist_init(&worklist, ctx->insns_count);
142
143
  /* First try to perform backward DFS search starting from "stop" nodes */
144
145
  /* Add all "stop" nodes */
146
0
  ref = ctx->ir_base[1].op1;
147
0
  while (ref) {
148
0
    ir_worklist_push(&worklist, ref);
149
0
    ref = ctx->ir_base[ref].op3;
150
0
  }
151
152
0
  while (ir_worklist_len(&worklist)) {
153
0
    ref = ir_worklist_pop(&worklist);
154
0
    insn = &ctx->ir_base[ref];
155
156
0
    if (insn->op == IR_NOP) {
157
0
      continue;
158
0
    }
159
0
    IR_ASSERT(IR_IS_BB_END(insn->op));
160
    /* Remember BB end */
161
0
    end = ref;
162
    /* Some successors of IF and SWITCH nodes may be inaccessible by backward DFS */
163
0
    use_list = &ctx->use_lists[end];
164
0
    n = use_list->count;
165
0
    if (n > 1 || (n == 1 && (ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR) != 0)) {
166
0
      for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
167
        /* Remember possible inaccessible successors */
168
0
        ir_bitset_incl(bb_leaks, *p);
169
0
      }
170
0
    }
171
    /* Skip control nodes untill BB start */
172
0
    ref = insn->op1;
173
0
    while (1) {
174
0
      insn = &ctx->ir_base[ref];
175
0
      if (IR_IS_BB_START(insn->op)) {
176
0
        break;
177
0
      }
178
0
      ref = insn->op1; // follow connected control blocks untill BB start
179
0
    }
180
    /* Mark BB Start */
181
0
    bb_count++;
182
0
    _blocks[ref] = end;
183
0
    ir_bitset_incl(bb_starts, ref);
184
    /* Add predecessors */
185
0
    _ir_add_predecessors(insn, &worklist);
186
0
  }
187
188
  /* Backward DFS way miss some branches ending by infinite loops.  */
189
  /* Try forward DFS. (in most cases all nodes are already proceed). */
190
191
  /* START node may be inaccessible from "stop" nodes */
192
0
  ir_bitset_incl(bb_leaks, 1);
193
194
  /* Add not processed START and successor of IF and SWITCH */
195
0
  IR_BITSET_FOREACH_DIFFERENCE(bb_leaks, bb_starts, len, start) {
196
0
    ir_worklist_push(&worklist, start);
197
0
  } IR_BITSET_FOREACH_END();
198
199
0
  if (ir_worklist_len(&worklist)) {
200
0
    ir_bitset_union(worklist.visited, bb_starts, len);
201
0
    do {
202
0
      ref = ir_worklist_pop(&worklist);
203
0
      insn = &ctx->ir_base[ref];
204
205
0
      if (insn->op == IR_NOP) {
206
0
        continue;
207
0
      }
208
0
      IR_ASSERT(IR_IS_BB_START(insn->op));
209
      /* Remember BB start */
210
0
      start = ref;
211
      /* Skip control nodes untill BB end */
212
0
      while (1) {
213
0
        ref = ir_next_control(ctx, ref);
214
0
        insn = &ctx->ir_base[ref];
215
0
        if (IR_IS_BB_END(insn->op)) {
216
0
          break;
217
0
        }
218
0
      }
219
      /* Mark BB Start */
220
0
      bb_count++;
221
0
      _blocks[start] = ref;
222
0
      ir_bitset_incl(bb_starts, start);
223
      /* Add successors */
224
0
      _ir_add_successors(ctx, ref, &worklist);
225
0
    } while (ir_worklist_len(&worklist));
226
0
  }
227
228
0
  IR_ASSERT(bb_count > 0);
229
230
  /* Create array of basic blocks and count successor/predecessors edges for each BB */
231
0
  blocks = ir_mem_malloc((bb_count + 1) * sizeof(ir_block));
232
0
  b = 1;
233
0
  bb = blocks + 1;
234
0
  count = 0;
235
  /* SCCP already removed UNREACHABKE blocks, otherwise all blocks are marked as UNREACHABLE first */
236
0
  bb_init_falgs = (ctx->flags2 & IR_CFG_REACHABLE) ? 0 : IR_BB_UNREACHABLE;
237
0
  IR_BITSET_FOREACH(bb_starts, len, start) {
238
0
    insn = &ctx->ir_base[start];
239
0
    if (insn->op == IR_NOP) {
240
0
      _blocks[start] = 0;
241
0
      continue;
242
0
    }
243
0
    end = _blocks[start];
244
0
    _blocks[start] = b;
245
0
    _blocks[end] = b;
246
0
    IR_ASSERT(IR_IS_BB_START(insn->op));
247
0
    bb->start = start;
248
0
    bb->end = end;
249
0
    bb->successors = count;
250
0
    count += ctx->use_lists[end].count;
251
0
    bb->successors_count = 0;
252
0
    bb->predecessors = count;
253
0
    bb->dom_parent = 0;
254
0
    bb->dom_depth = 0;
255
0
    bb->dom_child = 0;
256
0
    bb->dom_next_child = 0;
257
0
    bb->loop_header = 0;
258
0
    bb->loop_depth = 0;
259
0
    if (insn->op == IR_START) {
260
0
      bb->flags = IR_BB_START;
261
0
      bb->predecessors_count = 0;
262
0
    } else {
263
0
      bb->flags = bb_init_falgs;
264
0
      if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
265
0
        n = insn->inputs_count;
266
0
        bb->predecessors_count = n;
267
0
        edges_count += n;
268
0
        count += n;
269
0
      } else if (EXPECTED(insn->op1)) {
270
0
        if (insn->op == IR_ENTRY) {
271
0
          bb->flags |= IR_BB_ENTRY;
272
0
          ctx->entries_count++;
273
0
        }
274
0
        bb->predecessors_count = 1;
275
0
        edges_count++;
276
0
        count++;
277
0
      } else {
278
0
        IR_ASSERT(insn->op == IR_BEGIN); /* start of unreachable block */
279
0
        bb->predecessors_count = 0;
280
0
      }
281
0
    }
282
0
    b++;
283
0
    bb++;
284
0
  } IR_BITSET_FOREACH_END();
285
0
  bb_count = b - 1;
286
0
  if (UNEXPECTED(count != edges_count * 2)) {
287
0
    count = ir_cfg_remove_dead_inputs(ctx, _blocks, blocks, bb_count);
288
0
    IR_ASSERT(count != edges_count * 2);
289
0
  }
290
0
  ir_mem_free(bb_starts);
291
292
  /* Create an array of successor/predecessors control edges */
293
0
  edges = ir_mem_malloc(edges_count * 2 * sizeof(uint32_t));
294
0
  bb = blocks + 1;
295
0
  for (b = 1; b <= bb_count; b++, bb++) {
296
0
    insn = &ctx->ir_base[bb->start];
297
0
    if (bb->predecessors_count > 1) {
298
0
      uint32_t *q = edges + bb->predecessors;
299
0
      n = insn->inputs_count;
300
0
      for (p = insn->ops + 1; n > 0; p++, q++, n--) {
301
0
        ref = *p;
302
0
        IR_ASSERT(ref);
303
0
        ir_ref pred_b = _blocks[ref];
304
0
        ir_block *pred_bb = &blocks[pred_b];
305
0
        IR_ASSERT(pred_b > 0);
306
0
        *q = pred_b;
307
0
        edges[pred_bb->successors + pred_bb->successors_count++] = b;
308
0
      }
309
0
    } else if (bb->predecessors_count == 1) {
310
0
      ref = insn->op1;
311
0
      IR_ASSERT(ref);
312
0
      IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
313
0
      ir_ref pred_b = _blocks[ref];
314
0
      ir_block *pred_bb = &blocks[pred_b];
315
0
      edges[bb->predecessors] = pred_b;
316
0
      edges[pred_bb->successors + pred_bb->successors_count++] = b;
317
0
    }
318
0
  }
319
320
0
  ctx->cfg_blocks_count = bb_count;
321
0
  ctx->cfg_edges_count = edges_count * 2;
322
0
  ctx->cfg_blocks = blocks;
323
0
  ctx->cfg_edges = edges;
324
0
  ctx->cfg_map = _blocks;
325
326
0
  if (!(ctx->flags2 & IR_CFG_REACHABLE)) {
327
0
    uint32_t reachable_count = 0;
328
329
    /* Mark reachable blocks */
330
0
    ir_worklist_clear(&worklist);
331
0
    ir_worklist_push(&worklist, 1);
332
0
    while (ir_worklist_len(&worklist) != 0) {
333
0
      uint32_t *p;
334
335
0
      reachable_count++;
336
0
      b = ir_worklist_pop(&worklist);
337
0
      bb = &blocks[b];
338
0
      bb->flags &= ~IR_BB_UNREACHABLE;
339
0
      n = bb->successors_count;
340
0
      if (n > 1) {
341
0
        for (p = edges + bb->successors; n > 0; p++, n--) {
342
0
          ir_worklist_push(&worklist, *p);
343
0
        }
344
0
      } else if (n == 1) {
345
0
        ir_worklist_push(&worklist, edges[bb->successors]);
346
0
      }
347
0
    }
348
0
    if (reachable_count != ctx->cfg_blocks_count) {
349
0
      ir_remove_unreachable_blocks(ctx);
350
0
    }
351
0
  }
352
353
0
  ir_worklist_free(&worklist);
354
355
0
  return 1;
356
0
}
357
358
static void ir_remove_predecessor(ir_ctx *ctx, ir_block *bb, uint32_t from)
359
0
{
360
0
  uint32_t i, *p, *q, n = 0;
361
362
0
  p = q = &ctx->cfg_edges[bb->predecessors];
363
0
  for (i = 0; i < bb->predecessors_count; i++, p++) {
364
0
    if (*p != from) {
365
0
      if (p != q) {
366
0
        *q = *p;
367
0
      }
368
0
      q++;
369
0
      n++;
370
0
    }
371
0
  }
372
0
  IR_ASSERT(n != bb->predecessors_count);
373
0
  bb->predecessors_count = n;
374
0
}
375
376
static void ir_remove_merge_input(ir_ctx *ctx, ir_ref merge, ir_ref from)
377
0
{
378
0
  ir_ref i, j, n, k, *p, *q, use;
379
0
  ir_insn *use_insn;
380
0
  ir_use_list *use_list;
381
0
  ir_bitset life_inputs;
382
0
  ir_insn *insn = &ctx->ir_base[merge];
383
384
0
  IR_ASSERT(insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN);
385
0
  n = insn->inputs_count;
386
0
  i = 1;
387
0
  life_inputs = ir_bitset_malloc(n + 1);
388
0
  for (j = 1; j <= n; j++) {
389
0
    ir_ref input = ir_insn_op(insn, j);
390
391
0
    if (input != from) {
392
0
      if (i != j) {
393
0
        ir_insn_set_op(insn, i, input);
394
0
      }
395
0
      ir_bitset_incl(life_inputs, j);
396
0
      i++;
397
0
    }
398
0
  }
399
0
  i--;
400
0
  for (j = i + 1; j <= n; j++) {
401
0
    ir_insn_set_op(insn, j, IR_UNUSED);
402
0
  }
403
0
  if (i == 1) {
404
0
    insn->op = IR_BEGIN;
405
0
    insn->inputs_count = 1;
406
0
    use_list = &ctx->use_lists[merge];
407
0
    if (use_list->count > 1) {
408
0
      n++;
409
0
      for (k = use_list->count, p = q = &ctx->use_edges[use_list->refs]; k > 0; p++, k--) {
410
0
        use = *p;
411
0
        use_insn = &ctx->ir_base[use];
412
0
        if (use_insn->op == IR_PHI) {
413
          /* Convert PHI to COPY */
414
0
          i = 2;
415
0
          for (j = 2; j <= n; j++) {
416
0
            ir_ref input = ir_insn_op(use_insn, j);
417
418
0
            if (ir_bitset_in(life_inputs, j - 1)) {
419
0
              use_insn->op1 = ir_insn_op(use_insn, j);
420
0
            } else if (input > 0) {
421
0
              ir_use_list_remove_one(ctx, input, use);
422
0
            }
423
0
          }
424
0
          use_insn->op = IR_COPY;
425
0
          use_insn->inputs_count = 1;
426
0
          for (j = 2; j <= n; j++) {
427
0
            ir_insn_set_op(use_insn, j, IR_UNUSED);
428
0
          }
429
0
          continue;
430
0
        }
431
432
        /*compact use list */
433
0
        if (p != q){
434
0
          *q = use;
435
0
        }
436
0
        q++;
437
0
      }
438
439
0
      if (p != q) {
440
0
        use_list->count -= (p - q);
441
0
        do {
442
0
          *q = IR_UNUSED; /* clenu-op the removed tail */
443
0
          q++;
444
0
        } while (p != q);
445
0
      }
446
0
    }
447
0
  } else {
448
0
    insn->inputs_count = i;
449
450
0
    use_list = &ctx->use_lists[merge];
451
0
    if (use_list->count > 1) {
452
0
      n++;
453
0
      for (k = use_list->count, p = &ctx->use_edges[use_list->refs]; k > 0; p++, k--) {
454
0
        use = *p;
455
0
        use_insn = &ctx->ir_base[use];
456
0
        if (use_insn->op == IR_PHI) {
457
0
          i = 2;
458
0
          for (j = 2; j <= n; j++) {
459
0
            ir_ref input = ir_insn_op(use_insn, j);
460
461
0
            if (ir_bitset_in(life_inputs, j - 1)) {
462
0
              IR_ASSERT(input);
463
0
              if (i != j) {
464
0
                ir_insn_set_op(use_insn, i, input);
465
0
              }
466
0
              i++;
467
0
            } else if (input > 0) {
468
0
              ir_use_list_remove_one(ctx, input, use);
469
0
            }
470
0
          }
471
0
          use_insn->inputs_count = i - 1;
472
0
          for (j = i; j <= n; j++) {
473
0
            ir_insn_set_op(use_insn, j, IR_UNUSED);
474
0
          }
475
0
        }
476
0
      }
477
0
    }
478
0
  }
479
0
  ir_mem_free(life_inputs);
480
0
  ir_use_list_remove_all(ctx, from, merge);
481
0
}
482
483
/* CFG constructed after SCCP pass doesn't have unreachable BBs, otherwise they should be removed */
484
static int ir_remove_unreachable_blocks(ir_ctx *ctx)
485
0
{
486
0
  uint32_t b, *p, i;
487
0
  uint32_t unreachable_count = 0;
488
0
  uint32_t bb_count = ctx->cfg_blocks_count;
489
0
  ir_block *bb = ctx->cfg_blocks + 1;
490
491
0
  for (b = 1; b <= bb_count; b++, bb++) {
492
0
    if (bb->flags & IR_BB_UNREACHABLE) {
493
#if 0
494
      do {if (!unreachable_count) ir_dump_cfg(ctx, stderr);} while(0);
495
#endif
496
0
      if (bb->successors_count) {
497
0
        for (i = 0, p = &ctx->cfg_edges[bb->successors]; i < bb->successors_count; i++, p++) {
498
0
          ir_block *succ_bb = &ctx->cfg_blocks[*p];
499
500
0
          if (!(succ_bb->flags & IR_BB_UNREACHABLE)) {
501
0
            ir_remove_predecessor(ctx, succ_bb, b);
502
0
            ir_remove_merge_input(ctx, succ_bb->start, bb->end);
503
0
          }
504
0
        }
505
0
      } else {
506
0
        ir_ref prev, ref = bb->end;
507
0
        ir_insn *insn = &ctx->ir_base[ref];
508
509
0
        IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR);
510
        /* remove from terminators list */
511
0
        prev = ctx->ir_base[1].op1;
512
0
        if (prev == ref) {
513
0
          ctx->ir_base[1].op1 = insn->op3;
514
0
        } else {
515
0
          while (prev) {
516
0
            if (ctx->ir_base[prev].op3 == ref) {
517
0
              ctx->ir_base[prev].op3 = insn->op3;
518
0
              break;
519
0
            }
520
0
            prev = ctx->ir_base[prev].op3;
521
0
          }
522
0
        }
523
0
      }
524
0
      ctx->cfg_map[bb->start] = 0;
525
0
      ctx->cfg_map[bb->end] = 0;
526
0
      unreachable_count++;
527
0
    }
528
0
  }
529
530
0
  if (unreachable_count) {
531
0
    ir_block *dst_bb;
532
0
    uint32_t n = 1;
533
0
    uint32_t *edges;
534
535
0
    dst_bb = bb = ctx->cfg_blocks + 1;
536
0
    for (b = 1; b <= bb_count; b++, bb++) {
537
0
      if (!(bb->flags & IR_BB_UNREACHABLE)) {
538
0
        if (dst_bb != bb) {
539
0
          memcpy(dst_bb, bb, sizeof(ir_block));
540
0
          ctx->cfg_map[dst_bb->start] = n;
541
0
          ctx->cfg_map[dst_bb->end] = n;
542
0
        }
543
0
        dst_bb->successors_count = 0;
544
0
        dst_bb++;
545
0
        n++;
546
0
      }
547
0
    }
548
0
    ctx->cfg_blocks_count = bb_count = n - 1;
549
550
    /* Rebuild successor/predecessors control edges */
551
0
    edges = ctx->cfg_edges;
552
0
    bb = ctx->cfg_blocks + 1;
553
0
    for (b = 1; b <= bb_count; b++, bb++) {
554
0
      ir_insn *insn = &ctx->ir_base[bb->start];
555
0
      ir_ref *p, ref;
556
557
0
      n = bb->predecessors_count;
558
0
      if (n > 1) {
559
0
        uint32_t *q = edges + bb->predecessors;
560
561
0
        IR_ASSERT(n == insn->inputs_count);
562
0
        for (p = insn->ops + 1; n > 0; p++, q++, n--) {
563
0
          ref = *p;
564
0
          IR_ASSERT(ref);
565
0
          ir_ref pred_b = ctx->cfg_map[ref];
566
0
          ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
567
0
          *q = pred_b;
568
0
          edges[pred_bb->successors + pred_bb->successors_count++] = b;
569
0
        }
570
0
      } else if (n == 1) {
571
0
        ref = insn->op1;
572
0
        IR_ASSERT(ref);
573
0
        IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
574
0
        ir_ref pred_b = ctx->cfg_map[ref];
575
0
        ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
576
0
        edges[bb->predecessors] = pred_b;
577
0
        edges[pred_bb->successors + pred_bb->successors_count++] = b;
578
0
      }
579
0
    }
580
0
  }
581
582
0
  return 1;
583
0
}
584
585
static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b)
586
0
{
587
0
  uint32_t i, *p;
588
0
  ir_block *bb = &ctx->cfg_blocks[b];
589
590
0
  if (bb->postnum != 0) {
591
0
    return;
592
0
  }
593
594
0
  if (bb->successors_count) {
595
0
    bb->postnum = -1; /* Marker for "currently visiting" */
596
0
    p = ctx->cfg_edges + bb->successors;
597
0
    i = bb->successors_count;
598
0
    do {
599
0
      compute_postnum(ctx, cur, *p);
600
0
      p++;
601
0
    } while (--i);
602
0
  }
603
0
  bb->postnum = (*cur)++;
604
0
}
605
606
/* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by
607
 * Cooper, Harvey and Kennedy. */
608
static IR_NEVER_INLINE int ir_build_dominators_tree_slow(ir_ctx *ctx)
609
0
{
610
0
  uint32_t blocks_count, b, postnum;
611
0
  ir_block *blocks, *bb;
612
0
  uint32_t *edges;
613
0
  bool changed;
614
615
0
  blocks = ctx->cfg_blocks;
616
0
  edges  = ctx->cfg_edges;
617
0
  blocks_count = ctx->cfg_blocks_count;
618
619
  /* Clear the dominators tree */
620
0
  for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
621
0
    bb->idom = 0;
622
0
    bb->dom_depth = 0;
623
0
    bb->dom_child = 0;
624
0
    bb->dom_next_child = 0;
625
0
  }
626
627
0
  ctx->flags2 &= ~IR_NO_LOOPS;
628
629
0
  postnum = 1;
630
0
  compute_postnum(ctx, &postnum, 1);
631
632
  /* Find immediate dominators by iterative fixed-point algorithm */
633
0
  blocks[1].idom = 1;
634
0
  do {
635
0
    changed = 0;
636
    /* Iterating in Reverse Post Order */
637
0
    for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
638
0
      IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
639
0
      IR_ASSERT(bb->predecessors_count > 0);
640
0
      if (bb->predecessors_count == 1) {
641
0
        uint32_t pred_b = edges[bb->predecessors];
642
643
0
        if (blocks[pred_b].idom > 0 && bb->idom != pred_b) {
644
0
          bb->idom = pred_b;
645
0
          changed = 1;
646
0
        }
647
0
      } else if (bb->predecessors_count) {
648
0
        uint32_t idom = 0;
649
0
        uint32_t k = bb->predecessors_count;
650
0
        uint32_t *p = edges + bb->predecessors;
651
652
0
        do {
653
0
          uint32_t pred_b = *p;
654
0
          ir_block *pred_bb = &blocks[pred_b];
655
0
          ir_block *idom_bb;
656
657
0
          if (pred_bb->idom > 0) {
658
0
            idom = pred_b;
659
0
            idom_bb = &blocks[idom];
660
661
0
            while (--k > 0) {
662
0
              pred_b = *(++p);
663
0
              pred_bb = &blocks[pred_b];
664
0
              if (pred_bb->idom > 0) {
665
0
                while (idom != pred_b) {
666
0
                  while (pred_bb->postnum < idom_bb->postnum) {
667
0
                    pred_b = pred_bb->idom;
668
0
                    pred_bb = &blocks[pred_b];
669
0
                  }
670
0
                  while (idom_bb->postnum < pred_bb->postnum) {
671
0
                    idom = idom_bb->idom;
672
0
                    idom_bb = &blocks[idom];
673
0
                  }
674
0
                }
675
0
              }
676
0
            }
677
678
0
            if (bb->idom != idom) {
679
0
              bb->idom = idom;
680
0
              changed = 1;
681
0
            }
682
0
            break;
683
0
          }
684
0
          p++;
685
0
        } while (--k > 0);
686
0
      }
687
0
    }
688
0
  } while (changed);
689
690
  /* Build dominators tree */
691
0
  blocks[1].idom = 0;
692
0
  blocks[1].dom_depth = 0;
693
694
  /* Construct children lists sorted by block number */
695
0
  for (b = blocks_count, bb = &blocks[b]; b >= 2; b--, bb--) {
696
0
    ir_block *idom_bb = &blocks[bb->idom];
697
0
    bb->dom_depth = 0;
698
0
    bb->dom_next_child = idom_bb->dom_child;
699
0
    idom_bb->dom_child = b;
700
0
  }
701
702
  /* Recalculate dom_depth for all blocks */
703
0
  for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
704
0
    uint32_t idom = bb->idom;
705
0
    uint32_t dom_depth = 0;
706
0
    while (idom) {
707
0
      dom_depth++;
708
0
      if (blocks[idom].dom_depth > 0) {
709
0
        dom_depth += blocks[idom].dom_depth;
710
0
        break;
711
0
      } else {
712
0
        idom = blocks[idom].idom;
713
0
      }
714
0
    }
715
0
    bb->dom_depth = dom_depth;
716
0
  }
717
718
0
  return 1;
719
0
}
720
721
/* A single pass modification of "A Simple, Fast Dominance Algorithm" by
722
 * Cooper, Harvey and Kennedy, that relays on IR block ordering.
723
 * It may fallback to the general slow fixed-point algorithm.  */
724
static int ir_build_dominators_tree_iterative(ir_ctx *ctx);
725
int ir_build_dominators_tree(ir_ctx *ctx)
726
0
{
727
0
  uint32_t blocks_count, b;
728
0
  ir_block *blocks, *bb;
729
0
  uint32_t *edges;
730
0
  ir_list worklist;
731
732
0
  ir_list_init(&worklist, ctx->cfg_blocks_count / 2);
733
734
0
  ctx->flags2 |= IR_NO_LOOPS;
735
736
  /* Find immediate dominators */
737
0
  blocks = ctx->cfg_blocks;
738
0
  edges  = ctx->cfg_edges;
739
0
  blocks_count = ctx->cfg_blocks_count;
740
0
  blocks[1].idom = 1;
741
0
  blocks[1].dom_depth = 0;
742
743
  /* Iterating in Reverse Post Order */
744
0
  for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
745
0
    IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
746
0
    IR_ASSERT(bb->predecessors_count > 0);
747
0
    uint32_t k = bb->predecessors_count;
748
0
    uint32_t *p = edges + bb->predecessors;
749
0
    uint32_t idom = *p;
750
0
    ir_block *idom_bb;
751
752
0
    if (UNEXPECTED(idom >= b)) {
753
      /* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */
754
0
      ctx->flags2 &= ~IR_NO_LOOPS;
755
//      IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor");
756
0
      if (UNEXPECTED(k <= 1)) {
757
0
slow_case:
758
0
        ir_list_free(&worklist);
759
0
        return ir_build_dominators_tree_slow(ctx);
760
0
      }
761
0
      ir_list_push(&worklist, idom);
762
0
      while (1) {
763
0
        k--;
764
0
        p++;
765
0
        idom = *p;
766
0
        if (idom < b) {
767
0
          break;
768
0
        }
769
0
        if (UNEXPECTED(k == 0)) {
770
0
          goto slow_case;
771
0
        }
772
0
        ir_list_push(&worklist, idom);
773
0
      }
774
0
    }
775
0
    IR_ASSERT(blocks[idom].idom > 0);
776
0
    IR_ASSERT(k != 0);
777
778
0
    while (--k > 0) {
779
0
      uint32_t pred_b = *(++p);
780
781
0
      if (pred_b < b) {
782
0
        IR_ASSERT(blocks[pred_b].idom > 0);
783
0
        while (idom != pred_b) {
784
0
          while (pred_b > idom) {
785
0
            pred_b = blocks[pred_b].idom;
786
0
          }
787
0
          while (idom > pred_b) {
788
0
            idom = blocks[idom].idom;
789
0
          }
790
0
        }
791
0
      } else {
792
0
        ctx->flags2 &= ~IR_NO_LOOPS;
793
0
        IR_ASSERT(bb->predecessors_count > 1);
794
0
        ir_list_push(&worklist, pred_b);
795
0
      }
796
0
    }
797
0
    bb->idom = idom;
798
0
    idom_bb = &blocks[idom];
799
0
    bb->dom_depth = idom_bb->dom_depth + 1;
800
0
  }
801
802
  /* Construct children lists sorted by block number */
803
0
  for (b = blocks_count, bb = &blocks[b]; b >= 2; b--, bb--) {
804
0
    ir_block *idom_bb = &blocks[bb->idom];
805
0
    bb->dom_next_child = idom_bb->dom_child;
806
0
    idom_bb->dom_child = b;
807
0
  }
808
809
0
  blocks[1].idom = 0;
810
811
0
  if (ir_list_len(&worklist) != 0) {
812
0
    uint32_t dom_depth;
813
0
    uint32_t succ_b;
814
0
    bool complete = 1;
815
816
    /* Check if all the back-edges lead to the loop headers */
817
0
    do {
818
0
      b = ir_list_pop(&worklist);
819
0
      bb = &blocks[b];
820
0
      succ_b = ctx->cfg_edges[bb->successors];
821
0
      if (bb->successors_count != 1) {
822
        /* LOOP_END/END may be linked with the following ENTRY by a fake edge */
823
0
        IR_ASSERT(bb->successors_count == 2);
824
0
        if (blocks[succ_b].flags & IR_BB_ENTRY) {
825
0
          succ_b = ctx->cfg_edges[bb->successors + 1];
826
0
        } else {
827
0
          IR_ASSERT(blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
828
0
        }
829
0
      }
830
0
      dom_depth = blocks[succ_b].dom_depth;;
831
0
      while (bb->dom_depth > dom_depth) {
832
0
        b = bb->dom_parent;
833
0
        bb = &blocks[b];
834
0
      }
835
0
      if (UNEXPECTED(b != succ_b)) {
836
0
        complete = 0;
837
0
        break;
838
0
      }
839
0
    } while (ir_list_len(&worklist) != 0);
840
841
0
    if (UNEXPECTED(!complete)) {
842
0
      ir_list_free(&worklist);
843
0
      return ir_build_dominators_tree_iterative(ctx);
844
0
    }
845
0
  }
846
847
0
  ir_list_free(&worklist);
848
849
0
  return 1;
850
0
}
851
852
static int ir_build_dominators_tree_iterative(ir_ctx *ctx)
853
0
{
854
0
  bool changed;
855
0
  uint32_t blocks_count, b;
856
0
  ir_block *blocks, *bb;
857
0
  uint32_t *edges;
858
859
  /* Find immediate dominators */
860
0
  blocks = ctx->cfg_blocks;
861
0
  edges  = ctx->cfg_edges;
862
0
  blocks_count = ctx->cfg_blocks_count;
863
864
  /* Clear the dominators tree, but keep already found dominators */
865
0
  for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
866
0
    bb->dom_depth = 0;
867
0
    bb->dom_child = 0;
868
0
    bb->dom_next_child = 0;
869
0
  }
870
871
  /* Find immediate dominators by iterative fixed-point algorithm */
872
0
  blocks[1].idom = 1;
873
0
  do {
874
0
    changed = 0;
875
876
0
    for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
877
0
      IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
878
0
      IR_ASSERT(bb->predecessors_count > 0);
879
0
      uint32_t k = bb->predecessors_count;
880
0
      uint32_t *p = edges + bb->predecessors;
881
0
      uint32_t idom = *p;
882
883
0
      if (blocks[idom].idom == 0) {
884
0
        while (1) {
885
0
          k--;
886
0
          p++;
887
0
          idom = *p;
888
0
          if (blocks[idom].idom > 0) {
889
0
            break;
890
0
          }
891
0
          IR_ASSERT(k > 0);
892
0
        }
893
0
      }
894
0
      IR_ASSERT(k != 0);
895
0
      while (--k > 0) {
896
0
        uint32_t pred_b = *(++p);
897
898
0
        if (blocks[pred_b].idom > 0) {
899
0
          IR_ASSERT(blocks[pred_b].idom > 0);
900
0
          while (idom != pred_b) {
901
0
            while (pred_b > idom) {
902
0
              pred_b = blocks[pred_b].idom;
903
0
            }
904
0
            while (idom > pred_b) {
905
0
              idom = blocks[idom].idom;
906
0
            }
907
0
          }
908
0
        }
909
0
      }
910
0
      if (bb->idom != idom) {
911
0
        bb->idom = idom;
912
0
        changed = 1;
913
0
      }
914
0
    }
915
0
  } while (changed);
916
917
  /* Build dominators tree */
918
0
  blocks[1].idom = 0;
919
0
  blocks[1].dom_depth = 0;
920
0
  for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
921
0
    uint32_t idom = bb->idom;
922
0
    ir_block *idom_bb = &blocks[idom];
923
924
0
    bb->dom_depth = idom_bb->dom_depth + 1;
925
0
  }
926
927
  /* Construct children lists sorted by block number */
928
0
  for (b = blocks_count, bb = &blocks[b]; b >= 2; b--, bb--) {
929
0
    ir_block *idom_bb = &blocks[bb->idom];
930
0
    bb->dom_next_child = idom_bb->dom_child;
931
0
    idom_bb->dom_child = b;
932
0
  }
933
934
0
  return 1;
935
0
}
936
937
static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
938
0
{
939
0
  uint32_t b1_depth = blocks[b1].dom_depth;
940
0
  const ir_block *bb2 = &blocks[b2];
941
942
0
  while (bb2->dom_depth > b1_depth) {
943
0
    b2 = bb2->dom_parent;
944
0
    bb2 = &blocks[b2];
945
0
  }
946
0
  return b1 == b2;
947
0
}
948
949
int ir_find_loops(ir_ctx *ctx)
950
0
{
951
0
  uint32_t b, j, n, count;
952
0
  uint32_t *entry_times, *exit_times, *sorted_blocks, time = 1;
953
0
  ir_block *blocks = ctx->cfg_blocks;
954
0
  uint32_t *edges = ctx->cfg_edges;
955
0
  ir_worklist work;
956
957
0
  if (ctx->flags2 & IR_NO_LOOPS) {
958
0
    return 1;
959
0
  }
960
961
  /* We don't materialize the DJ spanning tree explicitly, as we are only interested in ancestor
962
   * queries. These are implemented by checking entry/exit times of the DFS search. */
963
0
  ir_worklist_init(&work, ctx->cfg_blocks_count + 1);
964
0
  entry_times = ir_mem_malloc((ctx->cfg_blocks_count + 1) * 3 * sizeof(uint32_t));
965
0
  exit_times = entry_times + ctx->cfg_blocks_count + 1;
966
0
  sorted_blocks = exit_times + ctx->cfg_blocks_count + 1;
967
968
0
  memset(entry_times, 0, (ctx->cfg_blocks_count + 1) * sizeof(uint32_t));
969
970
0
  ir_worklist_push(&work, 1);
971
0
  while (ir_worklist_len(&work)) {
972
0
    ir_block *bb;
973
0
    int child;
974
975
0
next:
976
0
    b = ir_worklist_peek(&work);
977
0
    if (!entry_times[b]) {
978
0
      entry_times[b] = time++;
979
0
    }
980
981
    /* Visit blocks immediately dominated by "b". */
982
0
    bb = &blocks[b];
983
0
    for (child = bb->dom_child; child > 0; child = blocks[child].dom_next_child) {
984
0
      if (ir_worklist_push(&work, child)) {
985
0
        goto next;
986
0
      }
987
0
    }
988
989
    /* Visit join edges. */
990
0
    if (bb->successors_count) {
991
0
      uint32_t *p = edges + bb->successors;
992
0
      for (j = 0; j < bb->successors_count; j++, p++) {
993
0
        uint32_t succ = *p;
994
995
0
        if (blocks[succ].idom == b) {
996
0
          continue;
997
0
        } else if (ir_worklist_push(&work, succ)) {
998
0
          goto next;
999
0
        }
1000
0
      }
1001
0
    }
1002
0
    exit_times[b] = time++;
1003
0
    ir_worklist_pop(&work);
1004
0
  }
1005
1006
  /* Sort blocks by level, which is the opposite order in which we want to process them */
1007
0
  sorted_blocks[1] = 1;
1008
0
  j = 1;
1009
0
  n = 2;
1010
0
  while (j != n) {
1011
0
    uint32_t i = j;
1012
0
    j = n;
1013
0
    for (; i < j; i++) {
1014
0
      int child;
1015
0
      for (child = blocks[sorted_blocks[i]].dom_child; child > 0; child = blocks[child].dom_next_child) {
1016
0
        sorted_blocks[n++] = child;
1017
0
      }
1018
0
    }
1019
0
  }
1020
0
  count = n;
1021
1022
  /* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs". */
1023
0
  uint32_t prev_dom_depth = blocks[sorted_blocks[n - 1]].dom_depth;
1024
0
  uint32_t prev_irreducible = 0;
1025
0
  while (n > 1) {
1026
0
    b = sorted_blocks[--n];
1027
0
    ir_block *bb = &blocks[b];
1028
1029
0
    IR_ASSERT(bb->dom_depth <= prev_dom_depth);
1030
0
    if (UNEXPECTED(prev_irreducible) && bb->dom_depth != prev_dom_depth) {
1031
      /* process delyed irreducible loops */
1032
0
      do {
1033
0
        b = sorted_blocks[prev_irreducible];
1034
0
        bb = &blocks[b];
1035
0
        if ((bb->flags & IR_BB_IRREDUCIBLE_LOOP) && !bb->loop_depth) {
1036
          /* process irreducible loop */
1037
0
          uint32_t hdr = b;
1038
1039
0
          bb->loop_depth = 1;
1040
0
          if (ctx->ir_base[bb->start].op == IR_MERGE) {
1041
0
            ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
1042
0
          }
1043
1044
          /* find the closing edge(s) of the irreucible loop */
1045
0
          IR_ASSERT(bb->predecessors_count > 1);
1046
0
          uint32_t *p = &edges[bb->predecessors];
1047
0
          j = bb->predecessors_count;
1048
0
          do {
1049
0
            uint32_t pred = *p;
1050
1051
0
            if (entry_times[pred] > entry_times[b] && exit_times[pred] < exit_times[b]) {
1052
0
              if (!ir_worklist_len(&work)) {
1053
0
                ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
1054
0
              }
1055
0
              blocks[pred].loop_header = 0; /* support for merged loops */
1056
0
              ir_worklist_push(&work, pred);
1057
0
            }
1058
0
            p++;
1059
0
          } while (--j);
1060
0
          if (ir_worklist_len(&work) == 0) continue;
1061
1062
          /* collect members of the irreducible loop */
1063
0
          while (ir_worklist_len(&work)) {
1064
0
            b = ir_worklist_pop(&work);
1065
0
            if (b != hdr) {
1066
0
              ir_block *bb = &blocks[b];
1067
0
              bb->loop_header = hdr;
1068
0
              if (bb->predecessors_count) {
1069
0
                uint32_t *p = &edges[bb->predecessors];
1070
0
                uint32_t n = bb->predecessors_count;
1071
0
                do {
1072
0
                  uint32_t pred = *p;
1073
0
                  while (blocks[pred].loop_header > 0) {
1074
0
                    pred = blocks[pred].loop_header;
1075
0
                  }
1076
0
                  if (pred != hdr) {
1077
0
                    if (entry_times[pred] > entry_times[hdr] && exit_times[pred] < exit_times[hdr]) {
1078
                      /* "pred" is a descendant of "hdr" */
1079
0
                      ir_worklist_push(&work, pred);
1080
0
                    } else {
1081
                      /* another entry to the irreducible loop */
1082
0
                      bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
1083
0
                      if (ctx->ir_base[bb->start].op == IR_MERGE) {
1084
0
                        ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
1085
0
                      }
1086
0
                    }
1087
0
                  }
1088
0
                  p++;
1089
0
                } while (--n);
1090
0
              }
1091
0
            }
1092
0
          }
1093
0
        }
1094
0
      } while (--prev_irreducible != n);
1095
0
      prev_irreducible = 0;
1096
0
      b = sorted_blocks[n];
1097
0
      bb = &blocks[b];
1098
0
    }
1099
1100
0
    if (bb->predecessors_count > 1) {
1101
0
      bool irreducible = 0;
1102
0
      uint32_t *p = &edges[bb->predecessors];
1103
1104
0
      j = bb->predecessors_count;
1105
0
      do {
1106
0
        uint32_t pred = *p;
1107
1108
        /* A join edge is one for which the predecessor does not
1109
           immediately dominate the successor.  */
1110
0
        if (bb->idom != pred) {
1111
          /* In a loop back-edge (back-join edge), the successor dominates
1112
             the predecessor.  */
1113
0
          if (ir_dominates(blocks, b, pred)) {
1114
0
            if (!ir_worklist_len(&work)) {
1115
0
              ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
1116
0
            }
1117
0
            blocks[pred].loop_header = 0; /* support for merged loops */
1118
0
            ir_worklist_push(&work, pred);
1119
0
          } else {
1120
            /* Otherwise it's a cross-join edge.  See if it's a branch
1121
               to an ancestor on the DJ spanning tree.  */
1122
0
            if (entry_times[pred] > entry_times[b] && exit_times[pred] < exit_times[b]) {
1123
0
              irreducible = 1;
1124
0
              break;
1125
0
            }
1126
0
          }
1127
0
        }
1128
0
        p++;
1129
0
      } while (--j);
1130
1131
0
      if (UNEXPECTED(irreducible)) {
1132
0
        bb->flags |= IR_BB_LOOP_HEADER | IR_BB_IRREDUCIBLE_LOOP;
1133
0
        ctx->flags2 |= IR_CFG_HAS_LOOPS | IR_IRREDUCIBLE_CFG;
1134
        /* Remember the position of the first irreducible loop to process all the irreducible loops
1135
         * after the reducible loops with the same dominator tree depth
1136
         */
1137
0
        if (!prev_irreducible) {
1138
0
          prev_irreducible = n;
1139
0
          prev_dom_depth = bb->dom_depth;
1140
0
        }
1141
0
        ir_list_clear(&work.l);
1142
0
      } else if (ir_worklist_len(&work)) {
1143
        /* collect members of the reducible loop */
1144
0
        uint32_t hdr = b;
1145
1146
0
        bb->flags |= IR_BB_LOOP_HEADER;
1147
0
        ctx->flags2 |= IR_CFG_HAS_LOOPS;
1148
0
        bb->loop_depth = 1;
1149
0
        if (ctx->ir_base[bb->start].op == IR_MERGE) {
1150
0
          ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
1151
0
        }
1152
0
        while (ir_worklist_len(&work)) {
1153
0
          b = ir_worklist_pop(&work);
1154
0
          if (b != hdr) {
1155
0
            ir_block *bb = &blocks[b];
1156
0
            bb->loop_header = hdr;
1157
0
            if (bb->predecessors_count) {
1158
0
              uint32_t *p = &edges[bb->predecessors];
1159
0
              uint32_t n = bb->predecessors_count;
1160
0
              do {
1161
0
                uint32_t pred = *p;
1162
0
                while (blocks[pred].loop_header > 0) {
1163
0
                  pred = blocks[pred].loop_header;
1164
0
                }
1165
0
                if (pred != hdr) {
1166
0
                  ir_worklist_push(&work, pred);
1167
0
                }
1168
0
                p++;
1169
0
              } while (--n);
1170
0
            }
1171
0
          }
1172
0
        }
1173
0
      }
1174
0
    }
1175
0
  }
1176
0
  IR_ASSERT(!prev_irreducible);
1177
1178
0
  if (ctx->flags2 & IR_CFG_HAS_LOOPS) {
1179
0
    for (n = 1; n < count; n++) {
1180
0
      b = sorted_blocks[n];
1181
0
      ir_block *bb = &blocks[b];
1182
0
      if (bb->loop_header > 0) {
1183
0
        ir_block *loop = &blocks[bb->loop_header];
1184
0
        uint32_t loop_depth = loop->loop_depth;
1185
1186
0
        if (bb->flags & IR_BB_LOOP_HEADER) {
1187
0
          loop_depth++;
1188
0
        }
1189
0
        bb->loop_depth = loop_depth;
1190
0
        if (bb->flags & (IR_BB_ENTRY|IR_BB_LOOP_WITH_ENTRY)) {
1191
0
          loop->flags |= IR_BB_LOOP_WITH_ENTRY;
1192
0
          if (loop_depth > 1) {
1193
            /* Set IR_BB_LOOP_WITH_ENTRY flag for all the enclosing loops */
1194
0
            bb = &blocks[loop->loop_header];
1195
0
            while (1) {
1196
0
              if (bb->flags & IR_BB_LOOP_WITH_ENTRY) {
1197
0
                break;
1198
0
              }
1199
0
              bb->flags |= IR_BB_LOOP_WITH_ENTRY;
1200
0
              if (bb->loop_depth == 1) {
1201
0
                break;
1202
0
              }
1203
0
              bb = &blocks[loop->loop_header];
1204
0
            }
1205
0
          }
1206
0
        }
1207
0
      }
1208
0
    }
1209
0
  }
1210
1211
0
  ir_mem_free(entry_times);
1212
0
  ir_worklist_free(&work);
1213
1214
0
  return 1;
1215
0
}
1216
1217
static uint32_t _ir_skip_empty_blocks(const ir_ctx *ctx, uint32_t b)
1218
0
{
1219
0
  while (1) {
1220
0
    ir_block *bb = &ctx->cfg_blocks[b];
1221
1222
0
    if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1223
0
      IR_ASSERT(bb->successors_count == 1);
1224
0
      b = ctx->cfg_edges[bb->successors];
1225
0
    } else {
1226
0
      return b;
1227
0
    }
1228
0
  }
1229
0
}
1230
1231
/* A variation of "Bottom-up Positioning" algorithm described by
1232
 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1233
 */
1234
typedef struct _ir_edge_info {
1235
  uint32_t from;
1236
  uint32_t to;
1237
  float    freq;
1238
} ir_edge_info;
1239
1240
typedef struct _ir_chain {
1241
  uint32_t head;
1242
  uint32_t next;
1243
  union {
1244
    uint32_t prev;
1245
    uint32_t tail;
1246
  };
1247
} ir_chain;
1248
1249
static int ir_edge_info_cmp(const void *b1, const void *b2)
1250
0
{
1251
0
  ir_edge_info *e1 = (ir_edge_info*)b1;
1252
0
  ir_edge_info *e2 = (ir_edge_info*)b2;
1253
1254
0
  if (e1->freq != e2->freq) {
1255
0
    return e1->freq < e2->freq ? 1 : -1;
1256
0
  }
1257
  /* In case of equal frequencies, try to avoid penalization of one of the "equal" paths by
1258
   * preferring the first RPO successor (in conditional branches) and the last RPO predecessor
1259
   * (in merge points).
1260
   *
1261
   * See "Static Basic Block Reordering Heuristics for Implicit Control Flow in Baseline JITs"
1262
   * Polito Guillermo, Ducasse Stephane, and Tesone Pablo (2021)
1263
   */
1264
0
  if (e1->from != e2->from) {
1265
0
    return e2->from - e1->from;
1266
0
  } else {
1267
0
    return e1->to - e2->to;
1268
0
  }
1269
0
}
1270
1271
static IR_NEVER_INLINE uint32_t ir_chain_head_path_compress(ir_chain *chains, uint32_t src, uint32_t head)
1272
0
{
1273
0
  IR_ASSERT(head != 0);
1274
0
  do {
1275
0
    head = chains[head].head;
1276
0
  } while (chains[head].head != head);
1277
0
  do {
1278
0
    uint32_t next = chains[src].head;
1279
0
    chains[src].head = head;
1280
0
    src = next;
1281
0
  } while (chains[src].head != head);
1282
0
  return head;
1283
0
}
1284
1285
IR_ALWAYS_INLINE uint32_t ir_chain_head(ir_chain *chains, uint32_t src)
1286
0
{
1287
0
  uint32_t head = chains[src].head;
1288
0
  if (chains[head].head == head) {
1289
0
    return head;
1290
0
  } else {
1291
0
    return ir_chain_head_path_compress(chains, src, head);
1292
0
  }
1293
0
}
1294
1295
static void ir_join_chains(ir_chain *chains, uint32_t src, uint32_t dst)
1296
0
{
1297
0
  uint32_t dst_tail = chains[dst].tail;
1298
0
  uint32_t src_tail = chains[src].tail;
1299
1300
0
  chains[dst_tail].next = src;
1301
0
  chains[dst].prev = src_tail;
1302
0
  chains[src_tail].next = dst;
1303
0
  chains[src].tail = dst_tail;
1304
0
  chains[dst].head = src;
1305
0
}
1306
1307
static void ir_insert_chain_before(ir_chain *chains, uint32_t c, uint32_t before)
1308
0
{
1309
0
  ir_chain *this = &chains[c];
1310
0
  ir_chain *next = &chains[before];
1311
1312
0
  IR_ASSERT(chains[c].head == c);
1313
0
  if (chains[before].head != before) {
1314
0
    this->head = next->head;
1315
0
  } else {
1316
0
    next->head = c;
1317
0
  }
1318
0
  this->next = before;
1319
0
  this->prev = next->prev;
1320
0
  next->prev = c;
1321
0
  chains[this->prev].next = c;
1322
0
}
1323
1324
#ifndef IR_DEBUG_BB_SCHEDULE_GRAPH
1325
# ifdef IR_DEBUG
1326
0
#  define IR_DEBUG_BB_SCHEDULE_GRAPH 1
1327
# else
1328
#  define IR_DEBUG_BB_SCHEDULE_GRAPH 0
1329
# endif
1330
#endif
1331
1332
#if IR_DEBUG_BB_SCHEDULE_GRAPH
1333
static void ir_dump_cfg_freq_graph(ir_ctx *ctx, float *bb_freq, uint32_t edges_count, ir_edge_info *edges, ir_chain *chains)
1334
0
{
1335
0
  uint32_t b, i;
1336
0
  ir_block *bb;
1337
0
  uint8_t c, *colors;
1338
0
  bool is_head, is_empty;
1339
0
  uint32_t max_colors = 12;
1340
1341
0
  colors = alloca(sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1342
0
  memset(colors, 0, sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1343
0
  i = 0;
1344
0
  for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1345
0
    if (chains[b].head == b) {
1346
0
      colors[b] = (i % max_colors) + 1;
1347
0
      i++;
1348
0
    }
1349
0
  }
1350
1351
0
  fprintf(stderr, "digraph {\n");
1352
0
  fprintf(stderr, "\trankdir=TB;\n");
1353
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1354
0
    bb = &ctx->cfg_blocks[b];
1355
0
    c = (chains[b].head) ? colors[ir_chain_head(chains, b)] : 0;
1356
0
    is_head = chains[b].head == b;
1357
0
    is_empty = (bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY;
1358
0
    if (c) {
1359
0
      fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\",colorscheme=set312,fillcolor=%d%s%s]\n", b, b,
1360
0
        bb->loop_depth, bb_freq[b],
1361
0
        ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1362
0
        c,
1363
0
        is_head ? ",penwidth=3" : "",
1364
0
        is_empty ? ",style=\"dotted,filled\"" : ",style=\"filled\"");
1365
0
    } else {
1366
0
      fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\"%s%s]\n", b, b,
1367
0
        bb->loop_depth, bb_freq[b],
1368
0
        ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1369
0
        is_head ? ",penwidth=3" : "",
1370
0
        is_empty ? ",style=\"dotted\"" : "");
1371
0
    }
1372
0
  }
1373
0
  fprintf(stderr, "\n");
1374
1375
0
  for (i = 0; i < edges_count; i++) {
1376
0
    fprintf(stderr, "\tBB%d -> BB%d [label=\"%0.3f\"]\n", edges[i].from, edges[i].to, edges[i].freq);
1377
0
  }
1378
0
  fprintf(stderr, "}\n");
1379
0
}
1380
#endif
1381
1382
#ifdef IR_DEBUG
1383
static void ir_dump_edges(ir_ctx *ctx, uint32_t edges_count, ir_edge_info *edges)
1384
0
{
1385
0
  uint32_t i;
1386
1387
0
  fprintf(stderr, "Edges:\n");
1388
0
  for (i = 0; i < edges_count; i++) {
1389
0
    fprintf(stderr, "\tBB%d -> BB%d %0.3f\n", edges[i].from, edges[i].to, edges[i].freq);
1390
0
  }
1391
0
}
1392
1393
static void ir_dump_chains(ir_ctx *ctx, ir_chain *chains)
1394
0
{
1395
0
  uint32_t b, tail, i;
1396
1397
0
  fprintf(stderr, "Chains:\n");
1398
0
  for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1399
0
    if (chains[b].head == b) {
1400
0
      tail = chains[b].tail;
1401
0
      i = b;
1402
0
      fprintf(stderr, "(BB%d", i);
1403
0
      while (i != tail) {
1404
0
        i = chains[i].next;
1405
0
        fprintf(stderr, ", BB%d", i);
1406
0
      }
1407
0
      fprintf(stderr, ")\n");
1408
0
    }
1409
0
  }
1410
0
}
1411
#endif
1412
1413
static int ir_schedule_blocks_bottom_up(ir_ctx *ctx)
1414
0
{
1415
0
  uint32_t max_edges_count = ctx->cfg_edges_count / 2;
1416
0
  uint32_t edges_count = 0;
1417
0
  uint32_t b, i, loop_depth;
1418
0
  float *bb_freq, freq;
1419
0
  ir_block *bb;
1420
0
  ir_edge_info *edges, *e;
1421
0
  ir_chain *chains;
1422
0
  ir_bitqueue worklist;
1423
0
  ir_bitset visited;
1424
0
  uint32_t *schedule_end, count;
1425
1426
0
  ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1427
0
  schedule_end = ctx->cfg_schedule + ctx->cfg_blocks_count;
1428
1429
  /* 1. Create initial chains for each BB */
1430
0
  chains = ir_mem_malloc(sizeof(ir_chain) * (ctx->cfg_blocks_count + 1));
1431
0
  chains[0].head = 0;
1432
0
  chains[0].next = 0;
1433
0
  chains[0].prev = 0;
1434
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1435
0
    chains[b].head = b;
1436
0
    chains[b].next = b;
1437
0
    chains[b].prev = b;
1438
0
  }
1439
1440
  /* 2. Collect information about BBs and EDGEs frequencies */
1441
0
  edges = ir_mem_malloc(sizeof(ir_edge_info) * max_edges_count);
1442
0
  bb_freq = ir_mem_calloc(ctx->cfg_blocks_count + 1, sizeof(float));
1443
0
  bb_freq[1] = 1.0f;
1444
0
  visited = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
1445
0
  ir_bitqueue_init(&worklist, ctx->cfg_blocks_count + 1);
1446
0
  ir_bitqueue_add(&worklist, 1);
1447
0
  while ((b = ir_bitqueue_pop(&worklist)) != (uint32_t)-1) {
1448
0
restart:
1449
0
    bb = &ctx->cfg_blocks[b];
1450
0
    if (bb->predecessors_count) {
1451
0
      uint32_t n = bb->predecessors_count;
1452
0
      uint32_t *p = ctx->cfg_edges + bb->predecessors;
1453
0
      for (; n > 0; p++, n--) {
1454
0
        uint32_t predecessor = *p;
1455
        /* Basic Blocks are ordered in a way that usual predecessors ids are less than successors.
1456
         * So we may compare blocks ids (predecessor < b) instead of a more expensive check for back edge
1457
         * (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b)
1458
         */
1459
0
        if (predecessor < b) {
1460
0
          if (!ir_bitset_in(visited, predecessor)) {
1461
0
            b = predecessor;
1462
0
            ir_bitqueue_del(&worklist, b);
1463
0
            goto restart;
1464
0
          }
1465
0
        } else if (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b) {
1466
          /* not a loop back-edge */
1467
0
          IR_ASSERT(b == predecessor || ctx->cfg_blocks[predecessor].loop_header == b);
1468
0
        }
1469
0
      }
1470
0
    }
1471
1472
0
    ir_bitset_incl(visited, b);
1473
1474
0
    if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1475
0
      uint32_t successor = ctx->cfg_edges[bb->successors];
1476
1477
      /* move empty blocks to the end */
1478
0
      IR_ASSERT(chains[b].head == b);
1479
0
      chains[b].head = 0;
1480
0
      *schedule_end = b;
1481
0
      schedule_end--;
1482
1483
0
      if (successor > b) {
1484
0
        bb_freq[successor] += bb_freq[b];
1485
0
        b = successor;
1486
0
        ir_bitqueue_del(&worklist, b);
1487
0
        goto restart;
1488
0
      } else {
1489
0
        continue;
1490
0
      }
1491
0
    }
1492
1493
0
    loop_depth = bb->loop_depth;
1494
0
    if (bb->flags & IR_BB_LOOP_HEADER) {
1495
      // TODO: Estimate the loop iterations count
1496
0
      bb_freq[b] *= 10;
1497
0
    }
1498
1499
0
    if (bb->successors_count) {
1500
0
      uint32_t n = bb->successors_count;
1501
0
      uint32_t *p = ctx->cfg_edges + bb->successors;
1502
1503
0
      if (n == 1) {
1504
0
        uint32_t successor = *p;
1505
1506
0
        IR_ASSERT(edges_count < max_edges_count);
1507
0
        freq = bb_freq[b];
1508
0
        if (successor > b) {
1509
0
          IR_ASSERT(!ir_bitset_in(visited, successor));
1510
0
          bb_freq[successor] += freq;
1511
0
          ir_bitqueue_add(&worklist, successor);
1512
0
        }
1513
0
        successor = _ir_skip_empty_blocks(ctx, successor);
1514
0
        edges[edges_count].from = b;
1515
0
        edges[edges_count].to = successor;
1516
0
        edges[edges_count].freq = freq;
1517
0
        edges_count++;
1518
0
      } else if (n == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1519
0
        uint32_t successor1 = *p;
1520
0
        ir_block *successor1_bb = &ctx->cfg_blocks[successor1];
1521
0
        ir_insn *insn1 = &ctx->ir_base[successor1_bb->start];
1522
0
        uint32_t successor2 = *(p + 1);
1523
0
        ir_block *successor2_bb = &ctx->cfg_blocks[successor2];
1524
0
        ir_insn *insn2 = &ctx->ir_base[successor2_bb->start];
1525
0
        int prob1, prob2, probN = 100;
1526
1527
0
        if (insn1->op2) {
1528
0
          prob1 = insn1->op2;
1529
0
          if (insn2->op2) {
1530
0
            prob2 = insn2->op2;
1531
0
            probN = prob1 + prob2;
1532
0
          } else {
1533
0
            if (prob1 > 100) {
1534
0
              prob1 = 100;
1535
0
            }
1536
0
            prob2 = 100 - prob1;
1537
0
          }
1538
1539
0
        } else if (insn2->op2) {
1540
0
          prob2 = insn2->op2;
1541
0
          if (prob2 > 100) {
1542
0
            prob2 = 100;
1543
0
          }
1544
0
          prob1 = 100 - prob2;
1545
0
        } else if (successor1_bb->loop_depth >= loop_depth
1546
0
            && successor2_bb->loop_depth < loop_depth) {
1547
0
          prob1 = 90;
1548
0
          prob2 = 10;
1549
0
        } else if (successor1_bb->loop_depth < loop_depth
1550
0
            && successor2_bb->loop_depth >= loop_depth) {
1551
0
          prob1 = 10;
1552
0
          prob2 = 90;
1553
0
        } else if (successor2_bb->flags & IR_BB_EMPTY) {
1554
0
          prob1 = 51;
1555
0
          prob2 = 49;
1556
0
        } else if (successor1_bb->flags & IR_BB_EMPTY) {
1557
0
          prob1 = 49;
1558
0
          prob2 = 51;
1559
0
        } else {
1560
0
          prob1 = prob2 = 50;
1561
0
        }
1562
0
        do {
1563
0
          freq = bb_freq[b] * (float)prob1 / (float)probN;
1564
0
          if (successor1 > b) {
1565
0
            IR_ASSERT(!ir_bitset_in(visited, successor1));
1566
0
            bb_freq[successor1] += freq;
1567
0
            if (successor1_bb->successors_count == 0 && insn1->op2 == 1) {
1568
              /* move cold block without successors to the end */
1569
0
              IR_ASSERT(chains[successor1].head == successor1);
1570
0
              chains[successor1].head = 0;
1571
0
              *schedule_end = successor1;
1572
0
              schedule_end--;
1573
0
              break;
1574
0
            } else {
1575
0
              ir_bitqueue_add(&worklist, successor1);
1576
0
            }
1577
0
          }
1578
          /* try to join edges early to reduce number of edges and the cost of their sorting */
1579
0
          if (prob1 > prob2
1580
0
           && (successor1_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1581
0
            uint32_t src = chains[b].next;
1582
0
            IR_ASSERT(chains[src].head == src);
1583
0
            IR_ASSERT(src == ir_chain_head(chains, b));
1584
0
            IR_ASSERT(chains[successor1].head == successor1);
1585
0
            ir_join_chains(chains, src, successor1);
1586
0
            if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1587
0
          }
1588
0
          successor1 = _ir_skip_empty_blocks(ctx, successor1);
1589
0
          IR_ASSERT(edges_count < max_edges_count);
1590
0
          edges[edges_count].from = b;
1591
0
          edges[edges_count].to = successor1;
1592
0
          edges[edges_count].freq = freq;
1593
0
          edges_count++;
1594
0
        } while (0);
1595
0
        do {
1596
0
          freq = bb_freq[b] * (float)prob2 / (float)probN;
1597
0
          if (successor2 > b) {
1598
0
            IR_ASSERT(!ir_bitset_in(visited, successor2));
1599
0
            bb_freq[successor2] += freq;
1600
0
            if (successor2_bb->successors_count == 0 && insn2->op2 == 1) {
1601
              /* move cold block without successors to the end */
1602
0
              IR_ASSERT(chains[successor2].head == successor2);
1603
0
              chains[successor2].head = 0;
1604
0
              *schedule_end = successor2;
1605
0
              schedule_end--;
1606
0
              break;
1607
0
            } else {
1608
0
              ir_bitqueue_add(&worklist, successor2);
1609
0
            }
1610
0
          }
1611
0
          if (prob2 > prob1
1612
0
           && (successor2_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1613
0
            uint32_t src = chains[b].next;
1614
0
            IR_ASSERT(chains[src].head == src);
1615
0
            IR_ASSERT(src == ir_chain_head(chains, b));
1616
0
            IR_ASSERT(chains[successor2].head == successor2);
1617
0
            ir_join_chains(chains, src, successor2);
1618
0
            if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1619
0
          }
1620
0
          successor2 = _ir_skip_empty_blocks(ctx, successor2);
1621
0
          IR_ASSERT(edges_count < max_edges_count);
1622
0
          edges[edges_count].from = b;
1623
0
          edges[edges_count].to = successor2;
1624
0
          edges[edges_count].freq = freq;
1625
0
          edges_count++;
1626
0
        } while (0);
1627
0
      } else {
1628
0
        int prob;
1629
1630
0
        for (; n > 0; p++, n--) {
1631
0
          uint32_t successor = *p;
1632
0
          ir_block *successor_bb = &ctx->cfg_blocks[successor];
1633
0
          ir_insn *insn = &ctx->ir_base[successor_bb->start];
1634
1635
0
          if (insn->op == IR_CASE_DEFAULT) {
1636
0
            prob = insn->op2;
1637
0
            if (!prob) {
1638
0
              prob = 100 / bb->successors_count;
1639
0
            }
1640
0
          } else if (insn->op == IR_CASE_VAL) {
1641
0
            prob = insn->op3;
1642
0
            if (!prob) {
1643
0
              prob = 100 / bb->successors_count;
1644
0
            }
1645
0
          } else if (insn->op == IR_ENTRY) {
1646
0
            if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1647
0
              prob = 99; /* prefer empty ENTRY block to go first */
1648
0
            } else {
1649
0
              prob = 1;
1650
0
            }
1651
0
          } else {
1652
0
            prob = 100 / bb->successors_count;
1653
0
          }
1654
0
          freq = bb_freq[b] * (float)prob / 100.0f;
1655
0
          if (successor > b) {
1656
0
            IR_ASSERT(!ir_bitset_in(visited, successor));
1657
0
            bb_freq[successor] += freq;
1658
0
            ir_bitqueue_add(&worklist, successor);
1659
0
          }
1660
0
          successor = _ir_skip_empty_blocks(ctx, successor);
1661
0
          IR_ASSERT(edges_count < max_edges_count);
1662
0
          edges[edges_count].from = b;
1663
0
          edges[edges_count].to = successor;
1664
0
          edges[edges_count].freq = freq;
1665
0
          edges_count++;
1666
0
        }
1667
0
      }
1668
0
    }
1669
0
  }
1670
0
  ir_bitqueue_free(&worklist);
1671
0
  ir_mem_free(visited);
1672
1673
  /* 2. Sort EDGEs according to their frequencies */
1674
0
  qsort(edges, edges_count, sizeof(ir_edge_info), ir_edge_info_cmp);
1675
1676
0
#ifdef IR_DEBUG
1677
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1678
0
    ir_dump_edges(ctx, edges_count, edges);
1679
0
  }
1680
0
#endif
1681
1682
  /* 3. Process EDGEs in the decreasing frequency order and join the connected chains */
1683
0
  for (e = edges, i = edges_count; i > 0; e++, i--) {
1684
0
    uint32_t dst = chains[e->to].head;
1685
0
    if (dst == e->to) {
1686
0
      uint32_t src = chains[e->from].next;
1687
0
      if (chains[src].head == src) {
1688
0
        IR_ASSERT(src == ir_chain_head(chains, e->from) && chains[src].tail == e->from);
1689
0
        if (src != dst) {
1690
0
          ir_join_chains(chains, src, dst);
1691
0
        } else if (ctx->cfg_blocks[e->from].successors_count < 2) {
1692
          /* Try to rotate loop chian to finish it with a conditional branch */
1693
0
          uint32_t tail = e->from;
1694
0
          uint32_t prev = src;
1695
0
          uint32_t next = chains[prev].next;
1696
0
          uint32_t best = 0;
1697
1698
0
          while (prev != tail) {
1699
0
            if (ctx->ir_base[ctx->cfg_blocks[prev].end].op == IR_IF) {
1700
0
              if (ctx->ir_base[ctx->cfg_blocks[prev].start].op == IR_LOOP_BEGIN
1701
0
               && ctx->cfg_blocks[prev].loop_depth > 1) {
1702
0
                best = next;
1703
0
                break;
1704
0
              } else if (!best || bb_freq[next] < bb_freq[best]) {
1705
                /* Find the successor of IF with the least frequency */
1706
0
                best = next;
1707
0
              }
1708
0
            }
1709
0
            prev = next;
1710
0
            next = chains[next].next;
1711
0
          }
1712
0
          if (best) {
1713
            /* change the head of the chain */
1714
0
            chains[src].head = best;
1715
0
            chains[best].head = best;
1716
0
          }
1717
0
        }
1718
#if !IR_DEBUG_BB_SCHEDULE_GRAPH
1719
        e->from = 0; /* reset "from" to avoid check on step #5 */
1720
#endif
1721
0
      }
1722
0
    }
1723
0
  }
1724
1725
0
#if IR_DEBUG_BB_SCHEDULE_GRAPH
1726
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1727
0
    ir_dump_cfg_freq_graph(ctx, bb_freq, edges_count, edges, chains);
1728
0
  }
1729
0
#endif
1730
1731
0
  ir_mem_free(bb_freq);
1732
1733
0
#ifdef IR_DEBUG
1734
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1735
0
    ir_dump_chains(ctx, chains);
1736
0
  }
1737
0
#endif
1738
1739
  /* 4. Merge empty entry blocks */
1740
0
  if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && ctx->entries_count) {
1741
0
    for (i = 0; i < ctx->entries_count; i++) {
1742
0
      b = ctx->entries[i];
1743
0
      IR_ASSERT(ctx->cfg_blocks[b].flags & IR_BB_ENTRY);
1744
0
      if (b && chains[b].head == b && chains[b].tail == b) {
1745
0
        bb = &ctx->cfg_blocks[b];
1746
0
        if (bb->flags & IR_BB_EMPTY) {
1747
0
          uint32_t successor;
1748
1749
0
          do {
1750
0
            IR_ASSERT(bb->successors_count == 1);
1751
0
            successor = ctx->cfg_edges[bb->successors];
1752
0
            bb = &ctx->cfg_blocks[successor];
1753
0
          } while ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY);
1754
0
          IR_ASSERT(chains[successor].head);
1755
0
          ir_insert_chain_before(chains, b, successor);
1756
0
        }
1757
0
      }
1758
0
    }
1759
1760
0
#ifdef IR_DEBUG
1761
0
    if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1762
0
      ir_dump_chains(ctx, chains);
1763
0
    }
1764
0
#endif
1765
0
  }
1766
1767
  /* 5. Align loop headers */
1768
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1769
0
    if (chains[b].head == b) {
1770
0
      bb = &ctx->cfg_blocks[b];
1771
0
      if (bb->loop_depth) {
1772
0
        if ((bb->flags & IR_BB_LOOP_HEADER) || ir_chain_head(chains, bb->loop_header) == b) {
1773
0
          bb->flags |= IR_BB_ALIGN_LOOP;
1774
0
        }
1775
0
      }
1776
0
    }
1777
0
  }
1778
1779
  /* 6. Group chains according to the most frequent edge between them */
1780
  // TODO: Try to find a better heuristic
1781
0
  for (e = edges, i = edges_count; i > 0; e++, i--) {
1782
#if !IR_DEBUG_BB_SCHEDULE_GRAPH
1783
    if (!e->from) continue;
1784
#endif
1785
0
    uint32_t src = ir_chain_head(chains, e->from);
1786
0
    uint32_t dst = ir_chain_head(chains, e->to);
1787
0
    if (src != dst) {
1788
0
      if (dst == 1) {
1789
0
        ir_join_chains(chains, dst, src);
1790
0
      } else {
1791
0
        ir_join_chains(chains, src, dst);
1792
0
      }
1793
0
    }
1794
0
  }
1795
1796
0
#ifdef IR_DEBUG
1797
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1798
0
    ir_dump_chains(ctx, chains);
1799
0
  }
1800
0
#endif
1801
1802
  /* 7. Form a final BB order */
1803
0
  count = 0;
1804
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1805
0
    if (chains[b].head == b) {
1806
0
      uint32_t tail = chains[b].tail;
1807
0
      uint32_t i = b;
1808
0
      while (1) {
1809
0
        count++;
1810
0
        ctx->cfg_schedule[count] = i;
1811
0
        if (i == tail) break;
1812
0
        i = chains[i].next;
1813
0
      }
1814
0
    }
1815
0
  }
1816
1817
0
  IR_ASSERT(ctx->cfg_schedule + count == schedule_end);
1818
0
  ctx->cfg_schedule[ctx->cfg_blocks_count + 1] = 0;
1819
1820
0
  ir_mem_free(edges);
1821
0
  ir_mem_free(chains);
1822
1823
0
  return 1;
1824
0
}
1825
1826
/* A variation of "Top-down Positioning" algorithm described by
1827
 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1828
 */
1829
static int ir_schedule_blocks_top_down(ir_ctx *ctx)
1830
0
{
1831
0
  ir_bitqueue blocks;
1832
0
  uint32_t b, best_successor, last_non_empty;
1833
0
  ir_block *bb, *best_successor_bb;
1834
0
  ir_insn *insn;
1835
0
  uint32_t *list, *schedule_end;
1836
0
  uint32_t count = 0;
1837
1838
0
  ir_bitqueue_init(&blocks, ctx->cfg_blocks_count + 1);
1839
0
  blocks.pos = 0;
1840
0
  list = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1841
0
  list[ctx->cfg_blocks_count + 1] = 0;
1842
0
  schedule_end = list + ctx->cfg_blocks_count;
1843
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1844
0
    ir_bitset_incl(blocks.set, b);
1845
0
  }
1846
1847
0
  while ((b = ir_bitqueue_pop(&blocks)) != (uint32_t)-1) {
1848
0
    bb = &ctx->cfg_blocks[b];
1849
    /* Start trace */
1850
0
    last_non_empty = 0;
1851
0
    do {
1852
0
      if (UNEXPECTED(bb->flags & IR_BB_PREV_EMPTY_ENTRY) && ir_bitqueue_in(&blocks, b - 1)) {
1853
        /* Schedule the previous empty ENTRY block before this one */
1854
0
        uint32_t predecessor = b - 1;
1855
1856
0
        ir_bitqueue_del(&blocks, predecessor);
1857
0
        count++;
1858
0
        list[count] = predecessor;
1859
0
      }
1860
0
      if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1861
        /* move empty blocks to the end */
1862
0
        *schedule_end = b;
1863
0
        schedule_end--;
1864
0
      } else {
1865
0
        count++;
1866
0
        list[count] = b;
1867
0
        last_non_empty = b;
1868
0
      }
1869
0
      best_successor_bb = NULL;
1870
0
      if (bb->successors_count == 1) {
1871
0
        best_successor = ctx->cfg_edges[bb->successors];
1872
0
        if (ir_bitqueue_in(&blocks, best_successor)) {
1873
0
          best_successor_bb = &ctx->cfg_blocks[best_successor];
1874
0
        }
1875
0
      } else if (bb->successors_count > 1) {
1876
0
        uint32_t prob, best_successor_prob;
1877
0
        uint32_t *p, successor;
1878
0
        ir_block *successor_bb;
1879
1880
0
        for (b = 0, p = &ctx->cfg_edges[bb->successors]; b < bb->successors_count; b++, p++) {
1881
0
          successor = *p;
1882
0
          if (ir_bitqueue_in(&blocks, successor)) {
1883
0
            successor_bb = &ctx->cfg_blocks[successor];
1884
0
            insn = &ctx->ir_base[successor_bb->start];
1885
0
            if (insn->op == IR_IF_TRUE || insn->op == IR_IF_FALSE) {
1886
0
              prob = insn->op2;
1887
0
              if (!prob) {
1888
0
                prob = 100 / bb->successors_count;
1889
0
                if (!(successor_bb->flags & IR_BB_EMPTY)) {
1890
0
                  prob++;
1891
0
                }
1892
0
              }
1893
0
            } else if (insn->op == IR_CASE_DEFAULT) {
1894
0
              prob = insn->op2;
1895
0
              if (!prob) {
1896
0
                prob = 100 / bb->successors_count;
1897
0
              }
1898
0
            } else if (insn->op == IR_CASE_VAL) {
1899
0
              prob = insn->op3;
1900
0
              if (!prob) {
1901
0
                prob = 100 / bb->successors_count;
1902
0
              }
1903
0
            } else if (insn->op == IR_ENTRY) {
1904
0
              if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1905
0
                prob = 99; /* prefer empty ENTRY block to go first */
1906
0
              } else {
1907
0
                prob = 1;
1908
0
              }
1909
0
            } else {
1910
0
              prob = 100 / bb->successors_count;
1911
0
            }
1912
0
            if (!best_successor_bb
1913
0
             || successor_bb->loop_depth > best_successor_bb->loop_depth
1914
0
             || prob > best_successor_prob) {
1915
0
              best_successor = successor;
1916
0
              best_successor_bb = successor_bb;
1917
0
              best_successor_prob = prob;
1918
0
            }
1919
0
          }
1920
0
        }
1921
0
      }
1922
0
      if (!best_successor_bb) {
1923
        /* Try to continue trace using the other successor of the last IF */
1924
0
        if ((bb->flags & IR_BB_EMPTY) && last_non_empty) {
1925
0
          bb = &ctx->cfg_blocks[last_non_empty];
1926
0
          if (bb->successors_count == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1927
0
            b = ctx->cfg_edges[bb->successors];
1928
1929
0
            if (!ir_bitqueue_in(&blocks, b)) {
1930
0
              b = ctx->cfg_edges[bb->successors + 1];
1931
0
            }
1932
0
            if (ir_bitqueue_in(&blocks, b)) {
1933
0
              bb = &ctx->cfg_blocks[b];
1934
0
              ir_bitqueue_del(&blocks, b);
1935
0
              continue;
1936
0
            }
1937
0
          }
1938
0
        }
1939
        /* End trace */
1940
0
        break;
1941
0
      }
1942
0
      b = best_successor;
1943
0
      bb = best_successor_bb;
1944
0
      ir_bitqueue_del(&blocks, b);
1945
0
    } while (1);
1946
0
  }
1947
1948
0
  IR_ASSERT(list + count == schedule_end);
1949
0
  ctx->cfg_schedule = list;
1950
0
  ir_bitqueue_free(&blocks);
1951
1952
0
  return 1;
1953
0
}
1954
1955
int ir_schedule_blocks(ir_ctx *ctx)
1956
0
{
1957
0
  ir_ref ref;
1958
1959
0
  if (ctx->cfg_blocks_count <= 2) {
1960
0
    return 1;
1961
0
  }
1962
1963
  /* Mark "stop" blocks termintated with UNREACHABLE as "unexpected" */
1964
0
  ref = ctx->ir_base[1].op1;
1965
0
  while (ref) {
1966
0
    ir_insn *insn = &ctx->ir_base[ref];
1967
1968
0
    if (insn->op == IR_UNREACHABLE && ctx->ir_base[insn->op1].op != IR_TAILCALL) {
1969
0
      ir_block *bb = &ctx->cfg_blocks[ctx->cfg_map[ref]];
1970
0
      uint32_t n = bb->predecessors_count;
1971
1972
0
      if (n == 1) {
1973
0
        ir_insn *start_insn = &ctx->ir_base[bb->start];
1974
0
        if (start_insn->op == IR_IF_TRUE
1975
0
         || start_insn->op == IR_IF_FALSE
1976
0
         || start_insn->op == IR_CASE_DEFAULT) {
1977
0
          if (!start_insn->op2) start_insn->op2 = 1;
1978
0
        } else if (start_insn->op == IR_CASE_VAL) {
1979
0
          if (!start_insn->op3) start_insn->op3 = 1;
1980
0
        }
1981
0
      } else if (n > 1) {
1982
0
        uint32_t *p = &ctx->cfg_edges[bb->predecessors];
1983
1984
0
        for (; n > 0; p++, n--) {
1985
0
          bb = &ctx->cfg_blocks[*p];
1986
0
          if (bb->predecessors_count == 1) {
1987
0
            ir_insn *start_insn = &ctx->ir_base[bb->start];
1988
0
            if (start_insn->op == IR_IF_TRUE
1989
0
             || start_insn->op == IR_IF_FALSE
1990
0
             || start_insn->op == IR_CASE_DEFAULT) {
1991
0
              if (!start_insn->op2) start_insn->op2 = 1;
1992
0
            } else if (start_insn->op == IR_CASE_VAL) {
1993
0
              if (!start_insn->op3) start_insn->op3 = 1;
1994
0
            }
1995
0
          }
1996
0
        }
1997
0
      }
1998
0
    }
1999
0
    ref = insn->op3;
2000
0
  }
2001
2002
  /* The bottom-up Pettis-Hansen algorithm is expensive - O(n^3),
2003
   * use it only for relatively small functions.
2004
   *
2005
   * TODO: make the choice between top-down and bottom-up algorithm configurable
2006
   */
2007
0
  if (UNEXPECTED(ctx->flags2 & IR_IRREDUCIBLE_CFG) || ctx->cfg_blocks_count > 256) {
2008
0
    return ir_schedule_blocks_top_down(ctx);
2009
0
  } else {
2010
0
    return ir_schedule_blocks_bottom_up(ctx);
2011
0
  }
2012
0
}
2013
2014
/* JMP target optimisation */
2015
uint32_t ir_skip_empty_target_blocks(const ir_ctx *ctx, uint32_t b)
2016
0
{
2017
0
  return _ir_skip_empty_blocks(ctx, b);
2018
0
}
2019
2020
uint32_t ir_next_block(const ir_ctx *ctx, uint32_t b)
2021
0
{
2022
0
  ir_block *bb;
2023
2024
0
  if (ctx->cfg_schedule) {
2025
0
    uint32_t ret = ctx->cfg_schedule[++b];
2026
2027
    /* Check for empty ENTRY block */
2028
0
    while (ret && ((ctx->cfg_blocks[ret].flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY)) {
2029
0
      ret = ctx->cfg_schedule[++b];
2030
0
    }
2031
0
    return ret;
2032
0
  }
2033
2034
0
  b++;
2035
0
  while (1) {
2036
0
    if (b > ctx->cfg_blocks_count) {
2037
0
      return 0;
2038
0
    }
2039
2040
0
    bb = &ctx->cfg_blocks[b];
2041
2042
0
    if ((bb->flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY) {
2043
0
      b++;
2044
0
    } else {
2045
0
      break;
2046
0
    }
2047
0
  }
2048
0
  return b;
2049
0
}
2050
2051
void ir_get_true_false_blocks(const ir_ctx *ctx, uint32_t b, uint32_t *true_block, uint32_t *false_block)
2052
0
{
2053
0
  ir_block *bb;
2054
0
  uint32_t *p, use_block;
2055
2056
0
  *true_block = 0;
2057
0
  *false_block = 0;
2058
0
  bb = &ctx->cfg_blocks[b];
2059
0
  IR_ASSERT(ctx->ir_base[bb->end].op == IR_IF);
2060
0
  IR_ASSERT(bb->successors_count == 2);
2061
0
  p = &ctx->cfg_edges[bb->successors];
2062
0
  use_block = *p;
2063
0
  if (ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE) {
2064
0
    *true_block = ir_skip_empty_target_blocks(ctx, use_block);
2065
0
    use_block = *(p+1);
2066
0
    IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
2067
0
    *false_block = ir_skip_empty_target_blocks(ctx, use_block);
2068
0
  } else {
2069
0
    IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
2070
0
    *false_block = ir_skip_empty_target_blocks(ctx, use_block);
2071
0
    use_block = *(p+1);
2072
0
    IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE);
2073
0
    *true_block = ir_skip_empty_target_blocks(ctx, use_block);
2074
0
  }
2075
0
  IR_ASSERT(*true_block && *false_block);
2076
0
}