Coverage Report

Created: 2026-04-01 06:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/opcache/jit/ir/ir_cfg.c
Line
Count
Source
1
/*
2
 * IR - Lightweight JIT Compilation Framework
3
 * (CFG - Control Flow Graph)
4
 * Copyright (C) 2022 Zend by Perforce.
5
 * Authors: Dmitry Stogov <dmitry@php.net>
6
 */
7
8
#ifndef _GNU_SOURCE
9
# define _GNU_SOURCE
10
#endif
11
12
#include "ir.h"
13
#include "ir_private.h"
14
15
static int ir_remove_unreachable_blocks(ir_ctx *ctx);
16
17
IR_ALWAYS_INLINE void _ir_add_successors(const ir_ctx *ctx, ir_ref ref, ir_worklist *worklist)
18
0
{
19
0
  ir_use_list *use_list = &ctx->use_lists[ref];
20
0
  ir_ref *p, use, n = use_list->count;
21
22
0
  if (n < 2) {
23
0
    if (n == 1) {
24
0
      use = ctx->use_edges[use_list->refs];
25
0
      IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
26
0
      ir_worklist_push(worklist, use);
27
0
    }
28
0
  } else {
29
0
    p = &ctx->use_edges[use_list->refs];
30
0
    if (n == 2) {
31
0
      use = *p;
32
0
      IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
33
0
      ir_worklist_push(worklist, use);
34
0
      use = *(p + 1);
35
0
      IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
36
0
      ir_worklist_push(worklist, use);
37
0
    } else {
38
0
      for (; n > 0; p++, n--) {
39
0
        use = *p;
40
0
        IR_ASSERT(ir_op_flags[ctx->ir_base[use].op] & IR_OP_FLAG_CONTROL);
41
0
        ir_worklist_push(worklist, use);
42
0
      }
43
0
    }
44
0
  }
45
0
}
46
47
IR_ALWAYS_INLINE void _ir_add_predecessors(const ir_insn *insn, ir_worklist *worklist)
48
0
{
49
0
  ir_ref n, ref;
50
0
  const ir_ref *p;
51
52
0
  if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
53
0
    n = insn->inputs_count;
54
0
    for (p = insn->ops + 1; n > 0; p++, n--) {
55
0
      ref = *p;
56
0
      IR_ASSERT(ref);
57
0
      ir_worklist_push(worklist, ref);
58
0
    }
59
0
  } else if (insn->op != IR_START) {
60
0
    if (EXPECTED(insn->op1)) {
61
0
      ir_worklist_push(worklist, insn->op1);
62
0
    }
63
0
  }
64
0
}
65
66
void ir_reset_cfg(ir_ctx *ctx)
67
0
{
68
0
  ctx->cfg_blocks_count = 0;
69
0
  ctx->cfg_edges_count = 0;
70
0
  if (ctx->cfg_blocks) {
71
0
    ir_mem_free(ctx->cfg_blocks);
72
0
    ctx->cfg_blocks = NULL;
73
0
    if (ctx->cfg_edges) {
74
0
      ir_mem_free(ctx->cfg_edges);
75
0
      ctx->cfg_edges = NULL;
76
0
    }
77
0
    if (ctx->cfg_map) {
78
0
      ir_mem_free(ctx->cfg_map);
79
0
      ctx->cfg_map = NULL;
80
0
    }
81
0
  }
82
0
}
83
84
static void ir_remove_phis_inputs(ir_ctx *ctx, ir_use_list *use_list, int new_inputs_count, ir_bitset life_inputs)
85
0
{
86
0
  ir_ref i, j, n, k, *p, *q, use;
87
0
  ir_insn *use_insn;
88
89
0
  if (new_inputs_count == 1) {
90
0
    for (k = use_list->count, p = q = &ctx->use_edges[use_list->refs]; k > 0; p++, k--) {
91
0
      use = *p;
92
0
      use_insn = &ctx->ir_base[use];
93
0
      if (use_insn->op == IR_PHI) {
94
        /* Convert PHI to COPY */
95
0
        n = use_insn->inputs_count;
96
0
        i = 2;
97
0
        for (j = 2; j <= n; j++) {
98
0
          ir_ref input = ir_insn_op(use_insn, j);
99
100
0
          if (ir_bitset_in(life_inputs, j - 1)) {
101
0
            use_insn->op1 = ir_insn_op(use_insn, j);
102
0
          } else if (input > 0) {
103
0
            ir_use_list_remove_one(ctx, input, use);
104
0
          }
105
0
        }
106
0
        use_insn->op = IR_COPY;
107
0
        use_insn->inputs_count = 1;
108
0
        for (j = 2; j <= n; j++) {
109
0
          ir_insn_set_op(use_insn, j, IR_UNUSED);
110
0
        }
111
0
        continue;
112
0
      }
113
114
      /*compact use list */
115
0
      if (p != q){
116
0
        *q = use;
117
0
      }
118
0
      q++;
119
0
    }
120
121
0
    if (p != q) {
122
0
      use_list->count -= (p - q);
123
0
      do {
124
0
        *q = IR_UNUSED; /* clenu-op the removed tail */
125
0
        q++;
126
0
      } while (p != q);
127
0
    }
128
0
  } else {
129
0
    for (k = use_list->count, p = &ctx->use_edges[use_list->refs]; k > 0; p++, k--) {
130
0
      use = *p;
131
0
      use_insn = &ctx->ir_base[use];
132
0
      if (use_insn->op == IR_PHI) {
133
0
        n = use_insn->inputs_count;
134
0
        i = 2;
135
0
        for (j = 2; j <= n; j++) {
136
0
          ir_ref input = ir_insn_op(use_insn, j);
137
138
0
          if (ir_bitset_in(life_inputs, j - 1)) {
139
0
            IR_ASSERT(input);
140
0
            if (i != j) {
141
0
              ir_insn_set_op(use_insn, i, input);
142
0
            }
143
0
            i++;
144
0
          } else if (input > 0) {
145
0
            ir_use_list_remove_one(ctx, input, use);
146
0
          }
147
0
        }
148
0
        use_insn->inputs_count = i - 1;
149
0
        for (j = i; j <= n; j++) {
150
0
          ir_insn_set_op(use_insn, j, IR_UNUSED);
151
0
        }
152
0
      }
153
0
    }
154
0
  }
155
0
}
156
157
static uint32_t IR_NEVER_INLINE ir_cfg_remove_dead_inputs(ir_ctx *ctx, uint32_t *_blocks, ir_block *blocks, uint32_t bb_count)
158
0
{
159
0
  uint32_t b, count = 0;
160
0
  ir_block *bb = blocks + 1;
161
0
  ir_insn *insn;
162
0
  ir_ref i, j, n, *ops, input;
163
0
  ir_bitset life_inputs = NULL;
164
165
0
  for (b = 1; b <= bb_count; b++, bb++) {
166
0
    bb->successors = count;
167
0
    count += ctx->use_lists[bb->end].count;
168
0
    bb->successors_count = 0;
169
0
    bb->predecessors = count;
170
0
    insn = &ctx->ir_base[bb->start];
171
0
    if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
172
0
      n = insn->inputs_count;
173
0
      ops = insn->ops;
174
0
      for (i = 1, j = 1; i <= n; i++) {
175
0
        input = ops[i];
176
0
        if (_blocks[input]) {
177
0
          if (life_inputs) {
178
0
            ir_bitset_incl(life_inputs, i);
179
0
          }
180
0
          if (i != j) {
181
0
            ops[j] = ops[i];
182
0
          }
183
0
          j++;
184
0
        } else {
185
0
          if (ctx->use_lists[bb->start].count > 1) {
186
            /* Some inputs of this MERGE are deleted and we have to update the depended PHIs */
187
0
            if (!life_inputs) {
188
0
              int k;
189
0
              life_inputs = ir_bitset_malloc(n + 1);
190
0
              for (k = 1; k < i; k++) {
191
0
                ir_bitset_incl(life_inputs, k);
192
0
              }
193
0
            }
194
0
          }
195
0
          IR_ASSERT(input > 0);
196
0
          IR_ASSERT(ctx->use_lists[input].count == 1 && ctx->use_edges[ctx->use_lists[input].refs] == bb->start);
197
0
          CLEAR_USES(input);
198
0
        }
199
0
      }
200
0
      j--;
201
0
      if (j != n) {
202
0
        if (j == 1) {
203
0
          insn->op = IR_BEGIN;
204
0
        }
205
0
        insn->inputs_count = j;
206
0
        bb->predecessors_count = j;
207
0
        j++;
208
0
        for (;j <= n; j++) {
209
0
          ops[j] = IR_UNUSED;
210
0
        }
211
0
        if (life_inputs) {
212
0
          ir_remove_phis_inputs(ctx, &ctx->use_lists[bb->start], insn->inputs_count, life_inputs);
213
0
          ir_mem_free(life_inputs);
214
0
          life_inputs = NULL;
215
0
        }
216
0
      }
217
0
    }
218
0
    count += bb->predecessors_count;
219
0
  }
220
0
  return count;
221
0
}
222
223
int ir_build_cfg(ir_ctx *ctx)
224
0
{
225
0
  ir_ref n, *p, ref, start, end;
226
0
  uint32_t b;
227
0
  ir_insn *insn;
228
0
  ir_worklist worklist;
229
0
  uint32_t bb_init_falgs;
230
0
  uint32_t count, bb_count = 0;
231
0
  uint32_t edges_count = 0;
232
0
  ir_block *blocks, *bb;
233
0
  uint32_t *_blocks, *edges;
234
0
  ir_use_list *use_list;
235
0
  uint32_t len = ir_bitset_len(ctx->insns_count);
236
0
  ir_bitset bb_starts = ir_mem_calloc(len * 2, IR_BITSET_BITS / 8);
237
0
  ir_bitset bb_leaks = bb_starts + len;
238
0
  _blocks = ir_mem_calloc(ctx->insns_limit, sizeof(uint32_t));
239
0
  ir_worklist_init(&worklist, ctx->insns_count);
240
241
  /* First try to perform backward DFS search starting from "stop" nodes */
242
243
  /* Add all "stop" nodes */
244
0
  ref = ctx->ir_base[1].op1;
245
0
  while (ref) {
246
0
    ir_worklist_push(&worklist, ref);
247
0
    ref = ctx->ir_base[ref].op3;
248
0
  }
249
250
0
  while (ir_worklist_len(&worklist)) {
251
0
    ref = ir_worklist_pop(&worklist);
252
0
    insn = &ctx->ir_base[ref];
253
254
0
    if (insn->op == IR_NOP) {
255
0
      continue;
256
0
    }
257
0
    IR_ASSERT(IR_IS_BB_END(insn->op));
258
    /* Remember BB end */
259
0
    end = ref;
260
    /* Some successors of IF and SWITCH nodes may be inaccessible by backward DFS */
261
0
    use_list = &ctx->use_lists[end];
262
0
    n = use_list->count;
263
0
    if (n > 1 || (n == 1 && (ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR) != 0)) {
264
0
      for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
265
        /* Remember possible inaccessible successors */
266
0
        ir_bitset_incl(bb_leaks, *p);
267
0
      }
268
0
    }
269
    /* Skip control nodes untill BB start */
270
0
    ref = insn->op1;
271
0
    while (1) {
272
0
      insn = &ctx->ir_base[ref];
273
0
      if (IR_IS_BB_START(insn->op)) {
274
0
        break;
275
0
      }
276
0
      ref = insn->op1; // follow connected control blocks untill BB start
277
0
    }
278
    /* Mark BB Start */
279
0
    bb_count++;
280
0
    _blocks[ref] = end;
281
0
    ir_bitset_incl(bb_starts, ref);
282
    /* Add predecessors */
283
0
    _ir_add_predecessors(insn, &worklist);
284
0
  }
285
286
  /* Backward DFS way miss some branches ending by infinite loops.  */
287
  /* Try forward DFS. (in most cases all nodes are already proceed). */
288
289
  /* START node may be inaccessible from "stop" nodes */
290
0
  ir_bitset_incl(bb_leaks, 1);
291
292
  /* Add not processed START and successor of IF and SWITCH */
293
0
  IR_BITSET_FOREACH_DIFFERENCE(bb_leaks, bb_starts, len, start) {
294
0
    ir_worklist_push(&worklist, start);
295
0
  } IR_BITSET_FOREACH_END();
296
297
0
  if (ir_worklist_len(&worklist)) {
298
0
    ir_bitset_union(worklist.visited, bb_starts, len);
299
0
    do {
300
0
      ref = ir_worklist_pop(&worklist);
301
0
      insn = &ctx->ir_base[ref];
302
303
0
      if (insn->op == IR_NOP) {
304
0
        continue;
305
0
      }
306
0
      IR_ASSERT(IR_IS_BB_START(insn->op));
307
      /* Remember BB start */
308
0
      start = ref;
309
      /* Skip control nodes untill BB end */
310
0
      while (1) {
311
0
        ref = ir_next_control(ctx, ref);
312
0
        insn = &ctx->ir_base[ref];
313
0
        if (IR_IS_BB_END(insn->op)) {
314
0
          break;
315
0
        }
316
0
      }
317
      /* Mark BB Start */
318
0
      bb_count++;
319
0
      _blocks[start] = ref;
320
0
      ir_bitset_incl(bb_starts, start);
321
      /* Add successors */
322
0
      _ir_add_successors(ctx, ref, &worklist);
323
0
    } while (ir_worklist_len(&worklist));
324
0
  }
325
326
0
  IR_ASSERT(bb_count > 0);
327
328
  /* Create array of basic blocks and count successor/predecessors edges for each BB */
329
0
  blocks = ir_mem_malloc((bb_count + 1) * sizeof(ir_block));
330
0
  b = 1;
331
0
  bb = blocks + 1;
332
0
  count = 0;
333
  /* SCCP already removed UNREACHABKE blocks, otherwise all blocks are marked as UNREACHABLE first */
334
0
  bb_init_falgs = (ctx->flags2 & IR_CFG_REACHABLE) ? 0 : IR_BB_UNREACHABLE;
335
0
  IR_BITSET_FOREACH(bb_starts, len, start) {
336
0
    insn = &ctx->ir_base[start];
337
0
    if (insn->op == IR_NOP) {
338
0
      _blocks[start] = 0;
339
0
      continue;
340
0
    }
341
0
    end = _blocks[start];
342
0
    _blocks[start] = b;
343
0
    _blocks[end] = b;
344
0
    IR_ASSERT(IR_IS_BB_START(insn->op));
345
0
    bb->start = start;
346
0
    bb->end = end;
347
0
    bb->successors = count;
348
0
    count += ctx->use_lists[end].count;
349
0
    bb->successors_count = 0;
350
0
    bb->predecessors = count;
351
0
    bb->dom_parent = 0;
352
0
    bb->dom_depth = 0;
353
0
    bb->dom_child = 0;
354
0
    bb->dom_next_child = 0;
355
0
    bb->loop_header = 0;
356
0
    bb->loop_depth = 0;
357
0
    if (insn->op == IR_START) {
358
0
      bb->flags = IR_BB_START;
359
0
      bb->predecessors_count = 0;
360
0
    } else {
361
0
      bb->flags = bb_init_falgs;
362
0
      if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) {
363
0
        n = insn->inputs_count;
364
0
        bb->predecessors_count = n;
365
0
        edges_count += n;
366
0
        count += n;
367
0
      } else if (EXPECTED(insn->op1)) {
368
0
        if (insn->op == IR_ENTRY) {
369
0
          bb->flags |= IR_BB_ENTRY;
370
0
          ctx->entries_count++;
371
0
        }
372
0
        bb->predecessors_count = 1;
373
0
        edges_count++;
374
0
        count++;
375
0
      } else {
376
0
        IR_ASSERT(insn->op == IR_BEGIN); /* start of unreachable block */
377
0
        bb->predecessors_count = 0;
378
0
      }
379
0
    }
380
0
    b++;
381
0
    bb++;
382
0
  } IR_BITSET_FOREACH_END();
383
0
  bb_count = b - 1;
384
0
  if (UNEXPECTED(count != edges_count * 2)) {
385
0
    count = ir_cfg_remove_dead_inputs(ctx, _blocks, blocks, bb_count);
386
0
    IR_ASSERT(count != edges_count * 2);
387
0
  }
388
0
  ir_mem_free(bb_starts);
389
390
  /* Create an array of successor/predecessors control edges */
391
0
  edges = ir_mem_malloc(edges_count * 2 * sizeof(uint32_t));
392
0
  bb = blocks + 1;
393
0
  for (b = 1; b <= bb_count; b++, bb++) {
394
0
    insn = &ctx->ir_base[bb->start];
395
0
    if (bb->predecessors_count > 1) {
396
0
      uint32_t *q = edges + bb->predecessors;
397
0
      n = insn->inputs_count;
398
0
      for (p = insn->ops + 1; n > 0; p++, q++, n--) {
399
0
        ref = *p;
400
0
        IR_ASSERT(ref);
401
0
        ir_ref pred_b = _blocks[ref];
402
0
        ir_block *pred_bb = &blocks[pred_b];
403
0
        IR_ASSERT(pred_b > 0);
404
0
        *q = pred_b;
405
0
        edges[pred_bb->successors + pred_bb->successors_count++] = b;
406
0
      }
407
0
    } else if (bb->predecessors_count == 1) {
408
0
      ref = insn->op1;
409
0
      IR_ASSERT(ref);
410
0
      IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
411
0
      ir_ref pred_b = _blocks[ref];
412
0
      ir_block *pred_bb = &blocks[pred_b];
413
0
      edges[bb->predecessors] = pred_b;
414
0
      edges[pred_bb->successors + pred_bb->successors_count++] = b;
415
0
    }
416
0
  }
417
418
0
  ctx->cfg_blocks_count = bb_count;
419
0
  ctx->cfg_edges_count = edges_count * 2;
420
0
  ctx->cfg_blocks = blocks;
421
0
  ctx->cfg_edges = edges;
422
0
  ctx->cfg_map = _blocks;
423
424
0
  if (!(ctx->flags2 & IR_CFG_REACHABLE)) {
425
0
    uint32_t reachable_count = 0;
426
427
    /* Mark reachable blocks */
428
0
    ir_worklist_clear(&worklist);
429
0
    ir_worklist_push(&worklist, 1);
430
0
    while (ir_worklist_len(&worklist) != 0) {
431
0
      uint32_t *p;
432
433
0
      reachable_count++;
434
0
      b = ir_worklist_pop(&worklist);
435
0
      bb = &blocks[b];
436
0
      bb->flags &= ~IR_BB_UNREACHABLE;
437
0
      n = bb->successors_count;
438
0
      if (n > 1) {
439
0
        for (p = edges + bb->successors; n > 0; p++, n--) {
440
0
          ir_worklist_push(&worklist, *p);
441
0
        }
442
0
      } else if (n == 1) {
443
0
        ir_worklist_push(&worklist, edges[bb->successors]);
444
0
      }
445
0
    }
446
0
    if (reachable_count != ctx->cfg_blocks_count) {
447
0
      ir_remove_unreachable_blocks(ctx);
448
0
    }
449
0
  }
450
451
0
  ir_worklist_free(&worklist);
452
453
0
  return 1;
454
0
}
455
456
static void ir_remove_predecessor(ir_ctx *ctx, ir_block *bb, uint32_t from)
457
0
{
458
0
  uint32_t i, *p, *q, n = 0;
459
460
0
  p = q = &ctx->cfg_edges[bb->predecessors];
461
0
  for (i = 0; i < bb->predecessors_count; i++, p++) {
462
0
    if (*p != from) {
463
0
      if (p != q) {
464
0
        *q = *p;
465
0
      }
466
0
      q++;
467
0
      n++;
468
0
    }
469
0
  }
470
0
  IR_ASSERT(n != bb->predecessors_count);
471
0
  bb->predecessors_count = n;
472
0
}
473
474
static void ir_remove_merge_input(ir_ctx *ctx, ir_ref merge, ir_ref from)
475
0
{
476
0
  ir_ref i, j, n;
477
0
  ir_use_list *use_list;
478
0
  ir_bitset life_inputs;
479
0
  ir_insn *insn = &ctx->ir_base[merge];
480
481
0
  IR_ASSERT(insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN);
482
0
  n = insn->inputs_count;
483
0
  i = 1;
484
0
  life_inputs = ir_bitset_malloc(n + 1);
485
0
  for (j = 1; j <= n; j++) {
486
0
    ir_ref input = ir_insn_op(insn, j);
487
488
0
    if (input != from) {
489
0
      if (i != j) {
490
0
        ir_insn_set_op(insn, i, input);
491
0
      }
492
0
      ir_bitset_incl(life_inputs, j);
493
0
      i++;
494
0
    }
495
0
  }
496
0
  i--;
497
0
  for (j = i + 1; j <= n; j++) {
498
0
    ir_insn_set_op(insn, j, IR_UNUSED);
499
0
  }
500
0
  if (i == 1) {
501
0
    insn->op = IR_BEGIN;
502
0
  }
503
0
  insn->inputs_count = i;
504
505
0
  use_list = &ctx->use_lists[merge];
506
0
  if (use_list->count > 1) {
507
0
    ir_remove_phis_inputs(ctx, use_list, i, life_inputs);
508
0
  }
509
510
0
  ir_mem_free(life_inputs);
511
0
  IR_ASSERT(ctx->use_lists[from].count == 1 && ctx->use_edges[ctx->use_lists[from].refs] == merge);
512
0
  CLEAR_USES(from);
513
0
}
514
515
/* CFG constructed after SCCP pass doesn't have unreachable BBs, otherwise they should be removed */
516
static int ir_remove_unreachable_blocks(ir_ctx *ctx)
517
0
{
518
0
  uint32_t b, *p, i;
519
0
  uint32_t unreachable_count = 0;
520
0
  uint32_t bb_count = ctx->cfg_blocks_count;
521
0
  ir_block *bb = ctx->cfg_blocks + 1;
522
523
0
  for (b = 1; b <= bb_count; b++, bb++) {
524
0
    if (bb->flags & IR_BB_UNREACHABLE) {
525
#if 0
526
      do {if (!unreachable_count) ir_dump_cfg(ctx, stderr);} while(0);
527
#endif
528
0
      if (bb->successors_count) {
529
0
        for (i = 0, p = &ctx->cfg_edges[bb->successors]; i < bb->successors_count; i++, p++) {
530
0
          ir_block *succ_bb = &ctx->cfg_blocks[*p];
531
532
0
          if (!(succ_bb->flags & IR_BB_UNREACHABLE)) {
533
0
            ir_remove_predecessor(ctx, succ_bb, b);
534
0
            ir_remove_merge_input(ctx, succ_bb->start, bb->end);
535
0
          }
536
0
        }
537
0
      } else {
538
0
        ir_ref prev, ref = bb->end;
539
0
        ir_insn *insn = &ctx->ir_base[ref];
540
541
0
        IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_TERMINATOR);
542
        /* remove from terminators list */
543
0
        prev = ctx->ir_base[1].op1;
544
0
        if (prev == ref) {
545
0
          ctx->ir_base[1].op1 = insn->op3;
546
0
        } else {
547
0
          while (prev) {
548
0
            if (ctx->ir_base[prev].op3 == ref) {
549
0
              ctx->ir_base[prev].op3 = insn->op3;
550
0
              break;
551
0
            }
552
0
            prev = ctx->ir_base[prev].op3;
553
0
          }
554
0
        }
555
0
      }
556
0
      ctx->cfg_map[bb->start] = 0;
557
0
      ctx->cfg_map[bb->end] = 0;
558
0
      unreachable_count++;
559
0
    }
560
0
  }
561
562
0
  if (unreachable_count) {
563
0
    ir_block *dst_bb;
564
0
    uint32_t n = 1;
565
0
    uint32_t *edges;
566
567
0
    dst_bb = bb = ctx->cfg_blocks + 1;
568
0
    for (b = 1; b <= bb_count; b++, bb++) {
569
0
      if (!(bb->flags & IR_BB_UNREACHABLE)) {
570
0
        if (dst_bb != bb) {
571
0
          memcpy(dst_bb, bb, sizeof(ir_block));
572
0
          ctx->cfg_map[dst_bb->start] = n;
573
0
          ctx->cfg_map[dst_bb->end] = n;
574
0
        }
575
0
        dst_bb->successors_count = 0;
576
0
        dst_bb++;
577
0
        n++;
578
0
      }
579
0
    }
580
0
    ctx->cfg_blocks_count = bb_count = n - 1;
581
582
    /* Rebuild successor/predecessors control edges */
583
0
    edges = ctx->cfg_edges;
584
0
    bb = ctx->cfg_blocks + 1;
585
0
    for (b = 1; b <= bb_count; b++, bb++) {
586
0
      ir_insn *insn = &ctx->ir_base[bb->start];
587
0
      ir_ref *p, ref;
588
589
0
      n = bb->predecessors_count;
590
0
      if (n > 1) {
591
0
        uint32_t *q = edges + bb->predecessors;
592
593
0
        IR_ASSERT(n == insn->inputs_count);
594
0
        for (p = insn->ops + 1; n > 0; p++, q++, n--) {
595
0
          ref = *p;
596
0
          IR_ASSERT(ref);
597
0
          ir_ref pred_b = ctx->cfg_map[ref];
598
0
          ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
599
0
          *q = pred_b;
600
0
          edges[pred_bb->successors + pred_bb->successors_count++] = b;
601
0
        }
602
0
      } else if (n == 1) {
603
0
        ref = insn->op1;
604
0
        IR_ASSERT(ref);
605
0
        IR_ASSERT(IR_OPND_KIND(ir_op_flags[insn->op], 1) == IR_OPND_CONTROL);
606
0
        ir_ref pred_b = ctx->cfg_map[ref];
607
0
        ir_block *pred_bb = &ctx->cfg_blocks[pred_b];
608
0
        edges[bb->predecessors] = pred_b;
609
0
        edges[pred_bb->successors + pred_bb->successors_count++] = b;
610
0
      }
611
0
    }
612
0
  }
613
614
0
  return 1;
615
0
}
616
617
/* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by
618
 * Cooper, Harvey and Kennedy. */
619
static IR_NEVER_INLINE int ir_build_dominators_tree_slow(ir_ctx *ctx)
620
0
{
621
0
  uint32_t blocks_count, b, postnum, i;
622
0
  ir_block *blocks, *bb;
623
0
  uint32_t *edges;
624
0
  uint32_t *rpo = ir_mem_malloc((ctx->cfg_blocks_count + 1) * sizeof(uint32_t));
625
0
  bool changed;
626
627
0
  blocks = ctx->cfg_blocks;
628
0
  edges  = ctx->cfg_edges;
629
0
  blocks_count = ctx->cfg_blocks_count;
630
631
0
  ctx->flags2 &= ~IR_NO_LOOPS;
632
633
0
  postnum = 1;
634
0
  ir_worklist work;
635
0
  ir_worklist_init(&work, ctx->cfg_blocks_count + 1);
636
0
  ir_worklist_push(&work, 1);
637
0
  IR_ASSERT(blocks[1].next_succ == 0);
638
0
  while (ir_worklist_len(&work)) {
639
0
next:
640
0
    b = ir_worklist_peek(&work);
641
0
    bb = &blocks[b];
642
0
    uint32_t n = bb->successors_count - bb->next_succ;
643
0
    if (n) {
644
0
      uint32_t *p = edges + bb->successors + bb->next_succ;
645
0
      for (; n > 0; p++, n--) {
646
0
        uint32_t succ = *p;
647
0
        if (ir_worklist_push(&work, succ)) {
648
0
          bb->next_succ = bb->successors_count - n + 1;
649
0
          IR_ASSERT(blocks[succ].next_succ == 0);
650
0
          goto next;
651
0
        }
652
0
      }
653
0
    }
654
655
    /* Start from bb->idom calculated by the fast dominators algorithm */
656
    // bb->idom = 0;
657
0
    bb->next_succ = 0;
658
0
    rpo[postnum] = b;
659
0
    bb->postnum = postnum++;
660
0
    ir_worklist_pop(&work);
661
0
  }
662
0
  ir_worklist_free(&work);
663
664
0
  IR_ASSERT(rpo[blocks_count] == 1);
665
666
  /* Find immediate dominators by iterative fixed-point algorithm */
667
0
  blocks[1].idom = 1;
668
0
  do {
669
0
    changed = 0;
670
671
    /* Iterating in Reverse Post Order */
672
0
    for (i = blocks_count - 1; i > 0; i--) {
673
0
      b = rpo[i];
674
0
      bb = &blocks[b];
675
0
      IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
676
0
      IR_ASSERT(bb->predecessors_count > 0);
677
0
      if (bb->predecessors_count == 1) {
678
0
        uint32_t pred_b = edges[bb->predecessors];
679
680
0
        if (blocks[pred_b].idom > 0 && bb->idom != pred_b) {
681
0
          bb->idom = pred_b;
682
0
          changed = 1;
683
0
        }
684
0
      } else if (bb->predecessors_count) {
685
0
        uint32_t idom = 0;
686
0
        uint32_t k = bb->predecessors_count;
687
0
        uint32_t *p = edges + bb->predecessors;
688
689
0
        do {
690
0
          uint32_t pred_b = *p;
691
0
          ir_block *pred_bb = &blocks[pred_b];
692
0
          ir_block *idom_bb;
693
694
0
          if (pred_bb->idom > 0) {
695
0
            idom = pred_b;
696
0
            idom_bb = &blocks[idom];
697
698
0
            while (--k > 0) {
699
0
              pred_b = *(++p);
700
0
              pred_bb = &blocks[pred_b];
701
0
              if (pred_bb->idom > 0) {
702
0
                while (idom != pred_b) {
703
0
                  while (pred_bb->postnum < idom_bb->postnum) {
704
0
                    pred_b = pred_bb->idom;
705
0
                    pred_bb = &blocks[pred_b];
706
0
                  }
707
0
                  while (idom_bb->postnum < pred_bb->postnum) {
708
0
                    idom = idom_bb->idom;
709
0
                    idom_bb = &blocks[idom];
710
0
                  }
711
0
                }
712
0
              }
713
0
            }
714
715
0
            if (bb->idom != idom) {
716
0
              bb->idom = idom;
717
0
              changed = 1;
718
0
            }
719
0
            break;
720
0
          }
721
0
          p++;
722
0
        } while (--k > 0);
723
0
      }
724
0
    }
725
0
  } while (changed);
726
727
0
  ir_mem_free(rpo);
728
729
  /* Build dominators tree */
730
0
  blocks[1].idom = 0;
731
0
  blocks[1].dom_depth = 0;
732
733
  /* Construct children lists sorted by block number */
734
0
  for (b = blocks_count, bb = &blocks[b]; b >= 2; b--, bb--) {
735
0
    ir_block *idom_bb = &blocks[bb->idom];
736
0
    bb->dom_depth = 0;
737
0
    bb->dom_next_child = idom_bb->dom_child;
738
0
    idom_bb->dom_child = b;
739
0
  }
740
741
  /* Recalculate dom_depth for all blocks */
742
0
  for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
743
0
    uint32_t idom = bb->idom;
744
0
    uint32_t dom_depth = 0;
745
0
    while (idom) {
746
0
      dom_depth++;
747
0
      if (blocks[idom].dom_depth > 0) {
748
0
        dom_depth += blocks[idom].dom_depth;
749
0
        break;
750
0
      } else {
751
0
        idom = blocks[idom].idom;
752
0
      }
753
0
    }
754
0
    bb->dom_depth = dom_depth;
755
0
  }
756
757
0
  return 1;
758
0
}
759
760
/* A single pass modification of "A Simple, Fast Dominance Algorithm" by
761
 * Cooper, Harvey and Kennedy, that relays on IR block ordering.
762
 * It may fallback to the general slow fixed-point algorithm.  */
763
static int ir_build_dominators_tree_iterative(ir_ctx *ctx);
764
int ir_build_dominators_tree(ir_ctx *ctx)
765
0
{
766
0
  uint32_t blocks_count, b;
767
0
  ir_block *blocks, *bb;
768
0
  uint32_t *edges;
769
0
  ir_list worklist;
770
771
0
  ir_list_init(&worklist, ctx->cfg_blocks_count / 2);
772
773
0
  ctx->flags2 |= IR_NO_LOOPS;
774
775
  /* Find immediate dominators */
776
0
  blocks = ctx->cfg_blocks;
777
0
  edges  = ctx->cfg_edges;
778
0
  blocks_count = ctx->cfg_blocks_count;
779
0
  blocks[1].idom = 1;
780
0
  blocks[1].dom_depth = 0;
781
782
  /* Iterating in Reverse Post Order (relay on existing BB order and fall-back to slow algorithm) */
783
0
  for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
784
0
    IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
785
0
    IR_ASSERT(bb->predecessors_count > 0);
786
0
    uint32_t k = bb->predecessors_count;
787
0
    uint32_t *p = edges + bb->predecessors;
788
0
    uint32_t idom = *p;
789
0
    ir_block *idom_bb;
790
791
0
    if (UNEXPECTED(idom >= b)) {
792
      /* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */
793
0
      ctx->flags2 &= ~IR_NO_LOOPS;
794
0
      if (UNEXPECTED(k <= 1)) {
795
        // IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor");
796
0
slow_case:
797
0
        ir_list_free(&worklist);
798
0
        return ir_build_dominators_tree_slow(ctx);
799
0
      }
800
0
      ir_list_push(&worklist, idom);
801
0
      while (1) {
802
0
        k--;
803
0
        p++;
804
0
        idom = *p;
805
0
        if (idom < b) {
806
0
          break;
807
0
        }
808
0
        if (UNEXPECTED(k == 0)) {
809
          // IR_ASSERT(0 && "Wrong blocks order: BB is before all its predecessors");
810
0
          goto slow_case;
811
0
        }
812
0
        ir_list_push(&worklist, idom);
813
0
      }
814
0
    }
815
0
    IR_ASSERT(blocks[idom].idom > 0);
816
0
    IR_ASSERT(k != 0);
817
818
0
    while (--k > 0) {
819
0
      uint32_t pred_b = *(++p);
820
821
0
      if (pred_b < b) {
822
0
        IR_ASSERT(blocks[pred_b].idom > 0);
823
0
        while (idom != pred_b) {
824
0
          while (pred_b > idom) {
825
0
            pred_b = blocks[pred_b].idom;
826
0
          }
827
0
          while (idom > pred_b) {
828
0
            idom = blocks[idom].idom;
829
0
          }
830
0
        }
831
0
      } else {
832
0
        ctx->flags2 &= ~IR_NO_LOOPS;
833
0
        IR_ASSERT(bb->predecessors_count > 1);
834
0
        ir_list_push(&worklist, pred_b);
835
0
      }
836
0
    }
837
0
    bb->idom = idom;
838
0
    idom_bb = &blocks[idom];
839
0
    bb->dom_depth = idom_bb->dom_depth + 1;
840
0
  }
841
842
0
  blocks[1].idom = 0;
843
844
0
  if (ir_list_len(&worklist) != 0) {
845
0
    uint32_t dom_depth;
846
0
    uint32_t succ_b;
847
0
    bool complete = 1;
848
849
    /* Check if all the back-edges lead to the loop headers */
850
0
    do {
851
0
      b = ir_list_pop(&worklist);
852
0
      bb = &blocks[b];
853
0
      succ_b = ctx->cfg_edges[bb->successors];
854
0
      if (bb->successors_count != 1) {
855
        /* LOOP_END/END may be linked with the following ENTRY by a fake edge */
856
0
        if (bb->successors_count != 2) {
857
0
          complete = 0;
858
0
          break;
859
0
        } else if (blocks[succ_b].flags & IR_BB_ENTRY) {
860
0
          succ_b = ctx->cfg_edges[bb->successors + 1];
861
0
        } else if (!(blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY)) {
862
0
          complete = 0;
863
0
          break;
864
0
        }
865
0
      }
866
0
      dom_depth = blocks[succ_b].dom_depth;;
867
0
      while (bb->dom_depth > dom_depth) {
868
0
        b = bb->dom_parent;
869
0
        bb = &blocks[b];
870
0
      }
871
0
      if (UNEXPECTED(b != succ_b)) {
872
0
        complete = 0;
873
0
        break;
874
0
      }
875
0
    } while (ir_list_len(&worklist) != 0);
876
877
0
    if (UNEXPECTED(!complete)) {
878
0
      ir_list_free(&worklist);
879
      // TODO: this algorithm may be incorrect. Prove and/or switch to ir_build_dominators_tree_slow() ???
880
0
      return ir_build_dominators_tree_iterative(ctx);
881
0
    }
882
0
  }
883
884
  /* Construct children lists sorted by block number */
885
0
  for (b = blocks_count, bb = &blocks[b]; b >= 2; b--, bb--) {
886
0
    ir_block *idom_bb = &blocks[bb->idom];
887
0
    bb->dom_next_child = idom_bb->dom_child;
888
0
    idom_bb->dom_child = b;
889
0
  }
890
891
0
  ir_list_free(&worklist);
892
893
0
  return 1;
894
0
}
895
896
static int ir_build_dominators_tree_iterative(ir_ctx *ctx)
897
0
{
898
0
  bool changed;
899
0
  uint32_t blocks_count, b;
900
0
  ir_block *blocks, *bb;
901
0
  uint32_t *edges;
902
903
  /* Find immediate dominators */
904
0
  blocks = ctx->cfg_blocks;
905
0
  edges  = ctx->cfg_edges;
906
0
  blocks_count = ctx->cfg_blocks_count;
907
908
  /* Clear the dominators tree, but keep already found dominators */
909
0
  for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
910
0
    bb->dom_depth = 0;
911
0
  }
912
913
  /* Find immediate dominators by iterative fixed-point algorithm */
914
0
  blocks[1].idom = 1;
915
0
  do {
916
0
    changed = 0;
917
918
0
    for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
919
0
      IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
920
0
      IR_ASSERT(bb->predecessors_count > 0);
921
0
      uint32_t k = bb->predecessors_count;
922
0
      uint32_t *p = edges + bb->predecessors;
923
0
      uint32_t idom = *p;
924
925
0
      if (blocks[idom].idom == 0) {
926
0
        while (1) {
927
0
          k--;
928
0
          if (UNEXPECTED(k == 0)) break;
929
0
          p++;
930
0
          idom = *p;
931
0
          if (blocks[idom].idom > 0) {
932
0
            break;
933
0
          }
934
0
        }
935
0
        if (UNEXPECTED(k == 0)) continue;
936
0
      }
937
0
      IR_ASSERT(k != 0);
938
0
      while (--k > 0) {
939
0
        uint32_t pred_b = *(++p);
940
941
0
        if (blocks[pred_b].idom > 0) {
942
0
          while (idom != pred_b) {
943
0
            while (pred_b > idom) {
944
0
              pred_b = blocks[pred_b].idom;
945
0
            }
946
0
            while (idom > pred_b) {
947
0
              idom = blocks[idom].idom;
948
0
            }
949
0
          }
950
0
        }
951
0
      }
952
0
      if (bb->idom != idom) {
953
0
        bb->idom = idom;
954
0
        changed = 1;
955
0
      }
956
0
    }
957
0
  } while (changed);
958
959
  /* Build dominators tree */
960
0
  blocks[1].idom = 0;
961
0
  blocks[1].dom_depth = 0;
962
0
  for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
963
0
    uint32_t idom = bb->idom;
964
0
    ir_block *idom_bb = &blocks[idom];
965
966
0
    bb->dom_depth = idom_bb->dom_depth + 1;
967
0
  }
968
969
  /* Construct children lists sorted by block number */
970
0
  for (b = blocks_count, bb = &blocks[b]; b >= 2; b--, bb--) {
971
0
    ir_block *idom_bb = &blocks[bb->idom];
972
0
    bb->dom_next_child = idom_bb->dom_child;
973
0
    idom_bb->dom_child = b;
974
0
  }
975
976
0
  return 1;
977
0
}
978
979
static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
980
0
{
981
0
  uint32_t b1_depth = blocks[b1].dom_depth;
982
0
  const ir_block *bb2 = &blocks[b2];
983
984
0
  while (bb2->dom_depth > b1_depth) {
985
0
    b2 = bb2->dom_parent;
986
0
    bb2 = &blocks[b2];
987
0
  }
988
0
  return b1 == b2;
989
0
}
990
991
/* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs" and
992
 * G. Ramalingam "Identifying Loops In Almost Linear Time". */
993
994
0
#define ENTRY_TIME(b) times[(b) * 2]
995
0
#define EXIT_TIME(b)  times[(b) * 2 + 1]
996
997
static IR_NEVER_INLINE void ir_collect_irreducible_loops(ir_ctx *ctx, uint32_t *times, ir_worklist *work, ir_list *list)
998
0
{
999
0
  ir_block *blocks = ctx->cfg_blocks;
1000
0
  uint32_t *edges = ctx->cfg_edges;
1001
1002
0
  IR_ASSERT(ir_list_len(list) != 0);
1003
0
  if (ir_list_len(list) > 1) {
1004
    /* Sort list to process irreducible loops in DFS order (insertion sort) */
1005
0
    ir_ref *a = list->a.refs;
1006
0
    uint32_t n = ir_list_len(list);
1007
0
    uint32_t i = 1;
1008
0
    while (i < n) {
1009
0
      uint32_t j = i;
1010
0
      while (j > 0 && ENTRY_TIME(a[j-1]) > ENTRY_TIME(a[j])) {
1011
0
        ir_ref tmp = a[j];
1012
0
        a[j] = a[j-1];
1013
0
        a[j-1] = tmp;
1014
0
        j--;
1015
0
      }
1016
0
      i++;
1017
0
    }
1018
0
  }
1019
0
  while (ir_list_len(list)) {
1020
0
    uint32_t hdr = ir_list_pop(list);
1021
0
    ir_block *bb = &blocks[hdr];
1022
1023
0
    IR_ASSERT(bb->flags & IR_BB_IRREDUCIBLE_LOOP);
1024
0
    IR_ASSERT(!bb->loop_depth);
1025
0
    if (!bb->loop_depth) {
1026
      /* process irreducible loop */
1027
1028
0
      bb->flags |= IR_BB_LOOP_HEADER;
1029
0
      bb->loop_depth = 1;
1030
0
      if (ctx->ir_base[bb->start].op == IR_MERGE) {
1031
0
        ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
1032
0
      }
1033
1034
      /* find the closing edge(s) of the irreucible loop */
1035
0
      IR_ASSERT(bb->predecessors_count > 1);
1036
0
      IR_ASSERT(ir_worklist_len(work) == 0);
1037
0
      ir_bitset_clear(work->visited, ir_bitset_len(ir_worklist_capasity(work)));
1038
0
      ir_bitset_incl(work->visited, hdr);
1039
1040
0
      uint32_t *p = &edges[bb->predecessors];
1041
0
      uint32_t n = bb->predecessors_count;
1042
0
      do {
1043
0
        uint32_t pred = *p;
1044
0
        if (ENTRY_TIME(pred) > ENTRY_TIME(hdr) && EXIT_TIME(pred) < EXIT_TIME(hdr)) {
1045
0
          ir_worklist_push(work, pred);
1046
0
        }
1047
0
        p++;
1048
0
      } while (--n);
1049
1050
      /* collect members of the irreducible loop */
1051
0
      while (ir_worklist_len(work)) {
1052
0
        uint32_t b = ir_worklist_pop(work);
1053
1054
0
        bb = &blocks[b];
1055
0
        if (!bb->loop_header) {
1056
0
          bb->loop_header = hdr;
1057
0
        }
1058
1059
0
        uint32_t *p = &edges[bb->predecessors];
1060
0
        uint32_t n = bb->predecessors_count;
1061
1062
0
        for (; n > 0; p++, n--) {
1063
0
          uint32_t pred = *p;
1064
0
          if (!ir_bitset_in(work->visited, pred)) {
1065
0
            if (blocks[pred].loop_header) {
1066
0
              if (blocks[pred].loop_header == b) continue;
1067
0
              do {
1068
0
                pred = blocks[pred].loop_header;
1069
0
              } while (blocks[pred].loop_header > 0);
1070
0
            }
1071
0
            if (ENTRY_TIME(pred) > ENTRY_TIME(hdr) && EXIT_TIME(pred) < EXIT_TIME(hdr)) {
1072
              /* "pred" is a descendant of "hdr" */
1073
0
                ir_worklist_push(work, pred);
1074
0
            } else if (bb->predecessors_count > 1) {
1075
              /* another entry to the irreducible loop */
1076
0
              bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
1077
0
              if (ctx->ir_base[bb->start].op == IR_MERGE) {
1078
0
                ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
1079
0
              }
1080
0
            }
1081
0
          }
1082
0
        }
1083
0
      }
1084
0
    }
1085
0
  }
1086
0
}
1087
1088
int ir_find_loops(ir_ctx *ctx)
1089
0
{
1090
0
  uint32_t b, j, n;
1091
0
  uint32_t *times, *sorted_blocks, time = 1;
1092
0
  ir_block *blocks = ctx->cfg_blocks;
1093
0
  uint32_t *edges = ctx->cfg_edges;
1094
0
  ir_worklist work;
1095
1096
0
  if (ctx->flags2 & IR_NO_LOOPS) {
1097
0
    return 1;
1098
0
  }
1099
1100
  /* Compute entry/exit times for the CFG DFS spanning tree to perform ancestor and back-edge queries. */
1101
0
  ir_worklist_init(&work, ctx->cfg_blocks_count + 1);
1102
0
  times = ir_mem_malloc((ctx->cfg_blocks_count + 1) * 3 * sizeof(uint32_t));
1103
0
  sorted_blocks = times + (ctx->cfg_blocks_count + 1) * 2;
1104
1105
0
  ir_bitset visited = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
1106
0
  ir_worklist_push(&work, 1);
1107
0
  while (ir_worklist_len(&work)) {
1108
0
next:
1109
0
    b = ir_worklist_peek(&work);
1110
0
    ir_block *bb = &blocks[b];
1111
1112
0
    if (!ir_bitset_in(visited, b)) {
1113
0
      ir_bitset_incl(visited, b);
1114
0
      ENTRY_TIME(b) = time++;
1115
0
    }
1116
1117
0
    uint32_t n = bb->successors_count - bb->next_succ;
1118
0
    if (n) {
1119
0
      uint32_t *p = edges + bb->successors + bb->next_succ;
1120
0
      for (; n > 0; p++, n--) {
1121
0
        uint32_t succ = *p;
1122
0
        if (ir_worklist_push(&work, succ)) {
1123
0
          bb->next_succ = bb->successors_count - n + 1;
1124
0
          IR_ASSERT(blocks[succ].next_succ == 0);
1125
0
          goto next;
1126
0
        }
1127
0
      }
1128
0
    }
1129
1130
0
    bb->next_succ = 0;
1131
0
    EXIT_TIME(b) = time++;
1132
0
    ir_worklist_pop(&work);
1133
0
  }
1134
0
  ir_mem_free(visited);
1135
1136
  /* Sort blocks by level, which is the opposite order in which we want to process them */
1137
  /* (Breadth First Search using "sorted_blocks" as a queue) */
1138
0
  sorted_blocks[1] = 1;
1139
0
  j = 1;
1140
0
  n = 2;
1141
0
  while (j != n) {
1142
0
    uint32_t i = j;
1143
0
    j = n;
1144
0
    for (; i < j; i++) {
1145
0
      int child;
1146
0
      for (child = blocks[sorted_blocks[i]].dom_child; child > 0; child = blocks[child].dom_next_child) {
1147
0
        sorted_blocks[n++] = child;
1148
0
      }
1149
0
    }
1150
0
  }
1151
0
  IR_ASSERT(n == ctx->cfg_blocks_count + 1);
1152
1153
0
#if IR_DEBUG
1154
0
  uint32_t prev_dom_depth = blocks[sorted_blocks[n - 1]].dom_depth;
1155
0
#endif
1156
0
  uint32_t irreducible_depth = 0;
1157
0
  ir_list irreducible_list = {0};
1158
1159
0
  while (n > 1) {
1160
0
    b = sorted_blocks[--n];
1161
0
    ir_block *bb = &blocks[b];
1162
1163
0
    IR_ASSERT(bb->dom_depth <= prev_dom_depth);
1164
1165
0
    if (UNEXPECTED(bb->dom_depth < irreducible_depth)) {
1166
0
      ir_collect_irreducible_loops(ctx, times, &work, &irreducible_list);
1167
0
      irreducible_depth = 0;
1168
0
    }
1169
1170
0
    if (bb->predecessors_count > 1) {
1171
0
      bool irreducible = 0;
1172
0
      uint32_t b_entry_time = ENTRY_TIME(b);
1173
0
      uint32_t b_exit_time = EXIT_TIME(b);
1174
0
      uint32_t *p = &edges[bb->predecessors];
1175
1176
0
      for (j = bb->predecessors_count; j > 0; p++, j--) {
1177
0
        uint32_t pred = *p;
1178
1179
        /* Check back-edges */
1180
0
        if (ENTRY_TIME(pred) >= b_entry_time && EXIT_TIME(pred) <= b_exit_time) {
1181
0
          if (ir_dominates(blocks, b, pred)) {
1182
            /* In a loop back-edge (back-join edge), the successor dominates
1183
               the predecessor.  */
1184
0
            if (!ir_worklist_len(&work)) {
1185
0
              ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
1186
0
            }
1187
0
            ir_worklist_push(&work, pred);
1188
0
          } else {
1189
            /* Otherwise it's a back-edge of irreducible loop. */
1190
0
            irreducible = 1;
1191
0
            break;
1192
0
          }
1193
0
        }
1194
0
      }
1195
1196
0
      if (UNEXPECTED(irreducible)) {
1197
0
        bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
1198
0
        ctx->flags2 |= IR_CFG_HAS_LOOPS | IR_IRREDUCIBLE_CFG;
1199
        /* Delay processing of all irreducible loops
1200
         * after all reducible loops with the same dominator tree depth
1201
         */
1202
0
        irreducible_depth = bb->dom_depth;
1203
0
        if (!ir_list_capasity(&irreducible_list)) {
1204
0
          ir_list_init(&irreducible_list, 16);
1205
0
        }
1206
0
        ir_list_push(&irreducible_list, b);
1207
0
        ir_list_clear(&work.l);
1208
0
      } else if (ir_worklist_len(&work)) {
1209
        /* collect members of the reducible loop */
1210
0
        uint32_t hdr = b;
1211
1212
0
        bb->flags |= IR_BB_LOOP_HEADER;
1213
0
        ctx->flags2 |= IR_CFG_HAS_LOOPS;
1214
0
        bb->loop_depth = 1;
1215
0
        if (ctx->ir_base[bb->start].op == IR_MERGE) {
1216
0
          ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
1217
0
        }
1218
0
        ir_bitset_incl(work.visited, hdr);
1219
0
        while (ir_worklist_len(&work)) {
1220
0
          b = ir_worklist_pop(&work);
1221
0
          if (b != hdr) {
1222
0
            ir_block *bb = &blocks[b];
1223
1224
0
            if (!bb->loop_header) {
1225
0
              bb->loop_header = hdr;
1226
0
            }
1227
1228
0
            uint32_t *p = &edges[bb->predecessors];
1229
0
            uint32_t n = bb->predecessors_count;
1230
0
            for (; n > 0; p++, n--) {
1231
0
              uint32_t pred = *p;
1232
0
              if (!ir_bitset_in(work.visited, pred)) {
1233
0
                if (blocks[pred].loop_header) {
1234
0
                  if (blocks[pred].loop_header == b) continue;
1235
0
                  do {
1236
0
                    pred = blocks[pred].loop_header;
1237
0
                  } while (blocks[pred].loop_header > 0);
1238
0
                  ir_worklist_push(&work, pred);
1239
0
                } else {
1240
0
                  ir_bitset_incl(work.visited, pred);
1241
0
                  ir_list_push_unchecked(&work.l, pred);
1242
0
                }
1243
0
              }
1244
0
            }
1245
0
          }
1246
0
        }
1247
0
      }
1248
0
    }
1249
0
  }
1250
1251
0
  IR_ASSERT(!irreducible_depth);
1252
0
  if (ir_list_capasity(&irreducible_list)) {
1253
0
    ir_list_free(&irreducible_list);
1254
0
  }
1255
1256
0
  if (ctx->flags2 & IR_CFG_HAS_LOOPS) {
1257
0
    n = ctx->cfg_blocks_count + 1;
1258
0
    for (j = 1; j < n; j++) {
1259
0
      b = sorted_blocks[j];
1260
0
      ir_block *bb = &blocks[b];
1261
0
      if (bb->loop_header > 0) {
1262
0
        ir_block *loop = &blocks[bb->loop_header];
1263
0
        uint32_t loop_depth = loop->loop_depth;
1264
1265
0
        if (bb->flags & IR_BB_LOOP_HEADER) {
1266
0
          loop_depth++;
1267
0
        }
1268
0
        bb->loop_depth = loop_depth;
1269
0
        if (bb->flags & (IR_BB_ENTRY|IR_BB_LOOP_WITH_ENTRY)) {
1270
0
          loop->flags |= IR_BB_LOOP_WITH_ENTRY;
1271
0
          if (loop_depth > 1) {
1272
            /* Set IR_BB_LOOP_WITH_ENTRY flag for all the enclosing loops */
1273
0
            bb = &blocks[loop->loop_header];
1274
0
            while (1) {
1275
0
              if (bb->flags & IR_BB_LOOP_WITH_ENTRY) {
1276
0
                break;
1277
0
              }
1278
0
              bb->flags |= IR_BB_LOOP_WITH_ENTRY;
1279
0
              if (bb->loop_depth == 1) {
1280
0
                break;
1281
0
              }
1282
0
              bb = &blocks[loop->loop_header];
1283
0
            }
1284
0
          }
1285
0
        }
1286
0
      }
1287
0
    }
1288
0
  }
1289
1290
0
  ir_mem_free(times);
1291
0
  ir_worklist_free(&work);
1292
1293
0
  return 1;
1294
0
}
1295
1296
static uint32_t _ir_skip_empty_blocks(const ir_ctx *ctx, uint32_t b)
1297
0
{
1298
0
  while (1) {
1299
0
    ir_block *bb = &ctx->cfg_blocks[b];
1300
1301
0
    if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1302
0
      IR_ASSERT(bb->successors_count == 1);
1303
0
      b = ctx->cfg_edges[bb->successors];
1304
0
    } else {
1305
0
      return b;
1306
0
    }
1307
0
  }
1308
0
}
1309
1310
/* A variation of "Bottom-up Positioning" algorithm described by
1311
 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1312
 */
1313
typedef struct _ir_edge_info {
1314
  uint32_t from;
1315
  uint32_t to;
1316
  float    freq;
1317
} ir_edge_info;
1318
1319
typedef struct _ir_chain {
1320
  uint32_t head;
1321
  uint32_t next;
1322
  union {
1323
    uint32_t prev;
1324
    uint32_t tail;
1325
  };
1326
} ir_chain;
1327
1328
static int ir_edge_info_cmp(const void *b1, const void *b2)
1329
0
{
1330
0
  ir_edge_info *e1 = (ir_edge_info*)b1;
1331
0
  ir_edge_info *e2 = (ir_edge_info*)b2;
1332
1333
0
  if (e1->freq != e2->freq) {
1334
0
    return e1->freq < e2->freq ? 1 : -1;
1335
0
  }
1336
  /* In case of equal frequencies, try to avoid penalization of one of the "equal" paths by
1337
   * preferring the first RPO successor (in conditional branches) and the last RPO predecessor
1338
   * (in merge points).
1339
   *
1340
   * See "Static Basic Block Reordering Heuristics for Implicit Control Flow in Baseline JITs"
1341
   * Polito Guillermo, Ducasse Stephane, and Tesone Pablo (2021)
1342
   */
1343
0
  if (e1->from != e2->from) {
1344
0
    return e2->from - e1->from;
1345
0
  } else {
1346
0
    return e1->to - e2->to;
1347
0
  }
1348
0
}
1349
1350
static IR_NEVER_INLINE uint32_t ir_chain_head_path_compress(ir_chain *chains, uint32_t src, uint32_t head)
1351
0
{
1352
0
  IR_ASSERT(head != 0);
1353
0
  do {
1354
0
    head = chains[head].head;
1355
0
  } while (chains[head].head != head);
1356
0
  do {
1357
0
    uint32_t next = chains[src].head;
1358
0
    chains[src].head = head;
1359
0
    src = next;
1360
0
  } while (chains[src].head != head);
1361
0
  return head;
1362
0
}
1363
1364
IR_ALWAYS_INLINE uint32_t ir_chain_head(ir_chain *chains, uint32_t src)
1365
0
{
1366
0
  uint32_t head = chains[src].head;
1367
0
  if (chains[head].head == head) {
1368
0
    return head;
1369
0
  } else {
1370
0
    return ir_chain_head_path_compress(chains, src, head);
1371
0
  }
1372
0
}
1373
1374
static void ir_join_chains(ir_chain *chains, uint32_t src, uint32_t dst)
1375
0
{
1376
0
  uint32_t dst_tail = chains[dst].tail;
1377
0
  uint32_t src_tail = chains[src].tail;
1378
1379
0
  chains[dst_tail].next = src;
1380
0
  chains[dst].prev = src_tail;
1381
0
  chains[src_tail].next = dst;
1382
0
  chains[src].tail = dst_tail;
1383
0
  chains[dst].head = src;
1384
0
}
1385
1386
static void ir_insert_chain_before(ir_chain *chains, uint32_t c, uint32_t before)
1387
0
{
1388
0
  ir_chain *this = &chains[c];
1389
0
  ir_chain *next = &chains[before];
1390
1391
0
  IR_ASSERT(chains[c].head == c);
1392
0
  if (chains[before].head != before) {
1393
0
    this->head = next->head;
1394
0
  } else {
1395
0
    next->head = c;
1396
0
  }
1397
0
  this->next = before;
1398
0
  this->prev = next->prev;
1399
0
  next->prev = c;
1400
0
  chains[this->prev].next = c;
1401
0
}
1402
1403
#ifndef IR_DEBUG_BB_SCHEDULE_GRAPH
1404
# ifdef IR_DEBUG
1405
0
#  define IR_DEBUG_BB_SCHEDULE_GRAPH 1
1406
# else
1407
#  define IR_DEBUG_BB_SCHEDULE_GRAPH 0
1408
# endif
1409
#endif
1410
1411
#if IR_DEBUG_BB_SCHEDULE_GRAPH
1412
static void ir_dump_cfg_freq_graph(ir_ctx *ctx, float *bb_freq, uint32_t edges_count, ir_edge_info *edges, ir_chain *chains)
1413
0
{
1414
0
  uint32_t b, i;
1415
0
  ir_block *bb;
1416
0
  uint8_t c, *colors;
1417
0
  bool is_head, is_empty;
1418
0
  uint32_t max_colors = 12;
1419
1420
0
  colors = alloca(sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1421
0
  memset(colors, 0, sizeof(uint8_t) * (ctx->cfg_blocks_count + 1));
1422
0
  i = 0;
1423
0
  for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1424
0
    if (chains[b].head == b) {
1425
0
      colors[b] = (i % max_colors) + 1;
1426
0
      i++;
1427
0
    }
1428
0
  }
1429
1430
0
  fprintf(stderr, "digraph {\n");
1431
0
  fprintf(stderr, "\trankdir=TB;\n");
1432
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1433
0
    bb = &ctx->cfg_blocks[b];
1434
0
    c = (chains[b].head) ? colors[ir_chain_head(chains, b)] : 0;
1435
0
    is_head = chains[b].head == b;
1436
0
    is_empty = (bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY;
1437
0
    if (c) {
1438
0
      fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\",colorscheme=set312,fillcolor=%d%s%s]\n", b, b,
1439
0
        bb->loop_depth, bb_freq[b],
1440
0
        ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1441
0
        c,
1442
0
        is_head ? ",penwidth=3" : "",
1443
0
        is_empty ? ",style=\"dotted,filled\"" : ",style=\"filled\"");
1444
0
    } else {
1445
0
      fprintf(stderr, "\tBB%d [label=\"BB%d: (%d),%0.3f\\n%s\\n%s\"%s%s]\n", b, b,
1446
0
        bb->loop_depth, bb_freq[b],
1447
0
        ir_op_name[ctx->ir_base[bb->start].op], ir_op_name[ctx->ir_base[bb->end].op],
1448
0
        is_head ? ",penwidth=3" : "",
1449
0
        is_empty ? ",style=\"dotted\"" : "");
1450
0
    }
1451
0
  }
1452
0
  fprintf(stderr, "\n");
1453
1454
0
  for (i = 0; i < edges_count; i++) {
1455
0
    fprintf(stderr, "\tBB%d -> BB%d [label=\"%0.3f\"]\n", edges[i].from, edges[i].to, edges[i].freq);
1456
0
  }
1457
0
  fprintf(stderr, "}\n");
1458
0
}
1459
#endif
1460
1461
#ifdef IR_DEBUG
1462
static void ir_dump_edges(ir_ctx *ctx, uint32_t edges_count, ir_edge_info *edges)
1463
0
{
1464
0
  uint32_t i;
1465
1466
0
  fprintf(stderr, "Edges:\n");
1467
0
  for (i = 0; i < edges_count; i++) {
1468
0
    fprintf(stderr, "\tBB%d -> BB%d %0.3f\n", edges[i].from, edges[i].to, edges[i].freq);
1469
0
  }
1470
0
}
1471
1472
static void ir_dump_chains(ir_ctx *ctx, ir_chain *chains)
1473
0
{
1474
0
  uint32_t b, tail, i;
1475
1476
0
  fprintf(stderr, "Chains:\n");
1477
0
  for (b = 1; b < ctx->cfg_blocks_count + 1; b++) {
1478
0
    if (chains[b].head == b) {
1479
0
      tail = chains[b].tail;
1480
0
      i = b;
1481
0
      fprintf(stderr, "(BB%d", i);
1482
0
      while (i != tail) {
1483
0
        i = chains[i].next;
1484
0
        fprintf(stderr, ", BB%d", i);
1485
0
      }
1486
0
      fprintf(stderr, ")\n");
1487
0
    }
1488
0
  }
1489
0
}
1490
1491
static bool ir_is_merged_loop_back_edge(ir_ctx *ctx, uint32_t hdr, uint32_t b)
1492
0
{
1493
0
  if (ctx->cfg_blocks[hdr].flags & IR_BB_LOOP_HEADER) {
1494
0
    uint32_t loop_depth = ctx->cfg_blocks[hdr].loop_depth;
1495
1496
0
    while (ctx->cfg_blocks[b].loop_depth > loop_depth) {
1497
0
      b = ctx->cfg_blocks[b].loop_header;
1498
0
    }
1499
0
    return b == hdr;
1500
0
  }
1501
0
  return 0;
1502
0
}
1503
#endif
1504
1505
static int ir_schedule_blocks_bottom_up(ir_ctx *ctx)
1506
0
{
1507
0
  uint32_t max_edges_count = ctx->cfg_edges_count / 2;
1508
0
  uint32_t edges_count = 0;
1509
0
  uint32_t b, i, loop_depth;
1510
0
  float *bb_freq, freq;
1511
0
  ir_block *bb;
1512
0
  ir_edge_info *edges, *e;
1513
0
  ir_chain *chains;
1514
0
  ir_bitqueue worklist;
1515
0
  ir_bitset visited;
1516
0
  uint32_t *schedule_end, count;
1517
1518
0
  ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1519
0
  schedule_end = ctx->cfg_schedule + ctx->cfg_blocks_count;
1520
1521
  /* 1. Create initial chains for each BB */
1522
0
  chains = ir_mem_malloc(sizeof(ir_chain) * (ctx->cfg_blocks_count + 1));
1523
0
  chains[0].head = 0;
1524
0
  chains[0].next = 0;
1525
0
  chains[0].prev = 0;
1526
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1527
0
    chains[b].head = b;
1528
0
    chains[b].next = b;
1529
0
    chains[b].prev = b;
1530
0
  }
1531
1532
  /* 2. Collect information about BBs and EDGEs frequencies */
1533
0
  edges = ir_mem_malloc(sizeof(ir_edge_info) * max_edges_count);
1534
0
  bb_freq = ir_mem_calloc(ctx->cfg_blocks_count + 1, sizeof(float));
1535
0
  bb_freq[1] = 1.0f;
1536
0
  visited = ir_bitset_malloc(ctx->cfg_blocks_count + 1);
1537
0
  ir_bitqueue_init(&worklist, ctx->cfg_blocks_count + 1);
1538
0
  ir_bitqueue_add(&worklist, 1);
1539
0
  while ((b = ir_bitqueue_pop(&worklist)) != (uint32_t)-1) {
1540
0
restart:
1541
0
    bb = &ctx->cfg_blocks[b];
1542
0
    if (bb->predecessors_count) {
1543
0
      uint32_t n = bb->predecessors_count;
1544
0
      uint32_t *p = ctx->cfg_edges + bb->predecessors;
1545
0
      for (; n > 0; p++, n--) {
1546
0
        uint32_t predecessor = *p;
1547
        /* Basic Blocks are ordered in a way that usual predecessors ids are less than successors.
1548
         * So we may compare blocks ids (predecessor < b) instead of a more expensive check for back edge
1549
         * (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b)
1550
         */
1551
0
        if (predecessor < b) {
1552
0
          if (!ir_bitset_in(visited, predecessor)) {
1553
0
            b = predecessor;
1554
0
            ir_bitqueue_del(&worklist, b);
1555
0
            goto restart;
1556
0
          }
1557
0
        } else if (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b) {
1558
          /* not a loop back-edge */
1559
0
          IR_ASSERT(b == predecessor || ctx->cfg_blocks[predecessor].loop_header == b ||
1560
0
            ir_is_merged_loop_back_edge(ctx, b, predecessor));
1561
0
        }
1562
0
      }
1563
0
    }
1564
1565
0
    ir_bitset_incl(visited, b);
1566
1567
0
    if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1568
0
      uint32_t successor = ctx->cfg_edges[bb->successors];
1569
1570
      /* move empty blocks to the end */
1571
0
      IR_ASSERT(chains[b].head == b);
1572
0
      chains[b].head = 0;
1573
0
      *schedule_end = b;
1574
0
      schedule_end--;
1575
1576
0
      if (successor > b) {
1577
0
        bb_freq[successor] += bb_freq[b];
1578
0
        b = successor;
1579
0
        ir_bitqueue_del(&worklist, b);
1580
0
        goto restart;
1581
0
      } else {
1582
0
        continue;
1583
0
      }
1584
0
    }
1585
1586
0
    loop_depth = bb->loop_depth;
1587
0
    if (bb->flags & IR_BB_LOOP_HEADER) {
1588
      // TODO: Estimate the loop iterations count
1589
0
      bb_freq[b] *= 10;
1590
0
    }
1591
1592
0
    if (bb->successors_count) {
1593
0
      uint32_t n = bb->successors_count;
1594
0
      uint32_t *p = ctx->cfg_edges + bb->successors;
1595
1596
0
      if (n == 1) {
1597
0
        uint32_t successor = *p;
1598
1599
0
        IR_ASSERT(edges_count < max_edges_count);
1600
0
        freq = bb_freq[b];
1601
0
        if (successor > b) {
1602
0
          IR_ASSERT(!ir_bitset_in(visited, successor));
1603
0
          bb_freq[successor] += freq;
1604
0
          ir_bitqueue_add(&worklist, successor);
1605
0
        }
1606
0
        successor = _ir_skip_empty_blocks(ctx, successor);
1607
0
        edges[edges_count].from = b;
1608
0
        edges[edges_count].to = successor;
1609
0
        edges[edges_count].freq = freq;
1610
0
        edges_count++;
1611
0
      } else if (n == 2 && ctx->ir_base[bb->end].op == IR_IF) {
1612
0
        uint32_t successor1 = *p;
1613
0
        ir_block *successor1_bb = &ctx->cfg_blocks[successor1];
1614
0
        ir_insn *insn1 = &ctx->ir_base[successor1_bb->start];
1615
0
        uint32_t successor2 = *(p + 1);
1616
0
        ir_block *successor2_bb = &ctx->cfg_blocks[successor2];
1617
0
        ir_insn *insn2 = &ctx->ir_base[successor2_bb->start];
1618
0
        int prob1, prob2, probN = 100;
1619
1620
0
        if (insn1->op2) {
1621
0
          prob1 = insn1->op2;
1622
0
          if (insn2->op2) {
1623
0
            prob2 = insn2->op2;
1624
0
            probN = prob1 + prob2;
1625
0
          } else {
1626
0
            if (prob1 > 100) {
1627
0
              prob1 = 100;
1628
0
            }
1629
0
            prob2 = 100 - prob1;
1630
0
          }
1631
1632
0
        } else if (insn2->op2) {
1633
0
          prob2 = insn2->op2;
1634
0
          if (prob2 > 100) {
1635
0
            prob2 = 100;
1636
0
          }
1637
0
          prob1 = 100 - prob2;
1638
0
        } else if (successor1_bb->loop_depth >= loop_depth
1639
0
            && successor2_bb->loop_depth < loop_depth) {
1640
0
          prob1 = 90;
1641
0
          prob2 = 10;
1642
0
        } else if (successor1_bb->loop_depth < loop_depth
1643
0
            && successor2_bb->loop_depth >= loop_depth) {
1644
0
          prob1 = 10;
1645
0
          prob2 = 90;
1646
0
        } else if (successor2_bb->flags & IR_BB_EMPTY) {
1647
0
          prob1 = 51;
1648
0
          prob2 = 49;
1649
0
        } else if (successor1_bb->flags & IR_BB_EMPTY) {
1650
0
          prob1 = 49;
1651
0
          prob2 = 51;
1652
0
        } else {
1653
0
          prob1 = prob2 = 50;
1654
0
        }
1655
0
        do {
1656
0
          freq = bb_freq[b] * (float)prob1 / (float)probN;
1657
0
          if (successor1 > b) {
1658
0
            IR_ASSERT(!ir_bitset_in(visited, successor1));
1659
0
            bb_freq[successor1] += freq;
1660
0
            if (successor1_bb->successors_count == 0 && insn1->op2 == 1) {
1661
              /* move cold block without successors to the end */
1662
0
              IR_ASSERT(chains[successor1].head == successor1);
1663
0
              chains[successor1].head = 0;
1664
0
              *schedule_end = successor1;
1665
0
              schedule_end--;
1666
0
              break;
1667
0
            } else {
1668
0
              ir_bitqueue_add(&worklist, successor1);
1669
0
            }
1670
0
          }
1671
          /* try to join edges early to reduce number of edges and the cost of their sorting */
1672
0
          if (prob1 > prob2
1673
0
           && (successor1_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1674
0
            uint32_t src = chains[b].next;
1675
0
            IR_ASSERT(chains[src].head == src);
1676
0
            IR_ASSERT(src == ir_chain_head(chains, b));
1677
0
            IR_ASSERT(chains[successor1].head == successor1);
1678
0
            ir_join_chains(chains, src, successor1);
1679
0
            if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1680
0
          }
1681
0
          successor1 = _ir_skip_empty_blocks(ctx, successor1);
1682
0
          IR_ASSERT(edges_count < max_edges_count);
1683
0
          edges[edges_count].from = b;
1684
0
          edges[edges_count].to = successor1;
1685
0
          edges[edges_count].freq = freq;
1686
0
          edges_count++;
1687
0
        } while (0);
1688
0
        do {
1689
0
          freq = bb_freq[b] * (float)prob2 / (float)probN;
1690
0
          if (successor2 > b) {
1691
0
            IR_ASSERT(!ir_bitset_in(visited, successor2));
1692
0
            bb_freq[successor2] += freq;
1693
0
            if (successor2_bb->successors_count == 0 && insn2->op2 == 1) {
1694
              /* move cold block without successors to the end */
1695
0
              IR_ASSERT(chains[successor2].head == successor2);
1696
0
              chains[successor2].head = 0;
1697
0
              *schedule_end = successor2;
1698
0
              schedule_end--;
1699
0
              break;
1700
0
            } else {
1701
0
              ir_bitqueue_add(&worklist, successor2);
1702
0
            }
1703
0
          }
1704
0
          if (prob2 > prob1
1705
0
           && (successor2_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) {
1706
0
            uint32_t src = chains[b].next;
1707
0
            IR_ASSERT(chains[src].head == src);
1708
0
            IR_ASSERT(src == ir_chain_head(chains, b));
1709
0
            IR_ASSERT(chains[successor2].head == successor2);
1710
0
            ir_join_chains(chains, src, successor2);
1711
0
            if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break;
1712
0
          }
1713
0
          successor2 = _ir_skip_empty_blocks(ctx, successor2);
1714
0
          IR_ASSERT(edges_count < max_edges_count);
1715
0
          edges[edges_count].from = b;
1716
0
          edges[edges_count].to = successor2;
1717
0
          edges[edges_count].freq = freq;
1718
0
          edges_count++;
1719
0
        } while (0);
1720
0
      } else {
1721
0
        int prob;
1722
1723
0
        for (; n > 0; p++, n--) {
1724
0
          uint32_t successor = *p;
1725
0
          ir_block *successor_bb = &ctx->cfg_blocks[successor];
1726
0
          ir_insn *insn = &ctx->ir_base[successor_bb->start];
1727
1728
0
          if (insn->op == IR_CASE_DEFAULT) {
1729
0
            prob = insn->op2;
1730
0
            if (!prob) {
1731
0
              prob = 100 / bb->successors_count;
1732
0
            }
1733
0
          } else if (insn->op == IR_CASE_VAL) {
1734
0
            prob = insn->op3;
1735
0
            if (!prob) {
1736
0
              prob = 100 / bb->successors_count;
1737
0
            }
1738
0
          } else if (insn->op == IR_ENTRY) {
1739
0
            if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1740
0
              prob = 99; /* prefer empty ENTRY block to go first */
1741
0
            } else {
1742
0
              prob = 1;
1743
0
            }
1744
0
          } else {
1745
0
            prob = 100 / bb->successors_count;
1746
0
          }
1747
0
          freq = bb_freq[b] * (float)prob / 100.0f;
1748
0
          if (successor > b) {
1749
0
            IR_ASSERT(!ir_bitset_in(visited, successor));
1750
0
            bb_freq[successor] += freq;
1751
0
            ir_bitqueue_add(&worklist, successor);
1752
0
          }
1753
0
          successor = _ir_skip_empty_blocks(ctx, successor);
1754
0
          IR_ASSERT(edges_count < max_edges_count);
1755
0
          edges[edges_count].from = b;
1756
0
          edges[edges_count].to = successor;
1757
0
          edges[edges_count].freq = freq;
1758
0
          edges_count++;
1759
0
        }
1760
0
      }
1761
0
    }
1762
0
  }
1763
0
  ir_bitqueue_free(&worklist);
1764
0
  ir_mem_free(visited);
1765
1766
  /* 2. Sort EDGEs according to their frequencies */
1767
0
  qsort(edges, edges_count, sizeof(ir_edge_info), ir_edge_info_cmp);
1768
1769
0
#ifdef IR_DEBUG
1770
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1771
0
    ir_dump_edges(ctx, edges_count, edges);
1772
0
  }
1773
0
#endif
1774
1775
  /* 3. Process EDGEs in the decreasing frequency order and join the connected chains */
1776
0
  for (e = edges, i = edges_count; i > 0; e++, i--) {
1777
0
    uint32_t dst = chains[e->to].head;
1778
0
    if (dst == e->to) {
1779
0
      uint32_t src = chains[e->from].next;
1780
0
      if (chains[src].head == src) {
1781
0
        IR_ASSERT(src == ir_chain_head(chains, e->from) && chains[src].tail == e->from);
1782
0
        if (src != dst) {
1783
0
          ir_join_chains(chains, src, dst);
1784
0
        } else if (ctx->cfg_blocks[e->from].successors_count < 2) {
1785
          /* Try to rotate loop chian to finish it with a conditional branch */
1786
0
          uint32_t tail = e->from;
1787
0
          uint32_t prev = src;
1788
0
          uint32_t next = chains[prev].next;
1789
0
          uint32_t best = 0;
1790
1791
0
          while (prev != tail) {
1792
0
            if (ctx->ir_base[ctx->cfg_blocks[prev].end].op == IR_IF) {
1793
0
              if (ctx->ir_base[ctx->cfg_blocks[prev].start].op == IR_LOOP_BEGIN
1794
0
               && ctx->cfg_blocks[prev].loop_depth > 1) {
1795
0
                best = next;
1796
0
                break;
1797
0
              } else if (!best || bb_freq[next] < bb_freq[best]) {
1798
                /* Find the successor of IF with the least frequency */
1799
0
                best = next;
1800
0
              }
1801
0
            }
1802
0
            prev = next;
1803
0
            next = chains[next].next;
1804
0
          }
1805
0
          if (best) {
1806
            /* change the head of the chain */
1807
0
            chains[src].head = best;
1808
0
            chains[best].head = best;
1809
0
          }
1810
0
        }
1811
#if !IR_DEBUG_BB_SCHEDULE_GRAPH
1812
        e->from = 0; /* reset "from" to avoid check on step #5 */
1813
#endif
1814
0
      }
1815
0
    }
1816
0
  }
1817
1818
0
#if IR_DEBUG_BB_SCHEDULE_GRAPH
1819
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1820
0
    ir_dump_cfg_freq_graph(ctx, bb_freq, edges_count, edges, chains);
1821
0
  }
1822
0
#endif
1823
1824
0
  ir_mem_free(bb_freq);
1825
1826
0
#ifdef IR_DEBUG
1827
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1828
0
    ir_dump_chains(ctx, chains);
1829
0
  }
1830
0
#endif
1831
1832
  /* 4. Merge empty entry blocks */
1833
0
  if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && ctx->entries_count) {
1834
0
    for (i = 0; i < ctx->entries_count; i++) {
1835
0
      b = ctx->entries[i];
1836
0
      IR_ASSERT(ctx->cfg_blocks[b].flags & IR_BB_ENTRY);
1837
0
      if (b && chains[b].head == b && chains[b].tail == b) {
1838
0
        bb = &ctx->cfg_blocks[b];
1839
0
        if (bb->flags & IR_BB_EMPTY) {
1840
0
          uint32_t successor;
1841
1842
0
          do {
1843
0
            IR_ASSERT(bb->successors_count == 1);
1844
0
            successor = ctx->cfg_edges[bb->successors];
1845
0
            bb = &ctx->cfg_blocks[successor];
1846
0
          } while ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY);
1847
0
          IR_ASSERT(chains[successor].head);
1848
0
          ir_insert_chain_before(chains, b, successor);
1849
0
        }
1850
0
      }
1851
0
    }
1852
1853
0
#ifdef IR_DEBUG
1854
0
    if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1855
0
      ir_dump_chains(ctx, chains);
1856
0
    }
1857
0
#endif
1858
0
  }
1859
1860
  /* 5. Align loop headers */
1861
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1862
0
    if (chains[b].head == b) {
1863
0
      bb = &ctx->cfg_blocks[b];
1864
0
      if (bb->loop_depth) {
1865
0
        if ((bb->flags & IR_BB_LOOP_HEADER) || ir_chain_head(chains, bb->loop_header) == b) {
1866
0
          bb->flags |= IR_BB_ALIGN_LOOP;
1867
0
        }
1868
0
      }
1869
0
    }
1870
0
  }
1871
1872
  /* 6. Group chains according to the most frequent edge between them */
1873
  // TODO: Try to find a better heuristic
1874
0
  for (e = edges, i = edges_count; i > 0; e++, i--) {
1875
#if !IR_DEBUG_BB_SCHEDULE_GRAPH
1876
    if (!e->from) continue;
1877
#endif
1878
0
    uint32_t src = ir_chain_head(chains, e->from);
1879
0
    uint32_t dst = ir_chain_head(chains, e->to);
1880
0
    if (src != dst) {
1881
0
      if (dst == 1) {
1882
0
        ir_join_chains(chains, dst, src);
1883
0
      } else {
1884
0
        ir_join_chains(chains, src, dst);
1885
0
      }
1886
0
    }
1887
0
  }
1888
1889
0
#ifdef IR_DEBUG
1890
0
  if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
1891
0
    ir_dump_chains(ctx, chains);
1892
0
  }
1893
0
#endif
1894
1895
  /* 7. Form a final BB order */
1896
0
  count = 0;
1897
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1898
0
    if (chains[b].head == b) {
1899
0
      uint32_t tail = chains[b].tail;
1900
0
      uint32_t i = b;
1901
0
      while (1) {
1902
0
        count++;
1903
0
        ctx->cfg_schedule[count] = i;
1904
0
        if (i == tail) break;
1905
0
        i = chains[i].next;
1906
0
      }
1907
0
    }
1908
0
  }
1909
1910
0
  IR_ASSERT(ctx->cfg_schedule + count == schedule_end);
1911
0
  ctx->cfg_schedule[ctx->cfg_blocks_count + 1] = 0;
1912
1913
0
  ir_mem_free(edges);
1914
0
  ir_mem_free(chains);
1915
1916
0
  return 1;
1917
0
}
1918
1919
/* A variation of "Top-down Positioning" algorithm described by
1920
 * Karl Pettis and Robert C. Hansen "Profile Guided Code Positioning"
1921
 */
1922
static int ir_schedule_blocks_top_down(ir_ctx *ctx)
1923
0
{
1924
0
  ir_bitqueue blocks;
1925
0
  uint32_t b, best_successor, last_non_empty;
1926
0
  ir_block *bb, *best_successor_bb;
1927
0
  ir_insn *insn;
1928
0
  uint32_t *list, *schedule_end;
1929
0
  uint32_t count = 0;
1930
1931
0
  ir_bitqueue_init(&blocks, ctx->cfg_blocks_count + 1);
1932
0
  blocks.pos = 0;
1933
0
  list = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
1934
0
  list[ctx->cfg_blocks_count + 1] = 0;
1935
0
  schedule_end = list + ctx->cfg_blocks_count;
1936
0
  for (b = 1; b <= ctx->cfg_blocks_count; b++) {
1937
0
    ir_bitset_incl(blocks.set, b);
1938
0
  }
1939
1940
0
  while ((b = ir_bitqueue_pop(&blocks)) != (uint32_t)-1) {
1941
0
    bb = &ctx->cfg_blocks[b];
1942
    /* Start trace */
1943
0
    last_non_empty = 0;
1944
0
    do {
1945
0
      if (UNEXPECTED(bb->flags & IR_BB_PREV_EMPTY_ENTRY) && ir_bitqueue_in(&blocks, b - 1)) {
1946
        /* Schedule the previous empty ENTRY block before this one */
1947
0
        uint32_t predecessor = b - 1;
1948
1949
0
        ir_bitqueue_del(&blocks, predecessor);
1950
0
        count++;
1951
0
        list[count] = predecessor;
1952
0
      }
1953
0
      if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
1954
        /* move empty blocks to the end */
1955
0
        *schedule_end = b;
1956
0
        schedule_end--;
1957
0
      } else {
1958
0
        count++;
1959
0
        list[count] = b;
1960
0
        last_non_empty = b;
1961
0
      }
1962
0
      best_successor_bb = NULL;
1963
0
      if (bb->successors_count == 1) {
1964
0
        best_successor = ctx->cfg_edges[bb->successors];
1965
0
        if (ir_bitqueue_in(&blocks, best_successor)) {
1966
0
          best_successor_bb = &ctx->cfg_blocks[best_successor];
1967
0
        }
1968
0
      } else if (bb->successors_count > 1) {
1969
0
        uint32_t prob, best_successor_prob;
1970
0
        uint32_t *p, successor;
1971
0
        ir_block *successor_bb;
1972
1973
0
        for (b = 0, p = &ctx->cfg_edges[bb->successors]; b < bb->successors_count; b++, p++) {
1974
0
          successor = *p;
1975
0
          if (ir_bitqueue_in(&blocks, successor)) {
1976
0
            successor_bb = &ctx->cfg_blocks[successor];
1977
0
            insn = &ctx->ir_base[successor_bb->start];
1978
0
            if (insn->op == IR_IF_TRUE || insn->op == IR_IF_FALSE) {
1979
0
              prob = insn->op2;
1980
0
              if (!prob) {
1981
0
                prob = 100 / bb->successors_count;
1982
0
                if (!(successor_bb->flags & IR_BB_EMPTY)) {
1983
0
                  prob++;
1984
0
                }
1985
0
              }
1986
0
            } else if (insn->op == IR_CASE_DEFAULT) {
1987
0
              prob = insn->op2;
1988
0
              if (!prob) {
1989
0
                prob = 100 / bb->successors_count;
1990
0
              }
1991
0
            } else if (insn->op == IR_CASE_VAL) {
1992
0
              prob = insn->op3;
1993
0
              if (!prob) {
1994
0
                prob = 100 / bb->successors_count;
1995
0
              }
1996
0
            } else if (insn->op == IR_ENTRY) {
1997
0
              if ((ctx->flags & IR_MERGE_EMPTY_ENTRIES) && (successor_bb->flags & IR_BB_EMPTY)) {
1998
0
                prob = 99; /* prefer empty ENTRY block to go first */
1999
0
              } else {
2000
0
                prob = 1;
2001
0
              }
2002
0
            } else {
2003
0
              prob = 100 / bb->successors_count;
2004
0
            }
2005
0
            if (!best_successor_bb
2006
0
             || successor_bb->loop_depth > best_successor_bb->loop_depth
2007
0
             || prob > best_successor_prob) {
2008
0
              best_successor = successor;
2009
0
              best_successor_bb = successor_bb;
2010
0
              best_successor_prob = prob;
2011
0
            }
2012
0
          }
2013
0
        }
2014
0
      }
2015
0
      if (!best_successor_bb) {
2016
        /* Try to continue trace using the other successor of the last IF */
2017
0
        if ((bb->flags & IR_BB_EMPTY) && last_non_empty) {
2018
0
          bb = &ctx->cfg_blocks[last_non_empty];
2019
0
          if (bb->successors_count == 2 && ctx->ir_base[bb->end].op == IR_IF) {
2020
0
            b = ctx->cfg_edges[bb->successors];
2021
2022
0
            if (!ir_bitqueue_in(&blocks, b)) {
2023
0
              b = ctx->cfg_edges[bb->successors + 1];
2024
0
            }
2025
0
            if (ir_bitqueue_in(&blocks, b)) {
2026
0
              bb = &ctx->cfg_blocks[b];
2027
0
              ir_bitqueue_del(&blocks, b);
2028
0
              continue;
2029
0
            }
2030
0
          }
2031
0
        }
2032
        /* End trace */
2033
0
        break;
2034
0
      }
2035
0
      b = best_successor;
2036
0
      bb = best_successor_bb;
2037
0
      ir_bitqueue_del(&blocks, b);
2038
0
    } while (1);
2039
0
  }
2040
2041
0
  IR_ASSERT(list + count == schedule_end);
2042
0
  ctx->cfg_schedule = list;
2043
0
  ir_bitqueue_free(&blocks);
2044
2045
0
  return 1;
2046
0
}
2047
2048
int ir_schedule_blocks(ir_ctx *ctx)
2049
0
{
2050
0
  ir_ref ref;
2051
2052
0
  if (ctx->cfg_blocks_count <= 2) {
2053
0
    return 1;
2054
0
  }
2055
2056
  /* Mark "stop" blocks termintated with UNREACHABLE as "unexpected" */
2057
0
  ref = ctx->ir_base[1].op1;
2058
0
  while (ref) {
2059
0
    ir_insn *insn = &ctx->ir_base[ref];
2060
2061
0
    if (insn->op == IR_UNREACHABLE && ctx->ir_base[insn->op1].op != IR_TAILCALL) {
2062
0
      ir_block *bb = &ctx->cfg_blocks[ctx->cfg_map[ref]];
2063
0
      uint32_t n = bb->predecessors_count;
2064
2065
0
      if (n == 1) {
2066
0
        ir_insn *start_insn = &ctx->ir_base[bb->start];
2067
0
        if (start_insn->op == IR_IF_TRUE
2068
0
         || start_insn->op == IR_IF_FALSE
2069
0
         || start_insn->op == IR_CASE_DEFAULT) {
2070
0
          if (!start_insn->op2) start_insn->op2 = 1;
2071
0
        } else if (start_insn->op == IR_CASE_VAL) {
2072
0
          if (!start_insn->op3) start_insn->op3 = 1;
2073
0
        }
2074
0
      } else if (n > 1) {
2075
0
        uint32_t *p = &ctx->cfg_edges[bb->predecessors];
2076
2077
0
        for (; n > 0; p++, n--) {
2078
0
          bb = &ctx->cfg_blocks[*p];
2079
0
          if (bb->predecessors_count == 1) {
2080
0
            ir_insn *start_insn = &ctx->ir_base[bb->start];
2081
0
            if (start_insn->op == IR_IF_TRUE
2082
0
             || start_insn->op == IR_IF_FALSE
2083
0
             || start_insn->op == IR_CASE_DEFAULT) {
2084
0
              if (!start_insn->op2) start_insn->op2 = 1;
2085
0
            } else if (start_insn->op == IR_CASE_VAL) {
2086
0
              if (!start_insn->op3) start_insn->op3 = 1;
2087
0
            }
2088
0
          }
2089
0
        }
2090
0
      }
2091
0
    }
2092
0
    ref = insn->op3;
2093
0
  }
2094
2095
  /* The bottom-up Pettis-Hansen algorithm is expensive - O(n^3),
2096
   * use it only for relatively small functions.
2097
   *
2098
   * TODO: make the choice between top-down and bottom-up algorithm configurable
2099
   */
2100
0
  if (UNEXPECTED(ctx->flags2 & IR_IRREDUCIBLE_CFG) || ctx->cfg_blocks_count > 256) {
2101
0
    return ir_schedule_blocks_top_down(ctx);
2102
0
  } else {
2103
0
    return ir_schedule_blocks_bottom_up(ctx);
2104
0
  }
2105
0
}
2106
2107
/* JMP target optimisation */
2108
uint32_t ir_skip_empty_target_blocks(const ir_ctx *ctx, uint32_t b)
2109
0
{
2110
0
  return _ir_skip_empty_blocks(ctx, b);
2111
0
}
2112
2113
uint32_t ir_next_block(const ir_ctx *ctx, uint32_t b)
2114
0
{
2115
0
  ir_block *bb;
2116
2117
0
  if (ctx->cfg_schedule) {
2118
0
    uint32_t ret = ctx->cfg_schedule[++b];
2119
2120
    /* Check for empty ENTRY block */
2121
0
    while (ret && ((ctx->cfg_blocks[ret].flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY)) {
2122
0
      ret = ctx->cfg_schedule[++b];
2123
0
    }
2124
0
    return ret;
2125
0
  }
2126
2127
0
  b++;
2128
0
  while (1) {
2129
0
    if (b > ctx->cfg_blocks_count) {
2130
0
      return 0;
2131
0
    }
2132
2133
0
    bb = &ctx->cfg_blocks[b];
2134
2135
0
    if ((bb->flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY) {
2136
0
      b++;
2137
0
    } else {
2138
0
      break;
2139
0
    }
2140
0
  }
2141
0
  return b;
2142
0
}
2143
2144
void ir_get_true_false_blocks(const ir_ctx *ctx, uint32_t b, uint32_t *true_block, uint32_t *false_block)
2145
0
{
2146
0
  ir_block *bb;
2147
0
  uint32_t *p, use_block;
2148
2149
0
  *true_block = 0;
2150
0
  *false_block = 0;
2151
0
  bb = &ctx->cfg_blocks[b];
2152
0
  IR_ASSERT(ctx->ir_base[bb->end].op == IR_IF);
2153
0
  IR_ASSERT(bb->successors_count == 2);
2154
0
  p = &ctx->cfg_edges[bb->successors];
2155
0
  use_block = *p;
2156
0
  if (ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE) {
2157
0
    *true_block = ir_skip_empty_target_blocks(ctx, use_block);
2158
0
    use_block = *(p+1);
2159
0
    IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
2160
0
    *false_block = ir_skip_empty_target_blocks(ctx, use_block);
2161
0
  } else {
2162
0
    IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_FALSE);
2163
0
    *false_block = ir_skip_empty_target_blocks(ctx, use_block);
2164
0
    use_block = *(p+1);
2165
0
    IR_ASSERT(ctx->ir_base[ctx->cfg_blocks[use_block].start].op == IR_IF_TRUE);
2166
0
    *true_block = ir_skip_empty_target_blocks(ctx, use_block);
2167
0
  }
2168
0
  IR_ASSERT(*true_block && *false_block);
2169
0
}