/src/php-src/ext/opcache/jit/ir/ir_gcm.c

Source
/*
 * IR - Lightweight JIT Compilation Framework
 * (GCM - Global Code Motion and Scheduler)
 * Copyright (C) 2022 Zend by Perforce.
 * Authors: Dmitry Stogov <dmitry@php.net>
 *
 * The GCM algorithm is based on Cliff Click's publication
 * See: C. Click. "Global code motion, global value numbering" Submitted to PLDI'95.
 */

#include "ir.h"
#include "ir_private.h"

#define IR_GCM_IS_SCHEDULED_EARLY(b) (((int32_t)(b)) < 0)
#define IR_GCM_EARLY_BLOCK(b)        ((uint32_t)-((int32_t)(b)))

#define IR_GCM_SPLIT 1
#define IR_SCHEDULE_SWAP_OPS 1

static uint32_t ir_gcm_schedule_early(ir_ctx *ctx, ir_ref ref, ir_list *queue_late)
{
  ir_ref n, *p, input;
  ir_insn *insn;
  uint32_t dom_depth;
  uint32_t b, result;

  insn = &ctx->ir_base[ref];

  IR_ASSERT(insn->op != IR_PARAM && insn->op != IR_VAR);
  IR_ASSERT(insn->op != IR_PHI && insn->op != IR_PI);

  result = 1;
  dom_depth = 0;

  n = insn->inputs_count;
  for (p = insn->ops + 1; n > 0; p++, n--) {
    input = *p;
    if (input > 0) {
      b = ctx->cfg_map[input];
      if (IR_GCM_IS_SCHEDULED_EARLY(b)) {
        b = IR_GCM_EARLY_BLOCK(b);
      } else if (!b) {
        b = ir_gcm_schedule_early(ctx, input, queue_late);
      }
      if (dom_depth < ctx->cfg_blocks[b].dom_depth) {
        dom_depth = ctx->cfg_blocks[b].dom_depth;
        result = b;
      }
    }
  }

  ctx->cfg_map[ref] = IR_GCM_EARLY_BLOCK(result);
  ir_list_push_unchecked(queue_late, ref);
  return result;
}

/* Last Common Ancestor */
static uint32_t ir_gcm_find_lca(ir_ctx *ctx, uint32_t b1, uint32_t b2)
{
  uint32_t dom_depth;

  dom_depth = ctx->cfg_blocks[b2].dom_depth;
  while (ctx->cfg_blocks[b1].dom_depth > dom_depth) {
    b1 = ctx->cfg_blocks[b1].dom_parent;
  }
  dom_depth = ctx->cfg_blocks[b1].dom_depth;
  while (ctx->cfg_blocks[b2].dom_depth > dom_depth) {
    b2 = ctx->cfg_blocks[b2].dom_parent;
  }
  while (b1 != b2) {
    b1 = ctx->cfg_blocks[b1].dom_parent;
    b2 = ctx->cfg_blocks[b2].dom_parent;
  }
  return b2;
}

static uint32_t ir_gcm_select_best_block(ir_ctx *ctx, ir_ref ref, uint32_t lca)
{
  ir_block *bb = &ctx->cfg_blocks[lca];
  uint32_t loop_depth = bb->loop_depth;
  uint32_t flags, best, b;

  if (!loop_depth) {
    return lca;
  }

#if 0 /* This is not necessary anymore. Conditions may be fused with IF across BBs. */
  if (ctx->ir_base[ref].op >= IR_EQ && ctx->ir_base[ref].op <= IR_UGT) {
    ir_use_list *use_list = &ctx->use_lists[ref];

    if (use_list->count == 1) {
      ir_ref use = ctx->use_edges[use_list->refs];
      ir_insn *insn = &ctx->ir_base[use];
      if (insn->op == IR_IF || insn->op == IR_GUARD || insn->op == IR_GUARD_NOT) {
        /* Don't hoist invariant comparison */
        return lca;
      }
    }
  }
#endif

  flags = (bb->flags & IR_BB_LOOP_HEADER) ? bb->flags : ctx->cfg_blocks[bb->loop_header].flags;
  if ((flags & IR_BB_LOOP_WITH_ENTRY)
   && !(ctx->binding && ir_binding_find(ctx, ref))) {
    /* Don't move loop invariant code across an OSR ENTRY if we can't restore it */
    return lca;
  }

  best = b = lca;
  do {
    b = bb->dom_parent;
    bb = &ctx->cfg_blocks[b];
    if (bb->loop_depth < loop_depth) {
      if (!bb->loop_depth) {
#if 1
        /* Avoid LICM if LOOP doesn't have a pre-header block */
        ir_block *loop_bb = &ctx->cfg_blocks[best];

        if (!(loop_bb->flags & IR_BB_LOOP_HEADER)) {
          loop_bb = &ctx->cfg_blocks[loop_bb->loop_header];
        }
        if (loop_bb->predecessors_count > 2) {
          int n = loop_bb->predecessors_count;
          uint32_t *p = ctx->cfg_edges + loop_bb->predecessors;

          while (n && *p != b) {
            n--; p++;
          }
          if (!n) {
            break;
          }
        }
#endif
        best = b;
        break;
      }
      flags = (bb->flags & IR_BB_LOOP_HEADER) ? bb->flags : ctx->cfg_blocks[bb->loop_header].flags;
      if ((flags & IR_BB_LOOP_WITH_ENTRY)
       && !(ctx->binding && ir_binding_find(ctx, ref))) {
        break;
      }
      loop_depth = bb->loop_depth;
      best = b;
    }
  } while (b != ctx->cfg_map[ref]);

  return best;
}

#if IR_GCM_SPLIT
/* Partially Dead Code Elimination through splitting the node and sinking the clones
 *
 * This code is based on the Benedikt Meurer's idea first implemented in V8.
 * See: https://codereview.chromium.org/899433005
 */

typedef struct _ir_gcm_split_data {
  ir_sparse_set totally_useful;
  ir_list       worklist;
} ir_gcm_split_data;

static void _push_predecessors(ir_ctx *ctx, ir_block *bb, ir_gcm_split_data *data)
{
  uint32_t *p, i, n = bb->predecessors_count;

  IR_ASSERT(n > 0);
  p = ctx->cfg_edges + bb->predecessors;
  do {
    i = *p;
    if (!ir_sparse_set_in(&data->totally_useful, i)) {
      ir_list_push(&data->worklist, i);
    }
    p++;
    n--;
  } while (n > 0);
}

static bool _check_successors(ir_ctx *ctx, ir_block *bb, ir_gcm_split_data *data)
{
  uint32_t *p, i, n = bb->successors_count;

  if (n <= 1) {
    IR_ASSERT(ir_sparse_set_in(&data->totally_useful, ctx->cfg_edges[bb->successors]));
    return 1;
  }

  p = ctx->cfg_edges + bb->successors;
  do {
    i = *p;
    if (!ir_sparse_set_in(&data->totally_useful, i)) {
      return 0;
    }
    p++;
    n--;
  } while (n > 0);

  return 1;
}

static bool ir_split_partially_dead_node(ir_ctx *ctx, ir_ref ref, uint32_t b)
{
  ir_use_list *use_list;
  ir_insn *insn;
  ir_ref n, *p, use;
  uint32_t i;
  ir_gcm_split_data *data = ctx->data;

  IR_ASSERT(b > 0 && b <= ctx->cfg_blocks_count);

  /* 1. Find a set of blocks where the node is TOTALLY_USEFUL (not PARTIALLY_DEAD)
   * 1.1. Collect the blocks where the node is really USED.
   */
  ir_sparse_set_clear(&data->totally_useful);

  use_list = &ctx->use_lists[ref];
  n = use_list->count;
  for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
    use = *p;
    insn = &ctx->ir_base[use];
    if (insn->op == IR_PHI) {
      ir_ref *p = insn->ops + 2; /* PHI data inputs */
      ir_ref *q = ctx->ir_base[insn->op1].ops + 1; /* MERGE inputs */
      ir_ref n = insn->inputs_count - 1;

      for (;n > 0; p++, q++, n--) {
        if (*p == ref) {
          i = ctx->cfg_map[*q];
          IR_ASSERT(i > 0 && i <= ctx->cfg_blocks_count);
          if (!ir_sparse_set_in(&data->totally_useful, i)) {
            if (i == b) return 0; /* node is totally-useful in the scheduled block */
            ir_sparse_set_add(&data->totally_useful, i);
          }
        }
      }
    } else {
      i = ctx->cfg_map[use];
      if (!i) {
        continue;
      }
      IR_ASSERT(i > 0 && i <= ctx->cfg_blocks_count);
      if (!ir_sparse_set_in(&data->totally_useful, i)) {
        if (i == b) return 0; /* node is totally-useful in the scheduled block */
        ir_sparse_set_add(&data->totally_useful, i);
      }
    }
  }

#ifdef IR_DEBUG
  if (ctx->flags & IR_DEBUG_GCM_SPLIT) {
    bool first = 1;
    fprintf(stderr, "*** Split partially dead node d_%d scheduled to BB%d\n", ref, b);
    IR_SPARSE_SET_FOREACH(&data->totally_useful, i) {
      if (first) {
        fprintf(stderr, "\td_%d is USED in [BB%d", ref, i);
        first = 0;
      } else {
        fprintf(stderr, ", BB%d", i);
      }
    } IR_SPARSE_SET_FOREACH_END();
    fprintf(stderr, "]\n");
  }
#endif

  /* 1.2. Iteratively check the predecessors of already found TOTALLY_USEFUL blocks and
   *      add them into TOTALLY_USEFUL set if all of their successors are already there.
   */
  IR_SPARSE_SET_FOREACH(&data->totally_useful, i) {
    _push_predecessors(ctx, &ctx->cfg_blocks[i], data);
  } IR_SPARSE_SET_FOREACH_END();

  while (ir_list_len(&data->worklist)) {
    i = ir_list_pop(&data->worklist);
    if (!ir_sparse_set_in(&data->totally_useful, i)) {
      ir_block *bb = &ctx->cfg_blocks[i];

      if (_check_successors(ctx, bb, data)) {
        if (i == b) {
          /* node is TOTALLY_USEFUL in the scheduled block */
          ir_list_clear(&data->worklist);
          return 0;
        }
        ir_sparse_set_add(&data->totally_useful, i);
        _push_predecessors(ctx, bb, data);
      }
    }
  }

  IR_ASSERT(!ir_sparse_set_in(&data->totally_useful, b));

#ifdef IR_DEBUG
  if (ctx->flags & IR_DEBUG_GCM_SPLIT) {
    bool first = 1;
    IR_SPARSE_SET_FOREACH(&data->totally_useful, i) {
      if (first) {
        fprintf(stderr, "\td_%d is TOTALLY_USEFUL in [BB%d", ref, i);
        first = 0;
      } else {
        fprintf(stderr, ", BB%d", i);
      }
    } IR_SPARSE_SET_FOREACH_END();
    fprintf(stderr, "]\n");
  }
#endif

  /* 2. Split the USEs into partitions */
  use_list = &ctx->use_lists[ref];
  ir_hashtab hash;
  uint32_t j, clone, clones_count = 0, uses_count = 0;
  struct {
    ir_ref   ref;
    uint32_t block;
    uint32_t lca;
    uint32_t use_count;
    uint32_t use;
  } *clones = ir_mem_malloc(sizeof(*clones) * use_list->count);
  struct {
    ir_ref   ref;
    uint32_t block;
    uint32_t next;
  } *uses = ir_mem_malloc(sizeof(*uses) * use_list->count);

  ir_hashtab_init(&hash, use_list->count);
  n = use_list->count;
  for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
    use = *p;
    insn = &ctx->ir_base[use];
    if (insn->op == IR_PHI) {
      ir_ref *p = insn->ops + 2; /* PHI data inputs */
      ir_ref *q = ctx->ir_base[insn->op1].ops + 1; /* MERGE inputs */
      ir_ref n = insn->inputs_count - 1;

      /* PHIs must be processed once */
      if (ir_hashtab_find(&hash, -use) != (ir_ref)IR_INVALID_VAL) {
        continue;
      }
      ir_hashtab_add(&hash, -use, IR_NULL);
      for (;n > 0; p++, q++, n--) {
        if (*p == ref) {
          j = i = ctx->cfg_map[*q];
          while (ir_sparse_set_in(&data->totally_useful, ctx->cfg_blocks[j].idom)) {
            j = ctx->cfg_blocks[j].idom;
          }
          clone = ir_hashtab_find(&hash, j);
          if (clone == IR_INVALID_VAL) {
            clone = clones_count++;
            ir_hashtab_add(&hash, j, clone);
            clones[clone].block = j;
            clones[clone].lca = i;
            clones[clone].use_count = 0;
            clones[clone].use = (uint32_t)-1;
          } else {
            clones[clone].lca = ir_gcm_find_lca(ctx, clones[clone].lca, i);
          }
          uses[uses_count].ref = use;
          uses[uses_count].block = i;
          uses[uses_count].next = clones[clone].use;
          clones[clone].use_count++;
          clones[clone].use = uses_count++;
        }
      }
    } else {
      j = i = ctx->cfg_map[use];
      if (i) {
        IR_ASSERT(i > 0);
        while (ir_sparse_set_in(&data->totally_useful, ctx->cfg_blocks[j].idom)) {
          j = ctx->cfg_blocks[j].idom;
        }
      }
      clone = ir_hashtab_find(&hash, j);
      if (clone == IR_INVALID_VAL) {
        clone = clones_count++;
        ir_hashtab_add(&hash, j, clone);
        clones[clone].block = j;
        clones[clone].lca = i;
        clones[clone].use_count = 0;
        clones[clone].use = -1;
      } else {
        clones[clone].lca = ir_gcm_find_lca(ctx, clones[clone].lca, i);
      }
      uses[uses_count].ref = use;
      uses[uses_count].block = i;
      uses[uses_count].next = clones[clone].use;
      clones[clone].use_count++;
      clones[clone].use = uses_count++;
    }
  }

  /* Select best blocks to insert clones */
  for (i = 0; i < clones_count; i++) {
    uint32_t b0 = clones[i].block;
    uint32_t lca = clones[i].lca;

    if (b0 != lca) {
      ir_block *bb = &ctx->cfg_blocks[lca];
      uint32_t loop_depth = bb->loop_depth;

      if (loop_depth) {
        uint32_t b;
        uint32_t best;

        best = b = lca;
        do {
          b = bb->dom_parent;
          bb = &ctx->cfg_blocks[b];
          if (bb->loop_depth < loop_depth) {
            if (!bb->loop_depth) {
              best = b;
              break;
            }
            loop_depth = bb->loop_depth;
            best = b;
          }
        } while (b != b0);
        lca = best;
      }
      clones[i].block = lca;
    }
  }

  // TODO: instead of inserting clone into the block where the expressin is partially available,
  //       we should insert PHI and the actual clones into the block sources where it's not available
  //       (similar to SSAPRE)

#ifdef IR_DEBUG
  if (ctx->flags & IR_DEBUG_GCM_SPLIT) {
    for (i = 0; i < clones_count; i++) {
      uint32_t u = clones[i].use;

      fprintf(stderr, "\tCLONE #%d in BB%d USES(%d)=[d_%d/BB%d",
        i, clones[i].block, clones[i].use_count, uses[u].ref, uses[u].block);
      u = uses[u].next;
      while (u != (uint32_t)-1) {
        fprintf(stderr, ", d_%d/BB%d", uses[u].ref, uses[u].block);
        u = uses[u].next;
      }
      fprintf(stderr, "]\n");
    }
  }
#endif

  /* Create Clones */
  insn = &ctx->ir_base[ref];
  clones[0].ref = ref;
  for (i = 1; i < clones_count; i++) {
    clones[i].ref = clone = ir_emit(ctx, insn->optx, insn->op1, insn->op2, insn->op3);
    insn = &ctx->ir_base[ref];
    /* Depending on the flags in IR_OPS, these can be references or data. */
    if (insn->op1 > 0 && insn->inputs_count >= 1) ir_use_list_add(ctx, insn->op1, clone);
    if (insn->op2 > 0 && insn->inputs_count >= 2) ir_use_list_add(ctx, insn->op2, clone);
    if (insn->op3 > 0 && insn->inputs_count >= 3) ir_use_list_add(ctx, insn->op3, clone);
  }

  /* Reconstruct IR: Update DEF->USE lists, CFG mapping and etc */
  n = ctx->use_lists[ref].refs;
  for (i = 0; i < clones_count; i++) {
    clone = clones[i].ref;
    if (clones[i].block
     && clones[i].use_count == 1
     && ctx->cfg_blocks[clones[i].block].loop_depth >= ctx->cfg_blocks[uses[clones[i].use].block].loop_depth) {
      /* TOTALLY_USEFUL block may be a head of a diamond above the real usage.
       * Sink it down to the real usage block.
       * Clones with few uses will be sunk into the LCA block.
       */
      clones[i].block = uses[clones[i].use].block;
    }
    ctx->cfg_map[clone] = clones[i].block;
    ctx->use_lists[clone].count = clones[i].use_count;
    ctx->use_lists[clone].refs = n;

    uint32_t u = clones[i].use;
    while (u != (uint32_t)-1) {
      uint32_t src = uses[u].block;
      use = uses[u].ref;
      ctx->use_edges[n++] = use;
      u = uses[u].next;
      if (i > 0) {
        /* replace inputs */
        ir_insn *insn = &ctx->ir_base[use];
        ir_ref k, l = insn->inputs_count;

        if (insn->op == IR_PHI) {
          ir_insn *merge = &ctx->ir_base[insn->op1];
          for (k = 2; k <= l; k++) {
            j = ctx->cfg_map[ir_insn_op(merge, k - 1)];
            if (j == src) {
              IR_ASSERT(ir_insn_op(insn, k) == ref);
              if (j != clones[i].block) {
                uint32_t dom_depth = ctx->cfg_blocks[clones[i].block].dom_depth;
                while (ctx->cfg_blocks[j].dom_depth > dom_depth) {
                  j = ctx->cfg_blocks[j].dom_parent;
                }
                if (j != clones[i].block) {
                  continue;
                }
              }
              ir_insn_set_op(insn, k, clone);
              break;
            }
          }
        } else {
          for (k = 1; k <= l; k++) {
            if (ir_insn_op(insn, k) == ref) {
              ir_insn_set_op(insn, k, clone);
              break;
            }
          }
        }
      }
    }
  }

  ir_mem_free(uses);
  ir_mem_free(clones);
  ir_hashtab_free(&hash);

#ifdef IR_DEBUG
  if (ctx->flags & IR_DEBUG_GCM_SPLIT) {
    ir_check(ctx);
  }
#endif

  return 1;
}
#endif

#ifdef IR_DEBUG
static bool ir_gcm_dominates(ir_ctx *ctx, uint32_t b1, uint32_t b2)
{
  uint32_t b1_depth = ctx->cfg_blocks[b1].dom_depth;
  const ir_block *bb2 = &ctx->cfg_blocks[b2];

  while (bb2->dom_depth > b1_depth) {
    b2 = bb2->dom_parent;
    bb2 = &ctx->cfg_blocks[b2];
  }
  return b1 == b2;
}
#endif

static void ir_gcm_schedule_late(ir_ctx *ctx, ir_ref ref, uint32_t b)
{
  ir_ref n, use;
  uint32_t lca = 0;

  IR_ASSERT(ctx->ir_base[ref].op != IR_PARAM && ctx->ir_base[ref].op != IR_VAR);
  IR_ASSERT(ctx->ir_base[ref].op != IR_PHI && ctx->ir_base[ref].op != IR_PI);

  IR_ASSERT(IR_GCM_IS_SCHEDULED_EARLY(b));
  b = IR_GCM_EARLY_BLOCK(b);
  ctx->cfg_map[ref] = b;

  for (n = 0; n < ctx->use_lists[ref].count; n++) {
    use = ctx->use_edges[ctx->use_lists[ref].refs + n];
    b = ctx->cfg_map[use];
    if (IR_GCM_IS_SCHEDULED_EARLY(b)) {
      ir_gcm_schedule_late(ctx, use, b);
      b = ctx->cfg_map[use];
      IR_ASSERT(b != 0);
    } else if (!b) {
      continue;
    } else if (ctx->ir_base[use].op == IR_PHI) {
      ir_insn *insn = &ctx->ir_base[use];
      ir_ref *p = insn->ops + 2; /* PHI data inputs */
      ir_ref *q = ctx->ir_base[insn->op1].ops + 1; /* MERGE inputs */
      ir_ref n = insn->inputs_count - 1;

      for (;n > 0; p++, q++, n--) {
        if (*p == ref) {
          b = ctx->cfg_map[*q];
          lca = !lca ? b : ir_gcm_find_lca(ctx, lca, b);
        }
      }
      continue;
    }
    lca = !lca ? b : ir_gcm_find_lca(ctx, lca, b);
  }

  IR_ASSERT(lca != 0 && "No Common Ancestor");
  IR_ASSERT(ir_gcm_dominates(ctx, ctx->cfg_map[ref], lca) && "Early placement doesn't dominate the late");

#if IR_GCM_SPLIT
  if (ctx->use_lists[ref].count > 1
   && ir_split_partially_dead_node(ctx, ref, lca)) {
    return;
  }
#endif

  if (lca != ctx->cfg_map[ref]) {
    b = ir_gcm_select_best_block(ctx, ref, lca);

    ctx->cfg_map[ref] = b;

    /* OVERFLOW is a projection of ADD/SUB/MUL_OV and must be scheduled into the same block */
    if (ctx->ir_base[ref].op >= IR_ADD_OV && ctx->ir_base[ref].op <= IR_MUL_OV) {
      ir_use_list *use_list = &ctx->use_lists[ref];
      ir_ref n, *p, use;

      for (n = use_list->count, p = &ctx->use_edges[use_list->refs]; n < 0; p++, n--) {
        use = *p;
        if (ctx->ir_base[use].op == IR_OVERFLOW) {
          ctx->cfg_map[use] = b;
          break;
        }
      }
    }
  }
}

int ir_gcm(ir_ctx *ctx)
{
  ir_ref k, n, *p, ref;
  ir_block *bb;
  ir_list queue_early;
  ir_list queue_late;
  uint32_t *_blocks, b;
  ir_insn *insn, *use_insn;
  ir_use_list *use_list;

  IR_ASSERT(ctx->cfg_map);
  _blocks = ctx->cfg_map;

  ir_list_init(&queue_early, ctx->insns_count);

  if (ctx->cfg_blocks_count == 1) {
    ref = ctx->cfg_blocks[1].end;
    do {
      insn = &ctx->ir_base[ref];
      _blocks[ref] = 1; /* pin to block */
      if (insn->inputs_count > 1) {
        /* insn has input data edges */
        ir_list_push_unchecked(&queue_early, ref);
      }
      ref = insn->op1; /* control predecessor */
    } while (ref != 1); /* IR_START */
    _blocks[1] = 1; /* pin to block */

    use_list = &ctx->use_lists[1];
    n = use_list->count;
    for (p = &ctx->use_edges[use_list->refs]; n > 0; n--, p++) {
      ref = *p;
      use_insn = &ctx->ir_base[ref];
      if (use_insn->op == IR_PARAM || use_insn->op == IR_VAR) {
        ctx->cfg_blocks[1].flags |= (use_insn->op == IR_PARAM) ? IR_BB_HAS_PARAM : IR_BB_HAS_VAR;
        _blocks[ref] = 1; /* pin to block */
      }
    }

    /* Place all live nodes to the first block */
    while (ir_list_len(&queue_early)) {
      ref = ir_list_pop(&queue_early);
      insn = &ctx->ir_base[ref];
      n = insn->inputs_count;
      for (p = insn->ops + 1; n > 0; p++, n--) {
        ref = *p;
        if (ref > 0 && _blocks[ref] == 0) {
          _blocks[ref] = 1;
          ir_list_push_unchecked(&queue_early, ref);
        }
      }
    }

    ir_list_free(&queue_early);

    return 1;
  }

  ir_list_init(&queue_late, ctx->insns_count);

  /* pin and collect control and control depended (PARAM, VAR, PHI, PI) instructions */
  b = ctx->cfg_blocks_count;
  for (bb = ctx->cfg_blocks + b; b > 0; bb--, b--) {
    IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
    ref = bb->end;

    /* process the last instruction of the block */
    insn = &ctx->ir_base[ref];
    _blocks[ref] = b; /* pin to block */
    if (insn->inputs_count > 1) {
      /* insn has input data edges */
      ir_list_push_unchecked(&queue_early, ref);
    }
    ref = insn->op1; /* control predecessor */

    while (ref != bb->start) {
      insn = &ctx->ir_base[ref];
      _blocks[ref] = b; /* pin to block */
      if (insn->inputs_count > 1) {
        /* insn has input data edges */
        ir_list_push_unchecked(&queue_early, ref);
      }
      if (insn->type != IR_VOID) {
        IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_MEM);
      }
      ref = insn->op1; /* control predecessor */
    }

    /* process the first instruction of the block */
    _blocks[ref] = b; /* pin to block */

    use_list = &ctx->use_lists[ref];
    n = use_list->count;
    if (n > 1) {
      for (p = &ctx->use_edges[use_list->refs]; n > 0; n--, p++) {
        ref = *p;
        use_insn = &ctx->ir_base[ref];
        if (use_insn->op == IR_PHI || use_insn->op == IR_PI) {
          bb->flags |= (use_insn->op == IR_PHI) ? IR_BB_HAS_PHI : IR_BB_HAS_PI;
          if (EXPECTED(ctx->use_lists[ref].count != 0)) {
            _blocks[ref] = b; /* pin to block */
            ir_list_push_unchecked(&queue_early, ref);
          }
        } else if (use_insn->op == IR_PARAM) {
          bb->flags |= IR_BB_HAS_PARAM;
          _blocks[ref] = b; /* pin to block */
        } else if (use_insn->op == IR_VAR) {
          bb->flags |= IR_BB_HAS_VAR;
          _blocks[ref] = b; /* pin to block */
        }
      }
    }
  }

  n = ir_list_len(&queue_early);
  while (n > 0) {
    n--;
    ref = ir_list_at(&queue_early, n);
    insn = &ctx->ir_base[ref];
    k = insn->inputs_count - 1;
    for (p = insn->ops + 2; k > 0; p++, k--) {
      ref = *p;
      if (ref > 0 && _blocks[ref] == 0) {
        ir_gcm_schedule_early(ctx, ref, &queue_late);
      }
    }
  }

#ifdef IR_DEBUG
  if (ctx->flags & IR_DEBUG_GCM) {
    fprintf(stderr, "GCM Schedule Early\n");
    for (n = 1; n < ctx->insns_count; n++) {
      fprintf(stderr, "%d -> %d\n", n, ctx->cfg_map[n]);
    }
  }
#endif

#if IR_GCM_SPLIT
  ir_gcm_split_data data;

  ir_sparse_set_init(&data.totally_useful, ctx->cfg_blocks_count + 1);
  ir_list_init(&data.worklist, ctx->cfg_blocks_count + 1);
  ctx->data = &data;
#endif

  n = ir_list_len(&queue_late);
  while (n > 0) {
    n--;
    ref = ir_list_at(&queue_late, n);
    b = ctx->cfg_map[ref];
    if (IR_GCM_IS_SCHEDULED_EARLY(b)) {
      ir_gcm_schedule_late(ctx, ref, b);
    }
  }

#if IR_GCM_SPLIT
  ir_list_free(&data.worklist);
  ir_sparse_set_free(&data.totally_useful);
  ctx->data = NULL;
#endif

  ir_list_free(&queue_early);
  ir_list_free(&queue_late);

#ifdef IR_DEBUG
  if (ctx->flags & IR_DEBUG_GCM) {
    fprintf(stderr, "GCM Schedule Late\n");
    for (n = 1; n < ctx->insns_count; n++) {
      fprintf(stderr, "%d -> %d\n", n, ctx->cfg_map[n]);
    }
  }
#endif

  return 1;
}

static void ir_xlat_binding(ir_ctx *ctx, ir_ref *_xlat)
{
  uint32_t n1, n2, pos;
  ir_ref key;
  ir_hashtab_bucket *b1, *b2;
  ir_hashtab *binding = ctx->binding;
  uint32_t hash_size = (uint32_t)(-(int32_t)binding->mask);

  memset((char*)binding->data - (hash_size * sizeof(uint32_t)), -1, hash_size * sizeof(uint32_t));
  n1 = binding->count;
  n2 = 0;
  pos = 0;
  b1 = binding->data;
  b2 = binding->data;
  while (n1 > 0) {
    key = b1->key;
    IR_ASSERT(key < ctx->insns_count);
    if (_xlat[key]) {
      key = _xlat[key];
      b2->key = key;
      if (b1->val > 0) {
        IR_ASSERT(_xlat[b1->val]);
        b2->val = _xlat[b1->val];
      } else {
        b2->val = b1->val;
      }
      key |= binding->mask;
      b2->next = ((uint32_t*)binding->data)[key];
      ((uint32_t*)binding->data)[key] = pos;
      pos += sizeof(ir_hashtab_bucket);
      b2++;
      n2++;
    }
    b1++;
    n1--;
  }
  binding->count = n2;
}

IR_ALWAYS_INLINE ir_ref ir_count_constant(ir_ref *_xlat, ir_ref ref)
{
  if (!_xlat[ref]) {
    _xlat[ref] = ref; /* this is only a "used constant" marker */
    return 1;
  }
  return 0;
}

IR_ALWAYS_INLINE bool ir_is_good_bb_order(ir_ctx *ctx, uint32_t b, ir_block *bb, ir_ref start)
{
  ir_insn *insn = &ctx->ir_base[start];
  uint32_t n = insn->inputs_count;
  ir_ref *p = insn->ops + 1;

  if (n == 1) {
    return ctx->cfg_map[*p] < b;
  } else {
    IR_ASSERT(n > 1);
    for (; n > 0; p++, n--) {
      ir_ref input = *p;

      if (!IR_IS_CONST_REF(input)) {
        uint32_t input_b = ctx->cfg_map[input];

        if (input_b < b) {
          /* ordered */
        } else if ((bb->flags & IR_BB_LOOP_HEADER)
          && (input_b == b || ctx->cfg_blocks[input_b].loop_header == b)) {
          /* back-edge of reducible loop */
        } else if ((bb->flags & IR_BB_IRREDUCIBLE_LOOP)
          && (ctx->cfg_blocks[input_b].loop_header == bb->loop_header)) {
          /* closing edge of irreducible loop */
        } else {
          return 0;
        }
      }
    }
    return 1;
  }
}

static IR_NEVER_INLINE void ir_fix_bb_order(ir_ctx *ctx, ir_ref *_prev, ir_ref *_next)
{
  uint32_t b, succ, count, *q, *xlat;
  ir_block *bb;
  ir_ref ref, n, prev;
  ir_worklist worklist;
  ir_block *new_blocks;

#if 0
  for (b = 1, bb = ctx->cfg_blocks + 1; b <= ctx->cfg_blocks_count; b++, bb++) {
    if (!ir_is_good_bb_order(ctx, b, bb, bb->start)) {
      goto fix;
    }
  }
  return;

fix:
#endif
  count = ctx->cfg_blocks_count + 1;
  new_blocks = ir_mem_malloc(count * sizeof(ir_block));
  xlat = ir_mem_malloc(count * sizeof(uint32_t));
  ir_worklist_init(&worklist, count);
  ir_worklist_push(&worklist, 1);
  /* Schedule blocks bottom-up. Place block only after all its successurs (except back-edges) are placed. */
  while (ir_worklist_len(&worklist) != 0) {
next:
    b = ir_worklist_peek(&worklist);
    bb = &ctx->cfg_blocks[b];
    n = bb->successors_count;
    if (n == 1) {
      succ = ctx->cfg_edges[bb->successors];
      if (ir_bitset_in(worklist.visited, succ)) {
        /* already processed */
      } else if ((ctx->cfg_blocks[succ].flags & IR_BB_IRREDUCIBLE_LOOP)
          && ((ctx->cfg_blocks[b].flags & IR_BB_LOOP_HEADER) ?
            (ctx->cfg_blocks[succ].loop_header != b) :
            (ctx->cfg_blocks[succ].loop_header != ctx->cfg_blocks[b].loop_header))) {
        /* "side" entry of irreducible loop (ignore) */
      } else if (ir_worklist_push(&worklist, succ)) {
        goto next;
      }
    } else if (n > 1) {
      uint32_t best = 0;
      uint32_t best_loop_depth = 0;

      q = ctx->cfg_edges + bb->successors + n;
      do {
        q--;
        succ = *q;
        if (ir_bitset_in(worklist.visited, succ)) {
          /* already processed */
        } else if ((ctx->cfg_blocks[succ].flags & IR_BB_IRREDUCIBLE_LOOP)
            && ((ctx->cfg_blocks[b].flags & IR_BB_LOOP_HEADER) ?
              (ctx->cfg_blocks[succ].loop_header != b) :
              (ctx->cfg_blocks[succ].loop_header != ctx->cfg_blocks[b].loop_header))) {
          /* "side" entry of irreducible loop (ignore) */
        } else if (!best) {
          best = succ;
          best_loop_depth = ctx->cfg_blocks[best].loop_depth;
        } else if (ctx->cfg_blocks[succ].loop_depth < best_loop_depth) {
          /* prefer deeper loop */
          best = succ;
          best_loop_depth = ctx->cfg_blocks[best].loop_depth;
        }
        n--;
      } while (n > 0);
      if (best) {
        ir_worklist_push(&worklist, best);
        goto next;
      }
    }

    /* All successors of "b" are placed. Now we can place "b" itself. */
    ir_worklist_pop(&worklist);
    count--;
    new_blocks[count] = *bb;
    xlat[b] = count;
  }
  IR_ASSERT(count == 1);
  xlat[0] = 0;
  ir_worklist_free(&worklist);

  prev = 0;
  for (b = 1, bb = new_blocks + 1; b <= ctx->cfg_blocks_count; b++, bb++) {
    bb->idom = xlat[bb->idom];
    bb->loop_header = xlat[bb->loop_header];
    n = bb->successors_count;
    if (n > 0) {
      for (q = ctx->cfg_edges + bb->successors; n > 0; q++, n--) {
        *q = xlat[*q];
      }
    }
    n = bb->predecessors_count;
    if (n > 0) {
      for (q = ctx->cfg_edges + bb->predecessors; n > 0; q++, n--) {
        *q = xlat[*q];
      }
    }
    _next[prev] = bb->start;
    _prev[bb->start] = prev;
    prev = bb->end;
  }
  _next[0] = 0;
  _next[prev] = 0;

  for (ref = 2; ref < ctx->insns_count; ref++) {
    ctx->cfg_map[ref] = xlat[ctx->cfg_map[ref]];
  }
  ir_mem_free(xlat);

  ir_mem_free(ctx->cfg_blocks);
  ctx->cfg_blocks = new_blocks;
}

#if IR_DEBUG
static void ir_schedule_print_list(const ir_ctx *ctx, uint32_t b, const ir_ref *_next,
                                   ir_ref start, ir_ref end, const char *label)
{
  ir_ref ref;

  fprintf(stderr, "  %s [%d", label, start);
  ref = _next[start];
  while (ref != end) {
    fprintf(stderr, ",%d", ref);
    ref = _next[ref];
  }
  fprintf(stderr, ",%d]\n", ref);
}
#endif

/* Simple Stable Topological Sort */
static void ir_schedule_topsort(const ir_ctx *ctx, uint32_t b, const ir_block *bb,
                                ir_ref *_xlat, ir_ref *_next, ir_ref *_prev,
                                ir_ref ref, ir_ref end,
                                ir_ref *insns_count, ir_ref *consts_count)
{
  ir_ref i = ref;
  const ir_insn *insn;

  if (bb->successors_count > 1) {
    ir_ref input, j = bb->end;
    ir_insn *end = &ctx->ir_base[j];

    if (end->op == IR_IF) {
      /* Move condition closer to IF */
      input = end->op2;
      if (input > 0
       && ctx->cfg_map[input] == b
       && !_xlat[input]
       && _prev[j] != input
       && (!(ir_op_flags[ctx->ir_base[input].op] & IR_OP_FLAG_CONTROL) || end->op1 == input)) {
        if (input == i) {
          i = _next[i];
          insn = &ctx->ir_base[i];
        }
        /* remove "input" */
        _prev[_next[input]] = _prev[input];
        _next[_prev[input]] = _next[input];
        /* insert before "j" */
        _prev[input] = _prev[j];
        _next[input] = j;
        _next[_prev[j]] = input;
        _prev[j] = input;
      }
    }
  }

  while (i != end) {
    ir_ref n, j, input;
    const ir_ref *p;

restart:
    IR_ASSERT(ctx->cfg_map[i] == b);
    insn = &ctx->ir_base[i];
    n = insn->inputs_count;
    for (j = n, p = insn->ops + 1; j > 0; p++, j--) {
      input = *p;
      if (!_xlat[input]) {
        /* input is not scheduled yet */
        if (input > 0) {
          if (ctx->cfg_map[input] == b) {
            /* "input" should be before "i" to satisfy dependency */
#ifdef IR_DEBUG
            if (ctx->flags & IR_DEBUG_SCHEDULE) {
              fprintf(stderr, "Wrong dependency %d:%d -> %d\n", b, input, i);
            }
#endif
            /* remove "input" */
            _prev[_next[input]] = _prev[input];
            _next[_prev[input]] = _next[input];
            /* insert before "i" */
            _prev[input] = _prev[i];
            _next[input] = i;
            _next[_prev[i]] = input;
            _prev[i] = input;
            /* restart from "input" */
            i = input;
            goto restart;
          }
        } else if (input < IR_TRUE) {
          *consts_count += ir_count_constant(_xlat, input);
        }
      }
    }

    _xlat[i] = *insns_count;
    *insns_count += ir_insn_inputs_to_len(n);
    IR_ASSERT(_next[i] != IR_UNUSED);
    i = _next[i];
  }
}

int ir_schedule(ir_ctx *ctx)
{
  ir_ref i, j, k, n, *p, *q, ref, new_ref, prev_ref, insns_count, consts_count, use_edges_count;
  ir_ref *_xlat;
  ir_ref *edges;
  ir_ref prev_b_end;
  uint32_t b;
  ir_ref *_next = ir_mem_malloc(ctx->insns_count * sizeof(ir_ref));
  ir_ref *_prev = ir_mem_malloc(ctx->insns_count * sizeof(ir_ref));
  ir_block *bb;
  ir_insn *insn, *new_insn, *base;
  ir_use_list *lists, *use_list, *new_list;
  bool bad_bb_order = 0;

  /* Create a double-linked list of nodes ordered by BB, respecting BB->start and BB->end */
  IR_ASSERT(ctx->cfg_map[1] == 1);

  /* link BB boundaries */
  _prev[1] = 0;
  prev_b_end = ctx->cfg_blocks[1].end;
  _next[1] = prev_b_end;
  _prev[prev_b_end] = 1;
  for (b = 2, bb = ctx->cfg_blocks + 2; b <= ctx->cfg_blocks_count; b++, bb++) {
    ir_ref start = bb->start;
    ir_ref end = bb->end;
    _next[prev_b_end] = start;
    _prev[start] = prev_b_end;
    _next[start] = end;
    _prev[end] = start;
    prev_b_end = end;
    if (!ir_is_good_bb_order(ctx, b, bb, start)) {
      bad_bb_order = 1;
    }
  }
  _next[prev_b_end] = 0;

  /* insert intermediate BB nodes */
  use_edges_count = ctx->use_lists[1].count;
  for (i = 2, use_list = &ctx->use_lists[i]; i < ctx->insns_count; use_list++, i++) {
    b = ctx->cfg_map[i];
    if (!b) continue;
    use_edges_count += use_list->count;
    bb = &ctx->cfg_blocks[b];
    if (i != bb->start && i != bb->end) {
      /* insert before "end" */
      ir_ref next = bb->end;
      ir_ref prev = _prev[next];
      _prev[i] = prev;
      _next[i] = next;
      _next[prev] = i;
      _prev[next] = i;
    }
  }

  if (bad_bb_order) {
    ir_fix_bb_order(ctx, _prev, _next);
  }

  _xlat = ir_mem_calloc((ctx->consts_count + ctx->insns_count), sizeof(ir_ref));
  _xlat += ctx->consts_count;
  _xlat[IR_TRUE] = IR_TRUE;
  _xlat[IR_FALSE] = IR_FALSE;
  _xlat[IR_NULL] = IR_NULL;
  _xlat[IR_UNUSED] = IR_UNUSED;
  insns_count = 1;
  consts_count = -(IR_TRUE - 1);

  /* Schedule instructions inside each BB (now just topological sort according to dependencies) */
  for (b = 1, bb = ctx->cfg_blocks + 1; b <= ctx->cfg_blocks_count; b++, bb++) {
    ir_ref start;

#ifdef IR_DEBUG
    if (ctx->flags & IR_DEBUG_SCHEDULE) {
      fprintf(stderr, "BB%d\n", b);
      ir_schedule_print_list(ctx, b, _next, bb->start, bb->end, "INITIAL");
    }
#endif

    IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
    /* Schedule BB start */
    start = i = bb->start;
    _xlat[i] = bb->start = insns_count;
    insn = &ctx->ir_base[i];
    if (insn->op == IR_BEGIN) {
      if (insn->op2) {
        consts_count += ir_count_constant(_xlat, insn->op2);
      }
    } else if (insn->op == IR_CASE_VAL) {
      IR_ASSERT(insn->op2 < IR_TRUE);
      consts_count += ir_count_constant(_xlat, insn->op2);
    } else if (insn->op == IR_CASE_RANGE) {
      IR_ASSERT(insn->op2 < IR_TRUE);
      consts_count += ir_count_constant(_xlat, insn->op2);
      IR_ASSERT(insn->op3 < IR_TRUE);
      consts_count += ir_count_constant(_xlat, insn->op3);
    }
    n = insn->inputs_count;
    insns_count += ir_insn_inputs_to_len(n);
    i = _next[i];
    insn = &ctx->ir_base[i];
    if (bb->flags & (IR_BB_HAS_PHI|IR_BB_HAS_PI|IR_BB_HAS_PARAM|IR_BB_HAS_VAR)) {
      int count = 0;

      /* Schedule PARAM, VAR, PI */
      while (insn->op == IR_PARAM || insn->op == IR_VAR || insn->op == IR_PI) {
        _xlat[i] = insns_count;
        insns_count += 1;
        i = _next[i];
        insn = &ctx->ir_base[i];
        count++;
      }
      /* Schedule PHIs */
      while (insn->op == IR_PHI) {
        ir_ref j, *p, input;

        _xlat[i] = insns_count;
        /* Reuse "n" from MERGE and skip first input */
        insns_count += ir_insn_inputs_to_len(n + 1);
        for (j = n, p = insn->ops + 2; j > 0; p++, j--) {
          input = *p;
          if (input < IR_TRUE) {
            consts_count += ir_count_constant(_xlat, input);
          }
        }
        i = _next[i];
        insn = &ctx->ir_base[i];
        count++;
      }
      /* Schedule remaining PHIs */
      if (UNEXPECTED(count < ctx->use_lists[start].count - 1)) {
        ir_use_list *use_list = &ctx->use_lists[start];
        ir_ref *p, count = use_list->count;
        ir_ref phis = _prev[i];

        for (p = &ctx->use_edges[use_list->refs]; count > 0; p++, count--) {
          ir_ref use = *p;
          ir_insn *use_insn = &ctx->ir_base[use];
          if (!_xlat[use] && ctx->cfg_map[use]) {
            if (use_insn->op == IR_PARAM
             || use_insn->op == IR_VAR
             || use_insn->op == IR_PI
             || use_insn->op == IR_PHI) {
              IR_ASSERT(ctx->cfg_map[use] == b);
              if (_prev[use] != phis) {
                /* remove "use" */
                _prev[_next[use]] = _prev[use];
                _next[_prev[use]] = _next[use];
                /* insert "use" after "phis" */
                _prev[use] = phis;
                _next[use] = _next[phis];
                _prev[_next[phis]] = use;
                _next[phis] = use;
              }
              phis = use;
              _xlat[use] = insns_count;
              if (use_insn->op == IR_PHI) {
                ir_ref *q;
                /* Reuse "n" from MERGE and skip first input */
                insns_count += ir_insn_inputs_to_len(n + 1);
                for (j = n, q = use_insn->ops + 2; j > 0; q++, j--) {
                  ir_ref input = *q;
                  if (input < IR_TRUE) {
                    consts_count += ir_count_constant(_xlat, input);
                  }
                }
              } else {
                insns_count += 1;
              }
            }
          }
        }
        i = _next[phis];
        insn = &ctx->ir_base[i];
      }
    }

    if (i != bb->end) {
      ir_schedule_topsort(ctx, b, bb, _xlat, _next, _prev, i, bb->end, &insns_count, &consts_count);
    }

#ifdef IR_DEBUG
    if (ctx->flags & IR_DEBUG_SCHEDULE) {
      ir_schedule_print_list(ctx, b, _next, start, bb->end, "  FINAL");
    }
#endif

    /* Schedule BB end */
    i = bb->end;
    insn = &ctx->ir_base[i];
    _xlat[i] = bb->end = insns_count;
    insns_count++;
    if (IR_INPUT_EDGES_COUNT(ir_op_flags[insn->op]) == 2) {
      if (insn->op2 < IR_TRUE) {
        consts_count += ir_count_constant(_xlat, insn->op2);
      }
    }
  }

#if 1
  /* Check if scheduling didn't make any modifications */
  if (consts_count == ctx->consts_count && insns_count == ctx->insns_count) {
    bool changed = 0;

    for (i = 1; i != 0; i = _next[i]) {
      if (_xlat[i] != i) {
        changed = 1;
        break;
      }
    }
    if (!changed) {
      _xlat -= ctx->consts_count;
      ir_mem_free(_xlat);
      ir_mem_free(_next);

      ctx->prev_ref = _prev;
      ctx->flags2 |= IR_LINEAR;
      ir_truncate(ctx);

      return 1;
    }
  }
#endif

  ir_mem_free(_prev);

  uint32_t *map = ir_mem_calloc(insns_count, sizeof(uint32_t));
  _prev = ir_mem_malloc(insns_count * sizeof(ir_ref));
  lists = ir_mem_malloc(insns_count * sizeof(ir_use_list));
  ir_ref *use_edges = edges = ir_mem_malloc(use_edges_count * sizeof(ir_ref));
  base = ir_mem_malloc((consts_count + insns_count) * sizeof(ir_insn));
  base += consts_count;

  /* Copy constants */
  if (ctx->consts_count == consts_count) {
    memcpy(base - consts_count + 1, ctx->ir_base - consts_count + 1, sizeof(ir_insn) * consts_count);
    for (j = -consts_count + 1; j < IR_TRUE; j++) {
      _xlat[j] = j;
    }
  } else {
    ir_insn *src = ctx->ir_base - ctx->consts_count + 1;
    ir_insn *dst = base - consts_count + 1;

    i = -ctx->consts_count + 1;
    j = -consts_count + 1;
    while (i < IR_TRUE) {
      if (_xlat[i]) {
        *dst = *src;
        dst->prev_const = 0;
        _xlat[i] = j;
        dst++;
        j++;
      }
      src++;
      i++;
    }
    IR_ASSERT(j == IR_TRUE);
    base[IR_TRUE].optx = IR_OPT(IR_C_BOOL, IR_BOOL);
    base[IR_TRUE].val.u64 = 1;
    base[IR_FALSE].optx = IR_OPT(IR_C_BOOL, IR_BOOL);
    base[IR_FALSE].val.u64 = 0;
    base[IR_NULL].optx = IR_OPT(IR_C_ADDR, IR_ADDR);
    base[IR_NULL].val.u64 = 0;
    MAKE_NOP(&base[IR_UNUSED]);
  }

  /* Copy instructions, use lists and use edges */
#ifdef IR_DEBUG
  ir_ref orig_use_edges_count = use_edges_count;
#endif
  prev_ref = 0;
  use_edges_count = 0;
  for (i = 1; i != 0; i = _next[i]) {
    new_ref = _xlat[i];
    map[new_ref] = ctx->cfg_map[i];
    _prev[new_ref] = prev_ref;
    prev_ref = new_ref;

    use_list = &ctx->use_lists[i];
    n = use_list->count;
    k = 0;
    if (n == 1) {
      ref = ctx->use_edges[use_list->refs];
      if (EXPECTED(_xlat[ref])) {
        *edges = _xlat[ref];
        edges++;
        k = 1;
      }
    } else {
      p = &ctx->use_edges[use_list->refs];
      while (n--) {
        ref = *p;
        if (EXPECTED(_xlat[ref])) {
          *edges = _xlat[ref];
          edges++;
          k++;
        }
        p++;
      }
    }
    new_list = &lists[new_ref];
    new_list->refs = use_edges_count;
    use_edges_count += k;
    new_list->count = k;

    insn = &ctx->ir_base[i];
    new_insn = &base[new_ref];

    new_insn->optx = insn->optx;
    n = new_insn->inputs_count;
    switch (n) {
      case 0:
        new_insn->op1 = insn->op1;
        new_insn->op2 = insn->op2;
        new_insn->op3 = insn->op3;
        break;
      case 1:
        new_insn->op1 = _xlat[insn->op1];
        if (new_insn->op == IR_BEGIN && insn->op2) {
          new_insn->op2 = _xlat[insn->op2];
        } else {
          new_insn->op2 = insn->op2;
        }
        new_insn->op3 = insn->op3;
        break;
      case 2:
        new_insn->op1 = _xlat[insn->op1];
        new_insn->op2 = _xlat[insn->op2];
        new_insn->op3 = insn->op3;
#if IR_SCHEDULE_SWAP_OPS
        /* Swap operands according to folding rules */
        if (new_insn->op1 < new_insn->op2) {
          switch (new_insn->op) {
            case IR_EQ:
            case IR_NE:
            case IR_ORDERED:
            case IR_UNORDERED:
            case IR_ADD:
            case IR_MUL:
            case IR_ADD_OV:
            case IR_MUL_OV:
            case IR_OR:
            case IR_AND:
            case IR_XOR:
            case IR_MIN:
            case IR_MAX:
              SWAP_REFS(new_insn->op1, new_insn->op2);
              break;
            case IR_LT:
            case IR_GE:
            case IR_LE:
            case IR_GT:
            case IR_ULT:
            case IR_UGE:
            case IR_ULE:
            case IR_UGT:
              SWAP_REFS(new_insn->op1, new_insn->op2);
              new_insn->op ^= 3; /* [U]LT <-> [U]GT, [U]LE <-> [U]GE */
              break;
          }
        }
#endif
        break;
      case 3:
        new_insn->op1 = _xlat[insn->op1];
        new_insn->op2 = _xlat[insn->op2];
        new_insn->op3 = _xlat[insn->op3];
        break;
      default:
        for (j = n, p = insn->ops + 1, q = new_insn->ops + 1; j > 0; p++, q++, j--) {
          *q = _xlat[*p];
        }
        break;
    }
  }

  /* Update list of terminators (IR_OPND_CONTROL_REF) */
  insn = &base[1];
  ref = insn->op1;
  if (ref) {
    insn->op1 = ref = _xlat[ref];
    while (1) {
      insn = &base[ref];
      ref = insn->op3;
      if (!ref) {
        break;
      }
      insn->op3 = ref = _xlat[ref];
    }
  }

  if (ctx->binding) {
    ir_xlat_binding(ctx, _xlat);
  }

  _xlat -= ctx->consts_count;
  ir_mem_free(_xlat);
  ir_mem_free(_next);

  /* Switch to new IR buffer */
  ir_mem_free(ctx->ir_base - ctx->consts_limit);
  ctx->ir_base = base;
  ctx->insns_count = ctx->insns_limit = insns_count;
  ctx->consts_count = ctx->consts_limit = consts_count;

  ir_mem_free(ctx->use_lists);
  ir_mem_free(ctx->use_edges);
  IR_ASSERT(orig_use_edges_count >= use_edges_count);
  ctx->use_lists = lists;
  ctx->use_edges = use_edges;
  ctx->use_edges_count = use_edges_count;

  ir_mem_free(ctx->cfg_map);
  ctx->cfg_map = map;

  ctx->prev_ref = _prev;

  ctx->flags2 |= IR_LINEAR;

  return 1;
}

void ir_build_prev_refs(ir_ctx *ctx)
{
  uint32_t b;
  ir_block *bb;
  ir_ref i, n, prev;
  ir_insn *insn;

  ctx->prev_ref = ir_mem_malloc(ctx->insns_count * sizeof(ir_ref));
  prev = 0;
  for (b = 1, bb = ctx->cfg_blocks + b; b <= ctx->cfg_blocks_count; b++, bb++) {
    IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
    for (i = bb->start, insn = ctx->ir_base + i; i < bb->end;) {
      ctx->prev_ref[i] = prev;
      n = ir_insn_len(insn);
      prev = i;
      i += n;
      insn += n;
    }
    ctx->prev_ref[i] = prev;
  }
}

Coverage Report

Created: 2026-06-02 06:40