ir/ir_gcm.c
2022-08-10 23:40:48 +03:00

523 lines
13 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "ir.h"
#include "ir_private.h"
/* GCM - Global Code Motion
*
* C. Click. "Global code motion, global value numbering" Submitted to PLDI 95.
*/
static void ir_gcm_schedule_early(ir_ctx *ctx, uint32_t *_blocks, ir_ref ref)
{
ir_ref j, n, *p;
ir_insn *insn;
uint32_t flags;
if (_blocks[ref] > 0) {
return;
}
_blocks[ref] = 1;
insn = &ctx->ir_base[ref];
flags = ir_op_flags[insn->op];
if (IR_OPND_KIND(flags, 1) == IR_OPND_CONTROL_DEP) {
IR_ASSERT(_blocks[insn->op1] > 0);
_blocks[ref] = _blocks[insn->op1];
}
n = ir_input_edges_count(ctx, insn);
for (j = 1, p = insn->ops + 1; j <= n; j++, p++) {
ir_ref input = *p;
if (input > 0) {
if (_blocks[input] == 0) {
ir_gcm_schedule_early(ctx, _blocks, input);
}
if (IR_OPND_KIND(flags, 1) != IR_OPND_CONTROL_DEP
&& ctx->cfg_blocks[_blocks[ref]].dom_depth < ctx->cfg_blocks[_blocks[input]].dom_depth) {
_blocks[ref] = _blocks[input];
}
}
}
}
/* Last Common Ancestor */
static uint32_t ir_gcm_find_lca(ir_ctx *ctx, uint32_t b1, uint32_t b2)
{
while (ctx->cfg_blocks[b1].dom_depth > ctx->cfg_blocks[b2].dom_depth) {
b1 = ctx->cfg_blocks[b1].dom_parent;
}
while (ctx->cfg_blocks[b2].dom_depth > ctx->cfg_blocks[b1].dom_depth) {
b2 = ctx->cfg_blocks[b2].dom_parent;
}
while (b1 != b2) {
b1 = ctx->cfg_blocks[b1].dom_parent;
b2 = ctx->cfg_blocks[b2].dom_parent;
}
return b2;
}
static void ir_gcm_schedule_late(ir_ctx *ctx, uint32_t *_blocks, ir_bitset visited, ir_ref ref)
{
ir_ref i, n, *p, use;
ir_insn *insn;
uint32_t flags;
ir_bitset_incl(visited, ref);
n = ctx->use_lists[ref].count;
if (n) {
uint32_t lca, b;
for (i = 0, p = &ctx->use_edges[ctx->use_lists[ref].refs]; i < n; i++, p++) {
use = *p;
if (!ir_bitset_in(visited, use) && _blocks[use]) {
ir_gcm_schedule_late(ctx, _blocks, visited, use);
}
}
insn = &ctx->ir_base[ref];
flags = ir_op_flags[insn->op];
if (IR_OPND_KIND(flags, 1) == IR_OPND_CONTROL_DEP) {
return;
}
lca = 0;
for (i = 0, p -= n; i < n; i++, p++) {
use = *p;
b = _blocks[use];
if (!b) {
continue;
}
insn = &ctx->ir_base[use];
if (insn->op == IR_PHI) {
if (insn->op2 == ref) {
b = _blocks[ctx->ir_base[insn->op1].op1];
} else if (insn->op3 == ref) {
b = _blocks[ctx->ir_base[insn->op1].op2];
} else {
IR_ASSERT(0);
}
}
lca = !lca ? b : ir_gcm_find_lca(ctx, lca, b);
}
b = lca;
while (ctx->cfg_blocks[b].loop_depth && lca != ctx->cfg_blocks[_blocks[ref]].dom_parent) {
if (ctx->cfg_blocks[lca].loop_depth < ctx->cfg_blocks[b].loop_depth) {
b = lca;
}
lca = ctx->cfg_blocks[lca].dom_parent;
}
_blocks[ref] = b;
}
}
int ir_gcm(ir_ctx *ctx)
{
ir_ref i, j, k, n, *p, ref;
ir_bitset visited;
ir_block *bb;
ir_list queue;
uint32_t *_blocks;
ir_insn *insn;
uint32_t flags;
_blocks = ir_mem_calloc(ctx->insns_count, sizeof(uint32_t));
ir_list_init(&queue, ctx->insns_count);
/* pin control instructions and collect their direct inputs */
for (i = 1, bb = ctx->cfg_blocks + 1; i <= ctx->cfg_blocks_count; i++, bb++) {
if (bb->flags & IR_BB_UNREACHABLE) {
continue;
}
j = bb->end;
while (1) {
insn = &ctx->ir_base[j];
_blocks[j] = i; /* pin to block */
flags = ir_op_flags[insn->op];
n = ir_input_edges_count(ctx, insn);
for (k = 1, p = insn->ops + 1; k <= n; k++, p++) {
ref = *p;
if (ref > 0) {
if (IR_OPND_KIND(flags, k) == IR_OPND_DATA || IR_OPND_KIND(flags, k) == IR_OPND_VAR) {
ir_list_push(&queue, ref);
}
}
}
if (j == bb->start) {
break;
}
j = insn->op1; /* control predecessor */
}
}
n = ir_list_len(&queue);
for (i = 0; i < n; i++) {
ref = ir_list_at(&queue, i);
if (_blocks[ref] == 0) {
ir_gcm_schedule_early(ctx, _blocks, ref);
}
}
#ifdef IR_DEBUG
if (ctx->flags & IR_DEBUG_GCM) {
fprintf(stderr, "GCM Schedule Early\n");
for (i = 1; i < ctx->insns_count; i++) {
fprintf(stderr, "%d -> %d\n", i, _blocks[i]);
}
}
#endif
/* collect uses of control instructions */
visited = ir_bitset_malloc(ctx->insns_count);
ir_list_clear(&queue);
for (i = 1, bb = ctx->cfg_blocks + 1; i <= ctx->cfg_blocks_count; i++, bb++) {
if (bb->flags & IR_BB_UNREACHABLE) {
continue;
}
j = bb->end;
while (1) {
ir_bitset_incl(visited, j);
insn = &ctx->ir_base[j];
n = ctx->use_lists[j].count;
if (n > 0) {
for (k = 0, p = &ctx->use_edges[ctx->use_lists[j].refs]; k < n; k++, p++) {
ref = *p;
if (ctx->ir_base[ref].op == IR_PARAM || ctx->ir_base[ref].op == IR_VAR) {
_blocks[ref] = _blocks[j];
} else if (ir_op_flags[ctx->ir_base[ref].op] & IR_OP_FLAG_DATA) {
ir_list_push(&queue, ref);
}
}
}
if (j == bb->start) {
break;
}
j = insn->op1; /* control predecessor */
}
}
n = ir_list_len(&queue);
for (i = 0; i < n; i++) {
ref = ir_list_at(&queue, i);
if (!ir_bitset_in(visited, ref)) {
ir_gcm_schedule_late(ctx, _blocks, visited, ref);
}
}
ir_mem_free(visited);
ir_list_free(&queue);
#ifdef IR_DEBUG
if (ctx->flags & IR_DEBUG_GCM) {
fprintf(stderr, "GCM Schedule Late\n");
for (i = 1; i < ctx->insns_count; i++) {
fprintf(stderr, "%d -> %d\n", i, _blocks[i]);
}
}
#endif
ctx->cfg_map = _blocks;
return 1;
}
int ir_schedule(ir_ctx *ctx)
{
ir_ctx new_ctx;
ir_ref i, j, k, n, *p, ref, new_ref, insns_count, consts_count;
ir_ref *_xlat;
uint32_t flags;
uint32_t edges_count;
ir_use_list *lists;
ir_ref *edges;
ir_bitset used;
ir_ref b;
uint32_t *_blocks = ctx->cfg_map;
ir_ref *_next = ir_mem_calloc(ctx->insns_count, sizeof(ir_ref));
ir_ref *_prev = ir_mem_calloc(ctx->insns_count, sizeof(ir_ref));
ir_ref _rest = 0;
ir_block *bb;
ir_insn *insn, *new_insn;
/* Create double-linked list of nodes ordered by BB, respecting BB->start and BB->end */
IR_ASSERT(_blocks[1]);
_prev[1] = 0;
for (i = 2, j = 1; i < ctx->insns_count; i++) {
b = _blocks[i];
if (b) {
bb = &ctx->cfg_blocks[b];
if (_blocks[j] == b || i == bb->start) {
/* add to the end of list */
_next[j] = i;
_prev[i] = j;
j = i;
} else if (_prev[bb->end]) {
/* move up, insert before the end of alredy scheduled BB */
k = bb->end;
_prev[i] = _prev[k];
_next[i] = k;
_next[_prev[k]] = i;
_prev[k] = i;
} else {
/* move down late */
_next[i] = _rest;
_rest = i;
}
}
}
_next[j] = 0;
while (_rest) {
i = _rest;
_rest = _next[i];
b = _blocks[i];
bb = &ctx->cfg_blocks[b];
if (i == bb->end) {
// TODO: When removing MERGE, SCCP may move END of the block below other blocks ???
/* insert at the end of the block */
k = _next[bb->start];
while (_blocks[k] == b) {
k = _next[k];
}
} else {
/* insert after start of the block and all PARAM, VAR, PI, PHI */
k = _next[bb->start];
insn = &ctx->ir_base[k];
while (insn->op == IR_PARAM || insn->op == IR_VAR || insn->op == IR_PI || insn->op == IR_PHI) {
k = _next[k];
insn = &ctx->ir_base[k];
}
}
/* insert before "k" */
_prev[i] = _prev[k];
_next[i] = k;
_next[_prev[k]] = i;
_prev[k] = i;
}
#ifdef IR_DEBUG
if (ctx->flags & IR_DEBUG_SCHEDULE) {
fprintf(stderr, "Before Schedule\n");
for (b = 1, bb = ctx->cfg_blocks + 1; b <= ctx->cfg_blocks_count; b++, bb++) {
for (i = bb->start; i <= bb->end && i > 0; i = _next[i]) {
fprintf(stderr, "%d -> %d\n", i, b);
}
}
}
#endif
/* Topological sort according dependencies inside each basic block */
ir_bitset scheduled = ir_bitset_malloc(ctx->insns_count);
for (b = 1, bb = ctx->cfg_blocks + 1; b <= ctx->cfg_blocks_count; b++, bb++) {
if (bb->flags & IR_BB_UNREACHABLE) {
continue;
}
i = bb->start;
ir_bitset_incl(scheduled, i);
i = _next[i];
insn = &ctx->ir_base[i];
while (insn->op == IR_PARAM || insn->op == IR_VAR || insn->op == IR_PI || insn->op == IR_PHI) {
ir_bitset_incl(scheduled, i);
i = _next[i];
insn = &ctx->ir_base[i];
}
for (; i != bb->end; i = _next[i]) {
ir_ref n, j, *p, def;
restart:
insn = &ctx->ir_base[i];
n = ir_input_edges_count(ctx, insn);
for (j = 1, p = insn->ops + 1; j <= n; j++, p++) {
def = *p;
if (def > 0
&& _blocks[def] == b
&& !ir_bitset_in(scheduled, def)
&& insn->op != IR_PHI
&& insn->op != IR_PI) {
/* "def" should be before "i" to satisfy dependency */
#ifdef IR_DEBUG
if (ctx->flags & IR_DEBUG_SCHEDULE) {
fprintf(stderr, "Wrong dependency %d:%d -> %d\n", b, def, i);
}
#endif
/* remove "def" */
_prev[_next[def]] = _prev[def];
_next[_prev[def]] = _next[def];
/* insert before "i" */
_prev[def] = _prev[i];
_next[def] = i;
_next[_prev[i]] = def;
_prev[i] = def;
/* restart from "def" */
i = def;
goto restart;
}
}
ir_bitset_incl(scheduled, i);
}
}
ir_mem_free(scheduled);
#ifdef IR_DEBUG
if (ctx->flags & IR_DEBUG_SCHEDULE) {
fprintf(stderr, "After Schedule\n");
for (b = 1, bb = ctx->cfg_blocks + 1; b <= ctx->cfg_blocks_count; b++, bb++) {
for (i = bb->start; i <= bb->end && i > 0; i = _next[i]) {
fprintf(stderr, "%d -> %d\n", i, b);
}
}
}
#endif
ir_mem_free(_prev);
/* TODO: linearize without reallocation and reconstruction ??? */
_xlat = ir_mem_calloc(ctx->consts_count + ctx->insns_count, sizeof(ir_ref));
_xlat += ctx->consts_count;
_xlat[IR_TRUE] = IR_TRUE;
_xlat[IR_FALSE] = IR_FALSE;
_xlat[IR_NULL] = IR_NULL;
used = ir_bitset_malloc(ctx->consts_count + 1);
insns_count = 1;
consts_count = -(IR_TRUE - 1);
for (i = 1; i != 0; i = _next[i]) {
_xlat[i] = insns_count;
insn = &ctx->ir_base[i];
flags = ir_op_flags[insn->op];
n = ir_input_edges_count(ctx, insn);
for (k = 1, p = insn->ops + 1; k <= n; k++, p++) {
ref = *p;
if (ref < IR_TRUE) {
if (!ir_bitset_in(used, -ref)) {
ir_bitset_incl(used, -ref);
consts_count++;
}
}
}
n = 1 + (n >> 2); // support for multi-word instructions like MERGE and PHI
insns_count += n;
}
lists = ir_mem_calloc(insns_count, sizeof(ir_use_list));
ir_init(&new_ctx, consts_count, insns_count);
new_ctx.flags = ctx->flags;
new_ctx.fixed_regset = ctx->fixed_regset;
new_ctx.fixed_save_regset = ctx->fixed_save_regset;
IR_BITSET_FOREACH(used, ir_bitset_len(ctx->consts_count + 1), ref) {
new_ref = new_ctx.consts_count;
IR_ASSERT(new_ref < ctx->consts_limit);
new_ctx.consts_count = new_ref + 1;
_xlat[-ref] = new_ref = -new_ref;
insn = &ctx->ir_base[-ref];
new_insn = &new_ctx.ir_base[new_ref];
new_insn->optx = insn->optx;
new_insn->prev_const = 0;
if (insn->op == IR_FUNC || insn->op == IR_STR) {
new_insn->val.addr = ir_str(&new_ctx, ir_get_str(ctx, insn->val.addr));
} else {
new_insn->val.u64 = insn->val.u64;
}
} IR_BITSET_FOREACH_END();
ir_mem_free(used);
edges_count = 0;
for (i = 1; i != 0; i = _next[i]) {
insn = &ctx->ir_base[i];
flags = ir_op_flags[insn->op];
n = ir_operands_count(ctx, insn);
new_ref = new_ctx.insns_count;
new_ctx.insns_count = new_ref + 1 + (n >> 2); // support for multi-word instructions like MERGE and PHI;
IR_ASSERT(new_ctx.insns_count <= new_ctx.insns_limit);
new_insn = &new_ctx.ir_base[new_ref];
new_insn->optx = insn->optx;
new_insn->op1 = IR_UNUSED;
new_insn->op2 = IR_UNUSED;
new_insn->op3 = IR_UNUSED;
for (k = 1, p = insn->ops + 1; k <= n; k++, p++) {
ref = *p;
switch (IR_OPND_KIND(flags, k)) {
case IR_OPND_DATA:
case IR_OPND_VAR:
case IR_OPND_CONTROL:
case IR_OPND_CONTROL_DEP:
ref = _xlat[ref];
if (ref > 0) {
lists[ref].refs = -1;
lists[ref].count++;
edges_count++;
}
break;
case IR_OPND_CONTROL_REF:
ref = _xlat[ref];
break;
case IR_OPND_STR:
ref = ir_str(&new_ctx, ir_get_str(ctx, ref));
break;
case IR_OPND_NUM:
case IR_OPND_PROB:
break;
default:
IR_ASSERT(0);
break;
}
new_insn->ops[k] = ref;
}
}
edges = ir_mem_malloc(edges_count * sizeof(ir_ref));
edges_count = 0;
for (i = IR_UNUSED + 1, insn = new_ctx.ir_base + i; i < new_ctx.insns_count;) {
n = ir_input_edges_count(&new_ctx, insn);
for (j = 1, p = insn->ops + 1; j <= n; j++, p++) {
ref = *p;
if (ref > 0) {
ir_use_list *use_list = &lists[ref];
if (use_list->refs == -1) {
use_list->refs = edges_count;
edges_count += use_list->count;
use_list->count = 0;
}
edges[use_list->refs + use_list->count++] = i;
}
}
n = 1 + (n >> 2); // support for multi-word instructions like MERGE and PHI
i += n;
insn += n;
}
new_ctx.use_edges = edges;
new_ctx.use_lists = lists;
if (ctx->cfg_blocks) {
uint32_t b;
ir_block *bb;
new_ctx.cfg_blocks_count = ctx->cfg_blocks_count;
if (ctx->cfg_edges_count) {
new_ctx.cfg_edges_count = ctx->cfg_edges_count;
new_ctx.cfg_edges = ir_mem_malloc(ctx->cfg_edges_count * sizeof(uint32_t));
memcpy(new_ctx.cfg_edges, ctx->cfg_edges, ctx->cfg_edges_count * sizeof(uint32_t));
}
new_ctx.cfg_blocks = ir_mem_malloc((ctx->cfg_blocks_count + 1) * sizeof(ir_block));
memcpy(new_ctx.cfg_blocks, ctx->cfg_blocks, (ctx->cfg_blocks_count + 1) * sizeof(ir_block));
for (b = 1, bb = new_ctx.cfg_blocks + 1; b <= new_ctx.cfg_blocks_count; b++, bb++) {
bb->start = _xlat[bb->start];
bb->end = _xlat[bb->end];
}
}
_xlat -= ctx->consts_count;
ir_mem_free(_xlat);
ir_free(ctx);
IR_ASSERT(new_ctx.consts_count == new_ctx.consts_limit);
IR_ASSERT(new_ctx.insns_count == new_ctx.insns_limit);
memcpy(ctx, &new_ctx, sizeof(ir_ctx));
ctx->flags |= IR_LINEAR;
ir_mem_free(_next);
return 1;
}