Files
compiler-explorer/lib/cfg/cfg-parsers/oat.ts
2025-02-02 17:54:31 +00:00

309 lines
12 KiB
TypeScript

// Copyright (c) 2024, Compiler Explorer Authors
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
import _ from 'underscore';
import {EdgeColor} from '../../../types/compilation/cfg.interfaces.js';
import {logger} from '../../logger.js';
import {BaseInstructionSetInfo, InstructionType} from '../instruction-sets/base.js';
import {AssemblyLine, BBRange, BaseCFGParser, CanonicalBB, Edge, Range} from './base.js';
// This currently only covers the default arm64 output. To support dex2oat's
// other ISAs, we just need to make sure the correct isJmpInstruction() is being
// used, and parse out the destination addrs differently. For example:
// - x86/x86_64 0x00004071 jmp +34 (0x00004098)
// - arm i 0x00004052 b 0x00004078
// - riscv64 0x0000411c bltz t6, +40 ; 0x00004144
export class OatCFGParser extends BaseCFGParser {
code: AssemblyLine[];
jmpAddrRegex: RegExp;
hexRegex: RegExp;
constructor(instructionSetInfo: BaseInstructionSetInfo) {
super(instructionSetInfo);
this.code = [];
this.jmpAddrRegex = /.*\(addr (0x.*)\)/;
this.hexRegex = /0x0+(.*)/;
}
static override get key() {
return 'oat';
}
// Generally the same as the base filterData(), but we keep empty lines
// because there are no other indicators for the end of a function.
override filterData(assembly: AssemblyLine[]) {
const isCode = (x: AssemblyLine) => x && (x.source !== null || this.isFunctionName(x));
return this.filterTextSection(assembly).map(_.clone).filter(isCode);
}
// For filtering through the 'Instruction set', 'Instruction set features',
// and 'Compiler filter' lines at the top of the compiler output.
isHeaderInfo(text: string) {
return text.startsWith('Instruction ') || text.startsWith('Compiler ');
}
// Uses the same general flow as the overridden function, but accounts for
// empty spaces between functions.
override splitToFunctions(asmArr: AssemblyLine[]) {
if (asmArr.length === 0) return [];
const result: Range[] = [];
let first = 0;
while (!asmArr[first].text || this.isHeaderInfo(asmArr[first].text)) {
++first;
}
const last = asmArr.length;
const fnRange: Range = {start: first, end: first};
while (first !== last) {
if (this.isFunctionEnd(asmArr[first].text)) {
fnRange.end = first;
if (fnRange.end > fnRange.start) {
result.push(_.clone(fnRange));
}
fnRange.start = first + 1;
}
++first;
}
fnRange.end = last;
if (fnRange.end > fnRange.start) {
result.push(_.clone(fnRange));
}
return result;
}
// In these examples, '0x416c' and '0x8074' will be returned.
// 0x00004144 b #+0x28 (addr 0x416c)
// 0x00008050 b.hs #+0x24 (addr 0x00008074)
getJmpAddr(inst: string): string {
const match = inst.match(this.jmpAddrRegex);
if (match) return this.shortenHex(match[1]);
return '';
}
// In this example, '0x0000416c' will be returned.
// 0x0000416c ret
getPc(inst: string) {
return inst.trim().split(/\s+/)[0].toLowerCase();
}
// In this example, 'add' will be returned.
// 0x00004168 add w0, w1, #0x3 (3)
getOpcode(inst: string) {
return inst.trim().split(/\s+/)[1].toLowerCase();
}
isJmpTarget(inst: string, jmpAddrs: string[]) {
return jmpAddrs.includes(this.shortenHex(this.getPc(inst)));
}
// '0x00004168' -> '0x4168'
shortenHex(pc: string) {
const match = pc.match(this.hexRegex);
if (match) return '0x' + match[1];
return pc;
}
override splitToBasicBlocks(asmArr: AssemblyLine[], range: Range) {
let first = range.start;
const last = range.end;
if (first === last) return [];
// Collect branch targets so we know where to start new blocks.
const jmpAddrs: string[] = [];
while (first < last) {
if (asmArr[first].text.includes('(addr ')) {
const addr = this.getJmpAddr(asmArr[first].text);
if (addr) jmpAddrs.push(addr);
}
++first;
}
// range.start is the function name; we want blocks' ranges to begin
// with the first instruction.
first = range.start + 1;
let rangeBb: BBRange = {
nameId: this.shortenHex(this.getPc(asmArr[first].text)),
start: first,
end: 0,
actionPos: [],
};
const newRangeWith = (oldRange: BBRange, nameId: string, start: number) => ({
nameId: nameId,
start: start,
actionPos: [],
end: oldRange.end,
});
const result: BBRange[] = [];
while (first < last) {
const inst = asmArr[first].text;
const opcode = this.getOpcode(inst);
if (this.isBasicBlockEnd(inst, asmArr[first - 1] ? asmArr[first - 1].text : '')) {
rangeBb.end = first;
result.push(_.clone(rangeBb));
rangeBb = newRangeWith(rangeBb, this.extractNodeName(inst), first + 1);
} else if (this.instructionSetInfo.isJmpInstruction(opcode)) {
rangeBb.actionPos.push(first);
} else if (this.isJmpTarget(inst, jmpAddrs)) {
rangeBb.end = first;
result.push(_.clone(rangeBb));
rangeBb = newRangeWith(rangeBb, this.extractNodeName(inst), first);
}
++first;
}
rangeBb.end = last;
result.push(_.clone(rangeBb));
return result;
}
// Empty lines indicate the end of a function.
override isFunctionEnd(text: string) {
return text.trim().length === 0;
}
// All nodes are named after the address of their first instruction.
override extractNodeName(inst: string) {
return this.shortenHex(this.getPc(inst));
}
// Identical to splitToCanonicalBasicBlock(), but with a different node
// naming scheme.
splitToCanonicalBasicBlockOat(code: AssemblyLine[], basicBlock: BBRange): CanonicalBB[] {
const actionPos = basicBlock.actionPos;
let actPosSz = actionPos.length;
if (actionPos[actPosSz - 1] + 1 === basicBlock.end) {
--actPosSz;
}
if (actPosSz === 0)
return [
{
nameId: basicBlock.nameId,
start: basicBlock.start,
end: basicBlock.end,
},
];
if (actPosSz === 1)
return [
{nameId: basicBlock.nameId, start: basicBlock.start, end: actionPos[0] + 1},
{
nameId: this.extractNodeName(this.code[actionPos[0] + 1].text),
start: actionPos[0] + 1,
end: basicBlock.end,
},
];
let first = 0;
const last = actPosSz;
const blockName = basicBlock.nameId;
let tmp: CanonicalBB = {nameId: blockName, start: basicBlock.start, end: actionPos[first] + 1};
const result: CanonicalBB[] = [];
result.push(_.clone(tmp));
while (first !== last - 1) {
tmp.nameId = this.extractNodeName(this.code[actionPos[first] + 1].text);
tmp.start = actionPos[first] + 1;
++first;
tmp.end = actionPos[first] + 1;
result.push(_.clone(tmp));
}
tmp = {
nameId: this.extractNodeName(this.code[actionPos[first] + 1].text),
start: actionPos[first] + 1,
end: basicBlock.end,
};
result.push(_.clone(tmp));
return result;
}
override makeEdges(asmArr: AssemblyLine[], arrOfCanonicalBasicBlock: CanonicalBB[]) {
const edges: Edge[] = [];
const setEdge = (sourceNode: string, targetNode: string, color: EdgeColor) => ({
from: sourceNode,
to: targetNode,
arrows: 'to',
color: color,
});
for (const x of arrOfCanonicalBasicBlock) {
let targetNode;
const lastInst = asmArr[x.end - 1].text;
const opcode = this.getOpcode(lastInst);
switch (this.instructionSetInfo.getInstructionType(opcode)) {
case InstructionType.jmp: {
targetNode = this.shortenHex(this.getJmpAddr(lastInst));
edges.push(setEdge(x.nameId, targetNode, 'blue'));
break;
}
case InstructionType.conditionalJmpInst: {
// Branch taken
targetNode = this.shortenHex(this.getJmpAddr(lastInst));
edges.push(setEdge(x.nameId, targetNode, 'green'));
// Branch not taken
targetNode = this.extractNodeName(asmArr[x.end].text);
edges.push(setEdge(x.nameId, targetNode, 'red'));
break;
}
case InstructionType.notRetInst: {
// No jmp, but the next instruction is in a different basic
// block because it is the target of another jmp.
if (asmArr[x.end]) {
targetNode = this.extractNodeName(asmArr[x.end].text);
edges.push(setEdge(x.nameId, targetNode, 'grey'));
}
break;
}
case InstructionType.retInst: {
break;
}
}
}
logger.debug(edges);
return edges;
}
override generateFunctionCfg(code: AssemblyLine[], fn: Range) {
this.code = _.clone(code);
const basicBlocks = this.splitToBasicBlocks(code, fn);
let arrOfCanonicalBasicBlock: CanonicalBB[] = [];
for (const bb of basicBlocks) {
// We don't want to use the base class's split method.
const tmp = this.splitToCanonicalBasicBlockOat(code, bb);
arrOfCanonicalBasicBlock = arrOfCanonicalBasicBlock.concat(tmp);
}
return {
nodes: this.makeNodes(code, arrOfCanonicalBasicBlock),
edges: this.makeEdges(code, arrOfCanonicalBasicBlock),
};
}
}