compiler-explorer/lib/cfg/cfg-parsers/oat.ts

// Copyright (c) 2024, Compiler Explorer Authors
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
//     * Redistributions of source code must retain the above copyright notice,
//       this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above copyright
//       notice, this list of conditions and the following disclaimer in the
//       documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.

import _ from 'underscore';

import {EdgeColor} from '../../../types/compilation/cfg.interfaces.js';
import {logger} from '../../logger.js';
import {BaseInstructionSetInfo, InstructionType} from '../instruction-sets/base.js';

import {AssemblyLine, BBRange, BaseCFGParser, CanonicalBB, Edge, Range} from './base.js';

// This currently only covers the default arm64 output. To support dex2oat's
// other ISAs, we just need to make sure the correct isJmpInstruction() is being
// used, and parse out the destination addrs differently. For example:
// - x86/x86_64      0x00004071    jmp +34 (0x00004098)
// - arm    i        0x00004052    b 0x00004078
// - riscv64         0x0000411c    bltz t6, +40 ; 0x00004144
export class OatCFGParser extends BaseCFGParser {
    code: AssemblyLine[];

    jmpAddrRegex: RegExp;
    hexRegex: RegExp;

    constructor(instructionSetInfo: BaseInstructionSetInfo) {
        super(instructionSetInfo);
        this.code = [];
        this.jmpAddrRegex = /.*\(addr (0x.*)\)/;
        this.hexRegex = /0x0+(.*)/;
    }

    static override get key() {
        return 'oat';
    }

    // Generally the same as the base filterData(), but we keep empty lines
    // because there are no other indicators for the end of a function.
    override filterData(assembly: AssemblyLine[]) {
        const isCode = (x: AssemblyLine) => x && (x.source !== null || this.isFunctionName(x));
        return this.filterTextSection(assembly).map(_.clone).filter(isCode);
    }

    // For filtering through the 'Instruction set', 'Instruction set features',
    // and 'Compiler filter' lines at the top of the compiler output.
    isHeaderInfo(text: string) {
        return text.startsWith('Instruction ') || text.startsWith('Compiler ');
    }

    // Uses the same general flow as the overridden function, but accounts for
    // empty spaces between functions.
    override splitToFunctions(asmArr: AssemblyLine[]) {
        if (asmArr.length === 0) return [];
        const result: Range[] = [];
        let first = 0;
        while (!asmArr[first].text || this.isHeaderInfo(asmArr[first].text)) {
            ++first;
        }
        const last = asmArr.length;
        const fnRange: Range = {start: first, end: first};
        while (first !== last) {
            if (this.isFunctionEnd(asmArr[first].text)) {
                fnRange.end = first;
                if (fnRange.end > fnRange.start) {
                    result.push(_.clone(fnRange));
                }
                fnRange.start = first + 1;
            }
            ++first;
        }

        fnRange.end = last;
        if (fnRange.end > fnRange.start) {
            result.push(_.clone(fnRange));
        }

        return result;
    }

    // In these examples, '0x416c' and '0x8074' will be returned.
    //     0x00004144    b #+0x28 (addr 0x416c)
    //     0x00008050    b.hs #+0x24 (addr 0x00008074)
    getJmpAddr(inst: string): string {
        const match = inst.match(this.jmpAddrRegex);
        if (match) return this.shortenHex(match[1]);
        return '';
    }

    // In this example, '0x0000416c' will be returned.
    //     0x0000416c    ret
    getPc(inst: string) {
        return inst.trim().split(/\s+/)[0].toLowerCase();
    }

    // In this example, 'add' will be returned.
    //     0x00004168    add w0, w1, #0x3 (3)
    getOpcode(inst: string) {
        return inst.trim().split(/\s+/)[1].toLowerCase();
    }

    isJmpTarget(inst: string, jmpAddrs: string[]) {
        return jmpAddrs.includes(this.shortenHex(this.getPc(inst)));
    }

    // '0x00004168' -> '0x4168'
    shortenHex(pc: string) {
        const match = pc.match(this.hexRegex);
        if (match) return '0x' + match[1];
        return pc;
    }

    override splitToBasicBlocks(asmArr: AssemblyLine[], range: Range) {
        let first = range.start;
        const last = range.end;
        if (first === last) return [];

        // Collect branch targets so we know where to start new blocks.
        const jmpAddrs: string[] = [];
        while (first < last) {
            if (asmArr[first].text.includes('(addr ')) {
                const addr = this.getJmpAddr(asmArr[first].text);
                if (addr) jmpAddrs.push(addr);
            }
            ++first;
        }
        // range.start is the function name; we want blocks' ranges to begin
        // with the first instruction.
        first = range.start + 1;

        let rangeBb: BBRange = {
            nameId: this.shortenHex(this.getPc(asmArr[first].text)),
            start: first,
            end: 0,
            actionPos: [],
        };

        const newRangeWith = (oldRange: BBRange, nameId: string, start: number) => ({
            nameId: nameId,
            start: start,
            actionPos: [],
            end: oldRange.end,
        });

        const result: BBRange[] = [];
        while (first < last) {
            const inst = asmArr[first].text;
            const opcode = this.getOpcode(inst);
            if (this.isBasicBlockEnd(inst, asmArr[first - 1] ? asmArr[first - 1].text : '')) {
                rangeBb.end = first;
                result.push(_.clone(rangeBb));
                rangeBb = newRangeWith(rangeBb, this.extractNodeName(inst), first + 1);
            } else if (this.instructionSetInfo.isJmpInstruction(opcode)) {
                rangeBb.actionPos.push(first);
            } else if (this.isJmpTarget(inst, jmpAddrs)) {
                rangeBb.end = first;
                result.push(_.clone(rangeBb));
                rangeBb = newRangeWith(rangeBb, this.extractNodeName(inst), first);
            }
            ++first;
        }

        rangeBb.end = last;
        result.push(_.clone(rangeBb));
        return result;
    }

    // Empty lines indicate the end of a function.
    override isFunctionEnd(text: string) {
        return text.trim().length === 0;
    }

    // All nodes are named after the address of their first instruction.
    override extractNodeName(inst: string) {
        return this.shortenHex(this.getPc(inst));
    }

    // Identical to splitToCanonicalBasicBlock(), but with a different node
    // naming scheme.
    splitToCanonicalBasicBlockOat(code: AssemblyLine[], basicBlock: BBRange): CanonicalBB[] {
        const actionPos = basicBlock.actionPos;
        let actPosSz = actionPos.length;
        if (actionPos[actPosSz - 1] + 1 === basicBlock.end) {
            --actPosSz;
        }

        if (actPosSz === 0)
            return [
                {
                    nameId: basicBlock.nameId,
                    start: basicBlock.start,
                    end: basicBlock.end,
                },
            ];
        if (actPosSz === 1)
            return [
                {nameId: basicBlock.nameId, start: basicBlock.start, end: actionPos[0] + 1},
                {
                    nameId: this.extractNodeName(this.code[actionPos[0] + 1].text),
                    start: actionPos[0] + 1,
                    end: basicBlock.end,
                },
            ];

        let first = 0;
        const last = actPosSz;
        const blockName = basicBlock.nameId;
        let tmp: CanonicalBB = {nameId: blockName, start: basicBlock.start, end: actionPos[first] + 1};
        const result: CanonicalBB[] = [];
        result.push(_.clone(tmp));
        while (first !== last - 1) {
            tmp.nameId = this.extractNodeName(this.code[actionPos[first] + 1].text);
            tmp.start = actionPos[first] + 1;
            ++first;
            tmp.end = actionPos[first] + 1;
            result.push(_.clone(tmp));
        }

        tmp = {
            nameId: this.extractNodeName(this.code[actionPos[first] + 1].text),
            start: actionPos[first] + 1,
            end: basicBlock.end,
        };
        result.push(_.clone(tmp));

        return result;
    }

    override makeEdges(asmArr: AssemblyLine[], arrOfCanonicalBasicBlock: CanonicalBB[]) {
        const edges: Edge[] = [];

        const setEdge = (sourceNode: string, targetNode: string, color: EdgeColor) => ({
            from: sourceNode,
            to: targetNode,
            arrows: 'to',
            color: color,
        });

        for (const x of arrOfCanonicalBasicBlock) {
            let targetNode;
            const lastInst = asmArr[x.end - 1].text;
            const opcode = this.getOpcode(lastInst);
            switch (this.instructionSetInfo.getInstructionType(opcode)) {
                case InstructionType.jmp: {
                    targetNode = this.shortenHex(this.getJmpAddr(lastInst));
                    edges.push(setEdge(x.nameId, targetNode, 'blue'));
                    break;
                }
                case InstructionType.conditionalJmpInst: {
                    // Branch taken
                    targetNode = this.shortenHex(this.getJmpAddr(lastInst));
                    edges.push(setEdge(x.nameId, targetNode, 'green'));
                    // Branch not taken
                    targetNode = this.extractNodeName(asmArr[x.end].text);
                    edges.push(setEdge(x.nameId, targetNode, 'red'));
                    break;
                }
                case InstructionType.notRetInst: {
                    // No jmp, but the next instruction is in a different basic
                    // block because it is the target of another jmp.
                    if (asmArr[x.end]) {
                        targetNode = this.extractNodeName(asmArr[x.end].text);
                        edges.push(setEdge(x.nameId, targetNode, 'grey'));
                    }
                    break;
                }
                case InstructionType.retInst: {
                    break;
                }
            }
        }
        logger.debug(edges);
        return edges;
    }

    override generateFunctionCfg(code: AssemblyLine[], fn: Range) {
        this.code = _.clone(code);
        const basicBlocks = this.splitToBasicBlocks(code, fn);
        let arrOfCanonicalBasicBlock: CanonicalBB[] = [];
        for (const bb of basicBlocks) {
            // We don't want to use the base class's split method.
            const tmp = this.splitToCanonicalBasicBlockOat(code, bb);
            arrOfCanonicalBasicBlock = arrOfCanonicalBasicBlock.concat(tmp);
        }
        return {
            nodes: this.makeNodes(code, arrOfCanonicalBasicBlock),
            edges: this.makeEdges(code, arrOfCanonicalBasicBlock),
        };
    }
}