mirror of
https://github.com/compiler-explorer/compiler-explorer.git
synced 2025-12-27 10:33:59 -05:00
Fixes test failures in assembly documentation generation. ## Import fixes All docenizers: Change imports from `../base.js` to `../../../types/assembly-docs.interfaces.js` with `import type`. ## AMD64: Fix missing SHR instruction The docenizer only read the first `<table>` in HTML files. Files like `SAL:SAR:SHL:SHR.html` have multiple tables - SHR was in table 2. **Fix**: Read all tables and combine results. ## PTX: Fix broken documentation extraction The PTX website structure changed. The old code looked for navigation links like "Instructions: add, sub, mul" which no longer exist. **Fix**: - Find instructions by scanning `<code>` blocks - Map each to its documentation section - Extract text from Description paragraphs - Ensure common instructions (add, sub) map to their definition sections, not changelogs All asm-docs tests now pass. --------- Co-authored-by: Claude <noreply@anthropic.com>
240 lines
8.8 KiB
Python
240 lines
8.8 KiB
Python
#! /usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import tarfile
|
|
import urllib
|
|
from urllib import request, parse
|
|
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
except ImportError:
|
|
raise ImportError("Please install BeautifulSoup (apt-get install python3-bs4 or pip install beautifulsoup4 should do it)")
|
|
|
|
parser = argparse.ArgumentParser(description='Docenizes XML version of the official ARM documents')
|
|
parser.add_argument('-i', '--inputfolder', type=str,
|
|
help='Folder where the input files reside as .xml. Default is ./asm-docs-arm/',
|
|
default='asm-docs-arm')
|
|
parser.add_argument('-o', '--outputpath', type=str, help='Final path of the .ts file. Default is ./asm-docs-arm32.ts',
|
|
default='./asm-docs-arm32.ts')
|
|
parser.add_argument('-d', '--downloadfolder', type=str,
|
|
help='Folder where the archive will be downloaded and extracted', default='asm-docs-arm')
|
|
parser.add_argument('-c', '--configfile', type=str, help='Json configuration file with contants', default='arm32.json', required=True)
|
|
|
|
# The maximum number of paragraphs from the description to copy.
|
|
MAX_DESC_PARAS = 5
|
|
STRIP_SUFFIX = re.compile(r'\s*(\(.*\))?\s*(--.*)?')
|
|
|
|
#arm32
|
|
FLDMX_RE = re.compile(r'^(FLDM)(\*)(X)')
|
|
FLDMX_SET = set(['DB', 'IA'])
|
|
|
|
#aarch64
|
|
CONDITION_RE = re.compile(r'^([A-Z][A-Z0-9]*\.?)(cond|<cc>)()')
|
|
CONDITION_SET = set(['EQ', 'NE', 'CS', 'CC', 'MI', 'PL', 'VS', 'VC', 'HI', 'LS', 'GE', 'LT', 'GT', 'LE', 'AL'])
|
|
FRINT_RE = re.compile(r'^(FRINT)(<r>)()')
|
|
FRINT_SET = set(['N', 'A', 'M', 'P', 'A', 'I', 'X'])
|
|
|
|
EXPAND_RE = [(FLDMX_RE, FLDMX_SET), (CONDITION_RE, CONDITION_SET), (FRINT_RE, FRINT_SET)]
|
|
|
|
# Some instructions are so broken we just take their names from the filename
|
|
UNPARSEABLE_INSTR_NAMES = []
|
|
# Some files contain instructions which cannot be parsed and which compilers are unlikely to emit
|
|
IGNORED_FILE_NAMES = [ ]
|
|
# Some instructions are defined in multiple files. We ignore a specific set of the
|
|
# duplicates here.
|
|
IGNORED_DUPLICATES = []
|
|
|
|
|
|
class Config:
|
|
class Archive:
|
|
url : str
|
|
name : str
|
|
subdir : str
|
|
def __init__(self, *, url, name, subdir):
|
|
self.url = str(url)
|
|
self.name = str(name)
|
|
self.subdir = str(subdir)
|
|
|
|
archive : Archive
|
|
documentation : str
|
|
def __init__(self, *, archive, documentation):
|
|
self.archive = Config.Archive(**archive)
|
|
self.documentation = str(documentation)
|
|
|
|
|
|
class Instruction(object):
|
|
def __init__(self, name, names, tooltip, body):
|
|
self.name = name
|
|
self.names = names
|
|
self.tooltip = tooltip.rstrip(': ,')
|
|
self.body = body
|
|
|
|
def __str__(self):
|
|
return "{} = {}\n{}".format(self.names, self.tooltip, self.body)
|
|
|
|
|
|
def get_url_for_instruction(instr):
|
|
return config.documentation
|
|
|
|
|
|
def download_asm_doc_archive(downloadfolder):
|
|
if not os.path.exists(downloadfolder):
|
|
print("Creating {} as download folder".format(downloadfolder))
|
|
os.makedirs(downloadfolder)
|
|
elif not os.path.isdir(downloadfolder):
|
|
print("Error: download folder {} is not a directory".format(downloadfolder))
|
|
sys.exit(1)
|
|
archive_name = os.path.join(downloadfolder, config.archive.name)
|
|
print("Downloading archive...")
|
|
urllib.request.urlretrieve(config.archive.url, archive_name)
|
|
|
|
|
|
def extract_asm_doc_archive(downloadfolder, inputfolder):
|
|
print("Extracting file...")
|
|
if os.path.isdir(os.path.join(inputfolder, config.archive.subdir)):
|
|
for root, dirs, files in os.walk(os.path.join(inputfolder, config.archive.subdir)):
|
|
for file in files:
|
|
if os.path.splitext(file)[1] == ".xml":
|
|
os.remove(os.path.join(root, file))
|
|
tar = tarfile.open(os.path.join(downloadfolder, config.archive.name))
|
|
tar.extractall(path=inputfolder)
|
|
|
|
def get_description_paragraphs(document_soup, part):
|
|
if part is None:
|
|
return None
|
|
for image in part.find_all('image'):
|
|
image.decompose()
|
|
for table in part.find_all('table'):
|
|
table.decompose()
|
|
paragraphs = part.find_all('para')[:5]
|
|
description_paragraphs = []
|
|
for paragraph in paragraphs:
|
|
paragraph = paragraph.wrap(document_soup.new_tag('p'))
|
|
paragraph.para.unwrap()
|
|
description_paragraphs.append(paragraph)
|
|
return description_paragraphs
|
|
|
|
instrclasses = set()
|
|
|
|
def parse(filename, f):
|
|
doc = BeautifulSoup(f, 'html.parser')
|
|
if doc.instructionsection is None:
|
|
print(filename + ": Failed to find instructionsection")
|
|
return None
|
|
instructionsection = doc.instructionsection
|
|
names = set()
|
|
|
|
for name in STRIP_SUFFIX.sub('',instructionsection['title']).split(','):
|
|
name = name.strip()
|
|
names.add(name)
|
|
for RE, SET in EXPAND_RE:
|
|
match = RE.match(name)
|
|
if match:
|
|
for elt in SET:
|
|
names.add(match.group(1) + elt + match.group(3))
|
|
|
|
body = get_description_paragraphs(doc, instructionsection.desc.authored)
|
|
if body is None:
|
|
body = get_description_paragraphs(doc, instructionsection.desc.description)
|
|
if body is None:
|
|
return None
|
|
|
|
return Instruction(
|
|
filename,
|
|
names,
|
|
body[0].text.strip(),
|
|
''.join(map(lambda x: str(x), body)).strip())
|
|
|
|
def parse_xml(directory):
|
|
print("Parsing instructions...")
|
|
instructions = []
|
|
for root, dirs, files in os.walk(directory):
|
|
for file in files:
|
|
if file.endswith(".xml") and file != "onebigfile.xml":
|
|
with open(os.path.join(root, file), encoding='utf-8') as f2:
|
|
name = os.path.splitext(file)[0]
|
|
if name in IGNORED_DUPLICATES or name in IGNORED_FILE_NAMES:
|
|
continue
|
|
instruction = parse(name, f2)
|
|
if not instruction:
|
|
continue
|
|
instructions.append(instruction)
|
|
return instructions
|
|
|
|
|
|
def self_test(instructions, directory):
|
|
# For each generated instruction, check that there is a path to a file in
|
|
# the documentation.
|
|
directory = os.path.join(directory, config.archive.subdir)
|
|
ok = True
|
|
for inst in instructions:
|
|
if not os.path.isfile(os.path.join(directory, inst.name + ".xml")):
|
|
print("Warning: {} has not file associated".format(inst.name))
|
|
ok = False
|
|
return ok
|
|
|
|
|
|
def docenizer():
|
|
global config
|
|
args = parser.parse_args()
|
|
print("Called with: {}".format(args))
|
|
|
|
with open(args.configfile) as f:
|
|
config = Config(**json.load(f))
|
|
print("Use configs: {}".format(json.dumps(config, default=lambda o: o.__dict__)))
|
|
# If we don't have the html folder already...
|
|
if not os.path.isdir(os.path.join(args.inputfolder, config.archive.subdir)):
|
|
# We don't, try with the compressed file
|
|
if not os.path.isfile(os.path.join(args.downloadfolder, config.archive.name)):
|
|
# We can't find that either. Download it
|
|
try:
|
|
download_asm_doc_archive(args.downloadfolder)
|
|
extract_asm_doc_archive(args.downloadfolder, args.inputfolder)
|
|
except IOError as e:
|
|
print("Error when downloading archive:")
|
|
print(e)
|
|
sys.exit(1)
|
|
else:
|
|
# We have a file already downloaded
|
|
extract_asm_doc_archive(args.downloadfolder, args.inputfolder)
|
|
instructions = parse_xml(os.path.join(args.inputfolder, config.archive.subdir))
|
|
print(instrclasses)
|
|
instructions.sort(key=lambda b: b.name)
|
|
self_test(instructions, args.inputfolder)
|
|
if not self_test(instructions, args.inputfolder):
|
|
print("Tests do not pass. Not writing output file. Aborting.")
|
|
sys.exit(3)
|
|
print("Writing {} instructions".format(len(instructions)))
|
|
with open(args.outputpath, 'w') as f:
|
|
f.write("""
|
|
import type {AssemblyInstructionInfo} from '../../../types/assembly-docs.interfaces.js';
|
|
|
|
export function getAsmOpcode(opcode: string | undefined): AssemblyInstructionInfo | undefined {
|
|
if (!opcode) return;
|
|
switch (opcode) {
|
|
""".lstrip())
|
|
all_inst = set()
|
|
for inst in instructions:
|
|
if not all_inst.isdisjoint(inst.names):
|
|
print("Overlap in instruction names: {} for {} - dropping".format(
|
|
inst.names.intersection(all_inst), inst.name))
|
|
continue
|
|
all_inst = all_inst.union(inst.names)
|
|
for name in sorted(inst.names):
|
|
f.write(' case "{}":\n'.format(name))
|
|
f.write(' return {}'.format(json.dumps({
|
|
"tooltip": inst.tooltip,
|
|
"html": inst.body,
|
|
"url": get_url_for_instruction(inst)
|
|
}, indent=16, separators=(',', ': ')))[:-1] + ' };\n\n')
|
|
f.write("""
|
|
}
|
|
}
|
|
""")
|
|
if __name__ == '__main__':
|
|
docenizer()
|