Files
compiler-explorer/etc/scripts/docenizers/docenizer-arm.py
Matt Godbolt 3160f628cc Fix AMD64 SHR missing and PTX documentation extraction (#8141)
Fixes test failures in assembly documentation generation.

## Import fixes

All docenizers: Change imports from `../base.js` to
`../../../types/assembly-docs.interfaces.js` with `import type`.

## AMD64: Fix missing SHR instruction

The docenizer only read the first `<table>` in HTML files. Files like
`SAL:SAR:SHL:SHR.html` have multiple tables - SHR was in table 2.

**Fix**: Read all tables and combine results.

## PTX: Fix broken documentation extraction

The PTX website structure changed. The old code looked for navigation
links like "Instructions: add, sub, mul" which no longer exist.

**Fix**: 
- Find instructions by scanning `<code>` blocks
- Map each to its documentation section
- Extract text from Description paragraphs
- Ensure common instructions (add, sub) map to their definition
sections, not changelogs

All asm-docs tests now pass.

---------

Co-authored-by: Claude <noreply@anthropic.com>
2025-09-26 15:07:59 -05:00

240 lines
8.8 KiB
Python

#! /usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import json
import os
import re
import sys
import tarfile
import urllib
from urllib import request, parse
try:
from bs4 import BeautifulSoup
except ImportError:
raise ImportError("Please install BeautifulSoup (apt-get install python3-bs4 or pip install beautifulsoup4 should do it)")
parser = argparse.ArgumentParser(description='Docenizes XML version of the official ARM documents')
parser.add_argument('-i', '--inputfolder', type=str,
help='Folder where the input files reside as .xml. Default is ./asm-docs-arm/',
default='asm-docs-arm')
parser.add_argument('-o', '--outputpath', type=str, help='Final path of the .ts file. Default is ./asm-docs-arm32.ts',
default='./asm-docs-arm32.ts')
parser.add_argument('-d', '--downloadfolder', type=str,
help='Folder where the archive will be downloaded and extracted', default='asm-docs-arm')
parser.add_argument('-c', '--configfile', type=str, help='Json configuration file with contants', default='arm32.json', required=True)
# The maximum number of paragraphs from the description to copy.
MAX_DESC_PARAS = 5
STRIP_SUFFIX = re.compile(r'\s*(\(.*\))?\s*(--.*)?')
#arm32
FLDMX_RE = re.compile(r'^(FLDM)(\*)(X)')
FLDMX_SET = set(['DB', 'IA'])
#aarch64
CONDITION_RE = re.compile(r'^([A-Z][A-Z0-9]*\.?)(cond|<cc>)()')
CONDITION_SET = set(['EQ', 'NE', 'CS', 'CC', 'MI', 'PL', 'VS', 'VC', 'HI', 'LS', 'GE', 'LT', 'GT', 'LE', 'AL'])
FRINT_RE = re.compile(r'^(FRINT)(<r>)()')
FRINT_SET = set(['N', 'A', 'M', 'P', 'A', 'I', 'X'])
EXPAND_RE = [(FLDMX_RE, FLDMX_SET), (CONDITION_RE, CONDITION_SET), (FRINT_RE, FRINT_SET)]
# Some instructions are so broken we just take their names from the filename
UNPARSEABLE_INSTR_NAMES = []
# Some files contain instructions which cannot be parsed and which compilers are unlikely to emit
IGNORED_FILE_NAMES = [ ]
# Some instructions are defined in multiple files. We ignore a specific set of the
# duplicates here.
IGNORED_DUPLICATES = []
class Config:
class Archive:
url : str
name : str
subdir : str
def __init__(self, *, url, name, subdir):
self.url = str(url)
self.name = str(name)
self.subdir = str(subdir)
archive : Archive
documentation : str
def __init__(self, *, archive, documentation):
self.archive = Config.Archive(**archive)
self.documentation = str(documentation)
class Instruction(object):
def __init__(self, name, names, tooltip, body):
self.name = name
self.names = names
self.tooltip = tooltip.rstrip(': ,')
self.body = body
def __str__(self):
return "{} = {}\n{}".format(self.names, self.tooltip, self.body)
def get_url_for_instruction(instr):
return config.documentation
def download_asm_doc_archive(downloadfolder):
if not os.path.exists(downloadfolder):
print("Creating {} as download folder".format(downloadfolder))
os.makedirs(downloadfolder)
elif not os.path.isdir(downloadfolder):
print("Error: download folder {} is not a directory".format(downloadfolder))
sys.exit(1)
archive_name = os.path.join(downloadfolder, config.archive.name)
print("Downloading archive...")
urllib.request.urlretrieve(config.archive.url, archive_name)
def extract_asm_doc_archive(downloadfolder, inputfolder):
print("Extracting file...")
if os.path.isdir(os.path.join(inputfolder, config.archive.subdir)):
for root, dirs, files in os.walk(os.path.join(inputfolder, config.archive.subdir)):
for file in files:
if os.path.splitext(file)[1] == ".xml":
os.remove(os.path.join(root, file))
tar = tarfile.open(os.path.join(downloadfolder, config.archive.name))
tar.extractall(path=inputfolder)
def get_description_paragraphs(document_soup, part):
if part is None:
return None
for image in part.find_all('image'):
image.decompose()
for table in part.find_all('table'):
table.decompose()
paragraphs = part.find_all('para')[:5]
description_paragraphs = []
for paragraph in paragraphs:
paragraph = paragraph.wrap(document_soup.new_tag('p'))
paragraph.para.unwrap()
description_paragraphs.append(paragraph)
return description_paragraphs
instrclasses = set()
def parse(filename, f):
doc = BeautifulSoup(f, 'html.parser')
if doc.instructionsection is None:
print(filename + ": Failed to find instructionsection")
return None
instructionsection = doc.instructionsection
names = set()
for name in STRIP_SUFFIX.sub('',instructionsection['title']).split(','):
name = name.strip()
names.add(name)
for RE, SET in EXPAND_RE:
match = RE.match(name)
if match:
for elt in SET:
names.add(match.group(1) + elt + match.group(3))
body = get_description_paragraphs(doc, instructionsection.desc.authored)
if body is None:
body = get_description_paragraphs(doc, instructionsection.desc.description)
if body is None:
return None
return Instruction(
filename,
names,
body[0].text.strip(),
''.join(map(lambda x: str(x), body)).strip())
def parse_xml(directory):
print("Parsing instructions...")
instructions = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".xml") and file != "onebigfile.xml":
with open(os.path.join(root, file), encoding='utf-8') as f2:
name = os.path.splitext(file)[0]
if name in IGNORED_DUPLICATES or name in IGNORED_FILE_NAMES:
continue
instruction = parse(name, f2)
if not instruction:
continue
instructions.append(instruction)
return instructions
def self_test(instructions, directory):
# For each generated instruction, check that there is a path to a file in
# the documentation.
directory = os.path.join(directory, config.archive.subdir)
ok = True
for inst in instructions:
if not os.path.isfile(os.path.join(directory, inst.name + ".xml")):
print("Warning: {} has not file associated".format(inst.name))
ok = False
return ok
def docenizer():
global config
args = parser.parse_args()
print("Called with: {}".format(args))
with open(args.configfile) as f:
config = Config(**json.load(f))
print("Use configs: {}".format(json.dumps(config, default=lambda o: o.__dict__)))
# If we don't have the html folder already...
if not os.path.isdir(os.path.join(args.inputfolder, config.archive.subdir)):
# We don't, try with the compressed file
if not os.path.isfile(os.path.join(args.downloadfolder, config.archive.name)):
# We can't find that either. Download it
try:
download_asm_doc_archive(args.downloadfolder)
extract_asm_doc_archive(args.downloadfolder, args.inputfolder)
except IOError as e:
print("Error when downloading archive:")
print(e)
sys.exit(1)
else:
# We have a file already downloaded
extract_asm_doc_archive(args.downloadfolder, args.inputfolder)
instructions = parse_xml(os.path.join(args.inputfolder, config.archive.subdir))
print(instrclasses)
instructions.sort(key=lambda b: b.name)
self_test(instructions, args.inputfolder)
if not self_test(instructions, args.inputfolder):
print("Tests do not pass. Not writing output file. Aborting.")
sys.exit(3)
print("Writing {} instructions".format(len(instructions)))
with open(args.outputpath, 'w') as f:
f.write("""
import type {AssemblyInstructionInfo} from '../../../types/assembly-docs.interfaces.js';
export function getAsmOpcode(opcode: string | undefined): AssemblyInstructionInfo | undefined {
if (!opcode) return;
switch (opcode) {
""".lstrip())
all_inst = set()
for inst in instructions:
if not all_inst.isdisjoint(inst.names):
print("Overlap in instruction names: {} for {} - dropping".format(
inst.names.intersection(all_inst), inst.name))
continue
all_inst = all_inst.union(inst.names)
for name in sorted(inst.names):
f.write(' case "{}":\n'.format(name))
f.write(' return {}'.format(json.dumps({
"tooltip": inst.tooltip,
"html": inst.body,
"url": get_url_for_instruction(inst)
}, indent=16, separators=(',', ': ')))[:-1] + ' };\n\n')
f.write("""
}
}
""")
if __name__ == '__main__':
docenizer()