mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-12-27 13:30:45 -05:00
scripts/kernel-doc.py: properly handle struct_group macros
Handing nested parenthesis with regular expressions is not an easy task. It is even harder with Python's re module, as it has a limited subset of regular expressions, missing more advanced features. We might use instead Python regex module, but still the regular expressions are very hard to understand. So, instead, add a logic to properly match delimiters. Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org> Signed-off-by: Jonathan Corbet <corbet@lwn.net> Link: https://lore.kernel.org/r/74dee485f70b7ce85e90496bfdd360283a677a58.1744106241.git.mchehab+huawei@kernel.org
This commit is contained in:
committed by
Jonathan Corbet
parent
3592385668
commit
01d3235dde
@@ -167,6 +167,172 @@ class Re:
|
||||
def group(self, num):
|
||||
return self.last_match.group(num)
|
||||
|
||||
class NestedMatch:
|
||||
"""
|
||||
Finding nested delimiters is hard with regular expressions. It is
|
||||
even harder on Python with its normal re module, as there are several
|
||||
advanced regular expressions that are missing.
|
||||
|
||||
This is the case of this pattern:
|
||||
|
||||
'\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
|
||||
|
||||
which is used to properly match open/close parenthesis of the
|
||||
string search STRUCT_GROUP(),
|
||||
|
||||
Add a class that counts pairs of delimiters, using it to match and
|
||||
replace nested expressions.
|
||||
|
||||
The original approach was suggested by:
|
||||
https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
|
||||
|
||||
Although I re-implemented it to make it more generic and match 3 types
|
||||
of delimiters. The logic checks if delimiters are paired. If not, it
|
||||
will ignore the search string.
|
||||
"""
|
||||
|
||||
# TODO:
|
||||
# Right now, regular expressions to match it are defined only up to
|
||||
# the start delimiter, e.g.:
|
||||
#
|
||||
# \bSTRUCT_GROUP\(
|
||||
#
|
||||
# is similar to: STRUCT_GROUP\((.*)\)
|
||||
# except that the content inside the match group is delimiter's aligned.
|
||||
#
|
||||
# The content inside parenthesis are converted into a single replace
|
||||
# group (e.g. r`\1').
|
||||
#
|
||||
# It would be nice to change such definition to support multiple
|
||||
# match groups, allowing a regex equivalent to.
|
||||
#
|
||||
# FOO\((.*), (.*), (.*)\)
|
||||
#
|
||||
# it is probably easier to define it not as a regular expression, but
|
||||
# with some lexical definition like:
|
||||
#
|
||||
# FOO(arg1, arg2, arg3)
|
||||
|
||||
|
||||
DELIMITER_PAIRS = {
|
||||
'{': '}',
|
||||
'(': ')',
|
||||
'[': ']',
|
||||
}
|
||||
|
||||
RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
|
||||
|
||||
def _search(self, regex, line):
|
||||
"""
|
||||
Finds paired blocks for a regex that ends with a delimiter.
|
||||
|
||||
The suggestion of using finditer to match pairs came from:
|
||||
https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
|
||||
but I ended using a different implementation to align all three types
|
||||
of delimiters and seek for an initial regular expression.
|
||||
|
||||
The algorithm seeks for open/close paired delimiters and place them
|
||||
into a stack, yielding a start/stop position of each match when the
|
||||
stack is zeroed.
|
||||
|
||||
The algorithm shoud work fine for properly paired lines, but will
|
||||
silently ignore end delimiters that preceeds an start delimiter.
|
||||
This should be OK for kernel-doc parser, as unaligned delimiters
|
||||
would cause compilation errors. So, we don't need to rise exceptions
|
||||
to cover such issues.
|
||||
"""
|
||||
|
||||
stack = []
|
||||
|
||||
for match_re in regex.finditer(line):
|
||||
start = match_re.start()
|
||||
offset = match_re.end()
|
||||
|
||||
d = line[offset -1]
|
||||
if d not in self.DELIMITER_PAIRS:
|
||||
continue
|
||||
|
||||
end = self.DELIMITER_PAIRS[d]
|
||||
stack.append(end)
|
||||
|
||||
for match in self.RE_DELIM.finditer(line[offset:]):
|
||||
pos = match.start() + offset
|
||||
|
||||
d = line[pos]
|
||||
|
||||
if d in self.DELIMITER_PAIRS:
|
||||
end = self.DELIMITER_PAIRS[d]
|
||||
|
||||
stack.append(end)
|
||||
continue
|
||||
|
||||
# Does the end delimiter match what it is expected?
|
||||
if stack and d == stack[-1]:
|
||||
stack.pop()
|
||||
|
||||
if not stack:
|
||||
yield start, offset, pos + 1
|
||||
break
|
||||
|
||||
def search(self, regex, line):
|
||||
"""
|
||||
This is similar to re.search:
|
||||
|
||||
It matches a regex that it is followed by a delimiter,
|
||||
returning occurrences only if all delimiters are paired.
|
||||
"""
|
||||
|
||||
for t in self._search(regex, line):
|
||||
|
||||
yield line[t[0]:t[2]]
|
||||
|
||||
def sub(self, regex, sub, line, count=0):
|
||||
"""
|
||||
This is similar to re.sub:
|
||||
|
||||
It matches a regex that it is followed by a delimiter,
|
||||
replacing occurrences only if all delimiters are paired.
|
||||
|
||||
if r'\1' is used, it works just like re: it places there the
|
||||
matched paired data with the delimiter stripped.
|
||||
|
||||
If count is different than zero, it will replace at most count
|
||||
items.
|
||||
"""
|
||||
out = ""
|
||||
|
||||
cur_pos = 0
|
||||
n = 0
|
||||
|
||||
found = False
|
||||
for start, end, pos in self._search(regex, line):
|
||||
out += line[cur_pos:start]
|
||||
|
||||
# Value, ignoring start/end delimiters
|
||||
value = line[end:pos - 1]
|
||||
|
||||
# replaces \1 at the sub string, if \1 is used there
|
||||
new_sub = sub
|
||||
new_sub = new_sub.replace(r'\1', value)
|
||||
|
||||
out += new_sub
|
||||
|
||||
# Drop end ';' if any
|
||||
if line[pos] == ';':
|
||||
pos += 1
|
||||
|
||||
cur_pos = pos
|
||||
n += 1
|
||||
|
||||
if count and count >= n:
|
||||
break
|
||||
|
||||
# Append the remaining string
|
||||
l = len(line)
|
||||
out += line[cur_pos:l]
|
||||
|
||||
return out
|
||||
|
||||
#
|
||||
# Regular expressions used to parse kernel-doc markups at KernelDoc class.
|
||||
#
|
||||
@@ -738,22 +904,49 @@ class KernelDoc:
|
||||
(Re(r'\s*____cacheline_aligned_in_smp', re.S), ' '),
|
||||
(Re(r'\s*____cacheline_aligned', re.S), ' '),
|
||||
|
||||
# Unwrap struct_group() based on this definition:
|
||||
# Unwrap struct_group macros based on this definition:
|
||||
# __struct_group(TAG, NAME, ATTRS, MEMBERS...)
|
||||
# which has variants like: struct_group(NAME, MEMBERS...)
|
||||
# Only MEMBERS arguments require documentation.
|
||||
#
|
||||
# Parsing them happens on two steps:
|
||||
#
|
||||
# 1. drop struct group arguments that aren't at MEMBERS,
|
||||
# storing them as STRUCT_GROUP(MEMBERS)
|
||||
#
|
||||
# 2. remove STRUCT_GROUP() ancillary macro.
|
||||
#
|
||||
# The original logic used to remove STRUCT_GROUP() using an
|
||||
# advanced regex:
|
||||
#
|
||||
# \bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;
|
||||
#
|
||||
# with two patterns that are incompatible with
|
||||
# Python re module, as it has:
|
||||
#
|
||||
# - a recursive pattern: (?1)
|
||||
# - an atomic grouping: (?>...)
|
||||
#
|
||||
# I tried a simpler version: but it didn't work either:
|
||||
# \bSTRUCT_GROUP\(([^\)]+)\)[^;]*;
|
||||
#
|
||||
# As it doesn't properly match the end parenthesis on some cases.
|
||||
#
|
||||
# So, a better solution was crafted: there's now a NestedMatch
|
||||
# class that ensures that delimiters after a search are properly
|
||||
# matched. So, the implementation to drop STRUCT_GROUP() will be
|
||||
# handled in separate.
|
||||
|
||||
(Re(r'\bstruct_group\s*\(([^,]*,)', re.S), r'STRUCT_GROUP('),
|
||||
(Re(r'\bstruct_group_attr\s*\(([^,]*,){2}', re.S), r'STRUCT_GROUP('),
|
||||
(Re(r'\bstruct_group_tagged\s*\(([^,]*),([^,]*),', re.S), r'struct \1 \2; STRUCT_GROUP('),
|
||||
(Re(r'\b__struct_group\s*\(([^,]*,){3}', re.S), r'STRUCT_GROUP('),
|
||||
|
||||
# This is incompatible with Python re, as it uses:
|
||||
# recursive patterns ((?1)) and atomic grouping ((?>...)):
|
||||
# '\bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;'
|
||||
# Let's see if this works instead:
|
||||
(Re(r'\bSTRUCT_GROUP\(([^\)]+)\)[^;]*;', re.S), r'\1'),
|
||||
|
||||
# Replace macros
|
||||
#
|
||||
# TODO: it is better to also move those to the NestedMatch logic,
|
||||
# to ensure that parenthesis will be properly matched.
|
||||
|
||||
(Re(r'__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)', re.S), r'DECLARE_BITMAP(\1, __ETHTOOL_LINK_MODE_MASK_NBITS)'),
|
||||
(Re(r'DECLARE_PHY_INTERFACE_MASK\s*\(([^\)]+)\)', re.S), r'DECLARE_BITMAP(\1, PHY_INTERFACE_MODE_MAX)'),
|
||||
(Re(r'DECLARE_BITMAP\s*\(' + args_pattern + r',\s*' + args_pattern + r'\)', re.S), r'unsigned long \1[BITS_TO_LONGS(\2)]'),
|
||||
@@ -765,9 +958,22 @@ class KernelDoc:
|
||||
(Re(r'DEFINE_DMA_UNMAP_LEN\s*\(' + args_pattern + r'\)', re.S), r'__u32 \1'),
|
||||
]
|
||||
|
||||
# Regexes here are guaranteed to have the end limiter matching
|
||||
# the start delimiter. Yet, right now, only one replace group
|
||||
# is allowed.
|
||||
|
||||
sub_nested_prefixes = [
|
||||
(re.compile(r'\bSTRUCT_GROUP\('), r'\1'),
|
||||
]
|
||||
|
||||
for search, sub in sub_prefixes:
|
||||
members = search.sub(sub, members)
|
||||
|
||||
nested = NestedMatch()
|
||||
|
||||
for search, sub in sub_nested_prefixes:
|
||||
members = nested.sub(search, sub, members)
|
||||
|
||||
# Keeps the original declaration as-is
|
||||
declaration = members
|
||||
|
||||
|
||||
Reference in New Issue
Block a user