From 2f3f6501e92949a9e98f9359bd986849e8b33f5e Mon Sep 17 00:00:00 2001
From: Eero Vaher <eero.vaher@fysik.lu.se>
Date: Thu, 7 May 2026 18:43:30 +0200
Subject: [PATCH] Simplify extracting arguments from C doc comments

Previously when `FunctionDoc._get_arg_doc_list()` extracted arguments
from C doc comments it also extracted struct fields, which then had to
be removed after the fact. Using a better regex avoids extracting struct
fields to begin with. Turns out that the `ArgumentDoc` class is not
needed at all anymore.
---
 erfa_generator.py | 83 +++++++++++++++--------------------------------
 1 file changed, 27 insertions(+), 56 deletions(-)

diff --git a/erfa_generator.py b/erfa_generator.py
index 91a4381..6c6b1c9 100644
--- a/erfa_generator.py
+++ b/erfa_generator.py
@@ -27,50 +27,36 @@ def __init__(self, doc: str, pyname: str) -> None:
             doc = doc.replace("\n* ", "\n** ", 2).replace("\n*\n", "\n**\n", 1)
         self.doc: Final = doc.replace("\n**", "\n").removeprefix("\n")
 
-        inout = self._get_arg_doc_list("Given and returned:\n(.+?)\n\n")
-        self.input: Final = self._get_arg_doc_list("Given.*?\n(.+?)\n\n") + inout
-        self.output: Final = inout + self._get_arg_doc_list("Returned.*?\n(.+?)\n\n")
+        get_arg_doc_list = functools.partial(
+            self._get_arg_doc_list, n_spaces=4 if pyname in ("ab", "refco") else 5
+        )
+        inout = get_arg_doc_list("Given and returned:\n(.+?)\n\n")
+        self.input: Final = get_arg_doc_list("Given.*?\n(.+?)\n\n") | inout
+        self.output: Final = inout | get_arg_doc_list("Returned.*?\n(.+?)\n\n")
 
-    def _get_arg_doc_list(self, regex: str) -> list["ArgumentDoc"]:
+    def _get_arg_doc_list(self, regex: str, n_spaces: int) -> frozenset[str]:
         """Parse input/output doc section lines, getting arguments from them.
 
-        Ensure all elements of eraASTROM and eraLDBODY are left out, as those
-        are not input or output arguments themselves.  Also remove the nb
-        argument in from of eraLDBODY, as we infer nb from the python array.
+        Also remove the nb argument in front of eraLDBODY, as we infer nb from
+        the python array.
         """
         result = re.search(regex, self.doc, re.DOTALL)
         if result is None:
-            return []
+            return frozenset()
         doc_list = []
-        skip = []
-        for arg_doc in map(ArgumentDoc, result.group(1).splitlines()):
-            if arg_doc.name is not None:
-                if skip:
-                    if skip[0] == arg_doc.name:
-                        skip.pop(0)
-                        continue
-                    raise RuntimeError(
-                        f"We whould be skipping {skip[0]} but {arg_doc.name} encountered."
-                    )
-
-                if arg_doc.type.startswith('eraLDBODY'):
-                    # Special-case LDBODY: for those, the previous argument
-                    # is always the number of bodies, but we don't need it
-                    # as an input argument for the ufunc since we're going
-                    # to determine this from the array itself. Also skip
-                    # the description of its contents; those are not arguments.
-                    doc_list.pop()
-                    skip = ['bm', 'dl', 'pv']
-                elif arg_doc.type.startswith('eraASTROM'):
-                    # Special-case ASTROM: need to skip the description
-                    # of its contents; those are not arguments.
-                    skip = ['pmt', 'eb', 'eh', 'em', 'v', 'bm1',
-                            'bpn', 'along', 'xpl', 'ypl', 'sphi',
-                            'cphi', 'diurab', 'eral', 'refa', 'refb']
-
-                doc_list.append(arg_doc)
-
-        return doc_list
+        for name, c_type in re.findall(
+            rf"^{n_spaces * ' '}([\w\*,]+) +([\w\[\]\*]+) +.+?",
+            result.group(1),
+            re.MULTILINE,
+        ):
+            if c_type.startswith("eraLDBODY"):
+                # Special-case LDBODY: for those, the previous argument
+                # is always the number of bodies, but we don't need it
+                # as an input argument for the ufunc since we're going
+                # to determine this from the array itself.
+                doc_list.pop()
+            doc_list.extend(name.replace("*", "").split(","))
+        return frozenset(doc_list)
 
     @property
     def title(self):
@@ -91,19 +77,6 @@ def title(self):
         return '\n    '.join(description)
 
 
-class ArgumentDoc:
-
-    def __init__(self, doc):
-        if (match := re.search("^ +([^ ]+)[ ]+([^ ]+)[ ]+.+", doc)) is not None:
-            self.name = match.group(1)
-            if self.name.startswith('*'):  # Easier than getting the regex to behave...
-                self.name = self.name.replace('*', '')
-            self.type = match.group(2)
-        else:
-            self.name = None
-            self.type = None
-
-
 class Variable:
     """Properties shared by Argument, Return and StatusCode."""
 
@@ -145,12 +118,10 @@ def __init__(self, definition: str, doc: FunctionDoc) -> None:
     @functools.cached_property
     def inout_state(self) -> str:
         inout_state = ""
-        for i in self.doc.input:
-            if self.name in i.name.split(","):
-                inout_state = "in"
-        for o in self.doc.output:
-            if self.name in o.name.split(","):
-                inout_state = "inout" if inout_state == "in" else "out"
+        if self.name in self.doc.input:
+            inout_state = "in"
+        if self.name in self.doc.output:
+            inout_state = "inout" if inout_state == "in" else "out"
         return inout_state
 
     @property