我正在尝试以编程方式摄取(“反映”)Google 风格的文档字符串。我正在使用
sphinx.ext.napoleon
,因为似乎没有多少工具可以做到这一点。我正在使用以下函数来跟踪这个示例:
from sphinx.ext.napoleon import Config, GoogleDocstring
def foo(arg: int | None = 5) -> None:
"""Stub summary.
Args:
arg(int): Optional integer defaulted to 5.
"""
docstring = GoogleDocstring(foo.__doc__)
print(docstring)
但是,我的用法不会像 Sphinx 示例那样自动将打印输出转换为 reST 样式。
这引出了我的问题。如何以编程方式从 Google 风格文档字符串中提取摘要、扩展描述、参数名称和参数描述?理想情况下,它们会转换为某种数据结构(例如
dict
或 dataclass
)。
相反,您可以尝试使用内置的
inspect
模块来获取文档字符串,如下所示:
import inspect
docstring = GoogleDocstring(inspect.getdoc(foo))
print(docstring)
这将以以下格式打印:
Stub summary.
:param arg: Optional integer defaulted to 5.
:type arg: int
然后,您可以编写自己的解析器或使用第三方库,例如doctrans、docstring_parser等。为了示例和简单起见,我从doctrans源中采用了以下解决方案。因为它支持的功能超出了要求,而且我不想安装并污染系统,因此,我直接使用了代码:
import re
import sys
PARAM_OR_RETURNS_REGEX = re.compile(":(?:param|returns?)")
RETURNS_REGEX = re.compile(":returns?: (?P<doc>.*)", re.S)
PARAM_REGEX = re.compile(r":param (?P<name>[\*\w]+): (?P<doc>.*?)"
r"(?:(?=:param)|(?=:return)|(?=:raises)|\Z)", re.S)
def trim(docstring):
"""trim function from PEP-257"""
if not docstring:
return ""
# Convert tabs to spaces (following the normal Python rules)
# and split into a list of lines:
lines = docstring.expandtabs().splitlines()
# Determine minimum indentation (first line doesn't count):
indent = sys.maxsize
for line in lines[1:]:
stripped = line.lstrip()
if stripped:
indent = min(indent, len(line) - len(stripped))
# Remove indentation (first line is special):
trimmed = [lines[0].strip()]
if indent < sys.maxsize:
for line in lines[1:]:
trimmed.append(line[indent:].rstrip())
# Strip off trailing and leading blank lines:
while trimmed and not trimmed[-1]:
trimmed.pop()
while trimmed and not trimmed[0]:
trimmed.pop(0)
# Current code/unittests expects a line return at
# end of multiline docstrings
# workaround expected behavior from unittests
if '\n' in docstring:
trimmed.append("")
# Return a single string:
return '\n'.join(trimmed)
def reindent(string):
return '\n'.join(line.strip() for line in string.strip().split('\n'))
def doc_to_type_doc(name, doc):
doc = trim(doc).splitlines()
docs, typ = [], []
for line in doc:
if line.startswith(':type'):
line = line[len(':type '):]
colon_at = line.find(':')
found_name = line[:colon_at]
assert name == found_name, '{!r} != {!r}'.format(name, found_name)
line = line[colon_at + 2:]
typ.append(line[3:-3] if line.startswith('```') and line.endswith('```') else line)
elif len(typ):
typ.append(line)
else:
docs.append(line)
return dict(doc='\n'.join(docs), **{'typ': '\n'.join(typ)} if len(typ) else {})
def parse_docstring(docstring):
"""Parse the docstring into its components.
:returns: a dictionary of form
{
'short_description': ...,
'long_description': ...,
'params': [{'name': ..., 'doc': ..., 'typ': ...}, ...],
"returns': {'name': ..., 'typ': ...}
}
"""
short_description = long_description = returns = ""
params = []
if docstring:
docstring = trim(docstring.lstrip('\n'))
lines = docstring.split('\n', 1)
short_description = lines[0]
if len(lines) > 1:
long_description = lines[1].strip()
params_returns_desc = None
match = PARAM_OR_RETURNS_REGEX.search(long_description)
if match:
long_desc_end = match.start()
params_returns_desc = long_description[long_desc_end:].strip()
long_description = long_description[:long_desc_end].rstrip()
if params_returns_desc:
params = [
dict(name=name, **doc_to_type_doc(name, doc))
for name, doc in PARAM_REGEX.findall(params_returns_desc)
]
match = RETURNS_REGEX.search(params_returns_desc)
if match:
returns = reindent(match.group('doc'))
if returns:
r_dict = {'name': ''}
for idx, char in enumerate(returns):
if char == ':':
r_dict['typ'] = returns[idx + len(':rtype:'):].strip()
if r_dict['typ'].startswith('```') and r_dict['typ'].endswith('```'):
r_dict['typ'] = r_dict['typ'][3:-3]
break
else:
r_dict['name'] += char
r_dict['name'] = r_dict['name'].rstrip()
returns = r_dict
return {
'short_description': short_description,
'long_description': long_description,
'params': params,
'returns': returns
}
parse_docstring("\n".join(docstring.lines()))