我一直在尝试编写一个工具,以查看断开链接的指定路径中的Word文档。 我放弃了让它搜索文件夹,以为我只需要让它首先做文档即可。我的技能有限,阅读并尝试了Copilot的一些建议。 (将其分解为单个任务)我将其放在一起:
import docx
import os
import url
doc = docx.Document('C:/Users/demo.docx')
allText = []
def find_hyperlinks(doc):
hyperlinks = []
rels = doc.part.rels
for rel in rels:
if "hyperlink" in rels[rel].target_ref:
hyperlinks.append(rels[rel].target_ref)
return hyperlinks
def find_broken_links_in_docx(doc):
broken_links = []
hyperlinks = find_hyperlinks(doc)
for url in hyperlinks:
try:
response = requests.head(url, allow_redirects=True)
if response.status_code >= 400:
broken_links.append(url)
except requests.RequestException:
broken_links.append(url)
return broken_links
def write_report(report, output_file):
with open(output_file, 'w') as f:
for file_path, links in report.items():
f.write(f"File: {file_path}\n")
for link in links:
f.write(f" Broken link: {link}\n")
f.write("\n")
if __name__ == "__main__":
output_file = "C:/Results/broken_links_report.txt"
report = find_broken_links_in_docx(doc)
write_report(report, output_file)
print(f"Report written to {output_file}")
File "c:\Users\Scripts\playground\openinganddocx.py", line 41, in <module>
write_report(report, output_file)
File "c:\Users\Scripts\playground\openinganddocx.py", line 31, in write_report
for file_path, links in report.items():
AttributeError: 'list' object has no attribute 'items'
为参考: 线31
f.write(f"File: {file_path}\n")
线41
print(f"Report written to {output_file}")
dict
(报告),事实并非如此。这就是为什么您要得到
AttributeError: list has no attribute 'items'
。您想要的是具有结构
{'filepath': [<urls>]}
的字典。所以从那里开始:
def find_hyperlinks(doc_path: str):
doc = docx.Document(doc_path)
hyperlinks = []
rels = doc.part.rels
for rel in rels:
if "hyperlink" in rels[rel].target_ref:
hyperlinks.append(rels[rel].target_ref)
# here is an example of how I might return that value
return {doc_path: hyperlinks}
# from here, prune the hyperlinks that work
broken_links = {}
for doc_path, links in links_dict.items():
broken = []
for link in links:
if link_works(link):
continue
broken.append(link)
broken_links[doc_path] = broken
# etc