我想使用单元格中包含嵌套表格的表格转换任何 html。
当我尝试执行此操作时,嵌套表所在的行后面会出现其他行。 嵌入表格后的行数等于前一行嵌入表格的行数之和,而我只需要他将表格插入单元格即可
这是我的转换代码:
from docx import Document
from bs4 import BeautifulSoup
class HtmlToDocx:
def __init__(self):
self.document = Document()
def handle_table(self, table_soup, parent_docx_table=None, row_idx=0, col_idx=0):
rows, cols = self.get_table_dimensions(table_soup)
if parent_docx_table:
cell = parent_docx_table.cell(row_idx, col_idx)
docx_table = cell.add_table(rows=rows, cols=cols)
else:
docx_table = self.document.add_table(rows=rows, cols=cols)
rows = self.get_table_rows(table_soup)
cell_row = 0
for row in rows:
cols = self.get_table_columns(row)
cell_col = 0
for col in cols:
nested_table = col.find('table')
docx_cell = docx_table.cell(cell_row, cell_col)
if nested_table:
self.handle_table(nested_table, docx_table, cell_row, cell_col)
cell_col += 1
continue
cell_html = self.get_cell_html(col)
docx_cell.text = cell_html
cell_col += 1
cell_row += 1
def get_table_rows(self, table_soup):
return table_soup.find_all('tr')
def get_table_columns(self, row):
return row.find_all(['th', 'td'], recursive=False)
def get_cell_html(self, soup):
return ''.join([str(i) for i in soup.contents if not (i.name == 'table' or isinstance(i, BeautifulSoup))])
def get_table_dimensions(self, table_soup):
rows = self.get_table_rows(table_soup)
cols = self.get_table_columns(rows[0]) if rows else []
return len(rows), len(cols)
def add_html_to_docx(self, html_content, output_file):
self.soup = BeautifulSoup(html_content, "html.parser")
top_tables = self.soup.find_all('table', recursive=False)
for table_soup in top_tables:
self.handle_table(table_soup)
self.document.save(output_file)
html_content = """
<table style="border-collapse: collapse; width: 100%; height: 232px;" border="1">
<tbody>
<tr style="height: 22px;">
<td style="width: 23.7413%; height: 22px;"> </td>
<td style="width: 23.7413%; height: 22px;"> </td>
<td style="width: 23.7413%; height: 22px;"> </td>
<td style="width: 23.7413%; height: 22px;"> </td>
</tr>
<tr style="height: 188px;">
<td style="width: 23.7413%; height: 188px;"> </td>
<td style="width: 23.7413%; height: 188px;">
</td>
<td style="width: 23.7413%; height: 188px;">
<table style="border-collapse: collapse; width: 95.0877%;" border="1">
<tbody>
<tr>
<td style="width: 14.6073%;"> </td>
<td style="width: 14.6073%;"> </td>
<td style="width: 14.6073%;"> </td>
<td style="width: 14.6073%;"> </td>
<td style="width: 14.6246%;"> </td>
</tr>
<tr>
<td style="width: 14.6073%;"> </td>
<td style="width: 14.6073%;"> </td>
<td style="width: 14.6073%;">ааа</td>
<td style="width: 14.6073%;"> </td>
<td style="width: 14.6246%;"> </td>
</tr>
<tr>
<td style="width: 14.6073%;"> </td>
<td style="width: 14.6073%;"> </td>
<td style="width: 14.6073%;"> </td>
<td style="width: 14.6073%;"> </td>
<td style="width: 14.6246%;"> </td>
</tr>
</tbody>
</table>
</td>
<td style="width: 23.7413%; height: 188px;">
<table style="border-collapse: collapse; width: 95.0877%;" border="1">
<tbody>
<tr>
<td style="width: 27.8655%;">е</td>
<td style="width: 27.8655%;"> </td>
<td style="width: 27.8742%;"> </td>
</tr>
<tr>
<td style="width: 27.8655%;"> </td>
<td style="width: 27.8655%;">еее</td>
<td style="width: 27.8742%;"> </td>
</tr>
</tbody>
</table>
</td>
</tr>
<tr style="height: 22px;">
<td style="width: 23.7413%; height: 22px;"> </td>
<td style="width: 23.7413%; height: 22px;"> </td>
<td style="width: 23.7413%; height: 22px;">
<table style="border-collapse: collapse; width: 95.0877%;" border="1">
<tbody>
<tr>
<td style="width: 19.5803%;">11</td>
<td style="width: 19.5803%;"> </td>
<td style="width: 19.5803%;"> </td>
<td style="width: 19.5889%;"> </td>
</tr>
<tr>
<td style="width: 19.5803%;"> </td>
<td style="width: 19.5803%;"> </td>
<td style="width: 19.5803%;">222</td>
<td style="width: 19.5889%;">333</td>
</tr>
</tbody>
</table>
</td>
<td style="width: 23.7413%; height: 22px;"> </td>
</tr>
</tbody>
</table>
"""
converter = HtmlToDocx()
converter.add_html_to_docx(html_content, "output.docx")
图片中是我得到的: 在此输入图片描述
只需要传递每个嵌套表的 id
def add_unique_ids_to_tables(self, soup):
tables = soup.find_all('table')
for table in tables:
table['data-table-id'] = str(uuid.uuid4())
def get_table_rows(self, table_soup):
"""
Returns only the <tr> rows that belong to the specified table, ignoring nested tables.
"""
table_id = table_soup.get('data-table-id')
if not table_id:
return []
rows = []
for tr in table_soup.find_all('tr'):
parent_table = tr.find_parent('table')
parent_table_id = parent_table.get('data-table-id') if parent_table else None
if parent_table_id == table_id:
rows.append(tr)
return rows
def get_table_columns(self, row):
return row.find_all(['th', 'td'], recursive=False)
def get_cell_html(self, soup):
return ''.join([str(i) for i in soup.contents if i.name != 'table'])
def get_table_dimensions(self, table_soup):
"""
Gets the dimensions of the table, ignoring nested tables.
"""
rows = self.get_table_rows(table_soup)
if not rows:
return 0, 0
cols = self.get_table_columns(rows[0])
return len(rows), len(cols)
def process_table(self, table_element, docx_table=None, row_idx=0, col_idx=0):
rows_count, cols_count = self.get_table_dimensions(table_element)
if docx_table:
docx_cell = docx_table.cell(row_idx, col_idx)
table = docx_cell.add_table(rows=rows_count, cols=cols_count)
table.style = 'Table Grid'
else:
# Основная таблица
table = self.doc.add_table(rows=rows_count, cols=cols_count)
table.style = 'Table Grid'
rows = self.get_table_rows(table_element)
for row_idx, row in enumerate(rows):
cells = self.get_table_columns(row)
for col_idx, cell_soup in enumerate(cells):
docx_cell = table.cell(row_idx, col_idx)
docx_cell.text = ''
for element in cell_soup.contents:
if element.name == 'table':
self.process_table(element, docx_table=table, row_idx=row_idx, col_idx=col_idx)
else:
# Обработка других элементов
cell_paragraph = docx_cell.paragraphs[0]
self.process_element(element, paragraph=cell_paragraph)
if row_idx == 0:
for run in docx_cell.paragraphs[0].runs:
run.bold = False
if not docx_table:
table.style = 'Table Grid'