如何将带有嵌套表格的表格的html转换为docx?

问题描述 投票:0回答:1

我想使用单元格中包含嵌套表格的表格转换任何 html。

当我尝试执行此操作时,嵌套表所在的行后面会出现其他行。 嵌入表格后的行数等于前一行嵌入表格的行数之和,而我只需要他将表格插入单元格即可

这是我的转换代码:

from docx import Document
from bs4 import BeautifulSoup

class HtmlToDocx:
    def __init__(self):
        self.document = Document()

    def handle_table(self, table_soup, parent_docx_table=None, row_idx=0, col_idx=0):
        
        rows, cols = self.get_table_dimensions(table_soup)
        
      
        if parent_docx_table:
            cell = parent_docx_table.cell(row_idx, col_idx)
            docx_table = cell.add_table(rows=rows, cols=cols)
        else:
            docx_table = self.document.add_table(rows=rows, cols=cols)
    
        rows = self.get_table_rows(table_soup)
        cell_row = 0
        for row in rows:
            cols = self.get_table_columns(row)
            cell_col = 0
            for col in cols:
                nested_table = col.find('table') 
                docx_cell = docx_table.cell(cell_row, cell_col)
    
                if nested_table:
                    
                    self.handle_table(nested_table, docx_table, cell_row, cell_col)
                    
                    cell_col += 1  
                    continue  

                cell_html = self.get_cell_html(col)
                docx_cell.text = cell_html
    
                cell_col += 1
            cell_row += 1


    def get_table_rows(self, table_soup):
        return table_soup.find_all('tr')

    def get_table_columns(self, row):
        return row.find_all(['th', 'td'], recursive=False)

    def get_cell_html(self, soup):
        return ''.join([str(i) for i in soup.contents if not (i.name == 'table' or isinstance(i, BeautifulSoup))])

    def get_table_dimensions(self, table_soup):
        rows = self.get_table_rows(table_soup)
        cols = self.get_table_columns(rows[0]) if rows else []
        return len(rows), len(cols)


    def add_html_to_docx(self, html_content, output_file):
        self.soup = BeautifulSoup(html_content, "html.parser")
        top_tables = self.soup.find_all('table', recursive=False)
        for table_soup in top_tables:
            self.handle_table(table_soup)
        self.document.save(output_file)

html_content = """
   
    <table style="border-collapse: collapse; width: 100%; height: 232px;" border="1">
        <tbody>
        <tr style="height: 22px;">
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        </tr>
        <tr style="height: 188px;">
        <td style="width: 23.7413%; height: 188px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 188px;">
        </td>
        <td style="width: 23.7413%; height: 188px;">
        <table style="border-collapse: collapse; width: 95.0877%;" border="1">
        <tbody>
        <tr>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6246%;">&nbsp;</td>
        </tr>
        <tr>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">ааа</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6246%;">&nbsp;</td>
        </tr>
        <tr>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6246%;">&nbsp;</td>
        </tr>
        </tbody>
        </table>
        </td>
        <td style="width: 23.7413%; height: 188px;">
        <table style="border-collapse: collapse; width: 95.0877%;" border="1">
        <tbody>
        <tr>
        <td style="width: 27.8655%;">е</td>
        <td style="width: 27.8655%;">&nbsp;</td>
        <td style="width: 27.8742%;">&nbsp;</td>
        </tr>
        <tr>
        <td style="width: 27.8655%;">&nbsp;</td>
        <td style="width: 27.8655%;">еее</td>
        <td style="width: 27.8742%;">&nbsp;</td>
        </tr>
        </tbody>
        </table>
        </td>
        </tr>
        <tr style="height: 22px;">
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">
        <table style="border-collapse: collapse; width: 95.0877%;" border="1">
        <tbody>
        <tr>
        <td style="width: 19.5803%;">11</td>
        <td style="width: 19.5803%;">&nbsp;</td>
        <td style="width: 19.5803%;">&nbsp;</td>
        <td style="width: 19.5889%;">&nbsp;</td>
        </tr>
        <tr>
        <td style="width: 19.5803%;">&nbsp;</td>
        <td style="width: 19.5803%;">&nbsp;</td>
        <td style="width: 19.5803%;">222</td>
        <td style="width: 19.5889%;">333</td>
        </tr>
        </tbody>
        </table>
        </td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        </tr>
        </tbody>
        </table>
"""
converter = HtmlToDocx()
converter.add_html_to_docx(html_content, "output.docx")

图片中是我得到的: 在此输入图片描述

html beautifulsoup docx python-docx
1个回答
0
投票

只需要传递每个嵌套表的 id

def add_unique_ids_to_tables(self, soup):
    tables = soup.find_all('table')
    for table in tables:
        table['data-table-id'] = str(uuid.uuid4())

def get_table_rows(self, table_soup):
    """
    Returns only the <tr> rows that belong to the specified table, ignoring nested tables.
    """
    table_id = table_soup.get('data-table-id')
    if not table_id:
        return []

    rows = []
    for tr in table_soup.find_all('tr'):
        parent_table = tr.find_parent('table')
        parent_table_id = parent_table.get('data-table-id') if parent_table else None
        if parent_table_id == table_id:
            rows.append(tr)
    return rows

def get_table_columns(self, row):
    return row.find_all(['th', 'td'], recursive=False)

def get_cell_html(self, soup):
    return ''.join([str(i) for i in soup.contents if i.name != 'table'])

def get_table_dimensions(self, table_soup):
    """
    Gets the dimensions of the table, ignoring nested tables.
    """
    rows = self.get_table_rows(table_soup)
    if not rows:
        return 0, 0
    cols = self.get_table_columns(rows[0])
    return len(rows), len(cols)

def process_table(self, table_element, docx_table=None, row_idx=0, col_idx=0):
    rows_count, cols_count = self.get_table_dimensions(table_element)

    if docx_table:
        docx_cell = docx_table.cell(row_idx, col_idx)
        table = docx_cell.add_table(rows=rows_count, cols=cols_count)
        table.style = 'Table Grid'
    else:
        # Основная таблица
        table = self.doc.add_table(rows=rows_count, cols=cols_count)
        table.style = 'Table Grid'

    rows = self.get_table_rows(table_element)

    for row_idx, row in enumerate(rows):
        cells = self.get_table_columns(row)
        for col_idx, cell_soup in enumerate(cells):
            docx_cell = table.cell(row_idx, col_idx)
            docx_cell.text = ''  

            for element in cell_soup.contents:
                if element.name == 'table':
                    self.process_table(element, docx_table=table, row_idx=row_idx, col_idx=col_idx)
                else:
                    # Обработка других элементов
                    cell_paragraph = docx_cell.paragraphs[0]
                    self.process_element(element, paragraph=cell_paragraph)

        if row_idx == 0:
            for run in docx_cell.paragraphs[0].runs:
                run.bold = False

    if not docx_table:
        table.style = 'Table Grid'
© www.soinside.com 2019 - 2024. All rights reserved.