Source code for data_juicer.ops.mapper.extract_tables_from_html_mapper

import bs4

from data_juicer.utils.constant import Fields, MetaKeys

from ..base_op import OPERATORS, TAGGING_OPS, Mapper

OP_NAME = 'extract_tables_from_html_mapper'


[docs] @TAGGING_OPS.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) class ExtractTablesFromHtmlMapper(Mapper): """Mapper to extract tables from HTML content."""
[docs] def __init__(self, tables_field_name: str = MetaKeys.html_tables, retain_html_tags: bool = False, include_header: bool = True, *args, **kwargs): """ Initialization method. :param tables_field_name: Field name to store the extracted tables. :param retain_html_tags: If True, retains HTML tags in the tables; otherwise, removes them. :param include_header: If True, includes the table header; otherwise, excludes it. This parameter is effective only when `retain_html_tags` is False and applies solely to the extracted table content. """ super().__init__(*args, **kwargs) self._init_parameters = self.remove_extra_parameters(locals()) self.tables_field_name = tables_field_name self.retain_html_tags = retain_html_tags self.include_header = include_header
[docs] def process_single(self, sample): # check if it's generated already if self.tables_field_name in sample[Fields.meta]: return sample # parse the HTML content using BeautifulSoup soup = bs4.BeautifulSoup(sample[self.text_key], 'html.parser') tables = soup.find_all('table') # if no tables are found, return an empty list if not tables: sample[Fields.meta][self.tables_field_name] = [] return sample # if retaining HTML tags, store the raw table elements if self.retain_html_tags: sample[Fields.meta][self.tables_field_name] = [ str(table) for table in tables ] return sample # extract table data without HTML tags extracted_tables = [] for table in tables: extracted_rows = [] for row in table.find_all('tr'): is_header_row = row.find('th', recursive=False) is not None # skip rows based on the include_header flag if not self.include_header and is_header_row: continue # extract text content from cells row_data = [ cell.get_text(strip=True) for cell in row.find_all(['td', 'th'], recursive=False) ] if row_data: extracted_rows.append(row_data) if extracted_rows: extracted_tables.append(extracted_rows) sample[Fields.meta][self.tables_field_name] = extracted_tables return sample