diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 33ae51f98..a8a0638ab 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -103,6 +103,7 @@ from .omnispatialbench import OmniSpatialBench from .omtgbench import OMTGBench from .ost_bench import OSTDataset +from .parsebench import ParseBench from .plotqa import PlotQA from .qbench_video import QBench_Video, QBench_Video_MCQ, QBench_Video_VQA from .reasonmap_plus import ReasonMap_Plus @@ -300,6 +301,7 @@ def evaluate(self, eval_file, **judge_kwargs): Design2Code, VLADBench, SSIBenchDataset, NPMM, SGI_Bench_Experimental_Reasoning, MMOral_OPG_OPEN, MMOral_OPG_CLOSED, # noqa: E501 SciDocBench, MMRarebenchDiagnosis, MMRarebenchTreatment, MMRarebenchCrossmodal, MMRarebenchExamination, + ParseBench, ] # add by EASI team diff --git a/vlmeval/dataset/parsebench.py b/vlmeval/dataset/parsebench.py new file mode 100644 index 000000000..a30a30a79 --- /dev/null +++ b/vlmeval/dataset/parsebench.py @@ -0,0 +1,391 @@ +import hashlib +import json +import os +import os.path as osp +import re +from datetime import datetime +from pathlib import Path + +import numpy as np +import pandas as pd + +from vlmeval.smp import LMUDataRoot, d2df, dump, get_intermediate_file_path, load +from .image_base import ImageBaseDataset + + +class ParseBench(ImageBaseDataset): + """ParseBench document parsing benchmark. + + This adapter evaluates VLMs as page-level document parsers. It reuses the + official ParseBench test cases and deterministic evaluators when the + ``parse_bench`` package is installed. + """ + + MODALITY = 'IMAGE' + TYPE = 'QA' + + DATASET_URL = { + 'ParseBench': '', + 'ParseBench_TEST': '', + } + DATASET_MD5 = {} + + SYSTEM_PROMPT = """Convert the document page image into structured parsing output. + +Return only valid JSON with this schema: +{ + "markdown": "complete Markdown transcription of the page", + "pages": [{"page_index": 0, "markdown": "page Markdown"}], + "layout_pages": [ + { + "page_number": 1, + "width": image_width, + "height": image_height, + "items": [ + { + "type": "text|title|table|figure|list|formula|other", + "md": "item Markdown", + "bbox": {"x": x, "y": y, "w": width, "h": height} + } + ] + } + ] +} + +If exact layout coordinates are unavailable, still return accurate Markdown and +use an empty layout_pages list. Do not include explanations or code fences.""" + + @classmethod + def supported_datasets(cls): + return list(cls.DATASET_URL) + + def load_data(self, dataset): + data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv') + self.data_path = data_path + + if osp.exists(data_path) and not os.environ.get('PARSEBENCH_REBUILD'): + return load(data_path) + + data_dir = self._resolve_data_dir(dataset) + test_cases = self._load_parsebench_cases(data_dir) + rows = [] + for test_case in test_cases: + if getattr(test_case, 'group', None) not in { + 'chart', 'layout', 'table', 'text_content', 'text_formatting' + }: + continue + image_path = self._render_first_page(Path(test_case.file_path), dataset) + rows.append({ + 'index': test_case.test_id, + 'image_path': image_path, + 'question': self.SYSTEM_PROMPT, + 'test_id': test_case.test_id, + 'group': test_case.group, + 'source_file_path': str(test_case.file_path), + }) + + if not rows: + raise RuntimeError(f'No ParseBench test cases found under {data_dir}') + + data = pd.DataFrame(rows) + dump(data, data_path) + return data + + def build_prompt(self, line): + if isinstance(line, int): + line = self.data.iloc[line] + image_path = self.dump_image(line)[0] + return [ + dict(type='image', value=image_path), + dict(type='text', value=line['question']), + ] + + def evaluate(self, eval_file, **judge_kwargs): + data = load(eval_file) + required = {'prediction', 'test_id', 'source_file_path', 'group'} + missing = required - set(data.columns) + if missing: + raise ValueError(f'ParseBench evaluation requires columns: {sorted(missing)}') + + data_dir = self._resolve_data_dir(self.dataset_name) + test_cases = self._load_parsebench_cases(data_dir) + test_case_map = {tc.test_id: tc for tc in test_cases} + + results = [] + detailed_rows = [] + pipeline_name = osp.splitext(osp.basename(eval_file))[0].replace(f'_{self.dataset_name}', '') + + for _, line in data.iterrows(): + test_id = str(line['test_id']) + test_case = test_case_map.get(test_id) + if test_case is None: + detailed_rows.append({ + 'test_id': test_id, + 'group': line['group'], + 'success': False, + 'error': 'No matching ParseBench test case', + }) + continue + + try: + inference_result = self._build_inference_result(line, pipeline_name) + result = self._evaluate_one(inference_result, test_case) + results.append(result) + metric_values = {m.metric_name: m.value for m in result.metrics} + detailed_rows.append({ + 'test_id': test_id, + 'group': line['group'], + 'success': result.success, + 'error': result.error, + **metric_values, + }) + except Exception as e: + detailed_rows.append({ + 'test_id': test_id, + 'group': line['group'], + 'success': False, + 'error': str(e), + }) + + detailed = pd.DataFrame(detailed_rows) + detail_file = get_intermediate_file_path(eval_file, '_parsebench_eval') + dump(detailed, detail_file) + + summary = self._summarize_results(results, detailed) + ret = d2df(summary) + score_file = get_intermediate_file_path(eval_file, '_score') + dump(ret, score_file) + return ret + + def _resolve_data_dir(self, dataset): + env_dir = os.environ.get('PARSEBENCH_DATA_DIR') + if env_dir: + return Path(env_dir).expanduser().resolve() + + try: + from parse_bench.data.download import download_dataset + except ImportError as e: + raise ImportError( + 'ParseBench requires the official parse_bench package. ' + 'Install it from the ParseBench repository, or set PARSEBENCH_DATA_DIR ' + 'to an already downloaded ParseBench dataset directory.' + ) from e + + root = Path(LMUDataRoot()) / 'ParseBench' + return download_dataset(root / ('test' if dataset.endswith('_TEST') else 'full'), + test=dataset.endswith('_TEST')) + + @staticmethod + def _load_parsebench_cases(data_dir): + try: + from parse_bench.test_cases import load_test_cases + except ImportError as e: + raise ImportError( + 'ParseBench evaluation requires parse_bench. ' + 'Install the ParseBench package before building or evaluating this dataset.' + ) from e + return load_test_cases(root_dir=Path(data_dir), require_test_json=False, product_type='parse') + + def _render_first_page(self, source_file, dataset): + image_root = Path(LMUDataRoot()) / 'images' / dataset + image_root.mkdir(parents=True, exist_ok=True) + path_hash = hashlib.md5(str(source_file).encode('utf-8')).hexdigest()[:10] + out_path = image_root / f'{source_file.stem}_{path_hash}.png' + if out_path.exists(): + return str(out_path) + + suffix = source_file.suffix.lower() + if suffix in {'.png', '.jpg', '.jpeg', '.jfif'}: + return str(source_file) + if suffix != '.pdf': + raise ValueError(f'Unsupported ParseBench source file: {source_file}') + + try: + import fitz + except ImportError as e: + raise ImportError('ParseBench PDF rendering requires pymupdf. Install with `pip install pymupdf`.') from e + + with fitz.open(str(source_file)) as doc: + page = doc.load_page(0) + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False) + pix.save(str(out_path)) + return str(out_path) + + @staticmethod + def _extract_json_object(text): + text = str(text).strip() + fence = re.search(r'```(?:json)?\s*(.*?)```', text, flags=re.S | re.I) + if fence: + text = fence.group(1).strip() + try: + return json.loads(text) + except Exception: + pass + + start = text.find('{') + end = text.rfind('}') + if start >= 0 and end > start: + try: + return json.loads(text[start:end + 1]) + except Exception: + return None + return None + + def _prediction_to_parse_output(self, prediction, example_id, pipeline_name): + from parse_bench.schemas.parse_output import PageIR, ParseOutput, ParseLayoutPageIR + + obj = self._extract_json_object(prediction) + if isinstance(obj, dict): + markdown = str(obj.get('markdown') or obj.get('md') or '') + pages_payload = obj.get('pages') or [] + layout_payload = obj.get('layout_pages') or obj.get('layout') or [] + if not markdown and pages_payload: + markdown = '\n\n'.join(str(p.get('markdown') or p.get('md') or '') for p in pages_payload) + if not markdown: + markdown = str(prediction) + + pages = [] + for i, page in enumerate(pages_payload): + if isinstance(page, dict): + pages.append(PageIR( + page_index=int(page.get('page_index', i)), + markdown=str(page.get('markdown') or page.get('md') or ''), + )) + if not pages: + pages = [PageIR(page_index=0, markdown=markdown)] + + layout_pages = [] + for page in layout_payload if isinstance(layout_payload, list) else []: + if isinstance(page, dict): + layout_pages.append(ParseLayoutPageIR.model_validate(page)) + + return ParseOutput( + example_id=example_id, + pipeline_name=pipeline_name, + pages=pages, + layout_pages=layout_pages, + markdown=markdown, + ) + + markdown = str(prediction) + return ParseOutput( + example_id=example_id, + pipeline_name=pipeline_name, + pages=[PageIR(page_index=0, markdown=markdown)], + markdown=markdown, + ) + + def _build_inference_result(self, line, pipeline_name): + from parse_bench.schemas.pipeline_io import InferenceRequest, InferenceResult + from parse_bench.schemas.product import ProductType + + test_id = str(line['test_id']) + now = datetime.now() + output = self._prediction_to_parse_output(line['prediction'], test_id, pipeline_name) + return InferenceResult( + request=InferenceRequest( + example_id=test_id, + source_file_path=str(line['source_file_path']), + product_type=ProductType.PARSE, + ), + pipeline_name=pipeline_name, + product_type=ProductType.PARSE, + raw_output={'prediction': str(line['prediction'])}, + output=output, + started_at=now, + completed_at=now, + latency_in_ms=0, + ) + + @staticmethod + def _evaluate_one(inference_result, test_case): + from parse_bench.evaluation.evaluators.layoutdet import LayoutDetectionEvaluator + from parse_bench.evaluation.evaluators.parse import ParseEvaluator + from parse_bench.evaluation.layout_adapters import create_layout_adapter_for_result + from parse_bench.schemas.pipeline_io import InferenceResult + from parse_bench.schemas.product import ProductType + from parse_bench.test_cases.schema import LayoutDetectionTestCase + + if isinstance(test_case, LayoutDetectionTestCase): + adapter = create_layout_adapter_for_result(inference_result) + layout_output = adapter.to_layout_output( + inference_result, + page_filter=getattr(test_case, 'page_index', 0) + 1, + ) + layout_result = InferenceResult( + request=inference_result.request, + pipeline_name=inference_result.pipeline_name, + product_type=ProductType.LAYOUT_DETECTION, + raw_output=inference_result.raw_output, + output=layout_output, + started_at=inference_result.started_at, + completed_at=inference_result.completed_at, + latency_in_ms=inference_result.latency_in_ms, + ) + return LayoutDetectionEvaluator().evaluate(layout_result, test_case) + + return ParseEvaluator().evaluate(inference_result, test_case) + + @staticmethod + def _summarize_results(results, detailed): + metric_rows = [] + for result in results: + if not result.success: + continue + group = result.test_id.split('/')[0] + for metric in result.metrics: + metric_rows.append({ + 'group': group, + 'metric': metric.metric_name, + 'value': metric.value, + }) + + summary = { + 'Overall': 0.0, + 'Tables': 0.0, + 'Charts': 0.0, + 'Content Faithfulness': 0.0, + 'Semantic Formatting': 0.0, + 'Visual Grounding': 0.0, + 'Evaluated': int(len(results)), + 'Failed': int((detailed['success'] == False).sum()) if 'success' in detailed else 0, # noqa: E712 + } + if not metric_rows: + return summary + + metrics = pd.DataFrame(metric_rows) + group_name = { + 'table': 'Tables', + 'chart': 'Charts', + 'text_content': 'Content Faithfulness', + 'text_formatting': 'Semantic Formatting', + 'layout': 'Visual Grounding', + } + preferred = { + 'table': ('grits_trm_composite', 'table_record_match', 'grits_con'), + 'chart': ('rule_pass_rate', 'chart_data', 'content_faithfulness'), + 'text_content': ('content_faithfulness', 'rule_based'), + 'text_formatting': ('semantic_formatting', 'rule_based'), + 'layout': ('parse_field_element_pass_rate', 'element_pass_rate', 'f1'), + } + + group_scores = [] + for group, label in group_name.items(): + sub = metrics[metrics['group'] == group] + if sub.empty: + continue + chosen = None + for key in preferred[group]: + cand = sub[sub['metric'].str.lower().str.contains(key)] + if not cand.empty: + chosen = cand + break + if chosen is None: + chosen = sub + score = float(np.mean(chosen['value'])) * 100 + summary[label] = round(score, 2) + group_scores.append(score) + + if group_scores: + summary['Overall'] = round(float(np.mean(group_scores)), 2) + return summary