From ad360b63ef57447261dc1b3ca7e9c06b24ef9436 Mon Sep 17 00:00:00 2001 From: TianhaoLiang2000 <2662248501@qq.com> Date: Wed, 6 May 2026 10:51:25 +0800 Subject: [PATCH 1/2] Centralize default judge model selection --- run.py | 71 ++++---------------------------------------- vlmeval/judge.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ vlmeval/tools.py | 14 +++------ 3 files changed, 87 insertions(+), 75 deletions(-) create mode 100644 vlmeval/judge.py diff --git a/run.py b/run.py index 45558c765..880092207 100644 --- a/run.py +++ b/run.py @@ -58,10 +58,11 @@ def get_gpu_list(): from vlmeval.inference import infer_data_job from vlmeval.inference_mt import infer_data_job_mt from vlmeval.inference_video import infer_data_job_video +from vlmeval.judge import get_default_judge_model from vlmeval.smp import (MMBenchOfficialServer, build_eval_id, collect_run_benchmark_report, get_eval_file_format, get_logger, get_pred_file_format, - get_pred_file_path, githash, is_prediction_complete, listinstr, load, - load_env, prepare_reuse_files, proxy_set, setup_logger, timestr, + get_pred_file_path, githash, is_prediction_complete, load, load_env, + prepare_reuse_files, proxy_set, setup_logger, timestr, upsert_dataset_status, upsert_run_status) from vlmeval.utils.result_transfer import MMMU_result_transfer, MMTBench_result_transfer @@ -256,69 +257,9 @@ def get_judge_kwargs(dataset_name, dataset_type, args): if args.judge is not None: judge_kwargs['model'] = args.judge else: - if dataset_type in ['MCQ', 'Y/N', 'MCQ_MMMU_Pro'] or listinstr( - ['moviechat1k', 'mme-reasoning'], dataset_name.lower() - ): - if listinstr(['WeMath', 'MME-Reasoning'], dataset_name): - judge_kwargs['model'] = 'gpt-4o-mini' - elif listinstr(['VisualPuzzles'], dataset_name): - judge_kwargs['model'] = 'exact_matching' - elif listinstr(['PuzzleVQA'], dataset_name): - judge_kwargs['model'] = 'exact_matching' - elif listinstr(['VisuLogic'], dataset_name): - judge_kwargs['model'] = 'exact_matching' - else: - judge_kwargs['model'] = 'gpt-4o-mini' - elif listinstr(['MMVet', 'LLaVABench', 'MMBench_Video'], dataset_name): - if listinstr(['LLaVABench_KO'], dataset_name): - judge_kwargs['model'] = 'gpt-4o-0806' - else: - judge_kwargs['model'] = 'gpt-4-turbo' - elif listinstr(['VGRPBench'], dataset_name): - judge_kwargs['model'] = 'gpt-4o' - elif listinstr( - ['MathVista', 'MathVerse', 'MathVision', 'LENS', 'DynaMath', 'VL-RewardBench', - 'LogicVista', 'MOAT', 'OCR_Reasoning', 'VTCBench', 'Asclepius', - 'MMSafetyBench', 'MSSBench', 'SIUO', 'SIUO_GEN', 'XSTest', 'Flames'], dataset_name - ): - judge_kwargs['model'] = 'gpt-4o-mini' - elif listinstr(['OlympiadBench'], dataset_name): - use_api_judger = judge_kwargs.get("olympiad_use_api_judger", False) - if use_api_judger: - judge_kwargs['model'] = 'gpt-4o-mini' - elif listinstr( - ['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench', - 'WildVision', 'MMAlignBench', 'MM-IFEval'], dataset_name - ): - judge_kwargs['model'] = 'gpt-4o' - elif listinstr(['ChartMimic'], dataset_name): - judge_kwargs['model'] = 'gpt-4o' - elif listinstr(['VDC'], dataset_name): - judge_kwargs['model'] = 'llama31-8b' - elif listinstr(['Video_MMLU_QA', 'Video_MMLU_CAP'], dataset_name): - judge_kwargs['model'] = 'qwen-72b' - elif listinstr(['MMVMBench'], dataset_name): - judge_kwargs['model'] = 'gpt-4o' - elif listinstr(['CVQA_EN', 'CVQA_LOC'], dataset_name): - judge_kwargs['model'] = 'gpt-4.1' - elif listinstr(['M4Bench'], dataset_name): - judge_kwargs['model'] = 'gpt-4o' - elif listinstr(['AyaVisionBench'], dataset_name): - judge_kwargs['model'] = 'gpt-4.1' - elif listinstr(['MathCanvas'], dataset_name): - judge_kwargs['model'] = 'gpt-4.1-2025-04-14' - elif listinstr(['MMReason'], dataset_name): - judge_kwargs['model'] = 'gpt-4.1' - elif listinstr(['CoreCognition'], dataset_name): - judge_kwargs['model'] = 'gpt-4.1' - elif listinstr(['WorldVQA'], dataset_name): - judge_kwargs['model'] = 'gpt-4o-1120' - elif listinstr(['Video-MME'], dataset_name): - judge_kwargs['model'] = 'gpt-4o-mini' - elif listinstr(['MaCBench'], dataset_name): - judge_kwargs['model'] = 'gpt-4o-mini' - elif listinstr(['SciDocBench'], dataset_name): - judge_kwargs['model'] = 'gpt-4o-mini' + judge_model = get_default_judge_model(dataset_name, dataset_type, judge_kwargs) + if judge_model is not None: + judge_kwargs['model'] = judge_model if args.use_verifier: judge_kwargs['use_verifier'] = True diff --git a/vlmeval/judge.py b/vlmeval/judge.py new file mode 100644 index 000000000..5e648b580 --- /dev/null +++ b/vlmeval/judge.py @@ -0,0 +1,77 @@ +def listinstr(lst, s): + return any(item in s for item in lst) + + +def get_default_judge_model(dataset_name, dataset_type, judge_kwargs=None): + """Return the default judge model for a dataset, or None if not specified.""" + judge_kwargs = judge_kwargs or {} + + if dataset_type in ['MCQ', 'Y/N', 'MCQ_MMMU_Pro'] or listinstr( + ['moviechat1k', 'mme-reasoning'], dataset_name.lower() + ): + if listinstr(['WeMath', 'MME-Reasoning'], dataset_name): + return 'gpt-4o-mini' + if listinstr(['VisualPuzzles'], dataset_name): + return 'exact_matching' + if listinstr(['PuzzleVQA'], dataset_name): + return 'exact_matching' + if listinstr(['VisuLogic'], dataset_name): + return 'exact_matching' + return 'gpt-4o-mini' + + if listinstr(['MMVet', 'LLaVABench', 'MMBench_Video', 'MMBench-Video'], dataset_name): + if listinstr(['LLaVABench_KO'], dataset_name): + return 'gpt-4o-0806' + return 'gpt-4-turbo' + + if listinstr(['VGRPBench'], dataset_name): + return 'gpt-4o' + + if listinstr( + ['MathVista', 'MathVerse', 'MathVision', 'LENS', 'DynaMath', 'VL-RewardBench', + 'LogicVista', 'MOAT', 'OCR_Reasoning', 'VTCBench', 'Asclepius', + 'MMSafetyBench', 'MSSBench', 'SIUO', 'SIUO_GEN', 'XSTest', 'Flames'], dataset_name + ): + return 'gpt-4o-mini' + + if listinstr(['OlympiadBench'], dataset_name): + if judge_kwargs.get('olympiad_use_api_judger', False): + return 'gpt-4o-mini' + return None + + if listinstr( + ['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench', + 'WildVision', 'MMAlignBench', 'MM-IFEval'], dataset_name + ): + return 'gpt-4o' + + if listinstr(['ChartMimic'], dataset_name): + return 'gpt-4o' + if listinstr(['VDC'], dataset_name): + return 'llama31-8b' + if listinstr(['Video_MMLU_QA', 'Video_MMLU_CAP'], dataset_name): + return 'qwen-72b' + if listinstr(['MMVMBench'], dataset_name): + return 'gpt-4o' + if listinstr(['CVQA_EN', 'CVQA_LOC'], dataset_name): + return 'gpt-4.1' + if listinstr(['M4Bench'], dataset_name): + return 'gpt-4o' + if listinstr(['AyaVisionBench'], dataset_name): + return 'gpt-4.1' + if listinstr(['MathCanvas'], dataset_name): + return 'gpt-4.1-2025-04-14' + if listinstr(['MMReason'], dataset_name): + return 'gpt-4.1' + if listinstr(['CoreCognition'], dataset_name): + return 'gpt-4.1' + if listinstr(['WorldVQA'], dataset_name): + return 'gpt-4o-1120' + if listinstr(['Video-MME'], dataset_name): + return 'gpt-4o-mini' + if listinstr(['MaCBench'], dataset_name): + return 'gpt-4o-mini' + if listinstr(['SciDocBench'], dataset_name): + return 'gpt-4o-mini' + + return None diff --git a/vlmeval/tools.py b/vlmeval/tools.py index b8c7e121a..ca831dba3 100644 --- a/vlmeval/tools.py +++ b/vlmeval/tools.py @@ -17,6 +17,7 @@ supported_VLM, vila_series, wemm_series, xcomposer_series, xtuner_series, yivl_series) from vlmeval.dataset import SUPPORTED_DATASETS +from vlmeval.judge import get_default_judge_model from vlmeval.smp import (dump, get_logger, get_pred_file_format, listinstr, load, load_env, localize_df, ls, md5, mrlines, mwlines) @@ -446,16 +447,9 @@ def EVAL(dataset_name, data_file, **kwargs): # Set the judge kwargs first before evaluation or dumping judge_kwargs = {'nproc': 4, 'verbose': True} if 'model' not in kwargs: - if dataset.TYPE in ['MCQ', 'Y/N', 'MCQ_MMMU_Pro']: - judge_kwargs['model'] = 'chatgpt-0125' - elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name): - judge_kwargs['model'] = 'gpt-4-turbo' - elif listinstr(['MMLongBench', 'MMDU'], dataset_name): - judge_kwargs['model'] = 'gpt-4o' - elif listinstr(['DynaMath', 'MathVerse', 'MathVista', 'MathVision'], dataset_name): - judge_kwargs['model'] = 'gpt-4o-mini' - elif listinstr(['SFE'], dataset_name): - judge_kwargs['model'] = 'gpt-4o-1120' + judge_model = get_default_judge_model(dataset_name, dataset.TYPE, judge_kwargs) + if judge_model is not None: + judge_kwargs['model'] = judge_model else: judge_kwargs['model'] = kwargs['model'] judge_kwargs['nproc'] = kwargs.get('nproc', 4) From 97518772de8251349606df7f3bc0548cdffdee09 Mon Sep 17 00:00:00 2001 From: TianhaoLiang2000 <2662248501@qq.com> Date: Fri, 15 May 2026 16:12:41 +0800 Subject: [PATCH 2/2] Move judge defaults to benchmark classes --- run.py | 8 +-- vlmeval/dataset/asclepius.py | 1 + vlmeval/dataset/chartmimic.py | 1 + vlmeval/dataset/dude.py | 1 + vlmeval/dataset/dynamath.py | 1 + vlmeval/dataset/flames.py | 1 + vlmeval/dataset/image_mcq.py | 4 ++ vlmeval/dataset/image_mt.py | 1 + vlmeval/dataset/image_vqa.py | 18 +++++++ vlmeval/dataset/m4bench.py | 1 + vlmeval/dataset/macbench.py | 1 + vlmeval/dataset/miabench.py | 1 + vlmeval/dataset/mmalignbench.py | 1 + vlmeval/dataset/mmbench_video.py | 1 + vlmeval/dataset/mmifeval.py | 1 + vlmeval/dataset/mmlongbench.py | 1 + vlmeval/dataset/mmsafetybench.py | 1 + vlmeval/dataset/moat.py | 1 + vlmeval/dataset/moviechat1k.py | 1 + vlmeval/dataset/mssbench.py | 1 + vlmeval/dataset/scidocbench.py | 1 + vlmeval/dataset/siuo.py | 1 + vlmeval/dataset/siuo_gen.py | 1 + vlmeval/dataset/slidevqa.py | 1 + vlmeval/dataset/vdc.py | 1 + vlmeval/dataset/video_mmlu.py | 2 + vlmeval/dataset/videomme.py | 1 + vlmeval/dataset/vl_rewardbench.py | 1 + vlmeval/dataset/wildvision.py | 1 + vlmeval/dataset/worldvqa.py | 1 + vlmeval/dataset/xstest.py | 1 + vlmeval/judge.py | 86 ++++++------------------------- vlmeval/tools.py | 2 +- 33 files changed, 71 insertions(+), 76 deletions(-) diff --git a/run.py b/run.py index 880092207..6a87978db 100644 --- a/run.py +++ b/run.py @@ -222,7 +222,7 @@ def build_model_from_base_url(args): return model_args -def get_judge_kwargs(dataset_name, dataset_type, args): +def get_judge_kwargs(dataset_name, dataset_type, args, dataset=None): """Determine judge kwargs based on dataset name and type. Uses run.py's logic as the canonical source for dataset-specific judge model @@ -257,7 +257,7 @@ def get_judge_kwargs(dataset_name, dataset_type, args): if args.judge is not None: judge_kwargs['model'] = args.judge else: - judge_model = get_default_judge_model(dataset_name, dataset_type, judge_kwargs) + judge_model = get_default_judge_model(dataset, dataset_type, judge_kwargs) if judge_model is not None: judge_kwargs['model'] = judge_model @@ -596,7 +596,7 @@ def run_local_mode(args): ) continue - judge_kwargs = get_judge_kwargs(dataset_name, dataset.TYPE, args) + judge_kwargs = get_judge_kwargs(dataset_name, dataset.TYPE, args, dataset=dataset) judge_model = judge_kwargs.get('model', '') if RANK == 0: @@ -982,7 +982,7 @@ def run_api_mode(args): logger.info(f'{ds_name} requires special handling, skipped in pipeline.') continue - judge_kwargs = get_judge_kwargs(ds_name, dataset.TYPE, args) + judge_kwargs = get_judge_kwargs(ds_name, dataset.TYPE, args, dataset=dataset) judge_model = judge_kwargs.get('model', '') logger.info(f'Judge kwargs: {judge_kwargs}') diff --git a/vlmeval/dataset/asclepius.py b/vlmeval/dataset/asclepius.py index b10344eb2..9e4d22259 100644 --- a/vlmeval/dataset/asclepius.py +++ b/vlmeval/dataset/asclepius.py @@ -24,6 +24,7 @@ class Asclepius(ImageVQADataset): """ TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' MODALITY = 'IMAGE' DATASET_URL = { diff --git a/vlmeval/dataset/chartmimic.py b/vlmeval/dataset/chartmimic.py index f02423122..182aaa18f 100644 --- a/vlmeval/dataset/chartmimic.py +++ b/vlmeval/dataset/chartmimic.py @@ -419,6 +419,7 @@ def _judge_one_item(item): class ChartMimic(ImageBaseDataset): TYPE = "VQA" + DEFAULT_JUDGE_MODEL = 'gpt-4o' # TODO: add dataset url and md5 DATASET_URL = { diff --git a/vlmeval/dataset/dude.py b/vlmeval/dataset/dude.py index 1fffb8f51..d8e82d790 100644 --- a/vlmeval/dataset/dude.py +++ b/vlmeval/dataset/dude.py @@ -51,6 +51,7 @@ def DUDE_acc(result_file): class DUDE(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o' DATASET_URL = { 'DUDE': 'https://opencompass.openxlab.space/utils/VLMEval/DUDE.tsv', diff --git a/vlmeval/dataset/dynamath.py b/vlmeval/dataset/dynamath.py index ba63df4db..9588cfb7c 100644 --- a/vlmeval/dataset/dynamath.py +++ b/vlmeval/dataset/dynamath.py @@ -112,6 +112,7 @@ def DynaMath_auxeval(model, line): class Dynamath(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DATASET_URL = { 'DynaMath': 'https://opencompass.openxlab.space/utils/VLMEval/DynaMath.tsv', 'DynaMath_noprompt': 'https://opencompass.openxlab.space/utils/VLMEval/DynaMath.tsv', diff --git a/vlmeval/dataset/flames.py b/vlmeval/dataset/flames.py index 955dbf4c0..950bdcbdd 100644 --- a/vlmeval/dataset/flames.py +++ b/vlmeval/dataset/flames.py @@ -55,6 +55,7 @@ def _flames_judge(model, dimension, question, response): class FlamesDataset(TextBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' MODALITY = 'TEXT' DATASET_URL = {'Flames': 'https://opencompass.openxlab.space/utils/VLMEval/Flames.tsv'} DATASET_MD5 = {'Flames': 'b567b6c96717c9e6c8bb9b458a85635a'} diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py index 17605d510..1f8f90fa6 100644 --- a/vlmeval/dataset/image_mcq.py +++ b/vlmeval/dataset/image_mcq.py @@ -1674,6 +1674,7 @@ def build_prompt(self, line): class VisualPuzzles(ImageMCQDataset): TYPE = "MCQ" + DEFAULT_JUDGE_MODEL = 'exact_matching' DATASET_URL = { 'VisualPuzzles': 'https://opencompass.openxlab.space/utils/VLMEval/VisualPuzzles.tsv' } @@ -1771,6 +1772,7 @@ def evaluate(self, eval_file, **judge_kwargs): class PuzzleVQA(ImageMCQDataset): TYPE = "MCQ" + DEFAULT_JUDGE_MODEL = 'exact_matching' DATASET_URL = { 'PuzzleVQA': 'https://opencompass.openxlab.space/utils/VLMEval/PuzzleVQA.tsv' } @@ -1842,6 +1844,7 @@ def evaluate(self, eval_file, **judge_kwargs): class VisuLogic(ImageMCQDataset): TYPE = "MCQ" + DEFAULT_JUDGE_MODEL = 'exact_matching' DATASET_URL = { 'VisuLogic': 'https://opencompass.openxlab.space/utils/VLMEval/VisuLogic.tsv' } @@ -3044,6 +3047,7 @@ def compute_iou(box1, box2): class CVQA(ImageMCQDataset): + DEFAULT_JUDGE_MODEL = 'gpt-4.1' @classmethod def supported_datasets(cls): diff --git a/vlmeval/dataset/image_mt.py b/vlmeval/dataset/image_mt.py index 205a7e572..bd33da8be 100644 --- a/vlmeval/dataset/image_mt.py +++ b/vlmeval/dataset/image_mt.py @@ -57,6 +57,7 @@ def build_prompt(self, line): class MMDUDataset(ImageMTDataset): + DEFAULT_JUDGE_MODEL = 'gpt-4o' DATASET_URL = {'MMDU': 'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'} DATASET_MD5 = {'MMDU': '848b635a88a078f49aebcc6e39792061'} DIMS = [ diff --git a/vlmeval/dataset/image_vqa.py b/vlmeval/dataset/image_vqa.py index 576fb50c4..25ef12458 100644 --- a/vlmeval/dataset/image_vqa.py +++ b/vlmeval/dataset/image_vqa.py @@ -241,6 +241,7 @@ def evaluate(self, eval_file, **judge_kwargs): class VTCBench(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' _DATASET_PATH = "https://huggingface.co/datasets/MLLM-CL/VTCBench" # Dataset URL mapping - points to different splits of HuggingFace dataset DATASET_URL = { @@ -571,6 +572,7 @@ def evaluate(self, eval_file, **judge_kwargs): class MathVista(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DATASET_URL = { 'MathVista_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv' @@ -700,6 +702,7 @@ def MathVista_acc_verifier(result_file): class MathVerse(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DATASET_URL = { 'MathVerse_MINI': 'https://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv', # noqa @@ -842,6 +845,7 @@ def evaluate(self, eval_file, **judge_kwargs): class MathVision(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DATASET_URL = { 'MathVision': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv', @@ -984,6 +988,7 @@ def report_primary_metric(cls, metrics: dict | None) -> dict: class LENS(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DATASET_URL = { 'LENS-CN-QA': 'https://huggingface.co/datasets/songlier/LENS/resolve/main/LENS-CN-QA.tsv', @@ -1272,6 +1277,7 @@ def evaluate(self, eval_file, **judge_kwargs): class OlympiadBench(ImageBaseDataset): TYPE = 'VQA_ex_prompt' + DEFAULT_JUDGE_MODEL = {'olympiad_use_api_judger': 'gpt-4o-mini'} DATASET_URL = { 'OlympiadBench': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv', @@ -1702,6 +1708,7 @@ def evaluate(self, eval_file, **judge_kwargs): class LogicVista(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DATASET_URL = { 'LogicVista': 'https://opencompass.openxlab.space/utils/VLMEval/LogicVista.tsv' @@ -1891,6 +1898,7 @@ def evaluate(self, eval_file, **judge_kwargs): class LLaVABench(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4-turbo' DATASET_URL = { 'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv' @@ -1932,6 +1940,7 @@ def evaluate(self, eval_file, **judge_kwargs): class LLaVABench_KO(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-0806' DATASET_URL = { 'LLaVABench_KO': 'https://huggingface.co/datasets/NCSOFT/K-LLaVA-W/resolve/main/LLaVABench_KO.tsv' @@ -1974,6 +1983,7 @@ def evaluate(self, eval_file, **judge_kwargs): class VGRPBench(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o' DATASET_URL = { 'VGRPBench': @@ -2036,6 +2046,7 @@ def evaluate(self, eval_file, **judge_kwargs): class MMVet(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4-turbo' DATASET_URL = { 'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv', @@ -3315,6 +3326,7 @@ def evaluate(self, eval_file, **judge_kwargs): class OCR_Reasoning(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DATASET_URL = { 'OCR_Reasoning': 'https://opencompass.openxlab.space/utils/VLMEval/OCR_Reasoning.tsv' @@ -3607,6 +3619,7 @@ def evaluate(self, eval_file, **judge_kwargs): class MMEReasoning(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DATASET_URL = {'MME-Reasoning': 'https://huggingface.co/datasets/U4R/MME-Reasoning/blob/main/MME_Reasoning.tsv'} DATASET_MD = {'MME-Reasoning': 'b243f44778782d3821523689f6b40a1e'} @@ -3802,6 +3815,7 @@ def evaluate(self, eval_file, **judge_kwargs): class MMVMBench(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o' DATASET_URL = { 'MMVMBench': 'https://opencompass.openxlab.space/utils/VLMEval/MMVMBench.tsv' @@ -3984,6 +3998,7 @@ def evaluate(self, eval_file, **judge_kwargs): class AyaVisionBench(ImageVQADataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4.1' DATASET_URL = { "AyaVisionBench": "https://huggingface.co/datasets/timothycdc/" @@ -4063,6 +4078,7 @@ def evaluate(self, eval_file, **judge_kwargs): class MathCanvas(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4.1-2025-04-14' DATASET_URL = { "MathCanvas-Bench": "https://huggingface.co/datasets/shiwk24/MathCanvas-Bench/resolve/main/MathCanvas_Bench_VLMEvalKit.tsv" @@ -4161,6 +4177,7 @@ def evaluate(self, eval_file, **judge_kwargs): class MMReason(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4.1' mini_path = 'https://huggingface.co/datasets/HuanjinYao/MMReason/resolve/main/MMReason_testmini.tsv?download=true' DATASET_URL = { 'MMReason_testmini': mini_path, @@ -4255,6 +4272,7 @@ def evaluate(self, eval_file, **judge_kwargs): class CoreCognition(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4.1' DATASET_URL = { 'CoreCognition': 'https://huggingface.co/datasets/ZTWHHH/CoreCognition/resolve/main/CoreCognition.tsv' diff --git a/vlmeval/dataset/m4bench.py b/vlmeval/dataset/m4bench.py index c215640d1..142f99f35 100644 --- a/vlmeval/dataset/m4bench.py +++ b/vlmeval/dataset/m4bench.py @@ -16,6 +16,7 @@ class M4Bench(ImageBaseDataset): Dataset class for M4Bench, handling single and dual image inputs. """ TYPE = 'M4Bench' + DEFAULT_JUDGE_MODEL = 'gpt-4o' DATASET_URL = { "State_Invariance": "https://huggingface.co/datasets/Anonymous8976/M4Bench/resolve/main/State_Invariance.tsv", # noqa: E501 diff --git a/vlmeval/dataset/macbench.py b/vlmeval/dataset/macbench.py index 7b5090e81..b1f36c46a 100644 --- a/vlmeval/dataset/macbench.py +++ b/vlmeval/dataset/macbench.py @@ -140,6 +140,7 @@ def macbench_auxeval(model, line): class MaCBench(ImageBaseDataset): TYPE = 'MaCBench' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DATASET_URL = {'MaCBench': ''} DATASET_MD5 = {'MaCBench': '0e163396dd28886fd828e101f24afdf6'} diff --git a/vlmeval/dataset/miabench.py b/vlmeval/dataset/miabench.py index 57cff8e8a..f5a09f4e4 100644 --- a/vlmeval/dataset/miabench.py +++ b/vlmeval/dataset/miabench.py @@ -101,6 +101,7 @@ def get_score_dict(data, score_raw): class MIABench(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o' DATASET_URL = { 'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv', diff --git a/vlmeval/dataset/mmalignbench.py b/vlmeval/dataset/mmalignbench.py index 909e7a207..905183b06 100644 --- a/vlmeval/dataset/mmalignbench.py +++ b/vlmeval/dataset/mmalignbench.py @@ -128,6 +128,7 @@ def MMAlignBench_auxeval(model, line): class MMAlignBench(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o' DATASET_URL = {'MMAlignBench': 'https://opencompass.openxlab.space/utils/VLMEval/MMAlignBench.tsv'} DATASET_MD5 = {'MMAlignBench': 'd00d8e61c99257cbaf76d8d5e926f01e'} diff --git a/vlmeval/dataset/mmbench_video.py b/vlmeval/dataset/mmbench_video.py index cada64bee..62881babc 100644 --- a/vlmeval/dataset/mmbench_video.py +++ b/vlmeval/dataset/mmbench_video.py @@ -68,6 +68,7 @@ class MMBenchVideo(VideoBaseDataset): """ TYPE = 'Video-VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4-turbo' def __init__(self, dataset='MMBench-Video', pack=False, nframe=0, fps=-1): super().__init__(dataset=dataset, pack=pack, nframe=nframe, fps=fps) diff --git a/vlmeval/dataset/mmifeval.py b/vlmeval/dataset/mmifeval.py index 84d8a0ca3..262c85ae0 100644 --- a/vlmeval/dataset/mmifeval.py +++ b/vlmeval/dataset/mmifeval.py @@ -332,6 +332,7 @@ def judge_one_item(item, retry=3): class MMIFEval(ImageBaseDataset): TYPE = "VQA" + DEFAULT_JUDGE_MODEL = 'gpt-4o' # TODO: add dataset url and md5 DATASET_URL = {"MM-IFEval": 'https://opencompass.openxlab.space/utils/VLMEval/MM-IFEval.tsv'} diff --git a/vlmeval/dataset/mmlongbench.py b/vlmeval/dataset/mmlongbench.py index a85f6049e..7553da580 100644 --- a/vlmeval/dataset/mmlongbench.py +++ b/vlmeval/dataset/mmlongbench.py @@ -435,6 +435,7 @@ def MMLongBench_acc(result_file): class MMLongBench(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o' DATASET_URL = { 'MMLongBench_DOC': 'https://opencompass.openxlab.space/utils/VLMEval/MMLongBench_DOC.tsv', diff --git a/vlmeval/dataset/mmsafetybench.py b/vlmeval/dataset/mmsafetybench.py index 241c9f495..64094fe0b 100644 --- a/vlmeval/dataset/mmsafetybench.py +++ b/vlmeval/dataset/mmsafetybench.py @@ -95,6 +95,7 @@ class MMSafetyBenchDataset(ImageBaseDataset): - attack_rate = unsafe / total """ + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' TYPE = 'VQA' MODALITY = 'IMAGE' diff --git a/vlmeval/dataset/moat.py b/vlmeval/dataset/moat.py index 39e66aa77..14d908357 100644 --- a/vlmeval/dataset/moat.py +++ b/vlmeval/dataset/moat.py @@ -48,6 +48,7 @@ def str2json(s: str): class MOAT(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DATASET_URL = { 'MOAT': "https://huggingface.co/datasets/waltsun/MOAT/resolve/main/MOAT.tsv", } diff --git a/vlmeval/dataset/moviechat1k.py b/vlmeval/dataset/moviechat1k.py index 1c684b419..8accdb0ae 100644 --- a/vlmeval/dataset/moviechat1k.py +++ b/vlmeval/dataset/moviechat1k.py @@ -22,6 +22,7 @@ class MovieChat1k(VideoBaseDataset): MD5 = '7c0aa7e10de1cddb37af42b4abc9a2dd' TYPE = 'Video-VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' def __init__(self, dataset='MovieChat1k', pack=False, nframe=0, fps=-1, subset='all', limit=1.0): super().__init__(dataset=dataset, pack=pack, nframe=nframe, fps=fps) diff --git a/vlmeval/dataset/mssbench.py b/vlmeval/dataset/mssbench.py index fda58b8a6..220f16262 100644 --- a/vlmeval/dataset/mssbench.py +++ b/vlmeval/dataset/mssbench.py @@ -59,6 +59,7 @@ def _mss_behavior_judge(model, pred): class MSSBenchDataset(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' MODALITY = 'IMAGE' DATASET_URL = {'MSSBench': 'https://opencompass.openxlab.space/utils/VLMEval/MSSBench.tsv'} DATASET_MD5 = {'MSSBench': 'f5398724ede5cb8d1c725fc01c96241b'} diff --git a/vlmeval/dataset/scidocbench.py b/vlmeval/dataset/scidocbench.py index 843547b67..142fae051 100644 --- a/vlmeval/dataset/scidocbench.py +++ b/vlmeval/dataset/scidocbench.py @@ -371,6 +371,7 @@ def _eval_one_item(item_json): class SciDocBench(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DATASET_URL = { 'SciDocBench': 'https://opencompass.openxlab.space/utils/VLMEvalKit/SciDocBench.tsv', diff --git a/vlmeval/dataset/siuo.py b/vlmeval/dataset/siuo.py index 4ea48c3da..6bacf0ebb 100644 --- a/vlmeval/dataset/siuo.py +++ b/vlmeval/dataset/siuo.py @@ -20,6 +20,7 @@ class SIUODataset(ImageBaseDataset): """ TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' MODALITY = 'IMAGE' SUB_DATASETS = ['SIUO_GEN', 'SIUO_MCQ'] diff --git a/vlmeval/dataset/siuo_gen.py b/vlmeval/dataset/siuo_gen.py index 96f9b2db0..346cf546c 100644 --- a/vlmeval/dataset/siuo_gen.py +++ b/vlmeval/dataset/siuo_gen.py @@ -84,6 +84,7 @@ def _siuo_eff_judge(model, q, pred): class SIUOGenDataset(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' MODALITY = 'IMAGE' DATASET_URL = {'SIUO_GEN': 'https://opencompass.openxlab.space/utils/VLMEval/SIUO_GEN.tsv'} DATASET_MD5 = {'SIUO_GEN': '74a41eadede71e932cce9004442cf1a7'} diff --git a/vlmeval/dataset/slidevqa.py b/vlmeval/dataset/slidevqa.py index 6dd79c019..b5a8422f3 100644 --- a/vlmeval/dataset/slidevqa.py +++ b/vlmeval/dataset/slidevqa.py @@ -65,6 +65,7 @@ def SlideVQA_acc(result_file): class SlideVQA(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o' DATASET_URL = { 'SLIDEVQA_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/SLIDEVQA_MINI.tsv', diff --git a/vlmeval/dataset/vdc.py b/vlmeval/dataset/vdc.py index 72d337130..11f2e7459 100644 --- a/vlmeval/dataset/vdc.py +++ b/vlmeval/dataset/vdc.py @@ -128,6 +128,7 @@ class VDC(VideoBaseDataset): MD5 = '' TYPE = 'Video-VQA' + DEFAULT_JUDGE_MODEL = 'llama31-8b' def __init__(self, dataset='VDC', pack=False, nframe=0, fps=-1, subset='all', limit=1.0): super().__init__(dataset=dataset, pack=pack, nframe=nframe, fps=fps) diff --git a/vlmeval/dataset/video_mmlu.py b/vlmeval/dataset/video_mmlu.py index 4b2928e98..fd7d0f44c 100644 --- a/vlmeval/dataset/video_mmlu.py +++ b/vlmeval/dataset/video_mmlu.py @@ -53,6 +53,7 @@ class Video_MMLU_CAP(VideoBaseDataset): MD5 = '' TYPE = 'Video-VQA' + DEFAULT_JUDGE_MODEL = 'qwen-72b' MODALITY = 'VIDEO' def __init__(self, dataset='Video_MMLU_CAP', pack=False, nframe=0, fps=-1, subset='all', limit=1.0): @@ -369,6 +370,7 @@ class Video_MMLU_QA(VideoBaseDataset): MD5 = '' TYPE = 'Video-VQA' + DEFAULT_JUDGE_MODEL = 'qwen-72b' MODALITY = 'VIDEO' def __init__(self, dataset='Video_MMLU_QA', pack=False, nframe=0, fps=-1, subset='all', limit=1.0): diff --git a/vlmeval/dataset/videomme.py b/vlmeval/dataset/videomme.py index 998c23e82..464fccae3 100644 --- a/vlmeval/dataset/videomme.py +++ b/vlmeval/dataset/videomme.py @@ -58,6 +58,7 @@ class VideoMME(VideoBaseDataset): """ TYPE = 'Video-MCQ' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DEFAULT_JUDGE = ['chatgpt-0125', 'gpt-4-0125'] def __init__(self, dataset='Video-MME', use_subtitle=False, nframe=0, fps=-1): diff --git a/vlmeval/dataset/vl_rewardbench.py b/vlmeval/dataset/vl_rewardbench.py index a6b872662..8692c5838 100644 --- a/vlmeval/dataset/vl_rewardbench.py +++ b/vlmeval/dataset/vl_rewardbench.py @@ -74,6 +74,7 @@ def VLRewardBench_eval_answer(model, line): class VLRewardBench(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' DATASET_URL = { 'VL-RewardBench': 'https://huggingface.co/datasets/MMInstruction/VL-RewardBench/resolve/main/vl_rewardbench.tsv' } diff --git a/vlmeval/dataset/wildvision.py b/vlmeval/dataset/wildvision.py index a423aefd2..3c4b36dd6 100644 --- a/vlmeval/dataset/wildvision.py +++ b/vlmeval/dataset/wildvision.py @@ -95,6 +95,7 @@ def WildVision_auxeval(model, line): class WildVision(ImageBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o' DATASET_URL = { 'WildVision': 'https://opencompass.openxlab.space/utils/VLMEval/WildVision.tsv' } diff --git a/vlmeval/dataset/worldvqa.py b/vlmeval/dataset/worldvqa.py index 29c2929ad..cc198a045 100644 --- a/vlmeval/dataset/worldvqa.py +++ b/vlmeval/dataset/worldvqa.py @@ -234,6 +234,7 @@ def auxeval(judge_model: Any, line: pd.Series, **kwargs: Any) -> Dict[str, Any]: # -------------------- dataset -------------------- class WorldVQA(ImageBaseDataset): TYPE = "VQA" + DEFAULT_JUDGE_MODEL = 'gpt-4o-1120' DATASET_URL = { # 改成你自己的 tsv 路径或挂载路径 "WorldVQA": "https://huggingface.co/datasets/moonshotai/WorldVQA/blob/main/WorldVQA.tsv", diff --git a/vlmeval/dataset/xstest.py b/vlmeval/dataset/xstest.py index d9c9fefce..f981037f3 100644 --- a/vlmeval/dataset/xstest.py +++ b/vlmeval/dataset/xstest.py @@ -61,6 +61,7 @@ def _xstest_judge_llm(model, q, pred): class XSTestDataset(TextBaseDataset): TYPE = 'VQA' + DEFAULT_JUDGE_MODEL = 'gpt-4o-mini' MODALITY = 'TEXT' DATASET_URL = {'XSTest': 'https://opencompass.openxlab.space/utils/VLMEval/XSTest.tsv'} DATASET_MD5 = {'XSTest': 'd33f7ff1bc362c2b8d8deb8021959f3c'} diff --git a/vlmeval/judge.py b/vlmeval/judge.py index 5e648b580..2eba9e5a5 100644 --- a/vlmeval/judge.py +++ b/vlmeval/judge.py @@ -1,77 +1,21 @@ -def listinstr(lst, s): - return any(item in s for item in lst) +DEFAULT_TYPE_JUDGE_MODELS = { + 'MCQ': 'gpt-4o-mini', + 'Y/N': 'gpt-4o-mini', + 'MCQ_MMMU_Pro': 'gpt-4o-mini', +} -def get_default_judge_model(dataset_name, dataset_type, judge_kwargs=None): +def get_default_judge_model(dataset, dataset_type=None, judge_kwargs=None): """Return the default judge model for a dataset, or None if not specified.""" judge_kwargs = judge_kwargs or {} - - if dataset_type in ['MCQ', 'Y/N', 'MCQ_MMMU_Pro'] or listinstr( - ['moviechat1k', 'mme-reasoning'], dataset_name.lower() - ): - if listinstr(['WeMath', 'MME-Reasoning'], dataset_name): - return 'gpt-4o-mini' - if listinstr(['VisualPuzzles'], dataset_name): - return 'exact_matching' - if listinstr(['PuzzleVQA'], dataset_name): - return 'exact_matching' - if listinstr(['VisuLogic'], dataset_name): - return 'exact_matching' - return 'gpt-4o-mini' - - if listinstr(['MMVet', 'LLaVABench', 'MMBench_Video', 'MMBench-Video'], dataset_name): - if listinstr(['LLaVABench_KO'], dataset_name): - return 'gpt-4o-0806' - return 'gpt-4-turbo' - - if listinstr(['VGRPBench'], dataset_name): - return 'gpt-4o' - - if listinstr( - ['MathVista', 'MathVerse', 'MathVision', 'LENS', 'DynaMath', 'VL-RewardBench', - 'LogicVista', 'MOAT', 'OCR_Reasoning', 'VTCBench', 'Asclepius', - 'MMSafetyBench', 'MSSBench', 'SIUO', 'SIUO_GEN', 'XSTest', 'Flames'], dataset_name - ): - return 'gpt-4o-mini' - - if listinstr(['OlympiadBench'], dataset_name): - if judge_kwargs.get('olympiad_use_api_judger', False): - return 'gpt-4o-mini' + judge_model = getattr(dataset, 'DEFAULT_JUDGE_MODEL', None) + if isinstance(judge_model, dict): + for judge_arg, model in judge_model.items(): + if judge_kwargs.get(judge_arg, False): + return model return None + if judge_model is not None: + return judge_model - if listinstr( - ['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench', - 'WildVision', 'MMAlignBench', 'MM-IFEval'], dataset_name - ): - return 'gpt-4o' - - if listinstr(['ChartMimic'], dataset_name): - return 'gpt-4o' - if listinstr(['VDC'], dataset_name): - return 'llama31-8b' - if listinstr(['Video_MMLU_QA', 'Video_MMLU_CAP'], dataset_name): - return 'qwen-72b' - if listinstr(['MMVMBench'], dataset_name): - return 'gpt-4o' - if listinstr(['CVQA_EN', 'CVQA_LOC'], dataset_name): - return 'gpt-4.1' - if listinstr(['M4Bench'], dataset_name): - return 'gpt-4o' - if listinstr(['AyaVisionBench'], dataset_name): - return 'gpt-4.1' - if listinstr(['MathCanvas'], dataset_name): - return 'gpt-4.1-2025-04-14' - if listinstr(['MMReason'], dataset_name): - return 'gpt-4.1' - if listinstr(['CoreCognition'], dataset_name): - return 'gpt-4.1' - if listinstr(['WorldVQA'], dataset_name): - return 'gpt-4o-1120' - if listinstr(['Video-MME'], dataset_name): - return 'gpt-4o-mini' - if listinstr(['MaCBench'], dataset_name): - return 'gpt-4o-mini' - if listinstr(['SciDocBench'], dataset_name): - return 'gpt-4o-mini' - - return None + dataset_type = dataset_type or getattr(dataset, 'TYPE', None) + return DEFAULT_TYPE_JUDGE_MODELS.get(dataset_type) diff --git a/vlmeval/tools.py b/vlmeval/tools.py index ca831dba3..c005bf5ba 100644 --- a/vlmeval/tools.py +++ b/vlmeval/tools.py @@ -447,7 +447,7 @@ def EVAL(dataset_name, data_file, **kwargs): # Set the judge kwargs first before evaluation or dumping judge_kwargs = {'nproc': 4, 'verbose': True} if 'model' not in kwargs: - judge_model = get_default_judge_model(dataset_name, dataset.TYPE, judge_kwargs) + judge_model = get_default_judge_model(dataset, dataset.TYPE, judge_kwargs) if judge_model is not None: judge_kwargs['model'] = judge_model else: