From b891c47323c5b05abde7822b332fab566cc26488 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=8C=E8=95=8A?= Date: Tue, 9 Jun 2026 14:42:29 +0800 Subject: [PATCH] feat: add support for flyai-vl --- vlmeval/api/__init__.py | 4 +- vlmeval/api/flyai_vl_api.py | 280 ++++++++++++++++++++++++++++++++++++ vlmeval/config.py | 13 ++ 3 files changed, 295 insertions(+), 2 deletions(-) create mode 100644 vlmeval/api/flyai_vl_api.py diff --git a/vlmeval/api/__init__.py b/vlmeval/api/__init__.py index 38a5f1d1e..0c844e2c8 100644 --- a/vlmeval/api/__init__.py +++ b/vlmeval/api/__init__.py @@ -32,7 +32,7 @@ from .telemm_thinking import TeleMM2Thinking_API from .together import TogetherAPI from .video_chat_online_v2 import VideoChatOnlineV2API - +from .flyai_vl_api import FlyAIVLAPI __all__ = [ 'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V', 'Gemini', 'QwenVLWrapper', 'QwenVLAPI', 'QwenAPI', 'Claude3V', 'Claude_Wrapper', 'Reka', 'GLMVisionAPI', 'CWWrapper', @@ -42,5 +42,5 @@ 'TaichuVLAPI', 'TaichuVLRAPI', 'DoubaoVL', "MUGUAPI", 'KimiVLAPIWrapper', 'KimiVLAPI', 'RBdashMMChat3_API', 'RBdashChat3_5_API', 'RBdashMMChat3_78B_API', 'RBdashMMChat3_5_38B_API', 'VideoChatOnlineV2API', 'TeleMM2_API', 'TeleMM2Thinking_API', 'TogetherAPI', 'GCPVertexAPI', - 'BedrockAPI', 'SenseChatVisionV2API', 'MiniMaxAPI', 'LiteLLMAPI', + 'BedrockAPI', 'SenseChatVisionV2API', 'MiniMaxAPI', 'LiteLLMAPI', 'FlyAIVLAPI' ] diff --git a/vlmeval/api/flyai_vl_api.py b/vlmeval/api/flyai_vl_api.py new file mode 100644 index 000000000..bcbae1bb7 --- /dev/null +++ b/vlmeval/api/flyai_vl_api.py @@ -0,0 +1,280 @@ +from __future__ import annotations +import os +import sys +import warnings + +import numpy as np + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import get_logger, proxy_set +import time +from http import HTTPStatus +import requests +import base64 +import json +import socket + +from requests.adapters import HTTPAdapter +from urllib3.connection import HTTPConnection +from urllib3.poolmanager import PoolManager +import asyncio +CONNECT_TIMEOUT_SEC = 10 +# 上游 Python 推理可能跑很久,给到 20 分钟 +READ_TIMEOUT_SEC = 20 * 60 + +# 跟 curl 默认对齐:60s 空闲后开始发探测包 +TCP_KEEPIDLE_SEC = 60 +TCP_KEEPINTERVAL_SEC = 15 +TCP_KEEPCOUNT = 4 + +logger = get_logger(__name__) + + +def _build_keepalive_socket_options(): + opts = list(HTTPConnection.default_socket_options) + [ + (socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1), + ] + # Linux:TCP_KEEPIDLE/INTVL/CNT 都在 IPPROTO_TCP 下 + if hasattr(socket, "TCP_KEEPIDLE"): + opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, TCP_KEEPIDLE_SEC)) + # macOS:用 TCP_KEEPALIVE 表示 idle 秒数 + elif hasattr(socket, "TCP_KEEPALIVE"): + opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPALIVE, TCP_KEEPIDLE_SEC)) + if hasattr(socket, "TCP_KEEPINTVL"): + opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, TCP_KEEPINTERVAL_SEC)) + if hasattr(socket, "TCP_KEEPCNT"): + opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPCNT, TCP_KEEPCOUNT)) + return opts + + +class KeepAliveAdapter(HTTPAdapter): + def init_poolmanager(self, connections, maxsize, block=False, **kwargs): + kwargs["socket_options"] = _build_keepalive_socket_options() + self.poolmanager = PoolManager( + num_pools=connections, + maxsize=maxsize, + block=block, + **kwargs, + ) + + +def ensure_image_url(image: str) -> str: + prefixes = ['http://', 'https://', 'file://', 'data:image;'] + if any(image.startswith(prefix) for prefix in prefixes): + return image + if os.path.exists(image): + return 'file://' + image + raise ValueError(f'Invalid image: {image}') + +class FlyAIVLWrapper(BaseAPI): + is_api: bool = True + + def __init__( + self, + model: str = 'flyai-vl', + key: str | None = None, + min_pixels: int | None = None, + max_pixels: int | None = None, + total_pixels: int | None = None, + max_new_tokens=32768, + temperature=0.01, + top_p=0.001, + top_k=1, + presence_penalty=0.0, + retry: int = 5, + use_custom_prompt: bool = False, + use_vllm: bool = True, + **kwargs, + ): + self.model = model + self.min_pixels = min_pixels + self.max_pixels = max_pixels + self.total_pixels = total_pixels + self.generate_kwargs = dict( + max_new_tokens=max_new_tokens, + top_p=top_p, + top_k=top_k, + temperature=temperature, + presence_penalty=presence_penalty, + ) + + super().__init__(use_custom_prompt=use_custom_prompt, **kwargs) + + def _convert_image_to_base64(self, data): + mime_map = { + 'jpg': 'image/jpeg', + 'jpeg': 'image/jpeg', + 'png': 'image/png', + 'gif': 'image/gif', + 'bmp': 'image/bmp', + 'webp': 'image/webp', + 'svg': 'image/svg+xml', + } + if isinstance(data, dict): + new_data = {} + for key, value in data.items(): + if key == 'image' and isinstance(value, str): + file_path = value + if file_path.startswith('file://'): + file_path = file_path[7:] + + if not os.path.exists(file_path): + print(f"[Warning] File not found: {file_path}") + new_data[key] = value + continue + + try: + with open(file_path, 'rb') as f: + raw_bytes = f.read() + image_name = f.name + format_type = image_name.split('.')[-1].lower() + + b64_str = base64.b64encode(raw_bytes).decode('utf-8') + + mime_type = mime_map[format_type] + new_data[key] = f"data:{mime_type};base64,{b64_str}" + + except Exception as e: + print(f"[Error] Failed to encode {file_path}: {e}") + new_data[key] = value + else: + new_data[key] = self._convert_image_to_base64(value) + return new_data + + elif isinstance(data, list): + return [self._convert_image_to_base64(item) for item in data] + + return data + + def _prepare_content(self, inputs: list[dict[str, str]], dataset: str | None = None) -> list[dict[str, str]]: + content = [] + for s in inputs: + if s['type'] == 'image': + item = {'type': 'image', 'image': ensure_image_url(s['value'])} + if dataset == 'OCRBench': + item['min_pixels'] = 10 * 10 * 32 * 32 + warnings.warn(f"OCRBench dataset uses custom min_pixels={item['min_pixels']}") + if self.max_pixels is not None: + item['max_pixels'] = self.max_pixels + else: + if self.min_pixels is not None: + item['min_pixels'] = self.min_pixels + if self.max_pixels is not None: + item['max_pixels'] = self.max_pixels + if self.total_pixels is not None: + item['total_pixels'] = self.total_pixels + for key in ['min_pixels', 'max_pixels', 'total_pixels', 'resized_height', 'resized_width']: + if key in s and s[key] is not None: + item[key] = s[key] + elif s['type'] == 'video': + value = s['value'] + if isinstance(value, list): + item = { + 'type': 'video', + 'video': [ensure_image_url(v) for v in value], + } + else: + item = {'type': 'video', 'video': ensure_video_url(value)} + if self.min_pixels is not None: + item['min_pixels'] = self.min_pixels + if self.max_pixels is not None: + item['max_pixels'] = self.max_pixels + if self.total_pixels is not None: + item['total_pixels'] = self.total_pixels + for key in ['resized_height', 'resized_width', 'fps', 'nframes', 'sample_fps']: + if key in s and s[key] is not None: + item[key] = s[key] + if not isinstance(value, list): + if self.fps is not None and 'fps' not in item: + item['fps'] = self.fps + elif self.nframe is not None and 'nframes' not in item: + import cv2 + video = cv2.VideoCapture(s['value']) + frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) + video.release() + if frame_count < self.nframe: + new_frame_count = frame_count // self.FRAME_FACTOR * self.FRAME_FACTOR + print(f"use {new_frame_count} for {s['value']}") + item['nframes'] = new_frame_count + else: + item['nframes'] = self.nframe + elif s['type'] == 'audio': + item = {'type': 'audio', 'audio': s['value']} + elif s['type'] == 'text': + item = {'type': 'text', 'text': s['value']} + else: + raise ValueError(f"Invalid message type: {s['type']}, {s}") + content.append(item) + return content + + def generate_inner(self, inputs, **kwargs) -> str: + + messages = [] + if self.system_prompt is not None: + messages.append({'role': 'system', 'content': self.system_prompt}) + messages.append( + {'role': 'user', 'content': self._prepare_content(inputs, dataset=kwargs.get('dataset', None))} + ) + generation_kwargs = self.generate_kwargs.copy() + dataset = kwargs.get('dataset', None) + kwargs.pop('dataset', None) + generation_kwargs.update(kwargs) + print(generation_kwargs) + # generate + address = "https://fliggy-evaluate-platform.alibaba-inc.com" + request_data = { + "model": "flyai_vl", + "messages": messages, + "dataset": dataset, + "kwargs" : generation_kwargs + } + request_data_b64 = self._convert_image_to_base64(request_data) + + request_data_str = json.dumps(request_data_b64) + + try: + session = requests.Session() + adapter = KeepAliveAdapter() + session.mount("https://", adapter) + session.mount("http://", adapter) + + headers = { + "Content-Type": "application/json; charset=UTF-8", + "Accept": "application/json", + "Authorization": "Bearer sk-M8ukT9rRKa0eCi71PZITqEiMHCqBJfcd" + } + start = time.time() + + # 提交任务 + submit_url = f"{address}/api/tpp/python/invoke" + submit_resp = session.post(submit_url, data=request_data_str, headers=headers, timeout=50) + task_id = submit_resp.json()['data']['task_id'] + print(f"task_id: {task_id}") + + # 轮询 + query_url = f"{address}/api/tpp/python/query_task?taskId={task_id}" + while (time.time() - start) < 3600: + time.sleep(15) + resp = session.get(query_url, headers=headers, timeout=200) + data = resp.json().get('data', {}) + state = data['taskInfo']['taskState'] + if state == 'SUCCESS': + answer = data['result']['response'] + return 0, answer, 'Succeeded! ' + elif state == 'FAILED': + raise Exception("Task FAILED") + + raise TimeoutError("轮询超时") + + except Exception as err: + logger.error(f'{type(err)}: {err}') + logger.error(f'The input messages are {inputs}.') + return -1, '', '' + + +class FlyAIVLAPI(FlyAIVLWrapper): + def generate(self, message, dataset=None): + print("====FlyAIVLAPI====") + print(message) + print(dataset) + return super(FlyAIVLAPI, self).generate(message,dataset=dataset) diff --git a/vlmeval/config.py b/vlmeval/config.py index 448771039..8e835716b 100644 --- a/vlmeval/config.py +++ b/vlmeval/config.py @@ -918,6 +918,19 @@ frequency_penalty=0.0, retry=3, ), + "flyai-vl": + partial( + api.FlyAIVLAPI, + model="flyai-vl", + use_custom_prompt=False, + use_vllm=True, + temperature=0.1, + top_p=0.95, + top_k=20, + presence_penalty=1.5, + max_new_tokens=32768, + retry=10, + ) } api_models['gpt-5'] = cp.deepcopy(api_models['gpt-5-2025-08-07'])