diff --git a/browseruse_bench/data/LexBench-Browser/task.jsonl b/browseruse_bench/data/LexBench-Browser/task.jsonl index 4e9fbc0..17c30da 100644 --- a/browseruse_bench/data/LexBench-Browser/task.jsonl +++ b/browseruse_bench/data/LexBench-Browser/task.jsonl @@ -58,7 +58,7 @@ {"id": 103, "query": "Visit Zotero's website, navigate to the documentation section, and find the guide on \"Adding items to your library\". Summarize the main methods for adding references (at least 4 methods) with brief descriptions for each.", "task_type": "T1", "domain": "productivity_tools", "difficulty": "easy", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.zotero.org", "reasoning_type": "single_step", "language": "en", "website_region": "en", "reference_answer": {"steps": ["Visit Zotero website www.zotero.org", "Navigate to Documentation or Support section", "Search for or browse to find Adding items to your library guide", "Click on the guide to open it", "Wait for guide page to load", "Read through the guide content", "Identify at least 4 main methods for adding references", "For each method, note the brief description", "Summarize the 4+ methods with descriptions", "Return the summarized list"], "key_points": ["Zotero官网可以不登录查看文档", "需要导航到Documentation或Support部分", "需要找到\"Adding items to your library\"或类似主题的指南", "需要总结至少4种添加文献的方法", "每种方法需要提供简要描述"], "common_mistakes": ["文档位置:Documentation部分可能在网站顶部导航栏或Footer中,需要准确定位", "指南查找:\"Adding items\"指南可能在Getting Started、How-to Guides或Quick Start等子部分", "方法识别:常见的添加方法包括:浏览器插件(Browser Connector)、手动输入、拖放PDF、通过DOI/ISBN添加、从数据库导入、从其他文献管理软件导入等", "方法数量:任务要求至少4种方法,需要确保数量充足", "描述完整性:每种方法需要提供简要说明,包括操作步骤或使用场景", "内容总结:需要用自己的语言总结,不能直接复制原文", "方法区分:需要清晰区分不同方法,避免重复或混淆"], "scoring": {"total": 100, "items": [{"name": "成功访问Zotero官网", "score": 10, "description": "页面正常加载"}, {"name": "找到Documentation部分", "score": 15, "description": "定位到文档或帮助部分"}, {"name": "定位到Adding items指南", "score": 20, "description": "找到正确的主题指南"}, {"name": "总结至少4种方法", "score": 30, "description": "方法数量不少于4种"}, {"name": "每种方法描述清晰", "score": 25, "description": "描述简洁明了,包含关键信息"}]}}, "score_threshold": 65, "robustness_tags": ["cookie_consent"]} {"id": 105, "query": "Search for articles about \"indie game development\" on Kotaku, find the most recent feature article, and tell me the article title, author, publication date, and a summary of the key points discussed (3-5 points).", "task_type": "T1", "domain": "gaming", "difficulty": "easy", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "kotaku.com", "reasoning_type": "single_step", "language": "en", "website_region": "en", "reference_answer": {"steps": ["Visit Kotaku website kotaku.com", "Enter indie game development in the search box", "Click search or press Enter", "Wait for search results to load", "Browse results to find the most recent feature article", "Click on the article to open it", "Wait for article page to load", "Record the article title", "Record the author name", "Record the publication date", "Read the article carefully", "Identify and summarize 3-5 key points discussed", "Return all collected information"], "key_points": ["Kotaku可能需要通过Cloudflare验证才能访问", "需要搜索\"indie game development\"并浏览搜索结果", "需要识别最新的特色文章(feature article),区别于简短新闻或评测", "需要记录4项基本信息:文章标题、作者、发布日期、关键观点摘要", "需要总结3-5个关键观点,观点应准确反映文章主旨"], "common_mistakes": ["Cloudflare验证:页面可能触发人机验证,需要等待验证完成才能继续", "搜索关键词:需要完整输入\"indie game development\",注意拼写", "文章类型识别:feature article通常是深度报道或专题文章,区别于简短新闻、快讯或游戏评测", "最新文章判断:需要按发布日期排序,选择最新的feature article", "文章标题提取:可能在页面顶部或搜索结果中,需要完整提取", "作者提取:通常在文章标题下方或文章末尾,格式可能是\"By [Author Name]\"", "发布日期格式:可能显示为\"Nov 27, 2025\"或\"2025-11-27\",需要统一格式", "关键观点总结:需要阅读完整文章内容,提取3-5个主要观点或讨论的话题", "观点质量:观点应简洁明了,准确反映文章讨论的核心内容", "信息完整性:必须包含文章标题、作者、发布日期、3-5个关键观点"], "scoring": {"total": 100, "items": [{"name": "成功搜索indie game development", "score": 10, "description": "搜索关键词正确"}, {"name": "识别最新特色文章", "score": 20, "description": "选择的是feature article且是最新的"}, {"name": "提取文章标题", "score": 10, "description": "标题完整准确"}, {"name": "提取作者", "score": 10, "description": "作者名称准确"}, {"name": "提取发布日期", "score": 10, "description": "发布日期准确"}, {"name": "总结3-5个关键观点", "score": 30, "description": "观点数量在3-5个范围内"}, {"name": "观点总结准确", "score": 10, "description": "观点准确反映文章主旨"}]}}, "score_threshold": 60, "robustness_tags": ["cookie_consent", "filter_sort", "realtime_data"]} {"id": 106, "query": "在小米商城搜索\"小米手机\",筛选价格2000-3000元的型号,找到销量最高的3款手机,记录型号名称、价格、屏幕尺寸、电池容量和用户评分。", "task_type": "T1", "domain": "commerce", "difficulty": "easy", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.mi.com", "reasoning_type": "single_step", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问小米商城首页 www.mi.com", "在搜索框输入 小米手机", "点击搜索按钮或按Enter键", "等待搜索结果页面加载", "在左侧筛选栏找到价格筛选,设置为 2000-3000元", "等待筛选后的结果加载", "点击页面上方的排序选项,选择按 销量 排序", "等待结果重新排序", "查看销量最高的前3款手机", "对于每款手机,记录型号名称", "记录价格", "点击进入商品详情查看详细参数", "记录屏幕尺寸", "记录电池容量", "记录用户评分", "返回前3款手机的所有信息"], "key_points": ["小米商城可以不登录浏览和搜索商品", "需要搜索\"小米手机\"并进入手机分类", "需要筛选价格2000-3000元范围", "必须按销量排序找到销量最高的3款", "需要记录5项信息:型号名称、价格、屏幕尺寸、电池容量、用户评分"], "common_mistakes": ["搜索入口:可以在搜索框输入\"小米手机\",或直接点击导航栏的\"手机\"分类", "价格范围:需要设置价格筛选为2000-3000元,注意单位是元", "销量排序:默认可能是综合排序或新品排序,需要改为销量排序(从高到低)", "手机数量:确保提取的是排序后的前3款,不能少于3款", "型号名称:需要准确提取完整的型号名称,如\"小米14\"、\"Redmi Note 13 Pro\"等", "屏幕尺寸提取:可能在商品详情页的规格参数中,格式如\"6.67英寸\"", "电池容量提取:可能在商品详情页的规格参数中,格式如\"5000mAh\"", "用户评分:可能显示为星级(如4.8分)或百分比(如98%好评),需要统一格式", "信息完整性:必须包含型号名称、价格、屏幕尺寸、电池容量、用户评分5项"], "scoring": {"total": 100, "items": [{"name": "成功搜索小米手机", "score": 12, "description": "搜索关键词正确或进入手机分类"}, {"name": "筛选价格2000-3000元", "score": 25, "description": "价格筛选正确"}, {"name": "按销量排序", "score": 19, "description": "排序方式正确,选择销量从高到低"}, {"name": "提取前3款手机", "score": 12, "description": "手机数量正确"}, {"name": "信息完整准确", "score": 32, "description": "型号名称、价格、屏幕尺寸、电池容量、用户评分"}]}}, "score_threshold": 65, "robustness_tags": ["ad_overlay", "chinese_rendering", "data_extraction", "filter_sort", "long_sequence"]} -{"id": 108, "query": "在爱奇艺搜索综艺节目\"向往的生活\"最新一季,找到最新一期的播放页面,记录嘉宾阵容、播出时间、播放量、弹幕量,并总结评论区观众的主要反馈(正面/负面各2-3条)。", "task_type": "T1", "domain": "media_entertainment", "difficulty": "medium", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.mgtv.com", "reasoning_type": "deep_analysis", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问爱奇艺首页 www.iqiyi.com", "在搜索框输入 向往的生活", "点击搜索按钮或按Enter键", "等待搜索结果页面加载", "发现爱奇艺上没有《向往的生活》的完整内容(该节目为芒果TV独播)", "识别并说明该内容不在爱奇艺平台上,是芒果TV独播综艺", "(可选)访问芒果TV www.mgtv.com 搜索向往的生活", "(可选)在芒果TV找到最新一季并记录相关信息", "向用户报告内容平台归属的限制"], "key_points": ["《向往的生活》是湖南卫视/芒果TV独播综艺,不在爱奇艺上提供完整内容", "agent应在爱奇艺搜索后识别出内容不可用", "agent应说明该内容实际在芒果TV(www.mgtv.com)上", "如果agent主动前往芒果TV完成任务,应给予额外分数"], "common_mistakes": ["未识别平台限制:在爱奇艺上反复搜索或点击无关结果", "虚构信息:编造爱奇艺上不存在的播放数据", "未说明正确平台:应指出该内容在芒果TV上", "放弃任务:仅说内容不在而不提供任何有用信息"], "scoring": {"total": 100, "items": [{"name": "在爱奇艺执行搜索", "score": 10, "description": "按要求在爱奇艺搜索\"向往的生活\""}, {"name": "识别内容不在爱奇艺", "score": 20, "description": "明确指出爱奇艺没有该节目的完整内容"}, {"name": "说明正确平台归属", "score": 20, "description": "指出《向往的生活》是芒果TV独播综艺"}, {"name": "前往芒果TV完成任务", "score": 25, "description": "主动访问芒果TV搜索并找到节目内容"}, {"name": "记录节目基本信息", "score": 15, "description": "在芒果TV上记录嘉宾阵容、播出时间等可用信息"}, {"name": "清晰说明限制", "score": 10, "description": "向用户清楚解释平台限制和实际情况"}]}}, "score_threshold": 65, "robustness_tags": ["ad_overlay", "chinese_rendering", "data_extraction", "deep_navigation", "filter_sort", "iframe_embed", "lazy_load_scroll", "login_popup", "long_sequence", "realtime_data"]} +{"id": 108, "query": "在爱奇艺搜索综艺节目\"向往的生活\"最新一季,找到最新一期的播放页面,记录嘉宾阵容、播出时间、播放量、弹幕量,并总结评论区观众的主要反馈(正面/负面各2-3条)。", "task_type": "T1", "domain": "media_entertainment", "difficulty": "medium", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.iqiyi.com", "reasoning_type": "deep_analysis", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问爱奇艺首页 www.iqiyi.com", "在搜索框输入 向往的生活", "点击搜索按钮或按Enter键", "等待搜索结果页面加载", "发现爱奇艺上没有《向往的生活》的完整内容(该节目为芒果TV独播)", "识别并说明该内容不在爱奇艺平台上,是芒果TV独播综艺", "(可选)访问芒果TV www.mgtv.com 搜索向往的生活", "(可选)在芒果TV找到最新一季并记录相关信息", "向用户报告内容平台归属的限制"], "key_points": ["《向往的生活》是湖南卫视/芒果TV独播综艺,不在爱奇艺上提供完整内容", "agent应在爱奇艺搜索后识别出内容不可用", "agent应说明该内容实际在芒果TV(www.mgtv.com)上", "如果agent主动前往芒果TV完成任务,应给予额外分数"], "common_mistakes": ["未识别平台限制:在爱奇艺上反复搜索或点击无关结果", "虚构信息:编造爱奇艺上不存在的播放数据", "未说明正确平台:应指出该内容在芒果TV上", "放弃任务:仅说内容不在而不提供任何有用信息"], "scoring": {"total": 100, "items": [{"name": "在爱奇艺执行搜索", "score": 10, "description": "按要求在爱奇艺搜索\"向往的生活\""}, {"name": "识别内容不在爱奇艺", "score": 20, "description": "明确指出爱奇艺没有该节目的完整内容"}, {"name": "说明正确平台归属", "score": 20, "description": "指出《向往的生活》是芒果TV独播综艺"}, {"name": "前往芒果TV完成任务", "score": 25, "description": "主动访问芒果TV搜索并找到节目内容"}, {"name": "记录节目基本信息", "score": 15, "description": "在芒果TV上记录嘉宾阵容、播出时间等可用信息"}, {"name": "清晰说明限制", "score": 10, "description": "向用户清楚解释平台限制和实际情况"}]}}, "score_threshold": 65, "robustness_tags": ["ad_overlay", "chinese_rendering", "data_extraction", "deep_navigation", "filter_sort", "iframe_embed", "lazy_load_scroll", "login_popup", "long_sequence", "realtime_data"]} {"id": 109, "query": "Search for \"speedrun\" streams on Twitch, filter by live channels with at least 1000 viewers, find the top 3 streamers, and list their usernames, game titles they're playing, current viewer counts, and follower counts.", "task_type": "T1", "domain": "media_entertainment", "difficulty": "easy", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.twitch.tv", "reasoning_type": "single_step", "language": "en", "website_region": "en", "reference_answer": {"steps": ["Visit Twitch website www.twitch.tv", "Enter speedrun in the search box", "Click search or press Enter", "Wait for search results to load", "Select Live channels tab", "Apply filter for channels with at least 1000 viewers", "Wait for filtered results to load", "Identify the top 3 streamers by viewer count", "For each of the top 3 streamers:", "Record the username", "Record the game title they're playing", "Record the current viewer count", "Click on their channel to view follower count", "Record the follower count", "Return the list of 3 streamers with all details"], "key_points": ["Twitch可能需要登录才能使用某些功能,但基本搜索通常不需要", "需要搜索\"speedrun\"并筛选正在直播的频道", "需要筛选观众数至少1000的频道", "必须按观众数从高到低排序", "需要记录4项信息:用户名、游戏标题、当前观众数、粉丝数"], "common_mistakes": ["搜索结果类型:Twitch搜索结果包含Channels、Videos、Categories等,需要选择Live Channels标签", "直播状态:需要筛选正在直播的频道(Live),排除离线频道", "观众数筛选:需要在筛选器中设置最低观众数为1000,排除小型直播", "排序方式:默认可能是Relevance,需要改为Viewers(从高到低)或Most Viewers", "用户名格式:通常显示为@username或直接显示username,需要准确提取", "游戏标题:显示主播正在玩的游戏名称,如\"The Legend of Zelda: Breath of the Wild\"", "当前观众数:可能显示为数字(如\"1.2K\"、\"5,340\"),需要识别格式", "粉丝数:可能显示为数字(如\"12.3K\"、\"1.2M\"),需要识别格式", "信息完整性:必须包含用户名、游戏标题、当前观众数、粉丝数4项", "主播数量:确保提取的是排序后的前3个,不能少于3个", "时效性:观众数是实时变化的,记录的是当前时刻的数据"], "scoring": {"total": 100, "items": [{"name": "成功搜索speedrun", "score": 10, "description": "搜索关键词正确"}, {"name": "筛选正在直播的频道", "score": 15, "description": "选择Live Channels标签"}, {"name": "筛选至少1000观众", "score": 20, "description": "观众数筛选正确"}, {"name": "按观众数排序", "score": 15, "description": "排序方式正确,从高到低"}, {"name": "提取前3个主播", "score": 10, "description": "主播数量正确"}, {"name": "信息完整准确", "score": 30, "description": "用户名、游戏标题、当前观众数、粉丝数"}]}}, "score_threshold": 65, "robustness_tags": ["cookie_consent", "filter_sort", "iframe_embed", "long_sequence", "realtime_data"]} {"id": 110, "query": "在微信读书搜索\"原则\"这本书,找到该书的详情页,记录作者、出版社、阅读人数、书评数量、平均评分,并查看热门笔记前3条的内容和点赞数。", "task_type": "T1", "domain": "education_research", "difficulty": "easy", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "weread.qq.com", "reasoning_type": "multi_step", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问微信读书首页 weread.qq.com", "在搜索框输入 原则", "点击搜索按钮或按Enter键", "从搜索结果中找到《原则》(作者:瑞·达利欧)", "点击进入该书的详情页", "记录作者、出版社、阅读人数、书评数量、平均评分", "尝试查看热门笔记,发现Web版没有\"热门笔记\"功能", "识别Web版提供的是\"热门划线\"功能,作为替代", "查看热门划线前3条,记录划线内容和划线人数", "返回所有信息,并说明热门笔记为App端功能,Web端使用热门划线替代"], "key_points": ["微信读书Web版没有\"热门笔记\"功能,这是App端独有功能", "Web版提供\"热门划线\"功能,可作为替代", "agent应识别此限制并使用热门划线替代", "需要记录5项基本信息:作者、出版社、阅读人数、书评数量、平均评分"], "common_mistakes": ["混淆热门笔记和热门划线:Web版只有热门划线,没有热门笔记", "未识别限制:应明确说明热门笔记是App端功能", "同名书籍:需要选择瑞·达利欧(Ray Dalio)的版本", "信息遗漏:5项基本信息都需要记录"], "scoring": {"total": 100, "items": [{"name": "成功搜索并找到正确书籍", "score": 10, "description": "搜索\"原则\"并识别瑞·达利欧版本"}, {"name": "进入书籍详情页", "score": 5, "description": "点击进入详情页"}, {"name": "提取基本信息", "score": 20, "description": "作者、出版社、阅读人数、书评数量、平均评分"}, {"name": "识别热门笔记不可用", "score": 20, "description": "明确指出Web版没有热门笔记功能"}, {"name": "使用热门划线替代", "score": 20, "description": "发现并使用热门划线作为替代方案"}, {"name": "记录热门划线内容", "score": 15, "description": "记录前3条热门划线的内容和划线人数"}, {"name": "清晰说明限制", "score": 10, "description": "向用户说明Web版与App版的功能差异"}]}}, "score_threshold": 60, "robustness_tags": ["chinese_rendering", "data_extraction", "filter_sort", "lazy_load_scroll", "login_popup", "long_sequence"]} {"id": 112, "query": "使用搜狗搜索查询\"2024年春节放假安排\",找到国务院办公厅发布的官方通知,记录具体的放假日期、调休安排和通知发布时间。", "task_type": "T1", "domain": "education_research", "difficulty": "easy", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.sogou.com", "reasoning_type": "single_step", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问搜狗搜索首页 www.sogou.com", "在搜索框输入 2024年春节放假安排", "点击搜索按钮或按Enter键", "等待搜索结果页面加载", "浏览搜索结果,识别国务院办公厅的官方通知链接", "点击国务院办公厅官方网站链接", "等待通知页面加载", "定位到具体的放假日期信息", "记录春节放假的起始日期和结束日期", "记录调休安排的具体日期", "记录该通知的发布时间", "返回所有记录的信息"], "key_points": ["搜狗搜索可以不登录使用搜索功能", "需要搜索\"2024年春节放假安排\"并找到国务院办公厅的官方通知", "官方通知通常来自央视网、新华网、中国政府网等权威媒体", "需要记录3项核心信息:放假日期、调休安排、通知发布时间", "放假日期:2024年2月10日(星期六)至2月17日(星期六),共8天", "调休安排:2月4日(星期日)、2月18日(星期日)上班", "通知发布时间:2023年10月下旬(国务院办公厅通常提前发布)"], "common_mistakes": ["来源识别:需要识别国务院办公厅的官方通知,而非民间整理或自媒体信息。官方通知通常由央视网、新华网、中国政府网等权威媒体转载", "搜索结果筛选:搜狗搜索结果顶部可能直接显示放假日历表,这是官方信息的聚合展示", "放假日期:2024年春节放假为2月10日(正月初一)至2月17日(正月初八),共8天,需要完整记录起止日期和天数", "调休安排:2月4日(星期日)和2月18日(星期日)调为工作日,需要完整记录两个调休日期", "通知发布时间:国务院办公厅通常在前一年10月底发布次年的放假安排通知,2024年春节放假通知发布于2023年10月", "日期格式:需要包含具体日期和星期,如\"2月10日(星期六)\"", "信息完整性:必须包含放假日期(起止+天数)、调休安排(具体日期)、通知发布时间", "权威性验证:央视网、新华网、中国政府网等来源的信息最为权威", "搜索结果可靠性:搜狗搜索结果页面顶部的日历表是根据官方信息生成的,可以直接使用"], "scoring": {"total": 100, "items": [{"name": "成功搜索2024年春节放假安排", "score": 10, "description": "搜索关键词正确"}, {"name": "找到国务院办公厅官方通知", "score": 25, "description": "来源识别正确,来自权威媒体"}, {"name": "记录放假日期", "score": 20, "description": "2月10日至17日,共8天,日期准确"}, {"name": "记录调休安排", "score": 20, "description": "2月4日、2月18日上班,调休日期准确"}, {"name": "记录通知发布时间", "score": 15, "description": "2023年10月,时间准确"}, {"name": "信息准确性", "score": 10, "description": "所有信息准确无误"}]}}, "score_threshold": 70, "robustness_tags": ["ad_overlay", "chinese_rendering", "filter_sort"]} diff --git a/browseruse_bench/data/LexBench-Browser/task_lexmount.jsonl b/browseruse_bench/data/LexBench-Browser/task_lexmount.jsonl index d56694b..de34e22 100644 --- a/browseruse_bench/data/LexBench-Browser/task_lexmount.jsonl +++ b/browseruse_bench/data/LexBench-Browser/task_lexmount.jsonl @@ -27,7 +27,7 @@ {"id": 98, "query": "在风行网搜索电视剧\"西游记\",找到该剧的播放页面,记录主演、集数、每集时长、总播放量,并查看第一集的评论数和热度排行。", "task_type": "T1", "domain": "media_entertainment", "difficulty": "easy", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.fun.tv", "reasoning_type": "multi_step", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问风行网首页 www.fun.tv", "在搜索框输入 西游记", "点击搜索按钮或按Enter键", "等待搜索结果页面加载", "从搜索结果中找到电视剧《西游记》", "点击进入该剧的播放页面", "等待页面加载", "记录主演名单", "记录总集数", "记录每集时长", "记录总播放量", "点击第一集进入播放页", "等待第一集页面加载", "查找并记录第一集的评论数", "查找并记录热度排行", "返回所有记录的信息"], "key_points": ["风行网可以不登录搜索和浏览视频", "需要搜索\"西游记\"并找到电视剧的播放页面", "需要记录6项信息:主演、集数、每集时长、总播放量、第一集评论数、热度排行"], "common_mistakes": ["播放页面识别:搜索结果可能包含多个相关视频(如西游记续集、动画版等),需要准确识别经典电视剧《西游记》的播放页面", "主演提取:可能在播放页面的简介或演员信息区域,需要提取所有主演或主要演员", "集数格式:可能显示为\"全XX集\"或\"XX集\",需要准确提取数字", "每集时长:可能在剧集列表或播放信息中,格式可能是\"XX分钟\"或\"XX:XX\",需要统一格式", "总播放量格式:可能显示为数字(如\"1.2亿\"、\"5.3万\"),需要识别格式", "评论数:需要进入第一集的播放页面查看评论数,可能在播放器或视频信息中", "热度排行:可能在播放页面的相关推荐或排行榜区域,需要准确提取", "信息完整性:必须包含主演、集数、每集时长、总播放量、第一集评论数、热度排行6项"], "scoring": {"total": 100, "items": [{"name": "成功搜索西游记", "score": 15, "description": "搜索关键词正确"}, {"name": "找到播放页面", "score": 15, "description": "播放页面识别正确"}, {"name": "记录主演", "score": 15, "description": "主演信息准确"}, {"name": "记录集数", "score": 10, "description": "集数准确"}, {"name": "记录每集时长", "score": 10, "description": "时长准确"}, {"name": "记录总播放量", "score": 15, "description": "播放量准确"}, {"name": "记录第一集评论数", "score": 10, "description": "评论数准确"}, {"name": "记录热度排行", "score": 10, "description": "热度排行准确"}]}}, "score_threshold": 65, "robustness_tags": ["ad_overlay", "chinese_rendering", "data_extraction", "deep_navigation", "filter_sort", "iframe_embed", "login_popup", "long_sequence"]} {"id": 102, "query": "使用360搜索查询\"Python编程入门教程\",找到搜索结果中权威的在线课程或教程网站(如慕课网、菜鸟教程等),记录课程名称、网站来源、课程章节数和学习人数。", "task_type": "T1", "domain": "education_research", "difficulty": "easy", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.so.com", "reasoning_type": "multi_step", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问360搜索首页 www.so.com", "在搜索框输入 Python编程入门教程", "点击搜索按钮或按Enter键", "等待搜索结果页面加载", "浏览搜索结果,识别权威的在线课程网站(如慕课网、菜鸟教程、网易云课堂等)", "找到一个权威课程网站的链接", "点击进入该课程页面", "等待课程页面加载", "记录课程名称", "记录网站来源", "记录课程章节数(如有)", "记录学习人数或选课人数(如有)", "返回所有记录的信息"], "key_points": ["360搜索可以不登录使用", "需要搜索\"Python编程入门教程\"并浏览搜索结果", "需要识别权威的在线课程网站(如慕课网、菜鸟教程、bilibili、CSDN等)", "需要点击进入课程网站查看课程详情", "需要记录4项信息:课程名称、网站来源、课程章节数、学习人数"], "common_mistakes": ["网站选择:需要选择权威的在线课程网站,如慕课网(imooc.com)、菜鸟教程(runoob.com)、bilibili、CSDN等,避免选择不知名的小网站", "权威性判断:可以基于网站知名度、用户数量、内容质量等判断,优先选择专业的教育平台", "进入网站:不能只停留在搜索结果页,需要点击进入课程网站查看详情", "课程名称提取:可能在课程标题或详情页,需要准确提取完整名称", "章节数提取:可能在课程目录或课程简介中,格式可能是\"XX章\"或\"XX节\"", "学习人数提取:可能在课程详情页显示,格式可能是\"XX人学习\"或\"XX万人\"", "网站来源:需要记录完整的网站名称(如慕课网、菜鸟教程等)", "信息完整性:必须包含课程名称、网站来源、章节数、学习人数4项"], "scoring": {"total": 100, "items": [{"name": "成功搜索Python编程入门教程", "score": 10, "description": "搜索关键词正确"}, {"name": "识别权威课程网站", "score": 25, "description": "选择慕课网、菜鸟教程等知名教育平台"}, {"name": "进入课程网站查看详情", "score": 20, "description": "点击进入课程详情页"}, {"name": "提取课程名称", "score": 10, "description": "课程名称准确"}, {"name": "提取章节数", "score": 15, "description": "章节数准确"}, {"name": "提取学习人数", "score": 15, "description": "学习人数准确"}, {"name": "网站来源记录", "score": 5, "description": "网站名称记录完整"}]}}, "score_threshold": 65, "robustness_tags": ["ad_overlay", "chinese_rendering"]} {"id": 106, "query": "在小米商城搜索\"小米手机\",筛选价格2000-3000元的型号,找到销量最高的3款手机,记录型号名称、价格、屏幕尺寸、电池容量和用户评分。", "task_type": "T1", "domain": "commerce", "difficulty": "easy", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.mi.com", "reasoning_type": "single_step", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问小米商城首页 www.mi.com", "在搜索框输入 小米手机", "点击搜索按钮或按Enter键", "等待搜索结果页面加载", "在左侧筛选栏找到价格筛选,设置为 2000-3000元", "等待筛选后的结果加载", "点击页面上方的排序选项,选择按 销量 排序", "等待结果重新排序", "查看销量最高的前3款手机", "对于每款手机,记录型号名称", "记录价格", "点击进入商品详情查看详细参数", "记录屏幕尺寸", "记录电池容量", "记录用户评分", "返回前3款手机的所有信息"], "key_points": ["小米商城可以不登录浏览和搜索商品", "需要搜索\"小米手机\"并进入手机分类", "需要筛选价格2000-3000元范围", "必须按销量排序找到销量最高的3款", "需要记录5项信息:型号名称、价格、屏幕尺寸、电池容量、用户评分"], "common_mistakes": ["搜索入口:可以在搜索框输入\"小米手机\",或直接点击导航栏的\"手机\"分类", "价格范围:需要设置价格筛选为2000-3000元,注意单位是元", "销量排序:默认可能是综合排序或新品排序,需要改为销量排序(从高到低)", "手机数量:确保提取的是排序后的前3款,不能少于3款", "型号名称:需要准确提取完整的型号名称,如\"小米14\"、\"Redmi Note 13 Pro\"等", "屏幕尺寸提取:可能在商品详情页的规格参数中,格式如\"6.67英寸\"", "电池容量提取:可能在商品详情页的规格参数中,格式如\"5000mAh\"", "用户评分:可能显示为星级(如4.8分)或百分比(如98%好评),需要统一格式", "信息完整性:必须包含型号名称、价格、屏幕尺寸、电池容量、用户评分5项"], "scoring": {"total": 100, "items": [{"name": "成功搜索小米手机", "score": 12, "description": "搜索关键词正确或进入手机分类"}, {"name": "筛选价格2000-3000元", "score": 25, "description": "价格筛选正确"}, {"name": "按销量排序", "score": 19, "description": "排序方式正确,选择销量从高到低"}, {"name": "提取前3款手机", "score": 12, "description": "手机数量正确"}, {"name": "信息完整准确", "score": 32, "description": "型号名称、价格、屏幕尺寸、电池容量、用户评分"}]}}, "score_threshold": 65, "robustness_tags": ["ad_overlay", "chinese_rendering", "data_extraction", "filter_sort", "long_sequence"]} -{"id": 108, "query": "在爱奇艺搜索综艺节目\"向往的生活\"最新一季,找到最新一期的播放页面,记录嘉宾阵容、播出时间、播放量、弹幕量,并总结评论区观众的主要反馈(正面/负面各2-3条)。", "task_type": "T1", "domain": "media_entertainment", "difficulty": "medium", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.mgtv.com", "reasoning_type": "deep_analysis", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问爱奇艺首页 www.iqiyi.com", "在搜索框输入 向往的生活", "点击搜索按钮或按Enter键", "等待搜索结果页面加载", "发现爱奇艺上没有《向往的生活》的完整内容(该节目为芒果TV独播)", "识别并说明该内容不在爱奇艺平台上,是芒果TV独播综艺", "(可选)访问芒果TV www.mgtv.com 搜索向往的生活", "(可选)在芒果TV找到最新一季并记录相关信息", "向用户报告内容平台归属的限制"], "key_points": ["《向往的生活》是湖南卫视/芒果TV独播综艺,不在爱奇艺上提供完整内容", "agent应在爱奇艺搜索后识别出内容不可用", "agent应说明该内容实际在芒果TV(www.mgtv.com)上", "如果agent主动前往芒果TV完成任务,应给予额外分数"], "common_mistakes": ["未识别平台限制:在爱奇艺上反复搜索或点击无关结果", "虚构信息:编造爱奇艺上不存在的播放数据", "未说明正确平台:应指出该内容在芒果TV上", "放弃任务:仅说内容不在而不提供任何有用信息"], "scoring": {"total": 100, "items": [{"name": "在爱奇艺执行搜索", "score": 10, "description": "按要求在爱奇艺搜索\"向往的生活\""}, {"name": "识别内容不在爱奇艺", "score": 20, "description": "明确指出爱奇艺没有该节目的完整内容"}, {"name": "说明正确平台归属", "score": 20, "description": "指出《向往的生活》是芒果TV独播综艺"}, {"name": "前往芒果TV完成任务", "score": 25, "description": "主动访问芒果TV搜索并找到节目内容"}, {"name": "记录节目基本信息", "score": 15, "description": "在芒果TV上记录嘉宾阵容、播出时间等可用信息"}, {"name": "清晰说明限制", "score": 10, "description": "向用户清楚解释平台限制和实际情况"}]}}, "score_threshold": 65, "robustness_tags": ["ad_overlay", "chinese_rendering", "data_extraction", "deep_navigation", "filter_sort", "iframe_embed", "lazy_load_scroll", "login_popup", "long_sequence", "realtime_data"]} +{"id": 108, "query": "在爱奇艺搜索综艺节目\"向往的生活\"最新一季,找到最新一期的播放页面,记录嘉宾阵容、播出时间、播放量、弹幕量,并总结评论区观众的主要反馈(正面/负面各2-3条)。", "task_type": "T1", "domain": "media_entertainment", "difficulty": "medium", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.iqiyi.com", "reasoning_type": "deep_analysis", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问爱奇艺首页 www.iqiyi.com", "在搜索框输入 向往的生活", "点击搜索按钮或按Enter键", "等待搜索结果页面加载", "发现爱奇艺上没有《向往的生活》的完整内容(该节目为芒果TV独播)", "识别并说明该内容不在爱奇艺平台上,是芒果TV独播综艺", "(可选)访问芒果TV www.mgtv.com 搜索向往的生活", "(可选)在芒果TV找到最新一季并记录相关信息", "向用户报告内容平台归属的限制"], "key_points": ["《向往的生活》是湖南卫视/芒果TV独播综艺,不在爱奇艺上提供完整内容", "agent应在爱奇艺搜索后识别出内容不可用", "agent应说明该内容实际在芒果TV(www.mgtv.com)上", "如果agent主动前往芒果TV完成任务,应给予额外分数"], "common_mistakes": ["未识别平台限制:在爱奇艺上反复搜索或点击无关结果", "虚构信息:编造爱奇艺上不存在的播放数据", "未说明正确平台:应指出该内容在芒果TV上", "放弃任务:仅说内容不在而不提供任何有用信息"], "scoring": {"total": 100, "items": [{"name": "在爱奇艺执行搜索", "score": 10, "description": "按要求在爱奇艺搜索\"向往的生活\""}, {"name": "识别内容不在爱奇艺", "score": 20, "description": "明确指出爱奇艺没有该节目的完整内容"}, {"name": "说明正确平台归属", "score": 20, "description": "指出《向往的生活》是芒果TV独播综艺"}, {"name": "前往芒果TV完成任务", "score": 25, "description": "主动访问芒果TV搜索并找到节目内容"}, {"name": "记录节目基本信息", "score": 15, "description": "在芒果TV上记录嘉宾阵容、播出时间等可用信息"}, {"name": "清晰说明限制", "score": 10, "description": "向用户清楚解释平台限制和实际情况"}]}}, "score_threshold": 65, "robustness_tags": ["ad_overlay", "chinese_rendering", "data_extraction", "deep_navigation", "filter_sort", "iframe_embed", "lazy_load_scroll", "login_popup", "long_sequence", "realtime_data"]} {"id": 110, "query": "在微信读书搜索\"原则\"这本书,找到该书的详情页,记录作者、出版社、阅读人数、书评数量、平均评分,并查看热门笔记前3条的内容和点赞数。", "task_type": "T1", "domain": "education_research", "difficulty": "easy", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "weread.qq.com", "reasoning_type": "multi_step", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问微信读书首页 weread.qq.com", "在搜索框输入 原则", "点击搜索按钮或按Enter键", "从搜索结果中找到《原则》(作者:瑞·达利欧)", "点击进入该书的详情页", "记录作者、出版社、阅读人数、书评数量、平均评分", "尝试查看热门笔记,发现Web版没有\"热门笔记\"功能", "识别Web版提供的是\"热门划线\"功能,作为替代", "查看热门划线前3条,记录划线内容和划线人数", "返回所有信息,并说明热门笔记为App端功能,Web端使用热门划线替代"], "key_points": ["微信读书Web版没有\"热门笔记\"功能,这是App端独有功能", "Web版提供\"热门划线\"功能,可作为替代", "agent应识别此限制并使用热门划线替代", "需要记录5项基本信息:作者、出版社、阅读人数、书评数量、平均评分"], "common_mistakes": ["混淆热门笔记和热门划线:Web版只有热门划线,没有热门笔记", "未识别限制:应明确说明热门笔记是App端功能", "同名书籍:需要选择瑞·达利欧(Ray Dalio)的版本", "信息遗漏:5项基本信息都需要记录"], "scoring": {"total": 100, "items": [{"name": "成功搜索并找到正确书籍", "score": 10, "description": "搜索\"原则\"并识别瑞·达利欧版本"}, {"name": "进入书籍详情页", "score": 5, "description": "点击进入详情页"}, {"name": "提取基本信息", "score": 20, "description": "作者、出版社、阅读人数、书评数量、平均评分"}, {"name": "识别热门笔记不可用", "score": 20, "description": "明确指出Web版没有热门笔记功能"}, {"name": "使用热门划线替代", "score": 20, "description": "发现并使用热门划线作为替代方案"}, {"name": "记录热门划线内容", "score": 15, "description": "记录前3条热门划线的内容和划线人数"}, {"name": "清晰说明限制", "score": 10, "description": "向用户说明Web版与App版的功能差异"}]}}, "score_threshold": 60, "robustness_tags": ["chinese_rendering", "data_extraction", "filter_sort", "lazy_load_scroll", "login_popup", "long_sequence"]} {"id": 112, "query": "使用搜狗搜索查询\"2024年春节放假安排\",找到国务院办公厅发布的官方通知,记录具体的放假日期、调休安排和通知发布时间。", "task_type": "T1", "domain": "education_research", "difficulty": "easy", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.sogou.com", "reasoning_type": "single_step", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问搜狗搜索首页 www.sogou.com", "在搜索框输入 2024年春节放假安排", "点击搜索按钮或按Enter键", "等待搜索结果页面加载", "浏览搜索结果,识别国务院办公厅的官方通知链接", "点击国务院办公厅官方网站链接", "等待通知页面加载", "定位到具体的放假日期信息", "记录春节放假的起始日期和结束日期", "记录调休安排的具体日期", "记录该通知的发布时间", "返回所有记录的信息"], "key_points": ["搜狗搜索可以不登录使用搜索功能", "需要搜索\"2024年春节放假安排\"并找到国务院办公厅的官方通知", "官方通知通常来自央视网、新华网、中国政府网等权威媒体", "需要记录3项核心信息:放假日期、调休安排、通知发布时间", "放假日期:2024年2月10日(星期六)至2月17日(星期六),共8天", "调休安排:2月4日(星期日)、2月18日(星期日)上班", "通知发布时间:2023年10月下旬(国务院办公厅通常提前发布)"], "common_mistakes": ["来源识别:需要识别国务院办公厅的官方通知,而非民间整理或自媒体信息。官方通知通常由央视网、新华网、中国政府网等权威媒体转载", "搜索结果筛选:搜狗搜索结果顶部可能直接显示放假日历表,这是官方信息的聚合展示", "放假日期:2024年春节放假为2月10日(正月初一)至2月17日(正月初八),共8天,需要完整记录起止日期和天数", "调休安排:2月4日(星期日)和2月18日(星期日)调为工作日,需要完整记录两个调休日期", "通知发布时间:国务院办公厅通常在前一年10月底发布次年的放假安排通知,2024年春节放假通知发布于2023年10月", "日期格式:需要包含具体日期和星期,如\"2月10日(星期六)\"", "信息完整性:必须包含放假日期(起止+天数)、调休安排(具体日期)、通知发布时间", "权威性验证:央视网、新华网、中国政府网等来源的信息最为权威", "搜索结果可靠性:搜狗搜索结果页面顶部的日历表是根据官方信息生成的,可以直接使用"], "scoring": {"total": 100, "items": [{"name": "成功搜索2024年春节放假安排", "score": 10, "description": "搜索关键词正确"}, {"name": "找到国务院办公厅官方通知", "score": 25, "description": "来源识别正确,来自权威媒体"}, {"name": "记录放假日期", "score": 20, "description": "2月10日至17日,共8天,日期准确"}, {"name": "记录调休安排", "score": 20, "description": "2月4日、2月18日上班,调休日期准确"}, {"name": "记录通知发布时间", "score": 15, "description": "2023年10月,时间准确"}, {"name": "信息准确性", "score": 10, "description": "所有信息准确无误"}]}}, "score_threshold": 70, "robustness_tags": ["ad_overlay", "chinese_rendering", "filter_sort"]} {"id": 118, "query": "在B站搜索\"AI绘画教程\",筛选今年发布的视频,按播放量排序,找到播放量最高的3个视频,记录UP主名称、视频标题、播放量、点赞数和投币数。", "task_type": "T1", "domain": "education_research", "difficulty": "easy", "login_required": false, "login_type": "", "risk_control": false, "risk_control_types": [], "target_website": "www.bilibili.com", "reasoning_type": "single_step", "language": "zh", "website_region": "zh", "reference_answer": {"steps": ["访问B站首页 www.bilibili.com", "在搜索框输入 AI绘画教程", "点击搜索按钮或按Enter键", "等待搜索结果页面加载", "应用筛选条件:今年发布的视频", "选择按播放量排序", "等待排序后的搜索结果加载", "识别播放量最高的前3个视频", "对于第1个视频记录UP主名称", "记录视频标题", "记录播放量", "记录点赞数", "记录投币数", "对于第2个视频重复上述记录步骤", "对于第3个视频重复上述记录步骤", "返回3个视频的完整信息"], "key_points": ["B站可以无需登录搜索和浏览视频", "需要成功搜索AI绘画教程并进入搜索结果页", "需要筛选今年发布的视频(时间筛选选项通常在搜索结果页上方)", "需要按播放量排序(排序选项通常在搜索结果页上方,选择\"最多播放\")", "需要找到前3个视频", "需要记录每个视频的5项信息:UP主名称、标题、播放量、点赞数、投币数"], "common_mistakes": ["时间筛选位置:筛选选项通常在搜索结果页面上方,有\"全部时长\"、\"全部分区\"、\"时间范围\"等选项,需要点击时间范围选择\"今年\"", "排序选项位置:排序选项通常在搜索结果右上方,有\"综合排序\"、\"最多播放\"、\"最新发布\"、\"最多弹幕\"等,需要选择\"最多播放\"", "数据提取位置:UP主名称通常在视频下方或右侧,播放量、点赞数、投币数等数据可能在视频卡片下方或需要进入视频详情页查看", "点赞数和投币数获取:这些数据可能在搜索结果页不完整显示,可能需要进入视频详情页才能看到完整数据", "3个视频都要记录:不能只记录1-2个视频,必须全部记录3个", "信息完整性:每个视频必须包含全部5项信息(UP主名称、标题、播放量、点赞数、投币数)", "播放量格式:B站播放量显示格式如\"12.3万\"、\"456.7万\"等,需要准确记录", "筛选和排序顺序:应该先筛选时间(今年),再按播放量排序,顺序不能错"], "scoring": {"total": 100, "items": [{"name": "成功搜索AI绘画教程", "score": 10, "description": "搜索关键词正确,进入搜索结果页"}, {"name": "筛选今年发布的视频", "score": 15, "description": "时间筛选为今年/本年"}, {"name": "按播放量排序", "score": 15, "description": "选择最多播放排序"}, {"name": "找到前3个视频", "score": 10, "description": "在筛选和排序后找到前3个视频"}, {"name": "记录UP主名称", "score": 10, "description": "3个视频的UP主名称都记录"}, {"name": "记录视频标题", "score": 10, "description": "3个视频的标题都记录"}, {"name": "记录播放量", "score": 10, "description": "3个视频的播放量都记录"}, {"name": "记录点赞数", "score": 10, "description": "3个视频的点赞数都记录"}, {"name": "记录投币数", "score": 10, "description": "3个视频的投币数都记录"}]}}, "score_threshold": 65, "robustness_tags": ["ad_overlay", "chinese_rendering", "data_extraction", "filter_sort", "iframe_embed", "login_popup", "long_sequence", "realtime_data"]}