Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion backend/packages/harness/deerflow/skills/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,37 @@
logger = logging.getLogger(__name__)


def _format_yaml_error(skill_file: Path, exc: yaml.YAMLError, source: str) -> str:
"""Render a developer-friendly explanation of a YAML front-matter error."""

lines = [f"Invalid YAML front-matter in {skill_file}: {exc}"]

mark = getattr(exc, "problem_mark", None)
source_lines = source.splitlines()
if mark is not None and 0 <= mark.line < len(source_lines):
offending = source_lines[mark.line]

# mark.line is 0-based within the front-matter body; +1 makes it
# 1-based, +1 more accounts for the leading `---` fence that the
# front-matter regex strips before yaml.safe_load sees it. The
# result matches the line number an author sees in their editor.
file_line_number = mark.line + 2
lines.append(f" line {file_line_number}: {offending}")

# Targeted hint for the most common authoring mistake: an unquoted
# scalar value whose body contains ``: ``. We only surface the hint
# when we are confident it applies, to avoid misleading authors who
# hit unrelated YAML errors.
if getattr(exc, "problem", "") == "mapping values are not allowed here" and ":" in offending:
key, _, value = offending.partition(":")
value = value.strip()
if value and value[0] not in {'"', "'", "|", ">", "[", "{"}:
escaped = value.replace("\\", "\\\\").replace('"', '\\"')
lines.append(f' hint: values containing ":" must be quoted, e.g. {key}: "{escaped}"')
Comment on lines +36 to +38

return "\n".join(lines)


def parse_allowed_tools(raw: object, skill_file: Path) -> list[str] | None:
"""Parse the optional allowed-tools frontmatter field.

Expand Down Expand Up @@ -60,7 +91,7 @@ def parse_skill_file(skill_file: Path, category: SkillCategory, relative_path: P
try:
metadata = yaml.safe_load(front_matter_text)
except yaml.YAMLError as exc:
logger.error("Invalid YAML front-matter in %s: %s", skill_file, exc)
logger.error("%s", _format_yaml_error(skill_file, exc, front_matter_text))
return None

if not isinstance(metadata, dict):
Expand Down
154 changes: 154 additions & 0 deletions backend/tests/test_skills_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from __future__ import annotations

import logging
from pathlib import Path

from deerflow.skills.parser import parse_skill_file
Expand Down Expand Up @@ -156,3 +157,156 @@ def test_parse_nonexistent_file_returns_none(tmp_path):
"""Non-existent files are handled gracefully."""
skill = parse_skill_file(tmp_path / "ghost" / "SKILL.md", category="custom")
assert skill is None


# ---------------------------------------------------------------------------
# Friendly YAML error reporting
# ---------------------------------------------------------------------------


def test_parse_unquoted_colon_value_logs_line_and_hint(tmp_path, caplog):
"""Unquoted value with ': ' produces a log that exposes the full offending line
(PyYAML truncates long lines with `...`) and a copy-pasteable quoting hint.

Regression for issue #3333: SKILL.md authored by an LLM frequently
contains ``description: foo: bar`` which PyYAML rejects with
``mapping values are not allowed here``. The skill is correctly skipped
(the file is not silently accepted). Before this change the only
diagnostic was PyYAML's own message, which (a) numbers lines within
the front-matter body rather than the file and (b) truncates long
values with '...'. The new behaviour pins:
* the line number an author sees in their editor (file-line, not
front-matter-line),
* the *full* offending line (no '...' truncation), and
* a copy-pasteable `key: "value"` hint.
"""

# The description value is intentionally long enough to trigger
# PyYAML's own '...' truncation in the rendered str(exc); our hint
# must echo the *full* value regardless.
long_value = "StarRun collector: progress, errors, tables out, plus assorted diagnostic notes"
front_matter = f"name: collect-startrun\ndescription: {long_value}"
skill_file = _write_skill(tmp_path, front_matter)

with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"):
skill = parse_skill_file(skill_file, category="custom")

assert skill is None
combined = "\n".join(rec.getMessage() for rec in caplog.records)
assert "Invalid YAML front-matter" in combined

# 1. File-line, not front-matter-line. `description` is the 2nd line
# of the front-matter body, which is line 3 of the file (line 1
# is the leading `---` fence). Before this PR the log said
# `line 2`, which sent authors to the wrong row.
assert f"line 3: description: {long_value}" in combined

# 2. The full value is preserved -- PyYAML's own message truncates
# long values with '...', so the presence of the un-truncated tail
# proves we are reading the source line ourselves, not echoing
# PyYAML's snippet.
assert "plus assorted diagnostic notes" in combined
assert "..." not in [line for line in combined.splitlines() if line.startswith(" line ")][0]

# 3. The copy-pasteable quoting hint is the actually-new diagnostic.
assert f'hint: values containing ":" must be quoted, e.g. description: "{long_value}"' in combined


def test_parse_unquoted_colon_value_preserves_nested_key_indent(tmp_path, caplog):
"""Nested keys must keep their leading indentation in the quoting hint.

Regression guard for CR feedback on PR #3335: an earlier version of
the hint called ``key.strip()``, which turned `` author: foo: bar``
into ``author: "foo: bar"``. Pasting that back under a parent mapping
silently moved the field to the top level. The hint must preserve
the original indentation so authors can copy-paste-fix in place.
"""

# A two-space-indented nested key triggers the same scanner error,
# but its hint must keep the indentation.
front_matter = "name: nested-skill\nmetadata:\n author: Jane: Doe"
skill_file = _write_skill(tmp_path, front_matter)

with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"):
skill = parse_skill_file(skill_file, category="custom")

assert skill is None
combined = "\n".join(rec.getMessage() for rec in caplog.records)
# Two leading spaces in front of `author` are preserved.
assert 'hint: values containing ":" must be quoted, e.g. author: "Jane: Doe"' in combined


def test_parse_unrelated_yaml_error_omits_quoting_hint(tmp_path, caplog):
"""Errors other than 'mapping values are not allowed' must NOT carry the quoting hint."""

# Unclosed flow sequence is a scanner error of a different shape; the
# quoting hint would be misleading and must be suppressed.
skill_file = _write_skill(tmp_path, "name: [unclosed\ndescription: x")

with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"):
skill = parse_skill_file(skill_file, category="custom")

assert skill is None
combined = "\n".join(rec.getMessage() for rec in caplog.records)
assert "Invalid YAML front-matter" in combined
assert "hint:" not in combined


def test_parse_valid_skill_emits_no_error_log(tmp_path, caplog):
"""Sanity check: a valid SKILL.md must not produce any error logs."""

skill_file = _write_skill(tmp_path, 'name: ok-skill\ndescription: "Foo: bar"')

with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"):
skill = parse_skill_file(skill_file, category="custom")

assert skill is not None
assert skill.description == "Foo: bar"
assert not caplog.records, "valid SKILL.md must not log errors"


def test_parse_unquoted_colon_value_escapes_backslashes_in_hint(tmp_path, caplog):
"""Backslashes in the offending value must be doubled in the hint.

Regression guard for CR feedback on PR #3335: an earlier version of
the hint only escaped ``"`` but left ``\\`` untouched. Pasting the
suggested ``key: "..."`` back into the file would then be reparsed
as an escape sequence by PyYAML's double-quoted scalar rules and
either fail to load or silently change meaning (e.g. ``C:\\Temp``
becoming ``C:<TAB>emp``). The hint must double the backslash so the
suggested scalar is valid YAML when pasted back.
"""

# The second ``: `` (after ``path``) is what trips PyYAML's
# "mapping values are not allowed here"; the ``C:\Temp`` segment
# carries the backslash that the hint must escape.
front_matter = "name: path-skill\ndescription: Windows path: C:\\Temp"
skill_file = _write_skill(tmp_path, front_matter)

with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"):
skill = parse_skill_file(skill_file, category="custom")

assert skill is None
combined = "\n".join(rec.getMessage() for rec in caplog.records)
assert r'description: "Windows path: C:\\Temp"' in combined


def test_parse_unquoted_colon_value_escapes_regex_in_hint(tmp_path, caplog):
"""Regex-style ``\\d`` must also be escaped in the hint.

Same root cause as the Windows-path guard above, but with a
regex-style escape that is even more likely to appear in
LLM-authored skills (e.g. a ``description`` that quotes a regex).
PyYAML rejects ``\\d`` in double-quoted scalars, so the hint must
emit ``\\\\d`` to remain valid.
"""

front_matter = "name: regex-skill\ndescription: match: \\d+ digits"
skill_file = _write_skill(tmp_path, front_matter)

with caplog.at_level(logging.ERROR, logger="deerflow.skills.parser"):
skill = parse_skill_file(skill_file, category="custom")

assert skill is None
combined = "\n".join(rec.getMessage() for rec in caplog.records)
assert r'description: "match: \\d+ digits"' in combined
Loading