490 lines
16 KiB
Python
490 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""Use Codex to analyze clangd semantic-highlighting logic from a GitHub blob URL.
|
|
|
|
Example:
|
|
python3 scripts/analyzers/extract_semantic_highlighting_codex.py \
|
|
https://github.com/llvm/llvm-project/blob/d8ba56ce3f98871ae4e5782c4af2df4c98bebde7/clang-tools-extra/clangd/SemanticHighlighting.cpp \
|
|
--output semantic-highlighting.md
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Literal
|
|
from urllib.error import HTTPError, URLError
|
|
from urllib.parse import urlparse
|
|
from urllib.request import ProxyHandler, Request, build_opener, getproxies
|
|
|
|
|
|
PROMPT = """You are analyzing one C++ source file from clangd.
|
|
|
|
Task:
|
|
- Extract the cases in which semantic highlighting is applied.
|
|
- Produce an exhaustive segmentation that covers every source line exactly once.
|
|
|
|
Kinds:
|
|
- nop: this line does not materially participate in deciding or applying a highlight.
|
|
- condition: this line or contiguous range establishes a boolean/branching condition that gates a later highlight resolution.
|
|
- resolution: this line or contiguous range selects or applies a concrete semantic-highlighting outcome, such as a HighlightingKind, modifier, token emission, or an equivalent concrete result.
|
|
|
|
Output rules:
|
|
- Return JSON only.
|
|
- Segments must be in ascending order and non-overlapping.
|
|
- Every line from 1 through the last line must be covered exactly once.
|
|
- Use the smallest practical contiguous ranges.
|
|
- It is fine to compress long nop runs into ranges; the caller may expand them later.
|
|
- For nop segments, use an empty summary string.
|
|
- For condition segments, write one short sentence in plain English.
|
|
- For resolution segments, write one short sentence describing the concrete outcome.
|
|
- For resolution segments, populate depends_on with the exact condition line ranges that directly gate this outcome when present.
|
|
- Use the provided line numbers exactly; never invent lines.
|
|
- Do not use any external tools or local files; analyze only the numbered source text provided in the prompt.
|
|
"""
|
|
|
|
|
|
SCHEMA = {
|
|
"type": "object",
|
|
"properties": {
|
|
"segments": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"properties": {
|
|
"start_line": {"type": "integer", "minimum": 1},
|
|
"end_line": {"type": "integer", "minimum": 1},
|
|
"kind": {
|
|
"type": "string",
|
|
"enum": ["nop", "condition", "resolution"],
|
|
},
|
|
"summary": {"type": "string"},
|
|
"depends_on": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"properties": {
|
|
"start_line": {"type": "integer", "minimum": 1},
|
|
"end_line": {"type": "integer", "minimum": 1},
|
|
},
|
|
"required": ["start_line", "end_line"],
|
|
"additionalProperties": False,
|
|
},
|
|
},
|
|
},
|
|
"required": [
|
|
"start_line",
|
|
"end_line",
|
|
"kind",
|
|
"summary",
|
|
"depends_on",
|
|
],
|
|
"additionalProperties": False,
|
|
},
|
|
}
|
|
},
|
|
"required": ["segments"],
|
|
"additionalProperties": False,
|
|
}
|
|
|
|
|
|
SegmentKind = Literal["nop", "condition", "resolution"]
|
|
|
|
PROXY_ENV_KEYS = ("http_proxy", "https_proxy", "all_proxy", "no_proxy")
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class GitHubBlobRef:
|
|
owner: str
|
|
repo: str
|
|
rev: str
|
|
path: str
|
|
blob_url: str
|
|
raw_url: str
|
|
|
|
@property
|
|
def title(self) -> str:
|
|
return Path(self.path).stem
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class LineRange:
|
|
start_line: int
|
|
end_line: int
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Segment:
|
|
start_line: int
|
|
end_line: int
|
|
kind: SegmentKind
|
|
summary: str
|
|
depends_on: tuple[LineRange, ...]
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Ask Codex to analyze a GitHub-hosted semantic-highlighting file.",
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
)
|
|
parser.add_argument(
|
|
"url",
|
|
help="GitHub blob URL pinned to a specific revision.",
|
|
)
|
|
parser.add_argument(
|
|
"--model",
|
|
default=None,
|
|
help="Codex model to use via `codex exec`.",
|
|
)
|
|
parser.add_argument(
|
|
"--reasoning-effort",
|
|
default=None,
|
|
choices=["low", "medium", "high", "xhigh"],
|
|
help="Reasoning effort override passed to Codex CLI.",
|
|
)
|
|
parser.add_argument(
|
|
"--timeout",
|
|
type=int,
|
|
default=30,
|
|
help="HTTP timeout in seconds for fetching the source file.",
|
|
)
|
|
parser.add_argument(
|
|
"--codex-bin",
|
|
default="codex",
|
|
help="Codex CLI executable to invoke.",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
help="Write the markdown output to this path instead of stdout.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def parse_github_blob_url(url: str) -> GitHubBlobRef:
|
|
parsed = urlparse(url)
|
|
if parsed.scheme not in {"http", "https"} or parsed.netloc != "github.com":
|
|
raise ValueError("expected a GitHub https://github.com/.../blob/... URL")
|
|
|
|
parts = [part for part in parsed.path.split("/") if part]
|
|
if len(parts) < 5 or parts[2] != "blob":
|
|
raise ValueError("expected a GitHub blob URL with /owner/repo/blob/rev/path")
|
|
|
|
owner, repo = parts[0], parts[1]
|
|
rev = parts[3]
|
|
path = "/".join(parts[4:])
|
|
if not path:
|
|
raise ValueError("missing file path in GitHub blob URL")
|
|
|
|
raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{rev}/{path}"
|
|
normalized_blob_url = f"https://github.com/{owner}/{repo}/blob/{rev}/{path}"
|
|
return GitHubBlobRef(
|
|
owner=owner,
|
|
repo=repo,
|
|
rev=rev,
|
|
path=path,
|
|
blob_url=normalized_blob_url,
|
|
raw_url=raw_url,
|
|
)
|
|
|
|
|
|
def fetch_text(url: str, timeout: int) -> str:
|
|
request = Request(url, headers={"User-Agent": "semantic-highlighting-codex/1.0"})
|
|
opener = build_opener(ProxyHandler(getproxies()))
|
|
try:
|
|
with opener.open(request, timeout=timeout) as response:
|
|
return response.read().decode("utf-8").replace("\r\n", "\n")
|
|
except HTTPError as exc:
|
|
raise RuntimeError(f"failed to fetch {url}: HTTP {exc.code}") from exc
|
|
except URLError as exc:
|
|
raise RuntimeError(f"failed to fetch {url}: {exc.reason}") from exc
|
|
|
|
|
|
def number_source(text: str) -> tuple[str, int]:
|
|
lines = text.splitlines()
|
|
width = len(str(max(len(lines), 1)))
|
|
numbered = "\n".join(
|
|
f"{idx:>{width}} | {line}" for idx, line in enumerate(lines, 1)
|
|
)
|
|
return numbered, len(lines)
|
|
|
|
|
|
def build_codex_prompt(
|
|
blob: GitHubBlobRef,
|
|
source_text: str,
|
|
line_count: int,
|
|
) -> str:
|
|
numbered_source, _ = number_source(source_text)
|
|
return (
|
|
f"{PROMPT}\n\n"
|
|
f"GitHub blob URL: {blob.blob_url}\n"
|
|
f"LLVM revision: {blob.rev}\n"
|
|
f"File path: {blob.path}\n"
|
|
f"File title: {blob.title}\n"
|
|
f"Total lines: {line_count}\n\n"
|
|
"Analyze only the following numbered source file:\n"
|
|
f"{numbered_source}\n"
|
|
)
|
|
|
|
|
|
def analyze_with_codex_cli(
|
|
codex_bin: str,
|
|
blob: GitHubBlobRef,
|
|
source_text: str,
|
|
line_count: int,
|
|
model: str | None,
|
|
reasoning_effort: str | None,
|
|
) -> list[Segment]:
|
|
if shutil.which(codex_bin) is None:
|
|
raise RuntimeError(f"Codex CLI executable `{codex_bin}` was not found in PATH.")
|
|
|
|
prompt = build_codex_prompt(
|
|
blob=blob, source_text=source_text, line_count=line_count
|
|
)
|
|
child_env = build_codex_env()
|
|
|
|
with tempfile.TemporaryDirectory(prefix="semantic-highlighting-codex-") as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
schema_path = temp_path / "schema.json"
|
|
output_path = temp_path / "last-message.json"
|
|
schema_path.write_text(json.dumps(SCHEMA, indent=2), encoding="utf-8")
|
|
|
|
command = [
|
|
codex_bin,
|
|
"exec",
|
|
"--skip-git-repo-check",
|
|
"--ephemeral",
|
|
"--color",
|
|
"never",
|
|
"--sandbox",
|
|
"read-only",
|
|
*([] if model is None else ["--model", model]),
|
|
*(
|
|
[]
|
|
if reasoning_effort is None
|
|
else ["--config", f"model_reasoning_effort={json.dumps(reasoning_effort)}"]
|
|
),
|
|
"--output-schema",
|
|
str(schema_path),
|
|
"--output-last-message",
|
|
str(output_path),
|
|
"-",
|
|
]
|
|
|
|
try:
|
|
completed = subprocess.run(
|
|
command,
|
|
input=prompt,
|
|
text=True,
|
|
capture_output=True,
|
|
check=True,
|
|
env=child_env,
|
|
)
|
|
except FileNotFoundError as exc:
|
|
raise RuntimeError(
|
|
f"failed to execute `{codex_bin}`: command not found"
|
|
) from exc
|
|
except subprocess.CalledProcessError as exc:
|
|
detail = exc.stderr.strip() or exc.stdout.strip() or str(exc)
|
|
raise RuntimeError(f"Codex CLI failed: {detail}") from exc
|
|
|
|
if not output_path.exists():
|
|
detail = completed.stderr.strip() or completed.stdout.strip()
|
|
raise RuntimeError(
|
|
"Codex CLI did not produce an output message."
|
|
+ (f" Details: {detail}" if detail else "")
|
|
)
|
|
|
|
output_text = output_path.read_text(encoding="utf-8").strip()
|
|
if not output_text:
|
|
raise RuntimeError("Codex CLI returned an empty final message.")
|
|
|
|
try:
|
|
payload = json.loads(output_text)
|
|
except json.JSONDecodeError as exc:
|
|
raise RuntimeError(f"Codex returned invalid JSON:\n{output_text}") from exc
|
|
|
|
return normalize_segments(payload.get("segments", []), line_count)
|
|
|
|
|
|
def build_codex_env() -> dict[str, str]:
|
|
env = os.environ.copy()
|
|
for key in PROXY_ENV_KEYS:
|
|
value = env.get(key)
|
|
upper_key = key.upper()
|
|
if value and upper_key not in env:
|
|
env[upper_key] = value
|
|
return env
|
|
|
|
|
|
def normalize_segments(raw_segments: list[dict], line_count: int) -> list[Segment]:
|
|
normalized: list[Segment] = []
|
|
for item in raw_segments:
|
|
kind = item["kind"]
|
|
if kind not in {"nop", "condition", "resolution"}:
|
|
raise ValueError(f"unknown segment kind: {kind}")
|
|
start_line = int(item["start_line"])
|
|
end_line = int(item["end_line"])
|
|
if start_line < 1 or end_line < start_line or end_line > line_count:
|
|
raise ValueError(
|
|
f"invalid segment range {start_line}-{end_line} for file with {line_count} lines"
|
|
)
|
|
depends_on = tuple(
|
|
LineRange(start_line=int(dep["start_line"]), end_line=int(dep["end_line"]))
|
|
for dep in item.get("depends_on", [])
|
|
)
|
|
normalized.append(
|
|
Segment(
|
|
start_line=start_line,
|
|
end_line=end_line,
|
|
kind=kind,
|
|
summary=item.get("summary", "").strip(),
|
|
depends_on=depends_on,
|
|
)
|
|
)
|
|
|
|
normalized.sort(key=lambda segment: (segment.start_line, segment.end_line))
|
|
|
|
stitched: list[Segment] = []
|
|
next_line = 1
|
|
for segment in normalized:
|
|
if segment.start_line < next_line:
|
|
raise ValueError(
|
|
f"overlapping segments around line {segment.start_line}: model output is invalid"
|
|
)
|
|
while next_line < segment.start_line:
|
|
stitched.append(
|
|
Segment(
|
|
start_line=next_line,
|
|
end_line=next_line,
|
|
kind="nop",
|
|
summary="",
|
|
depends_on=(),
|
|
)
|
|
)
|
|
next_line += 1
|
|
stitched.extend(expand_nop_segment(segment))
|
|
next_line = segment.end_line + 1
|
|
|
|
while next_line <= line_count:
|
|
stitched.append(
|
|
Segment(
|
|
start_line=next_line,
|
|
end_line=next_line,
|
|
kind="nop",
|
|
summary="",
|
|
depends_on=(),
|
|
)
|
|
)
|
|
next_line += 1
|
|
|
|
return stitched
|
|
|
|
|
|
def expand_nop_segment(segment: Segment) -> list[Segment]:
|
|
if segment.kind != "nop" or segment.start_line == segment.end_line:
|
|
return [segment]
|
|
return [
|
|
Segment(
|
|
start_line=line_no,
|
|
end_line=line_no,
|
|
kind="nop",
|
|
summary="",
|
|
depends_on=(),
|
|
)
|
|
for line_no in range(segment.start_line, segment.end_line + 1)
|
|
]
|
|
|
|
|
|
def blob_anchor_url(blob: GitHubBlobRef, line_range: LineRange) -> str:
|
|
if line_range.start_line == line_range.end_line:
|
|
return f"{blob.blob_url}#L{line_range.start_line}"
|
|
return f"{blob.blob_url}#L{line_range.start_line}-L{line_range.end_line}"
|
|
|
|
|
|
def format_line_range(start_line: int, end_line: int) -> str:
|
|
if start_line == end_line:
|
|
return f"Line {start_line}"
|
|
return f"Line {start_line}-{end_line}"
|
|
|
|
|
|
def format_loc_reference(blob: GitHubBlobRef, line_range: LineRange) -> str:
|
|
label = (
|
|
f"LoC {line_range.start_line}"
|
|
if line_range.start_line == line_range.end_line
|
|
else f"LoC {line_range.start_line}-{line_range.end_line}"
|
|
)
|
|
return f"[{label}]({blob_anchor_url(blob, line_range)})"
|
|
|
|
|
|
def render_segment_summary(blob: GitHubBlobRef, segment: Segment) -> str:
|
|
summary = segment.summary.strip()
|
|
if segment.kind == "nop" or not summary:
|
|
return ""
|
|
if segment.kind == "resolution" and segment.depends_on:
|
|
refs = [format_loc_reference(blob, dep) for dep in segment.depends_on]
|
|
if len(refs) == 1:
|
|
prefix = f"By condition at {refs[0]}, "
|
|
else:
|
|
prefix = f"By conditions at {', '.join(refs)}, "
|
|
return prefix + decapitalize_summary(summary)
|
|
return summary
|
|
|
|
|
|
def decapitalize_summary(summary: str) -> str:
|
|
if not summary:
|
|
return summary
|
|
if len(summary) >= 2 and summary[0].isupper() and summary[1].islower():
|
|
return summary[:1].lower() + summary[1:]
|
|
return summary
|
|
|
|
|
|
def render_markdown(blob: GitHubBlobRef, segments: list[Segment]) -> str:
|
|
lines = [f"- LLVMRevHash: `{blob.rev}`", f"# {blob.title}", ""]
|
|
for segment in segments:
|
|
lines.append(
|
|
f"## {format_line_range(segment.start_line, segment.end_line)} (kind: {segment.kind})"
|
|
)
|
|
summary = render_segment_summary(blob, segment)
|
|
if summary:
|
|
lines.append(summary)
|
|
lines.append("")
|
|
return "\n".join(lines).rstrip() + "\n"
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
|
|
try:
|
|
blob = parse_github_blob_url(args.url)
|
|
source_text = fetch_text(blob.raw_url, timeout=args.timeout)
|
|
_, line_count = number_source(source_text)
|
|
segments = analyze_with_codex_cli(
|
|
codex_bin=args.codex_bin,
|
|
blob=blob,
|
|
source_text=source_text,
|
|
line_count=line_count,
|
|
model=args.model,
|
|
reasoning_effort=args.reasoning_effort,
|
|
)
|
|
markdown = render_markdown(blob, segments)
|
|
except Exception as exc:
|
|
print(f"Error: {exc}", file=sys.stderr)
|
|
return 1
|
|
|
|
if args.output:
|
|
args.output.write_text(markdown, encoding="utf-8")
|
|
else:
|
|
sys.stdout.write(markdown)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|