[CI] Remove buildkite from metrics container (#143049)

Now that buildkite has been sunsetted, remove buildkite tracking from
the metrics container as it does not do anything.
This commit is contained in:
Aiden Grossman
2025-06-06 19:58:57 +00:00
committed by GitHub
parent 16dda4d3f4
commit 34e5d8ef16

View File

@@ -1,12 +1,9 @@
import collections
import datetime
import dateutil
import github
import json
import logging
import os
import requests
import sys
import time
from dataclasses import dataclass
@@ -55,18 +52,6 @@ GITHUB_WORKFLOW_MAX_CREATED_AGE_HOURS = 8
# by trial and error).
GRAFANA_METRIC_MAX_AGE_MN = 120
# Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
# the metric name in Grafana. This is important not to lose metrics history
# if the workflow name changes.
BUILDKITE_WORKFLOW_TO_TRACK = {
":linux: Linux x64": "buildkite_linux",
":windows: Windows x64": "buildkite_windows",
}
# Number of builds to fetch per page. Since we scrape regularly, this can
# remain small.
BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 50
@dataclass
class JobMetrics:
@@ -86,181 +71,6 @@ class GaugeMetric:
time_ns: int
def buildkite_fetch_page_build_list(
buildkite_token: str, after_cursor: str = None
) -> list[dict[str, str]]:
"""Fetches a page of the build list using the GraphQL BuildKite API.
Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
older than the one pointer by |after_cursor| if provided.
The |after_cursor| value is taken from the previous page returned by the
API.
Args:
buildkite_token: the secret token to authenticate GraphQL requests.
after_cursor: cursor after which to start the page fetch.
Returns:
The most recent builds after cursor (if set) with the following format:
[
{
"cursor": <value>,
"number": <build-number>,
}
]
"""
BUILDKITE_GRAPHQL_QUERY = """
query OrganizationShowQuery {{
organization(slug: "llvm-project") {{
pipelines(search: "Github pull requests", first: 1) {{
edges {{
node {{
builds (state: [CANCELING, CREATING, FAILING, RUNNING], first: {PAGE_SIZE}, after: {AFTER}) {{
edges {{
cursor
node {{
number
}}
}}
}}
}}
}}
}}
}}
}}
"""
query = BUILDKITE_GRAPHQL_QUERY.format(
PAGE_SIZE=BUILDKITE_GRAPHQL_BUILDS_PER_PAGE,
AFTER="null" if after_cursor is None else '"{}"'.format(after_cursor),
)
query = json.dumps({"query": query})
url = "https://graphql.buildkite.com/v1"
headers = {
"Authorization": "Bearer " + buildkite_token,
"Content-Type": "application/json",
}
data = requests.post(url, data=query, headers=headers).json()
# De-nest the build list.
if "errors" in data:
logging.info("Failed to fetch BuildKite jobs: {}".format(data["errors"]))
return []
builds = data["data"]["organization"]["pipelines"]["edges"][0]["node"]["builds"][
"edges"
]
# Fold cursor info into the node dictionnary.
return [{**x["node"], "cursor": x["cursor"]} for x in builds]
def buildkite_get_build_info(build_number: str) -> dict:
"""Returns all the info associated with the provided build number.
Note: for unknown reasons, graphql returns no jobs for a given build,
while this endpoint does, hence why this uses this API instead of graphql.
Args:
build_number: which build number to fetch info for.
Returns:
The info for the target build, a JSON dictionnary.
"""
URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
return requests.get(URL.format(build_number)).json()
def buildkite_get_incomplete_tasks(buildkite_token: str) -> list:
"""Returns all the running/pending BuildKite builds.
Args:
buildkite_token: the secret token to authenticate GraphQL requests.
last_cursor: the cursor to stop at if set. If None, a full page is fetched.
"""
output = []
cursor = None
while True:
page = buildkite_fetch_page_build_list(buildkite_token, cursor)
if len(page) == 0:
break
cursor = page[-1]["cursor"]
output += page
return output
def buildkite_get_metrics(
buildkite_token: str, previously_incomplete: set[int]
) -> (list[JobMetrics], set[int]):
"""Returns a tuple with:
- the metrics recorded for newly completed workflow jobs.
- the set of workflow still running now.
Args:
buildkite_token: the secret token to authenticate GraphQL requests.
previously_incomplete: the set of running workflows the last time this
function was called.
"""
running_builds = buildkite_get_incomplete_tasks(buildkite_token)
incomplete_now = set([x["number"] for x in running_builds])
output = []
for build_id in previously_incomplete:
if build_id in incomplete_now:
continue
info = buildkite_get_build_info(build_id)
metric_timestamp = dateutil.parser.isoparse(info["finished_at"])
for job in info["jobs"]:
# This workflow is not interesting to us.
if job["name"] not in BUILDKITE_WORKFLOW_TO_TRACK:
continue
# Don't count canceled jobs.
if job["canceled_at"]:
continue
created_at = dateutil.parser.isoparse(job["created_at"])
scheduled_at = dateutil.parser.isoparse(job["scheduled_at"])
started_at = dateutil.parser.isoparse(job["started_at"])
finished_at = dateutil.parser.isoparse(job["finished_at"])
job_name = BUILDKITE_WORKFLOW_TO_TRACK[job["name"]]
queue_time = (started_at - scheduled_at).seconds
run_time = (finished_at - started_at).seconds
status = bool(job["passed"])
# Grafana will refuse to ingest metrics older than ~2 hours, so we
# should avoid sending historical data.
metric_age_mn = (
datetime.datetime.now(datetime.timezone.utc) - metric_timestamp
).total_seconds() / 60
if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN:
logging.warning(
f"Job {job['name']} from workflow {build_id} dropped due"
+ f" to staleness: {metric_age_mn}mn old."
)
continue
metric_timestamp_ns = int(metric_timestamp.timestamp()) * 10**9
workflow_id = build_id
workflow_name = "Github pull requests"
output.append(
JobMetrics(
job_name,
queue_time,
run_time,
status,
metric_timestamp_ns,
workflow_id,
workflow_name,
)
)
return output, incomplete_now
def github_get_metrics(
github_repo: github.Repository, last_workflows_seen_as_completed: set[int]
) -> tuple[list[JobMetrics], int]:
@@ -478,7 +288,6 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
def main():
# Authenticate with Github
github_auth = Auth.Token(os.environ["GITHUB_TOKEN"])
buildkite_token = os.environ["BUILDKITE_TOKEN"]
grafana_api_key = os.environ["GRAFANA_API_KEY"]
grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
@@ -486,9 +295,6 @@ def main():
# Because the Github queries are broken, we'll simply log a 'processed'
# bit for the last COUNT_TO_PROCESS workflows.
gh_last_workflows_seen_as_completed = set()
# Stores the list of pending/running builds in BuildKite we need to check
# at the next iteration.
bk_incomplete = set()
# Enter the main loop. Every five minutes we wake up and dump metrics for
# the relevant jobs.
@@ -500,13 +306,8 @@ def main():
github_repo, gh_last_workflows_seen_as_completed
)
bk_metrics, bk_incomplete = buildkite_get_metrics(
buildkite_token, bk_incomplete
)
metrics = gh_metrics + bk_metrics
upload_metrics(metrics, grafana_metrics_userid, grafana_api_key)
logging.info(f"Uploaded {len(metrics)} metrics")
upload_metrics(gh_metrics, grafana_metrics_userid, grafana_api_key)
logging.info(f"Uploaded {len(gh_metrics)} metrics")
time.sleep(SCRAPE_INTERVAL_SECONDS)