[CI] Remove buildkite from metrics container (#143049)

Now that buildkite has been sunsetted, remove buildkite tracking from
the metrics container as it does not do anything.
This commit is contained in:
Aiden Grossman
2025-06-06 19:58:57 +00:00
committed by GitHub
parent 16dda4d3f4
commit 34e5d8ef16

View File

@@ -1,12 +1,9 @@
import collections import collections
import datetime import datetime
import dateutil
import github import github
import json
import logging import logging
import os import os
import requests import requests
import sys
import time import time
from dataclasses import dataclass from dataclasses import dataclass
@@ -55,18 +52,6 @@ GITHUB_WORKFLOW_MAX_CREATED_AGE_HOURS = 8
# by trial and error). # by trial and error).
GRAFANA_METRIC_MAX_AGE_MN = 120 GRAFANA_METRIC_MAX_AGE_MN = 120
# Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
# the metric name in Grafana. This is important not to lose metrics history
# if the workflow name changes.
BUILDKITE_WORKFLOW_TO_TRACK = {
":linux: Linux x64": "buildkite_linux",
":windows: Windows x64": "buildkite_windows",
}
# Number of builds to fetch per page. Since we scrape regularly, this can
# remain small.
BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 50
@dataclass @dataclass
class JobMetrics: class JobMetrics:
@@ -86,181 +71,6 @@ class GaugeMetric:
time_ns: int time_ns: int
def buildkite_fetch_page_build_list(
buildkite_token: str, after_cursor: str = None
) -> list[dict[str, str]]:
"""Fetches a page of the build list using the GraphQL BuildKite API.
Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
older than the one pointer by |after_cursor| if provided.
The |after_cursor| value is taken from the previous page returned by the
API.
Args:
buildkite_token: the secret token to authenticate GraphQL requests.
after_cursor: cursor after which to start the page fetch.
Returns:
The most recent builds after cursor (if set) with the following format:
[
{
"cursor": <value>,
"number": <build-number>,
}
]
"""
BUILDKITE_GRAPHQL_QUERY = """
query OrganizationShowQuery {{
organization(slug: "llvm-project") {{
pipelines(search: "Github pull requests", first: 1) {{
edges {{
node {{
builds (state: [CANCELING, CREATING, FAILING, RUNNING], first: {PAGE_SIZE}, after: {AFTER}) {{
edges {{
cursor
node {{
number
}}
}}
}}
}}
}}
}}
}}
}}
"""
query = BUILDKITE_GRAPHQL_QUERY.format(
PAGE_SIZE=BUILDKITE_GRAPHQL_BUILDS_PER_PAGE,
AFTER="null" if after_cursor is None else '"{}"'.format(after_cursor),
)
query = json.dumps({"query": query})
url = "https://graphql.buildkite.com/v1"
headers = {
"Authorization": "Bearer " + buildkite_token,
"Content-Type": "application/json",
}
data = requests.post(url, data=query, headers=headers).json()
# De-nest the build list.
if "errors" in data:
logging.info("Failed to fetch BuildKite jobs: {}".format(data["errors"]))
return []
builds = data["data"]["organization"]["pipelines"]["edges"][0]["node"]["builds"][
"edges"
]
# Fold cursor info into the node dictionnary.
return [{**x["node"], "cursor": x["cursor"]} for x in builds]
def buildkite_get_build_info(build_number: str) -> dict:
"""Returns all the info associated with the provided build number.
Note: for unknown reasons, graphql returns no jobs for a given build,
while this endpoint does, hence why this uses this API instead of graphql.
Args:
build_number: which build number to fetch info for.
Returns:
The info for the target build, a JSON dictionnary.
"""
URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
return requests.get(URL.format(build_number)).json()
def buildkite_get_incomplete_tasks(buildkite_token: str) -> list:
"""Returns all the running/pending BuildKite builds.
Args:
buildkite_token: the secret token to authenticate GraphQL requests.
last_cursor: the cursor to stop at if set. If None, a full page is fetched.
"""
output = []
cursor = None
while True:
page = buildkite_fetch_page_build_list(buildkite_token, cursor)
if len(page) == 0:
break
cursor = page[-1]["cursor"]
output += page
return output
def buildkite_get_metrics(
buildkite_token: str, previously_incomplete: set[int]
) -> (list[JobMetrics], set[int]):
"""Returns a tuple with:
- the metrics recorded for newly completed workflow jobs.
- the set of workflow still running now.
Args:
buildkite_token: the secret token to authenticate GraphQL requests.
previously_incomplete: the set of running workflows the last time this
function was called.
"""
running_builds = buildkite_get_incomplete_tasks(buildkite_token)
incomplete_now = set([x["number"] for x in running_builds])
output = []
for build_id in previously_incomplete:
if build_id in incomplete_now:
continue
info = buildkite_get_build_info(build_id)
metric_timestamp = dateutil.parser.isoparse(info["finished_at"])
for job in info["jobs"]:
# This workflow is not interesting to us.
if job["name"] not in BUILDKITE_WORKFLOW_TO_TRACK:
continue
# Don't count canceled jobs.
if job["canceled_at"]:
continue
created_at = dateutil.parser.isoparse(job["created_at"])
scheduled_at = dateutil.parser.isoparse(job["scheduled_at"])
started_at = dateutil.parser.isoparse(job["started_at"])
finished_at = dateutil.parser.isoparse(job["finished_at"])
job_name = BUILDKITE_WORKFLOW_TO_TRACK[job["name"]]
queue_time = (started_at - scheduled_at).seconds
run_time = (finished_at - started_at).seconds
status = bool(job["passed"])
# Grafana will refuse to ingest metrics older than ~2 hours, so we
# should avoid sending historical data.
metric_age_mn = (
datetime.datetime.now(datetime.timezone.utc) - metric_timestamp
).total_seconds() / 60
if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN:
logging.warning(
f"Job {job['name']} from workflow {build_id} dropped due"
+ f" to staleness: {metric_age_mn}mn old."
)
continue
metric_timestamp_ns = int(metric_timestamp.timestamp()) * 10**9
workflow_id = build_id
workflow_name = "Github pull requests"
output.append(
JobMetrics(
job_name,
queue_time,
run_time,
status,
metric_timestamp_ns,
workflow_id,
workflow_name,
)
)
return output, incomplete_now
def github_get_metrics( def github_get_metrics(
github_repo: github.Repository, last_workflows_seen_as_completed: set[int] github_repo: github.Repository, last_workflows_seen_as_completed: set[int]
) -> tuple[list[JobMetrics], int]: ) -> tuple[list[JobMetrics], int]:
@@ -478,7 +288,6 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
def main(): def main():
# Authenticate with Github # Authenticate with Github
github_auth = Auth.Token(os.environ["GITHUB_TOKEN"]) github_auth = Auth.Token(os.environ["GITHUB_TOKEN"])
buildkite_token = os.environ["BUILDKITE_TOKEN"]
grafana_api_key = os.environ["GRAFANA_API_KEY"] grafana_api_key = os.environ["GRAFANA_API_KEY"]
grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"] grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
@@ -486,9 +295,6 @@ def main():
# Because the Github queries are broken, we'll simply log a 'processed' # Because the Github queries are broken, we'll simply log a 'processed'
# bit for the last COUNT_TO_PROCESS workflows. # bit for the last COUNT_TO_PROCESS workflows.
gh_last_workflows_seen_as_completed = set() gh_last_workflows_seen_as_completed = set()
# Stores the list of pending/running builds in BuildKite we need to check
# at the next iteration.
bk_incomplete = set()
# Enter the main loop. Every five minutes we wake up and dump metrics for # Enter the main loop. Every five minutes we wake up and dump metrics for
# the relevant jobs. # the relevant jobs.
@@ -500,13 +306,8 @@ def main():
github_repo, gh_last_workflows_seen_as_completed github_repo, gh_last_workflows_seen_as_completed
) )
bk_metrics, bk_incomplete = buildkite_get_metrics( upload_metrics(gh_metrics, grafana_metrics_userid, grafana_api_key)
buildkite_token, bk_incomplete logging.info(f"Uploaded {len(gh_metrics)} metrics")
)
metrics = gh_metrics + bk_metrics
upload_metrics(metrics, grafana_metrics_userid, grafana_api_key)
logging.info(f"Uploaded {len(metrics)} metrics")
time.sleep(SCRAPE_INTERVAL_SECONDS) time.sleep(SCRAPE_INTERVAL_SECONDS)