Despite the error message for preempted jobs containing the words
"cancelled", these are considered workflow "failures" by github.
This is important, because if we fail to distinguish between "failed"
and "cancelled" jobs, the restarter will fight to restart jobs a user
intentionally cancelled (either by pressing the "cancel" button, or by
pushing an update to a PR).
This reverts commit 3ea7fc7339. This also
reverts earlier attempts to solve this problem by matching the messages
to detect manual cancellations.
This change also removes ldionne's test workflow, as its hard to
correctly keep in sync.
This change does not attempt to address the maintainability or
testability of this script, which continues to be an issue. If asked to
address these issues, my plan is to write the script in python (which
most people are more familar with), and turn this action into a "docker
action" using a container with the python action and dependencies built
into it. Let me know if that's a direction we're interested in heading.
159 lines
7.0 KiB
YAML
159 lines
7.0 KiB
YAML
name: Restart Preempted Libc++ Workflow
|
|
|
|
# The libc++ builders run on preemptable VMs, which can be shutdown at any time.
|
|
# This workflow identifies when a workflow run was canceled due to the VM being preempted,
|
|
# and restarts the workflow run.
|
|
|
|
# We identify a canceled workflow run by checking the annotations of the check runs in the check suite,
|
|
# which should contain the message "The runner has received a shutdown signal."
|
|
|
|
# Note: If a job is both preempted and also contains a non-preemption failure, we do not restart the workflow.
|
|
|
|
on:
|
|
workflow_run:
|
|
workflows: [Build and Test libc\+\+]
|
|
types:
|
|
- completed
|
|
|
|
permissions:
|
|
contents: read
|
|
|
|
jobs:
|
|
restart:
|
|
if: github.repository_owner == 'llvm' && (github.event.workflow_run.conclusion == 'failure')
|
|
name: "Restart Job"
|
|
permissions:
|
|
statuses: read
|
|
checks: write
|
|
actions: write
|
|
runs-on: ubuntu-24.04
|
|
steps:
|
|
- name: "Restart Job"
|
|
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
|
|
with:
|
|
script: |
|
|
// The "The run was canceled by" message comes from a user manually canceling a workflow
|
|
// the "higher priority" message comes from github canceling a workflow because the user updated the change.
|
|
// And the "exit code 1" message indicates a genuine failure.
|
|
const failure_regex = /(Process completed with exit code 1.)/
|
|
const preemption_regex = /(The runner has received a shutdown signal)|(The operation was canceled)/
|
|
|
|
const wf_run = context.payload.workflow_run
|
|
core.notice(`Running on "${wf_run.display_title}" by @${wf_run.actor.login} (event: ${wf_run.event})\nWorkflow run URL: ${wf_run.html_url}`)
|
|
|
|
|
|
async function create_check_run(conclusion, message) {
|
|
// Create a check run on the given workflow run to indicate if
|
|
// we are restarting the workflow or not.
|
|
if (conclusion != 'success' && conclusion != 'skipped' && conclusion != 'neutral') {
|
|
core.setFailed('Invalid conclusion: ' + conclusion)
|
|
}
|
|
await github.rest.checks.create({
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
name: 'Restart Preempted Job',
|
|
head_sha: wf_run.head_sha,
|
|
status: 'completed',
|
|
conclusion: conclusion,
|
|
output: {
|
|
title: 'Restarted Preempted Job',
|
|
summary: message
|
|
}
|
|
})
|
|
}
|
|
|
|
console.log('Listing check runs for suite')
|
|
const check_suites = await github.rest.checks.listForSuite({
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
check_suite_id: context.payload.workflow_run.check_suite_id,
|
|
per_page: 100 // FIXME: We don't have 100 check runs yet, but we should handle this better.
|
|
})
|
|
|
|
check_run_ids = [];
|
|
for (check_run of check_suites.data.check_runs) {
|
|
console.log('Checking check run: ' + check_run.id);
|
|
if (check_run.status != 'completed') {
|
|
console.log('Check run was not completed. Skipping.');
|
|
continue;
|
|
}
|
|
if (check_run.conclusion != 'failure') {
|
|
console.log('Check run had conclusion: ' + check_run.conclusion + '. Skipping.');
|
|
continue;
|
|
}
|
|
check_run_ids.push(check_run.id);
|
|
}
|
|
|
|
has_preempted_job = false;
|
|
|
|
for (check_run_id of check_run_ids) {
|
|
console.log('Listing annotations for check run: ' + check_run_id);
|
|
|
|
annotations = await github.rest.checks.listAnnotations({
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
check_run_id: check_run_id
|
|
})
|
|
|
|
// For temporary debugging purposes to see the structure of the annotations.
|
|
console.log(annotations);
|
|
|
|
has_failed_job = false;
|
|
saved_failure_message = null;
|
|
|
|
for (annotation of annotations.data) {
|
|
if (annotation.annotation_level != 'failure') {
|
|
continue;
|
|
}
|
|
|
|
const preemption_match = annotation.message.match(preemption_regex);
|
|
|
|
if (preemption_match != null) {
|
|
console.log('Found preemption message: ' + annotation.message);
|
|
has_preempted_job = true;
|
|
}
|
|
|
|
const failure_match = annotation.message.match(failure_regex);
|
|
if (failure_match != null) {
|
|
has_failed_job = true;
|
|
saved_failure_message = annotation.message;
|
|
}
|
|
}
|
|
if (has_failed_job && (! has_preempted_job)) {
|
|
// We only want to restart the workflow if all of the failures were due to preemption.
|
|
// We don't want to restart the workflow if there were other failures.
|
|
//
|
|
// However, libcxx runners running inside docker containers produce both a preemption message and failure message.
|
|
//
|
|
// The desired approach is to ignore failure messages which appear on the same job as a preemption message
|
|
// (An job is a single run with a specific configuration, ex generic-gcc, gcc-14).
|
|
//
|
|
// However, it's unclear that this code achieves the desired approach, and it may ignore all failures
|
|
// if a preemption message is found at all on any run.
|
|
//
|
|
// For now, it's more important to restart preempted workflows than to avoid restarting workflows with
|
|
// non-preemption failures.
|
|
//
|
|
// TODO Figure this out.
|
|
core.notice('Choosing not to rerun workflow because we found a non-preemption failure' +
|
|
'Failure message: "' + saved_failure_message + '"');
|
|
await create_check_run('skipped', 'Choosing not to rerun workflow because we found a non-preemption failure\n'
|
|
+ 'Failure message: ' + saved_failure_message)
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (!has_preempted_job) {
|
|
core.notice('No preempted jobs found. Not restarting workflow.');
|
|
await create_check_run('neutral', 'No preempted jobs found. Not restarting workflow.')
|
|
return;
|
|
}
|
|
|
|
core.notice("Restarted workflow: " + context.payload.workflow_run.id);
|
|
await github.rest.actions.reRunWorkflowFailedJobs({
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
run_id: context.payload.workflow_run.id
|
|
})
|
|
await create_check_run('success', 'Restarted workflow run due to preempted job')
|