blob: c9035d2401f83368f909b10b58871ac86ab0b736 [file] [log] [blame]
#!/bin/bash
set -xe
USE_SQUAD=0
USE_TUXSUITE_FVP=${USE_TUXSUITE_FVP:-0}
# Get LAVA device type from a job file
get_lava_device_type() {
local job_file=$1
awk '/^device_type:/ {print $2}' ${job_file}
}
setup_tuxsuite() {
mkdir -p ~/.config/tuxsuite/
cat > ~/.config/tuxsuite/config.ini <<EOF
[default]
token=$TUXSUITE_TOKEN
group=$TUXSUITE_GROUP
project=$TUXSUITE_PROJECT
EOF
}
# Wait for the LAVA job to finished
# By default, timeout at 5400 secs (1.5 hours) and monitor every 60 seconds
wait_lava_job() {
set +x
local id=$1
local timeout="${2:-5400}"
local interval="${3:-60}"
(( t = timeout ))
while ((t > 0)); do
sleep $interval
resilient_cmd lavacli jobs show $id | tee "${WORKSPACE}/lava-progress.show" | grep 'state *:'
set +x
if grep 'state.*: Finished' "${WORKSPACE}/lava-progress.show"; then
set -x
cat "${WORKSPACE}/lava-progress.show"
# finished
return 0
fi
((t -= interval))
done
set -x
cat "${WORKSPACE}/lava-progress.show"
echo "Timeout waiting for job to finish"
# timeout
return 1
}
# Run the given command passed through parameters, if fails, try
# at most more N-times with a pause of M-seconds until success.
resilient_cmd() {
set +x
local max_retries=10
local sleep_body=2
local iter=0
while true; do
if "$@"; then
break
fi
sleep ${sleep_body}
# Exponential backoff
sleep_body=$(( sleep_body * 2 ))
if [ ${sleep_body} -ge 60 ]; then
sleep_body=60
echo "WARNING: Command '$@' still not successful on retry #${iter}, exp backoff already limited" 1>&2
fi
iter=$(( iter + 1 ))
if [ ${iter} -ge ${max_retries} ]; then
echo "ERROR: Command '$@' failed ${iter} times in row" 1>&2
set -x
return 1
fi
done
set -x
return 0
}
ls -l ${WORKSPACE}
DEVICE=$(get_lava_device_type artefacts-lava/job.yaml)
if [ "${DEVICE}" == "fvp" -a "${USE_TUXSUITE_FVP}" -ne 0 ]; then
setup_tuxsuite
set -o pipefail
for i in $(seq 1 ${LAVA_RETRIES:-3}); do
echo "# TuxSuite submission iteration #$i"
if python3 -u -m tuxsuite test submit --device fvp-lava --job-definition artefacts-lava/job.yaml | tee tuxsuite-submit.out; then
status=0
break
else
status=$?
echo "TuxSuite test failed, status: ${status}"
fi
done
TUXID=$(awk '/^uid:/ {print $2}' tuxsuite-submit.out)
echo "TuxSuite test ID: ${TUXID}"
echo ${TUXID} > ${WORKSPACE}/tux.id
tuxsuite test logs --raw ${TUXID} > ${WORKSPACE}/lava-raw.log
if tuxsuite test results ${TUXID} | grep -v "lava.http-download" | grep -q 'fail'; then
echo "tuxsuite test submit status was: ${status}, failing testcases found, setting as 1 (failed)"
status=1
fi
echo "TuxSuite test result: ${status}"
exit ${status}
fi
function submit_via_lava_or_squad() {
lavacli identities add --username ${LAVA_USER} --token ${LAVA_TOKEN} --uri "https://${LAVA_SERVER}/RPC2" default
if [ $USE_SQUAD -ne 0 -a -n "${QA_SERVER_VERSION}" ]; then
# Submit via SQUAD
if [ -n "${GERRIT_CHANGE_NUMBER}" ] && [ -n "${GERRIT_PATCHSET_NUMBER}" ]; then
curl \
--fail \
--retry 4 \
-X POST \
--header "Auth-Token: ${QA_REPORTS_TOKEN}" \
${QA_SERVER}/api/createbuild/${QA_SERVER_TEAM}/${QA_SERVER_PROJECT}/${QA_SERVER_VERSION}
fi
TESTJOB_ID=$(curl \
--fail \
--retry 4 \
-X POST \
--header "Auth-Token: ${QA_REPORTS_TOKEN}" \
--form backend=${LAVA_SERVER} \
--form definition=@artefacts-lava/job.yaml \
${QA_SERVER}/api/submitjob/${QA_SERVER_TEAM}/${QA_SERVER_PROJECT}/${QA_SERVER_VERSION}/${DEVICE_TYPE})
# SQUAD will send 400, curl error code 22, on bad test definition
if [ "$?" = "22" ]; then
echo "Bad test definition!!"
exit 1
fi
if [ -n "${TESTJOB_ID}" ]; then
echo "TEST JOB URL: ${QA_SERVER}/testjob/${TESTJOB_ID} TEST JOB ID: ${TESTJOB_ID}"
# The below loop with a sleep is intentional: LAVA could be under heavy load so previous job creation can
# take 'some' time to get the right numeric LAVA JOB ID
renumber='^[0-9]+$'
LAVAJOB_ID="null"
iter=0
max_tries=120 # run retries for an hour
while ! [[ $LAVAJOB_ID =~ $renumber ]]; do
if [ $iter -eq $max_tries ] ; then
LAVAJOB_ID=''
break
fi
sleep 30
LAVAJOB_ID=$(curl --fail --retry 4 ${QA_SERVER}/api/testjobs/${TESTJOB_ID}/?fields=job_id)
# Get the job_id value (whatever it is)
LAVAJOB_ID=$(echo ${LAVAJOB_ID} | jq '.job_id')
LAVAJOB_ID="${LAVAJOB_ID//\"/}"
iter=$(( iter + 1 ))
done
fi
else
# Submit directly to LAVA
LAVAJOB_ID=$(resilient_cmd lavacli jobs submit artefacts-lava/job.yaml)
fi
# check that rest query at least get non-empty value
if [ -n "${LAVAJOB_ID}" ]; then
echo "LAVA URL: https://${LAVA_SERVER}/scheduler/job/${LAVAJOB_ID} LAVA JOB ID: ${LAVAJOB_ID}"
# if timeout on waiting for LAVA to complete, create an 'artificial' lava.log indicating
# job ID and timeout seconds
if ! wait_lava_job ${LAVAJOB_ID}; then
echo "Stopped monitoring LAVA JOB ${LAVAJOB_ID}, likely stuck or timeout too short?" | tee "${WORKSPACE}/lava.log"
exit 1
else
# Retrieve the test job plain log which is a yaml format file from LAVA
resilient_cmd sh -c "lavacli jobs logs --raw ${LAVAJOB_ID} > ${WORKSPACE}/lava-raw.log"
# Fetch and store LAVA job result (1 failure, 0 success)
resilient_cmd lavacli results ${LAVAJOB_ID} | tee "${WORKSPACE}/lava.results"
if grep -q '\[fail\]' "${WORKSPACE}/lava.results"; then
return 1
else
return 0
fi
fi
else
echo "LAVA Job ID could not be obtained"
exit 1
fi
}
# FIXME: Juno and FVP jobs may fail due to non-related users changes,
# so CI needs to resubmit the job, at most three times:
# Juno jobs may fail due to LAVA lab infrastructure issues (see
# https://projects.linaro.org/browse/LSS-2128)
# FVP jobs may hang at some particular TFTF test (see
# https://linaro.atlassian.net/browse/TFC-176)
# UPDATE: We want to keep retrying for LAVA for historical reasons,
# but we want to start from clean page with TuxSuite, so don't
# retry for it for now, and see how it goes.
status=1
for i in $(seq 1 ${LAVA_RETRIES:-3}); do
echo "# LAVA submission iteration #$i"
if submit_via_lava_or_squad; then
status=0
break
fi
done
exit ${status}