tf-a-builder/submit-test-job.sh - ci/tf-a-job-configs.git - TrustedFirmware Git Browser

 #!/bin/bash

 set -xe

 USE_SQUAD=0
 USE_TUXSUITE_FVP=${USE_TUXSUITE_FVP:-0}

 # Get LAVA device type from a job file
 get_lava_device_type() {
     local job_file=$1
     awk '/^device_type:/ {print $2}' ${job_file}
 }

 setup_tuxsuite() {
     mkdir -p ~/.config/tuxsuite/
     cat > ~/.config/tuxsuite/config.ini <<EOF
 [default]
 token=$TUXSUITE_TOKEN
 group=$TUXSUITE_GROUP
 project=$TUXSUITE_PROJECT
 EOF
 }

 # Wait for the LAVA job to finished
 # By default, timeout at 5400 secs (1.5 hours) and monitor every 60 seconds
 wait_lava_job() {
     set +x
     local id=$1
     local timeout="${2:-5400}"
     local interval="${3:-60}"

     (( t = timeout ))

     while ((t > 0)); do
         sleep $interval
         resilient_cmd lavacli jobs show $id | tee "${WORKSPACE}/lava-progress.show" | grep 'state *:'
         set +x
         if grep 'state.*: Finished' "${WORKSPACE}/lava-progress.show"; then
             set -x
             cat "${WORKSPACE}/lava-progress.show"
             # finished
             return 0
         fi
         ((t -= interval))
     done
     set -x
     cat "${WORKSPACE}/lava-progress.show"
     echo "Timeout waiting for job to finish"
     # timeout
     return 1
 }

 # Run the given command passed through parameters, if fails, try
 # at most more N-times with a pause of M-seconds until success.
 resilient_cmd() {
     set +x
     local max_retries=10
     local sleep_body=2
     local iter=0

     while true; do
         if "$@"; then
             break
         fi

         sleep ${sleep_body}
         # Exponential backoff
         sleep_body=$(( sleep_body * 2 ))
         if [ ${sleep_body} -ge 60 ]; then
             sleep_body=60
             echo "WARNING: Command '$@' still not successful on retry #${iter}, exp backoff already limited" 1>&2
         fi

         iter=$(( iter + 1 ))
         if [ ${iter} -ge ${max_retries} ]; then
             echo "ERROR: Command '$@' failed ${iter} times in row" 1>&2
             set -x
             return 1
         fi
     done
     set -x
     return 0
 }

 ls -l ${WORKSPACE}

 DEVICE=$(get_lava_device_type artefacts-lava/job.yaml)

 if [ "${DEVICE}" == "fvp" -a "${USE_TUXSUITE_FVP}" -ne 0 ]; then
     setup_tuxsuite
     set -o pipefail
     for i in $(seq 1 ${LAVA_RETRIES:-3}); do
         echo "# TuxSuite submission iteration #$i"
         if python3 -u -m tuxsuite test submit --device fvp-lava --job-definition artefacts-lava/job.yaml | tee tuxsuite-submit.out; then
             status=0
             break
         else
             status=$?
             echo "TuxSuite test failed, status: ${status}"
         fi
     done
     TUXID=$(awk '/^uid:/ {print $2}' tuxsuite-submit.out)
     echo "TuxSuite test ID: ${TUXID}"
     echo ${TUXID} > ${WORKSPACE}/tux.id
     tuxsuite test logs --raw ${TUXID} > ${WORKSPACE}/lava-raw.log

     if tuxsuite test results ${TUXID} | grep -v "lava.http-download" | grep -q 'fail'; then
         echo "tuxsuite test submit status was: ${status}, failing testcases found, setting as 1 (failed)"
         status=1
     fi

     echo "TuxSuite test result: ${status}"

     exit ${status}
 fi

 function submit_via_lava_or_squad() {

 lavacli identities add --username ${LAVA_USER} --token ${LAVA_TOKEN} --uri "https://${LAVA_SERVER}/RPC2" default

 if [ $USE_SQUAD -ne 0 -a -n "${QA_SERVER_VERSION}" ]; then
     # Submit via SQUAD

     if [ -n "${GERRIT_CHANGE_NUMBER}" ] && [ -n "${GERRIT_PATCHSET_NUMBER}" ]; then
         curl \
             --fail \
             --retry 4 \
             -X POST \
             --header "Auth-Token: ${QA_REPORTS_TOKEN}" \
             ${QA_SERVER}/api/createbuild/${QA_SERVER_TEAM}/${QA_SERVER_PROJECT}/${QA_SERVER_VERSION}
     fi

     TESTJOB_ID=$(curl \
         --fail \
         --retry 4 \
         -X POST \
         --header "Auth-Token: ${QA_REPORTS_TOKEN}" \
         --form backend=${LAVA_SERVER} \
         --form definition=@artefacts-lava/job.yaml \
         ${QA_SERVER}/api/submitjob/${QA_SERVER_TEAM}/${QA_SERVER_PROJECT}/${QA_SERVER_VERSION}/${DEVICE_TYPE})

     # SQUAD will send 400, curl error code 22, on bad test definition
     if [ "$?" = "22" ]; then
         echo "Bad test definition!!"
         exit 1
     fi

     if [ -n "${TESTJOB_ID}" ]; then
         echo "TEST JOB URL: ${QA_SERVER}/testjob/${TESTJOB_ID} TEST JOB ID: ${TESTJOB_ID}"


         # The below loop with a sleep is intentional: LAVA could be under heavy load so previous job creation can
         # take 'some' time to get the right numeric LAVA JOB ID
         renumber='^[0-9]+$'
         LAVAJOB_ID="null"
         iter=0
         max_tries=120 # run retries for an hour
         while ! [[ $LAVAJOB_ID =~ $renumber ]]; do
             if [ $iter -eq $max_tries ] ; then
                 LAVAJOB_ID=''
                 break
             fi
             sleep 30
             LAVAJOB_ID=$(curl --fail --retry 4 ${QA_SERVER}/api/testjobs/${TESTJOB_ID}/?fields=job_id)

             # Get the job_id value (whatever it is)
             LAVAJOB_ID=$(echo ${LAVAJOB_ID} | jq '.job_id')
             LAVAJOB_ID="${LAVAJOB_ID//\"/}"

             iter=$(( iter + 1 ))
         done
     fi
 else
     # Submit directly to LAVA
     LAVAJOB_ID=$(resilient_cmd lavacli jobs submit artefacts-lava/job.yaml)
 fi


 # check that rest query at least get non-empty value
 if [ -n "${LAVAJOB_ID}" ]; then

     echo "LAVA URL: https://${LAVA_SERVER}/scheduler/job/${LAVAJOB_ID} LAVA JOB ID: ${LAVAJOB_ID}"


     # if timeout on waiting for LAVA to complete, create an 'artificial' lava.log indicating
     # job ID and timeout seconds
     if ! wait_lava_job ${LAVAJOB_ID}; then
         echo "Stopped monitoring LAVA JOB ${LAVAJOB_ID}, likely stuck or timeout too short?" | tee "${WORKSPACE}/lava.log"
         exit 1
     else
         # Retrieve the test job plain log which is a yaml format file from LAVA
         resilient_cmd sh -c "lavacli jobs logs --raw ${LAVAJOB_ID} > ${WORKSPACE}/lava-raw.log"

         # Fetch and store LAVA job result (1 failure, 0 success)
         resilient_cmd lavacli results ${LAVAJOB_ID} | tee "${WORKSPACE}/lava.results"
         if grep -q '\[fail\]' "${WORKSPACE}/lava.results"; then
             return 1
         else
             return 0
         fi
     fi
 else
     echo "LAVA Job ID could not be obtained"
     exit 1
 fi

 }

 # FIXME: Juno and FVP jobs may fail due to non-related users changes,
 # so CI needs to resubmit the job, at most three times:
 # Juno jobs may fail due to LAVA lab infrastructure issues (see
 # https://projects.linaro.org/browse/LSS-2128)
 # FVP jobs may hang at some particular TFTF test (see
 # https://linaro.atlassian.net/browse/TFC-176)

 # UPDATE: We want to keep retrying for LAVA for historical reasons,
 # but we want to start from clean page with TuxSuite, so don't
 # retry for it for now, and see how it goes.

 status=1
 for i in $(seq 1 ${LAVA_RETRIES:-3}); do
     echo "# LAVA submission iteration #$i"
     if submit_via_lava_or_squad; then
         status=0
         break
     fi
 done

 exit ${status}
	#!/bin/bash

	set -xe

	USE_SQUAD=0
	USE_TUXSUITE_FVP=${USE_TUXSUITE_FVP:-0}

	# Get LAVA device type from a job file
	get_lava_device_type() {
	local job_file=$1
	awk '/^device_type:/ {print $2}' ${job_file}
	}

	setup_tuxsuite() {
	mkdir -p ~/.config/tuxsuite/
	cat > ~/.config/tuxsuite/config.ini <<EOF
	[default]
	token=$TUXSUITE_TOKEN
	group=$TUXSUITE_GROUP
	project=$TUXSUITE_PROJECT
	EOF
	}

	# Wait for the LAVA job to finished
	# By default, timeout at 5400 secs (1.5 hours) and monitor every 60 seconds
	wait_lava_job() {
	set +x
	local id=$1
	local timeout="${2:-5400}"
	local interval="${3:-60}"

	(( t = timeout ))

	while ((t > 0)); do
	sleep $interval
	resilient_cmd lavacli jobs show $id \| tee "${WORKSPACE}/lava-progress.show" \| grep 'state *:'
	set +x
	if grep 'state.*: Finished' "${WORKSPACE}/lava-progress.show"; then
	set -x
	cat "${WORKSPACE}/lava-progress.show"
	# finished
	return 0
	fi
	((t -= interval))
	done
	set -x
	cat "${WORKSPACE}/lava-progress.show"
	echo "Timeout waiting for job to finish"
	# timeout
	return 1
	}

	# Run the given command passed through parameters, if fails, try
	# at most more N-times with a pause of M-seconds until success.
	resilient_cmd() {
	set +x
	local max_retries=10
	local sleep_body=2
	local iter=0

	while true; do
	if "$@"; then
	break
	fi

	sleep ${sleep_body}
	# Exponential backoff
	sleep_body=$(( sleep_body * 2 ))
	if [ ${sleep_body} -ge 60 ]; then
	sleep_body=60
	echo "WARNING: Command '$@' still not successful on retry #${iter}, exp backoff already limited" 1>&2
	fi

	iter=$(( iter + 1 ))
	if [ ${iter} -ge ${max_retries} ]; then
	echo "ERROR: Command '$@' failed ${iter} times in row" 1>&2
	set -x
	return 1
	fi
	done
	set -x
	return 0
	}

	ls -l ${WORKSPACE}

	DEVICE=$(get_lava_device_type artefacts-lava/job.yaml)

	if [ "${DEVICE}" == "fvp" -a "${USE_TUXSUITE_FVP}" -ne 0 ]; then
	setup_tuxsuite
	set -o pipefail
	for i in $(seq 1 ${LAVA_RETRIES:-3}); do
	echo "# TuxSuite submission iteration #$i"
	if python3 -u -m tuxsuite test submit --device fvp-lava --job-definition artefacts-lava/job.yaml \| tee tuxsuite-submit.out; then
	status=0
	break
	else
	status=$?
	echo "TuxSuite test failed, status: ${status}"
	fi
	done
	TUXID=$(awk '/^uid:/ {print $2}' tuxsuite-submit.out)
	echo "TuxSuite test ID: ${TUXID}"
	echo ${TUXID} > ${WORKSPACE}/tux.id
	tuxsuite test logs --raw ${TUXID} > ${WORKSPACE}/lava-raw.log

	if tuxsuite test results ${TUXID} \| grep -v "lava.http-download" \| grep -q 'fail'; then
	echo "tuxsuite test submit status was: ${status}, failing testcases found, setting as 1 (failed)"
	status=1
	fi

	echo "TuxSuite test result: ${status}"

	exit ${status}
	fi

	function submit_via_lava_or_squad() {

	lavacli identities add --username ${LAVA_USER} --token ${LAVA_TOKEN} --uri "https://${LAVA_SERVER}/RPC2" default

	if [ $USE_SQUAD -ne 0 -a -n "${QA_SERVER_VERSION}" ]; then
	# Submit via SQUAD

	if [ -n "${GERRIT_CHANGE_NUMBER}" ] && [ -n "${GERRIT_PATCHSET_NUMBER}" ]; then
	curl \
	--fail \
	--retry 4 \
	-X POST \
	--header "Auth-Token: ${QA_REPORTS_TOKEN}" \
	${QA_SERVER}/api/createbuild/${QA_SERVER_TEAM}/${QA_SERVER_PROJECT}/${QA_SERVER_VERSION}
	fi

	TESTJOB_ID=$(curl \
	--fail \
	--retry 4 \
	-X POST \
	--header "Auth-Token: ${QA_REPORTS_TOKEN}" \
	--form backend=${LAVA_SERVER} \
	--form definition=@artefacts-lava/job.yaml \
	${QA_SERVER}/api/submitjob/${QA_SERVER_TEAM}/${QA_SERVER_PROJECT}/${QA_SERVER_VERSION}/${DEVICE_TYPE})

	# SQUAD will send 400, curl error code 22, on bad test definition
	if [ "$?" = "22" ]; then
	echo "Bad test definition!!"
	exit 1
	fi

	if [ -n "${TESTJOB_ID}" ]; then
	echo "TEST JOB URL: ${QA_SERVER}/testjob/${TESTJOB_ID} TEST JOB ID: ${TESTJOB_ID}"


	# The below loop with a sleep is intentional: LAVA could be under heavy load so previous job creation can
	# take 'some' time to get the right numeric LAVA JOB ID
	renumber='^[0-9]+$'
	LAVAJOB_ID="null"
	iter=0
	max_tries=120 # run retries for an hour
	while ! [[ $LAVAJOB_ID =~ $renumber ]]; do
	if [ $iter -eq $max_tries ] ; then
	LAVAJOB_ID=''
	break
	fi
	sleep 30
	LAVAJOB_ID=$(curl --fail --retry 4 ${QA_SERVER}/api/testjobs/${TESTJOB_ID}/?fields=job_id)

	# Get the job_id value (whatever it is)
	LAVAJOB_ID=$(echo ${LAVAJOB_ID} \| jq '.job_id')
	LAVAJOB_ID="${LAVAJOB_ID//\"/}"

	iter=$(( iter + 1 ))
	done
	fi
	else
	# Submit directly to LAVA
	LAVAJOB_ID=$(resilient_cmd lavacli jobs submit artefacts-lava/job.yaml)
	fi


	# check that rest query at least get non-empty value
	if [ -n "${LAVAJOB_ID}" ]; then

	echo "LAVA URL: https://${LAVA_SERVER}/scheduler/job/${LAVAJOB_ID} LAVA JOB ID: ${LAVAJOB_ID}"


	# if timeout on waiting for LAVA to complete, create an 'artificial' lava.log indicating
	# job ID and timeout seconds
	if ! wait_lava_job ${LAVAJOB_ID}; then
	echo "Stopped monitoring LAVA JOB ${LAVAJOB_ID}, likely stuck or timeout too short?" \| tee "${WORKSPACE}/lava.log"
	exit 1
	else
	# Retrieve the test job plain log which is a yaml format file from LAVA
	resilient_cmd sh -c "lavacli jobs logs --raw ${LAVAJOB_ID} > ${WORKSPACE}/lava-raw.log"

	# Fetch and store LAVA job result (1 failure, 0 success)
	resilient_cmd lavacli results ${LAVAJOB_ID} \| tee "${WORKSPACE}/lava.results"
	if grep -q '\[fail\]' "${WORKSPACE}/lava.results"; then
	return 1
	else
	return 0
	fi
	fi
	else
	echo "LAVA Job ID could not be obtained"
	exit 1
	fi

	}

	# FIXME: Juno and FVP jobs may fail due to non-related users changes,
	# so CI needs to resubmit the job, at most three times:
	# Juno jobs may fail due to LAVA lab infrastructure issues (see
	# https://projects.linaro.org/browse/LSS-2128)
	# FVP jobs may hang at some particular TFTF test (see
	# https://linaro.atlassian.net/browse/TFC-176)

	# UPDATE: We want to keep retrying for LAVA for historical reasons,
	# but we want to start from clean page with TuxSuite, so don't
	# retry for it for now, and see how it goes.

	status=1
	for i in $(seq 1 ${LAVA_RETRIES:-3}); do
	echo "# LAVA submission iteration #$i"
	if submit_via_lava_or_squad; then
	status=0
	break
	fi
	done

	exit ${status}