benchmark: change ts buffer allocation flow

1. Change timestamp buffer allocation/mapping flow
2. Add misc cosmetic fixes

In case if timestamp buffer is allocated in userspace and new register
user memory API is used for its registering in OP-TEE (introduced in
optee_client commit 27888d73d156 ("tee_client_api: register user memory")),
there is no possibility to keep this mapping permanent among different
TEEC_InvokeCommand invocations. All all SHM are automatically unmapped from
OP-TEE VA space after TEEC_InvokeCommand is handled by OP-TEE.

Fixes: https://github.com/OP-TEE/optee_os/issues/1979
Acked-by: Joakim Bech <joakim.bech@linaro.org>
Signed-off-by: Igor Opaniuk <igor.opaniuk@linaro.org>
diff --git a/benchmark_aux.c b/benchmark_aux.c
index 7995c18..222b179 100644
--- a/benchmark_aux.c
+++ b/benchmark_aux.c
@@ -24,12 +24,15 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
+#include <fcntl.h>
 #include <libgen.h>
 #include <linux/limits.h>
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
 #include <unistd.h>
 
 #include "benchmark_aux.h"
@@ -121,3 +124,28 @@
 {
 	return sysconf(_SC_NPROCESSORS_ONLN);
 }
+
+void *mmap_paddr(intptr_t paddr, uint64_t size)
+{
+	int devmem;
+	off_t offset = 0;
+	off_t page_addr;
+	intptr_t *hw_addr = (intptr_t *)paddr;
+
+	devmem = open("/dev/mem", O_RDWR);
+	if (!devmem)
+		return NULL;
+
+	offset = (off_t)hw_addr % getpagesize();
+	page_addr = (off_t)(hw_addr - offset);
+
+	hw_addr = (intptr_t *)mmap(0, size, PROT_READ|PROT_WRITE,
+					MAP_SHARED, devmem, page_addr);
+	if (hw_addr == MAP_FAILED) {
+		close(devmem);
+		return NULL;
+	}
+
+	close(devmem);
+	return (hw_addr + offset);
+}
diff --git a/benchmark_aux.h b/benchmark_aux.h
index 865c8dd..fc612f4 100644
--- a/benchmark_aux.h
+++ b/benchmark_aux.h
@@ -43,6 +43,8 @@
 void alloc_argv(int argc, char *argv[], char **new_argv[]);
 void dealloc_argv(int new_argc, char **new_argv);
 
+void *mmap_paddr(intptr_t paddr, uint64_t size);
+
 /* get amount of cores */
 uint32_t get_cores(void);
 #endif /* BENCHMARK_AUX_H */
diff --git a/common.h b/common.h
index 645ba1a..a4f3ee9 100644
--- a/common.h
+++ b/common.h
@@ -36,7 +36,7 @@
 
 #ifdef DEBUG
 #define DBG(fmt, args...) printf("[" OUTPUT_APP_PREFIX \
-		"] DEBUG: %s:%d:%s(): " fmt "\n", __FILE__, __LINE__, __func__, ##args)
+	"] DEBUG: %s:%d:%s(): " fmt "\n", __FILE__, __LINE__, __func__, ##args)
 #else
 #define DBG(fmt, args...)
 #endif
diff --git a/main.c b/main.c
index f6c0dde..ffbb4a5 100644
--- a/main.c
+++ b/main.c
@@ -49,13 +49,10 @@
 static struct tee_ts_global *bench_ts_global;
 
 static const TEEC_UUID pta_benchmark_uuid = PTA_BENCHMARK_UUID;
-static TEEC_SharedMemory ts_buf_shm = {
-		.flags = TEEC_MEM_INPUT | TEEC_MEM_OUTPUT
-};
 static TEEC_Context ctx;
 static TEEC_Session sess;
 
-static volatile sig_atomic_t is_running;
+static sig_atomic_t is_running;
 static yaml_emitter_t emitter;
 
 
@@ -82,52 +79,42 @@
 static void close_bench_pta(void)
 {
 	/* release benchmark timestamp shm */
-	TEEC_ReleaseSharedMemory(&ts_buf_shm);
 	TEEC_CloseSession(&sess);
 	TEEC_FinalizeContext(&ctx);
 }
 
-static void init_ts_global(void *ts_global, uint32_t cores)
-{
-	unsigned int i;
-	struct tee_ts_cpu_buf *cpu_buf;
-
-	/* init global timestamp buffer */
-	bench_ts_global = (struct tee_ts_global *)ts_global;
-	bench_ts_global->cores = cores;
-
-	/* init per-cpu timestamp buffers */
-	for (i = 0; i < cores; i++) {
-		cpu_buf = &bench_ts_global->cpu_buf[i];
-		memset(cpu_buf, 0, sizeof(struct tee_ts_cpu_buf));
-	}
-}
-
-static void register_bench_buf(uint32_t cores)
+static void alloc_bench_buf(uint32_t cores)
 {
 	TEEC_Result res;
 	TEEC_Operation op = { 0 };
 	uint32_t ret_orig;
+	intptr_t paddr_ts_buf = 0;
+	size_t size;
 
-	ts_buf_shm.size = sizeof(struct tee_ts_global) +
-			sizeof(struct tee_ts_cpu_buf) * cores;
+	op.paramTypes = TEEC_PARAM_TYPES(TEEC_VALUE_INOUT,
+			TEEC_VALUE_INPUT, TEEC_NONE, TEEC_NONE);
 
-	/* allocate global timestamp buffer */
-	res = TEEC_AllocateSharedMemory(&ctx, &ts_buf_shm);
-	tee_check_res(res, "TEEC_AllocateSharedMemory");
+	op.params[1].value.a = cores;
 
-	init_ts_global(ts_buf_shm.buffer, cores);
-
-	op.paramTypes = TEEC_PARAM_TYPES(TEEC_MEMREF_PARTIAL_INOUT,
-			TEEC_NONE, TEEC_NONE, TEEC_NONE);
-	op.params[0].memref.parent = &ts_buf_shm;
-
-	TEEC_InvokeCommand(&sess, BENCHMARK_CMD_REGISTER_MEMREF,
+	res = TEEC_InvokeCommand(&sess, BENCHMARK_CMD_REGISTER_MEMREF,
 					&op, &ret_orig);
 	tee_check_res(res, "TEEC_InvokeCommand");
+
+	paddr_ts_buf = op.params[0].value.a;
+	size = op.params[0].value.b;
+
+	INFO("ts buffer paddr = %x, size = %d\n", paddr_ts_buf, size);
+	if (paddr_ts_buf) {
+
+		bench_ts_global = mmap_paddr(paddr_ts_buf, size);
+		if (!bench_ts_global)
+			ERROR_EXIT("Failed to allocate timestamp buffer");
+	} else {
+		ERROR_EXIT("Failed to allocate timestamp buffer");
+	}
 }
 
-static void unregister_bench(void)
+static void free_bench_buf(void)
 {
 	TEEC_Result res;
 	TEEC_Operation op = { 0 };
@@ -154,7 +141,7 @@
 }
 
 static int timestamp_pop(struct tee_ts_cpu_buf *cpu_buf,
-						struct tee_time_st *ts)
+			 struct tee_time_st *ts)
 {
 	uint64_t ts_tail;
 
@@ -358,13 +345,15 @@
 						&ts_data);
 			if (!ret) {
 				ts_received = true;
-				DBG("Timestamp: core = %u; tick = %lld; pc = 0x%"
-						PRIx64 ";system = %s",
-						i, ts_data.cnt, ts_data.addr,
-						bench_str_src(ts_data.src));
+				DBG("Timestamp: core = %u; tick = %lld; "
+					"pc = 0x%" PRIx64 "; system = %s",
+					i, ts_data.cnt, ts_data.addr,
+					bench_str_src(ts_data.src));
 				if (!fill_timestamp(i, ts_data.cnt,
-						ts_data.addr, bench_str_src(ts_data.src)))
-					ERROR_GOTO(deinit_yaml, "Adding timestamp failed");
+					ts_data.addr,
+					bench_str_src(ts_data.src)))
+					ERROR_GOTO(deinit_yaml,
+					"Adding timestamp failed");
 
 			}
 		}
@@ -375,7 +364,8 @@
 				sched_yield();
 			} else {
 				ERROR_GOTO(deinit_yaml,
-					"No new data in the per-cpu ringbuffers, closing ts file");
+					"No new data in the per-cpu ringbuffers"
+					);
 			}
 		}
 	}
@@ -424,7 +414,7 @@
 
 	INFO("2. Allocating per-core buffers, cores detected = %d",
 					cores);
-	register_bench_buf(cores);
+	alloc_bench_buf(cores);
 
 	res = realpath(argv[1], testapp_path);
 	if (!res)
@@ -479,7 +469,7 @@
 	INFO("4. Done benchmark");
 
 	dealloc_argv(argc-1, testapp_argv);
-	unregister_bench();
+	free_bench_buf();
 	close_bench_pta();
 	return 0;
 }