diff --git a/.gitignore b/.gitignore
index 36ba3cd..d7af72c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,16 @@
+# Git comparison files
+*.diff
+*.patch
+*.orig
+*.rej
 # Editor backup files
 *~
+# Generated logs and results files
+*.log
+*.res
+*.csv
+# Generated graphics
+*.png
+bm-graph-all-*/
+# Dump files
+*.dump
diff --git a/build-all.sh b/build-all.sh
index c2afecf..3ddaf3e 100755
--- a/build-all.sh
+++ b/build-all.sh
@@ -15,9 +15,16 @@ usage () {
     cat <<EOF
 Usage ./build-all.sh                      : Build riscv64-unknown-linux-gnu
                                             tool chain and QEMU (default)
-                     [--only-qemu]        : Build just QEMU
+                     [--build-qemu]       : Build qemu-riscv32 and qemu-riscv64
                      [--build-clang]      : Build Clang/LLVM
                      [--build-gdbserver]  : Build gdbserver
+		     [--qemu-only]        : Only build qemu
+		     [--qemu-configs]     : Additional QEMU config otions
+		     [--qemu-cflags]      : CFLAGS for building QEMU (default
+                                            "-Wno-error")
+                     [--profile-qemu]     : Enable profiling by gperf
+                     [--prefix <path>]    : Install path of the tool chain.
+                                            Default path is ../install
                      [--arch <arch>]      : Target architecture. Default
                                             architecture is rv64gc
                      [--abi <abi>]        : Target ABI. Default ABI is lp64d
@@ -33,6 +40,7 @@ Usage ./build-all.sh                      : Build riscv64-unknown-linux-gnu
                      [--clean]            : Delete build directories in
                                             riscv-gnu-toolchain and the install
                                             directory before building
+                     [--clean-qemu]       : Clean just the QEMU build
                      [--help]             : Print this message and exit
 EOF
 }
@@ -51,9 +59,13 @@ DEFAULTTRIPLE=riscv64-unknown-elf
 
 build_linux=true
 qemu_only=false
+qemu_configs=""
+qemu_cflags=""
+profile_qemu=""
 build_gdbserver=false
 build_clang=false
 clean_build=false
+clean_qemu_build=false
 enable_multilib=true
 print_help=false
 print_hashes=false
@@ -110,6 +122,17 @@ until
       --qemu-only)
 	  qemu_only=true
 	  ;;
+      --qemu-configs)
+	  shift
+	  qemu_configs="$1"
+	  ;;
+      --qemu-cflags)
+	  shift
+	  qemu_cflags="$1"
+	  ;;
+      --profile-qemu)
+	  profile_qemu="--enable-gprof"
+	  ;;
       --build-gdbserver)
 	  build_gdbserver=true
 	  ;;
@@ -156,6 +179,10 @@ until
 	  ;;
       --clean)
 	  clean_build=true
+	  clean_qemu_build=true
+	  ;;
+      --clean-qemu)
+	  clean_qemu_build=true
 	  ;;
       --help)
 	  print_help=true
@@ -267,6 +294,15 @@ else
   EXTRA_OPTS="${EXTRA_OPTS} --disable-multilib"
 fi
 echo "  build qemu: yes"
+echo "   qemu_configs: ${qemu_configs}"
+echo "   qemu_cflags: ${qemu_cflags}"
+if ${clean_qemu_build}
+then
+   echo "   qemu_clean: yes"
+else
+   echo "   qemu_clean: no"
+fi
+
 if ${build_gdbserver}
 then
   echo "  build gdbserver: yes"
@@ -283,7 +319,7 @@ fi
 cd $TOPDIR/riscv-gnu-toolchain
 
 log_file="${LOGDIR}/clean-toolchain.log"
-if ${clean_build}
+if ${clean_build} && ! ${qemu_only}
 then
   echo
   echo "Cleaning...                            logging to ${log_file}"
@@ -362,8 +398,14 @@ echo "Building QEMU...                 logging to ${log_file}"
   $TOPDIR/qemu/configure --prefix=$INSTALLDIR \
 	  --target-list=riscv64-linux-user,riscv32-linux-user \
 	  --interp-prefix=$INSTALLDIR/sysroot \
-	  --python=python3 \
-	  --extra-cflags="-Wno-error"
+	  --python=python3 ${profile_qemu} \
+	  ${qemu_configs} \
+	  --extra-cflags="${qemu_cflags}"
+  if ${clean_build} || ${clean_qemu_build}
+  then
+      rm -f ${INSTALLDIR}/bin/qemu-riscv??
+      make clean
+  fi
   make -j $(nproc)
   make install
 ) > ${log_file} 2>&1
diff --git a/memcpy-benchmarks/.gitignore b/memcpy-benchmarks/.gitignore
index b8f3d14..99b66b3 100644
--- a/memcpy-benchmarks/.gitignore
+++ b/memcpy-benchmarks/.gitignore
@@ -3,3 +3,12 @@
 *.exe
 *.icount
 *.check
+# Generated data
+*.csv
+*.res
+perf.data
+perf.data.old
+gmon.out
+# Standard directories for generated data
+res-baseline
+res-development
diff --git a/memcpy-benchmarks/README.md b/memcpy-benchmarks/README.md
index 118bf27..6874eca 100644
--- a/memcpy-benchmarks/README.md
+++ b/memcpy-benchmarks/README.md
@@ -30,3 +30,143 @@ option to see arguments and the comments in the script.
 The `run-sequence.sh` script will run a large number of benchmarks for
 different values of VLEN and LMUL and for a range of data sizes.  Again use
 the `--help` option to see arguments and look at the comments in the script.
+
+## Scripts to help with Linux _perf_
+
+### Prerequisites
+
+The scripts are intended to run under Linux.  Prequisites are Linux _perf_ and
+_csvtool_, both of which should be available with standard distributions.
+
+### `run_perf.sh`
+
+```
+./run-perf.sh [--bytes <num>] [--resdir <dir>] [--sizes <list>]
+```
+
+Uses Linux _perf_ to profile different variants of the `memcpy` benchmark.
+Arguments are as follows.
+
+- `--bytes` _num_ : Total bytes to copy (optional).  Default 1,000,000,000.
+
+- `--resdir` _dir_ : Directory in which to place the results (optional).
+  Default is `res-baseline` in the directory holding this script.
+
+- `--sizes` _list_ : Space separated list of the data sizes to use when
+  creating results (optional).  Default list is all the powers of 2, 3, 5 and
+  7 up to 5<sup>6</sup>.
+
+The results will be three sets of files of the form
+`prof-`_type_`-`_size_`.res`, where `type` is one of `scalar`, `vector-small`
+or `vector-large`, and _size_, is the size of the data block copied on each
+iteration.
+
+The total number of iterations for each test is determined by the number given
+in the `--bytes` argument divided by the size of the data block being used for
+the run.
+
+`perf record` is run using DWARF to determine the call graph.  This gives
+accurate results, but is slow.  Expect each iteration to take of the order of
+20 minutes on a decent server.
+
+### `extract-top-level-funcs.sh`
+
+```
+./extract-top-level-funcs.sh --resfile <file> [--cutoff <num>] \
+    [--total|--self] [--omit-empty] [--md | --csv]
+```
+
+Extract the main results from a file generated by `run_perf.sh`.  Arguments
+are as follows.
+
+- `--resfile` _file_: Target file to extract results from (mandatory)
+- `--cutoff` _num_: Percentage below which to stop showing results
+  (optional). Default value 1
+- `--total`: Cutoff and sorting based on total time (self + children)
+  (optional). Set by default.
+- `--self`: Cutoff and sorting just based on self time (no children)
+  (optional). Opposite to `--total`, so not set by default.
+- `--omit-empty`: Do not show results if self is 0.00 (optional).  Only has
+  any effect in combination with `--total`.
+- `--md`: Output results in MarkDown format (optional). Set by default
+- `--csv`: Output results in CSV format.
+
+**Note.** Only one of `--total` or `--self` may be specified.  Only one of
+`--md` or `--csv` may be specified.
+
+This is the central file for extracting data from the Linux _perf_ results.
+In general using `--self` gives the most useful data for targeting
+optimizations. Using `--total` will flag up these functions, but also
+functions which are just wrappers for other functions.  The `--omit-empty`
+option can be helpful when using `--total` to skip functions which are purely
+wrapping other functions.
+
+### `count-top-funcs.sh`
+
+```
+Usage ./count-top-funcs.sh [--resdir <dir>] [--total|--self] [--md | --csv]
+```
+
+Find the frequency of the most used functions in a set of data.  This is a
+wrapper for `extract-top-level-funcs`.  Arguments are as follows.
+
+- `--resdir` _dir_: Directory with the results to be analysed (optional).
+  Default `res-baseline`
+- `--total`: Cutoff and sorting based on total time (self + children)
+  (optional). Set by default.
+- `--self`: Cutoff and sorting just based on self time (no children)
+  (optional). Opposite to `--total`, so not set by default.
+- `--md`: Output results in MarkDown format (optional). Set by default
+- `--csv`: Output results in CSV format.
+
+**Note.** Only one of `--total` or `--self` may be specified.  Only one of
+`--md` or `--csv` may be specified.
+
+The results to be analysed will be in files of the form
+`prof-`_type_`-`_size_`.res`, where _type_ is one of `scalar`, `vector-small`
+or `vector-large`, and _size_, is the size of the data block in bytes copied
+on each iteration.
+
+### `profile-all-funcs.sh`
+
+```
+./profile-all-funcs.sh [--resdir <dir>] [--type <str>] [--total|--self] \
+    [--funclist <list>]
+```
+
+Extract data on function usage for different data sizes in a form suitable for
+graphical analysis.  Arguments are as follows.
+
+- `--resdir` _dir_: Directory with the results to be analysed (optional).
+  Default `res-baseline`.
+- `--type` _str_: What type of result to look at (optional).  Permitted values
+  are `scalar` (default), `vector-small` or `vector-large`.
+- `--total`: Cutoff and sorting based on total time (self + children)
+  (optional). Set by default.
+- `--self`: Cutoff and sorting just based on self time (no children)
+  (optional). Opposite to `--total`, so not set by default.
+- `--funclist` _list_: Space separated list of functions to profile. Default
+  value `helper_lookup_tb_ptr cpu_get_tb_cpu_state`
+
+**Note.** Only one of `--total` or `--self` may be specified.
+
+This script typically takes a set of functions identified by
+`count-top-funcs.sh`.  The output is always CSV format.
+
+### `run-spec-pop2.sh`
+
+```
+./run-spec-pop2.sh [--reportfile <file>] [--specdir <dir>]
+```
+
+**Note.** Because this is not specific to the `memcpy` benchmarks it lives in
+the main `tooling` repository.  Arguments are as follows.
+
+- `--reportfile` _file_: Put the results in this file. Default
+  `prof-628.pop2_s.res` in the `tooling` repository
+
+- `--specdir` _dir_: The directory holding the SPEC installation to be used.
+
+This script runs the SPEC CPU 2017 benchmark under QEMU with Linux _perf_.
+The script runs a previously built benchmark.  If necessary use
+`runspec-qemu.sh` to create the benchmark binary.
diff --git a/memcpy-benchmarks/count-top-funcs.sh b/memcpy-benchmarks/count-top-funcs.sh
new file mode 100755
index 0000000..0e5d9a5
--- /dev/null
+++ b/memcpy-benchmarks/count-top-funcs.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+# Copyright (C) 2024 Embecosm Limited <www.embecosm.com>
+# Contributor Jeremy Bennett <jeremy.bennett@embecosm.com>
+
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# A script to count how often functions are used in profiled memcpy benchmarks
+
+set -u
+
+usage () {
+    cat <<EOF
+Usage ./count-top-funcs.sh : Count frequency of most use functions
+          [--resdir <dir>] : Directory with the results.  Default
+                             "res-baseline"
+          [--total|--self] : Select results based on total (self + children)
+                             or just self.  Default "total"`
+          [--md | --csv]   : Output results in Markdown (default) or CSV
+
+The results to be analysed will be in files of the form
+"prof-<type>-<size>.res", where type is one of "scalar", "vector-small" or
+"vector-large", and size, is the size of the data block in bytes copied on
+each iteration.
+EOF
+}
+
+topdir="$(cd $(dirname $(dirname $(dirname $(readlink -f $0)))) ; pwd)"
+memcpydir="${topdir}/tooling/memcpy-benchmarks"
+resdir="${memcpydir}/res-baseline"
+dototal="--total"
+format="--md"
+
+set +u
+until
+  opt="$1"
+  case "${opt}"
+  in
+      --resdir)
+	  shift
+	  resdir="$(cd $(readlink -f $1) ; pwd)"
+	  ;;
+      --total|--self)
+	  dototal=$1
+	  ;;
+      --md|--csv)
+	  format="$1"
+	  ;;
+      --help)
+	  usage
+	  exit 0
+	  ;;
+      ?*)
+	  usage
+	  exit 1
+	  ;;
+      *)
+	  ;;
+  esac
+[ "x${opt}" = "x" ]
+do
+  shift
+done
+set -u
+
+# We create a lot of temporaries!
+tmpdir="$(mktemp -d count-top-funcs-XXXXXX)"
+
+# Find out the sizes
+dlens="$(ls -1 ${resdir}/prof-scalar-*.res | \
+	    sed -e 's/^.*prof-scalar-//' -e 's/\.res$//' | sort -n)"
+
+cd ${memcpydir}
+for tp in "scalar" "vector-small" "vector-large"
+do
+    echo
+    echo "${tp}"
+    echo
+    tmpf1="${tmpdir}/all-${tp}.res"
+    tmpf2="${tmpdir}/table-${tp}.res"
+    rm -f "${tmpf1}"
+    touch "${tmpf1}"
+    for l in ${dlens}
+    do
+	./extract-top-level-funcs.sh --resfile ${resdir}/prof-${tp}-${l}.res \
+				     ${dototal} --omit-empty >> ${tmpf1}
+    done
+    sed -n < ${tmpf1} -e 's/`//gp' | \
+	sed -e 's/^|[^|]*|[^|]*| //' -e 's/[[:space:]]*|$//' | \
+	sort | uniq -c | sort -nr > ${tmpf2}
+
+    case "${format}"
+    in
+	--md)
+	    printf "| %5s | %-45s |\n" "Count" "Function/address"
+	    printf "| %5s | %-45s |\n" "----:" \
+		   ":------------------------------------------"
+	    ;;
+	--csv)
+	    printf '"%s","%s"\n' "Count" "Function/address"
+    esac
+
+    while IFS='' read -r line
+    do
+	cnt=$(echo "${line}" | sed -e 's/^[[:space:]]\+//' -e 's/ .*$//')
+	func=$(echo "${line}" | \
+		   sed -e 's/^[[:space:]]\+[[:digit:]]\+[[:space:]]\+//')
+
+	case "${format}"
+	in
+	    --md)
+		func=$(echo "\`${func}\`")
+		printf "| %5s | %-45s |\n" "${cnt}" "${func}"
+		;;
+	    --csv)
+		printf '"%s","%s"\n' "${cnt}" "${func}"
+	esac
+    done < ${tmpf2}
+done
+
+rm -r ${tmpdir}
diff --git a/memcpy-benchmarks/extract-top-level-funcs.sh b/memcpy-benchmarks/extract-top-level-funcs.sh
new file mode 100755
index 0000000..42cfb21
--- /dev/null
+++ b/memcpy-benchmarks/extract-top-level-funcs.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+
+# Copyright (C) 2024 Embecosm Limited <www.embecosm.com>
+# Contributor Jeremy Bennett <jeremy.bennett@embecosm.com>
+
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# A script to extract performance data from memcpy perf runs
+
+set -u
+
+usage () {
+    cat <<EOF
+Usage ./extract-top-level-funcs.sh : Extract list of top level functions
+          --resfile <file>         : Target file to extract
+          [--cutoff <num>]         : Percentage at which to stop showing
+                                     results (default 1)
+          [--total|--self]         : Select based on total (self + children)
+                                     or just self.  Default "total"
+          [--omit-empty]           : Do not show results if self is 0.00
+          [--md | --csv]           : Output results in Markdown (default) or
+                                     CSV
+EOF
+}
+
+topdir="$(cd $(dirname $(dirname $(dirname $(readlink -f $0)))) ; pwd)"
+memcpydir=${topdir}/tooling/memcpy-benchmarks
+
+# Default values
+resfile=
+cutoff=1
+dototal=true
+format="--md"
+omit_empty=false
+
+set +u
+until
+  opt="$1"
+  case "${opt}"
+  in
+      --resfile)
+	  shift
+	  resfile="$(readlink -f $1)"
+	  ;;
+      --cutoff)
+	  shift
+	  cutoff="$1"
+	  ;;
+      --total)
+	  dototal=true
+	  ;;
+      --self)
+	  dototal=false
+	  ;;
+      --omit-empty)
+	  omit_empty=true
+	  ;;
+      --md|--csv)
+	  format="$1"
+	  ;;
+      --help)
+	  usage
+	  exit 0
+	  ;;
+      ?*)
+	  usage
+	  exit 1
+	  ;;
+      *)
+	  ;;
+  esac
+[ "x${opt}" = "x" ]
+do
+  shift
+done
+set -u
+
+if [[ "x${resfile}" == "x" ]]
+then
+    echo "ERROR: --resfile required."
+    usage
+    exit 1
+fi
+
+# Temporary file, so we can sort the results
+tmpf=$(mktemp extract-top-level-funcs-XXXXXX)
+
+case "${format}"
+in
+    --md)
+	printf "| %8s | %8s | %-45s |\n" "Children" "Self" "Function/address"
+	printf "| %8s | %8s | %-45s |\n" "-------:" "-------:" \
+	       ":--------------------------------------------"
+	;;
+    --csv)
+	printf '"%s","%s","%s"\n' "Children" "Self" "Function/address"
+esac
+
+while IFS='' read -r line
+do
+    if (echo "${line}" | grep -q '\[\.\] [^[:space:]]\+$')
+    then
+	# Extract the three fields of interest
+	pctot=$(echo "${line}" | \
+		  sed -e 's/^[[:space:]]\+\([[:digit:]]\+\...\)%.*$/\1/')
+	pcself=$(echo "${line}" | \
+		  sed -e 's/^[[:space:]]\+[[:digit:]]\+\...%[[:space:]]\+\([[:digit:]]\+\...\)%.*$/\1/')
+	func=$(echo ${line} | sed -e 's/^.*\[\.\] \([^[:space:]]\+\)$/\1/')
+
+	# Print fields of interest
+	if ${dototal}
+	then
+	    selector="${pctot}"
+	else
+	    selector="${pcself}"
+	fi
+
+	if [[ "$(echo "${selector}" | sed -e 's/\...$//')" -ge ${cutoff} ]]
+	then
+	    if ! ${omit_empty} || [[ "${pcself}" != "0.00" ]]
+	    then
+		case "${format}"
+		in
+		    --md)
+			func=$(echo "\`${func}\`")
+			printf "| %8s | %8s | %-45s |\n" "${pctot}" \
+			       "${pcself}" "${func}" >> ${tmpf}
+			;;
+		    --csv)
+			printf '"%s","%s","%s"\n' "${pctot}" \
+			       "${pcself}" "${func}" >> ${tmpf}
+			;;
+		esac
+	    fi
+	fi
+
+	# If pctot is less than the cutoff, then we definitely cannot have any
+	# more useful data (since we are ordered on pctot, and pcself can be no
+	# greater than pctot)
+	if [[ "$(echo "${pctot}" | sed -e 's/\...$//')" -lt ${cutoff} ]]
+	then
+	    # Sort the results if necessary
+	    if ${dototal}
+	    then
+		cat < ${tmpf}
+	    else
+		# Need to sort
+		case "${format}"
+		in
+		    --md)
+			sort -nr -t'|' -k3  < ${tmpf}
+			;;
+		    --csv)
+			sort -nr -t'"' -k4  < ${tmpf}
+			;;
+		esac
+	    fi
+
+	    rm ${tmpf}
+	    exit 0
+	fi
+    fi
+done < ${resfile}
diff --git a/memcpy-benchmarks/profile-all-funcs.sh b/memcpy-benchmarks/profile-all-funcs.sh
new file mode 100755
index 0000000..2fe1a35
--- /dev/null
+++ b/memcpy-benchmarks/profile-all-funcs.sh
@@ -0,0 +1,153 @@
+#!/bin/bash
+
+# Copyright (C) 2024 Embecosm Limited <www.embecosm.com>
+# Contributor Jeremy Bennett <jeremy.bennett@embecosm.com>
+
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# A script to extract function performance data from memcpy perf runs
+
+set -u
+
+usage () {
+    cat <<EOF
+Usage ./profile-all-funcs.sh  : Extract function performance data
+          [--resdir <dir>]    : Directory with the results.  Default
+                                "res-baseline"
+	  [--type <str>]      : What type of result to look at:
+                                scalar (default), vector-small or vector-large.
+          [--total|--self]    : Select based on total (self + children) or just
+                                self.  Default "total"
+          [--funclist <list>] : Space separated list of functions to profile,
+                                default
+                                "helper_lookup_tb_ptr cpu_get_tb_cpu_state"
+EOF
+}
+
+topdir="$(cd $(dirname $(dirname $(dirname $(readlink -f $0)))) ; pwd)"
+memcpydir=${topdir}/tooling/memcpy-benchmarks
+
+# Default values
+resdir="${memcpydir}/res-baseline"
+restype="scalar"
+dototal=true
+funclist="helper_lookup_tb_ptr cpu_get_tb_cpu_state"
+
+set +u
+until
+  opt="$1"
+  case "${opt}"
+  in
+      --resdir)
+	  shift
+	  resdir="$(cd $(readlink -f $1) ; pwd)"
+	  ;;
+      --type)
+	  shift
+	  case "$1"
+	  in
+	      scalar|vector-small|vector-large)
+		  restype="$1"
+		  ;;
+	      *)
+		  echo "ERROR: Uknown results type: \"$1\""
+		  usage
+		  exit 1
+		  ;;
+	  esac
+	  ;;
+      --total)
+	  dototal=true
+	  ;;
+      --self)
+	  dototal=false
+	  ;;
+      --funclist)
+	  shift
+	  funclist="$1"
+	  ;;
+      --help)
+	  usage
+	  exit 0
+	  ;;
+      ?*)
+	  usage
+	  exit 1
+	  ;;
+      *)
+	  ;;
+  esac
+[ "x${opt}" = "x" ]
+do
+  shift
+done
+set -u
+
+# Temporary file for intermediaries
+tmpf="$(mktemp profile-all-funcs-XXXXXX)"
+tmpcsv="$(mktemp profile-all-funcs-XXXXXX.csv)"
+
+# Find out the sizes
+cd ${resdir}
+dlens=$(ls -1 prof-${restype}-*.res | \
+	    sed -e "s/^prof-${restype}-//" -e 's/\.res$//' | sort -n)
+
+# Build up the results in a list
+declare -A reslist
+for f in ${funclist}
+do
+    reslist[${f}]="%${f}"
+done
+res_title="%Size"
+
+# Extract the data
+cd ${memcpydir}
+for l in ${dlens}
+do
+    res_title="${res_title}#${l}"
+    ./extract-top-level-funcs.sh --md \
+	--resfile ${resdir}/prof-${restype}-${l}.res > ${tmpf}
+
+    for f in ${funclist}
+    do
+	# Select which percentage we are reporting
+	if ${dototal}
+	then
+	    pc=$(grep ${f} < ${tmpf} | \
+		 sed -n -e 's/|[^|]\+|[[:space:]]\+\([^[:space:]]\+\).*$/\1/p')
+	else
+	    pc=$(grep ${f} < ${tmpf} | \
+		 sed -n -e 's/|[[:space:]]\+\([^[:space:]]\+\).*$/\1/p')
+	fi
+
+	if [[ "x${pc}" == "x" ]]
+	then
+	    res="0.0000"
+	elif [[ ${pc} == "100.0" ]]
+	then
+	    res="1.0000"
+	else
+	    intpart=$(echo "${pc}" | \
+			  sed -e 's/\([[:digit:]]\+\)\.[[:digit:]]\+$/\1/')
+	    fracpart=$(echo "${pc}" | \
+			   sed -e 's/[[:digit:]]\+\.\([[:digit:]]\+\)$/\1/')
+	    res=$(printf "0.%02d%2s" "${intpart}" "${fracpart}")
+
+	fi
+	reslist[${f}]="${reslist[${f}]}#${res}"
+    done
+done
+
+# Print it all out
+res_title="${res_title}%"
+echo "${res_title}" | sed -e 's/%/"/g' -e 's/#/","/g' > ${tmpcsv}
+for f in ${funclist}
+do
+    reslist[${f}]="${reslist[${f}]}%"
+    echo "${reslist[${f}]}" | sed -e 's/%/"/g' -e 's/#/","/g' >> ${tmpcsv}
+done
+
+csvtool transpose ${tmpcsv}
+
+rm -f ${tmpf}
+rm -f ${tmpcsv}
diff --git a/memcpy-benchmarks/run-perf.sh b/memcpy-benchmarks/run-perf.sh
new file mode 100755
index 0000000..5200821
--- /dev/null
+++ b/memcpy-benchmarks/run-perf.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+# Copyright (C) 2024 Embecosm Limited <www.embecosm.com>
+# Contributor Jeremy Bennett <jeremy.bennett@embecosm.com>
+
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# A script to run Linux perf on lots of memcpy benchmarks
+
+set -u
+
+usage () {
+    cat <<EOF
+Usage: ./run-perf.sh       : Run Linux perf on memcpy benchmarks
+          [--bytes <num>]  : Total bytes to copy.  Default 1,000,000,000
+          [--resdir <dir>] : Directory in which to place the results.  Default
+                             is "res-baseline" in the directory holding this
+			     script.
+          [--sizes <list>] : Space separated list of the data sizes to use when
+                             creating results.  Default list is all the powers
+                             of 2, 3, 5 and 7 up to 5^6
+
+The results will be three sets of files of the form "prof-<type>-<size>.res",
+where type is one of "scalar", "vector-small" or "vector-large", and size, is
+the size of the data block copied on each iteration.
+
+The total number of iterations for each test is determined by the number given
+in the "--bytes" argument divided by the size of the data block being used for
+the run.
+
+"perf record" is run using DWARF to determine the call graph.  This gives
+accurate results, but is slow.  Expect each iteration to take of the order of
+20 minutes on a decent server.
+EOF
+}
+
+memcpydir="$(cd $(readlink -f $0) ; pwd)"
+
+# Default values
+bytes="1000000000"
+resdir="${memcpydir}/res-baseline"
+data_lens="  1 \
+             2 \
+             3 \
+             4 \
+             5 \
+             7 \
+             8 \
+             9 \
+            16 \
+            25 \
+            27 \
+            32 \
+            49 \
+            64 \
+            81 \
+           125 \
+           128 \
+           243 \
+           256 \
+           343 \
+           512 \
+           625 \
+           729 \
+          1024 \
+          2048 \
+          2401 \
+          3125 \
+          4096 \
+          6561 \
+          8192 \
+         15625"
+
+set +u
+until
+  opt="$1"
+  case "${opt}"
+  in
+      --bytes)
+	  shift
+	  bytes="$1"
+	  ;;
+      --resdir)
+	  shift
+	  resdir="$(cd $(readlink -f $1) ; pwd)"
+	  ;;
+      --sizes)
+	  shift
+	  data_lens="$1"
+	  ;;
+      --help)
+	  usage
+	  exit 0
+	  ;;
+      ?*)
+	  usage
+	  exit 1
+	  ;;
+      *)
+	  ;;
+  esac
+[ "x${opt}" = "x" ]
+do
+  shift
+done
+set -u
+
+for dlen in ${data_lens}
+do
+    iters=$((bytes / dlen))
+    echo "=== Data len ${dlen}, iterations ${iters}"
+    echo " - scalar record"
+    time perf record -g -m 16M --call-graph dwarf,4096 -- \
+	 qemu-riscv64 -cpu "rv64,v=true,vlen=128" \
+	 smemcpy.exe ${dlen} ${iters} > /dev/null
+    echo " - scalar report"
+    time perf report --stdio --call-graph "graph,0.1,caller,function" \
+	 -k /tmp/vmlinux | \
+	sed -e 's/[[:space:]]*$//' > ${resdir}/prof-scalar-${dlen}.res
+    echo " - small vector record"
+    time perf record -g -m 16M --call-graph dwarf -- \
+	 qemu-riscv64 -cpu "rv64,v=true,vlen=128" \
+	 vmemcpy1.exe ${dlen} ${iters} > /dev/null
+    echo " - small vector report"
+    time perf report --stdio --call-graph "graph,0.1,caller,function" \
+	 -k /tmp/vmlinux | \
+	sed -e 's/[[:space:]]*$//' > ${resdir}/prof-vector-small-${dlen}.res
+    echo " - large vector record"
+    time perf record -g -m 16M --call-graph dwarf,4096 -- \
+	 qemu-riscv64 -cpu "rv64,v=true,vlen=1024" \
+	 vmemcpy8.exe ${dlen} ${iters} > /dev/null
+    echo " - large vector report"
+    time perf report --stdio --call-graph "graph,0.1,caller,function" \
+	 -k /tmp/vmlinux | \
+	sed -e 's/[[:space:]]*$//' > ${resdir}/prof-vector-large-${dlen}.res
+done
diff --git a/run-spec-pop2.sh b/run-spec-pop2.sh
new file mode 100755
index 0000000..1e7e72b
--- /dev/null
+++ b/run-spec-pop2.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Copyright (C) 2024 Embecosm Limited <www.embecosm.com>
+# Contributor Jeremy Bennett <jeremy.bennett@embecosm.com>
+
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# A script to run Linux perf on the SPEC 628.pop2_s benchmark.
+
+set -u
+
+usage () {
+    cat <<EOF
+Usage ./run-spec-pop2.sh        : Run the SPEC 2017 628.pop2_2 benchmark under
+                                  perf
+          [--reportfile <file>] : Name of the report file (relative to tooling
+                                  directory).  Default: prof-628.pop2_s.res
+          [--specdir <dir>]     : Use this as the directory with the SPEC CPU
+                                  2017 installation.  Default
+                                  ${topdir}/install/spec-2024-08-14-08-41-03
+EOF
+}
+
+topdir="$(cd $(dirname $(dirname $(readlink -f $0))) ; pwd)"
+tooldir="${topdir}/tooling"
+specdir="${topdir}/install/spec-2024-08-14-08-41-03"
+reportfile=prof-628.pop2_s.res
+
+set +u
+until
+  opt="$1"
+  case "${opt}" in
+      --reportfile)
+	  shift
+	  reportfile=$(readlink -f $1)
+	  ;;
+      --specdir)
+	  shift
+	  specdir="$(cd $(readlink -f $1) ; pwd)"
+	  ;;
+      --help)
+	  usage
+	  exit 0
+	  ;;
+      ?*)
+	  usage
+	  exit 1
+	  ;;
+      *)
+	  ;;
+  esac
+[ "x${opt}" = "x" ]
+do
+  shift
+done
+set -u
+
+export PATH="${topdir}/install/bin:${PATH}"
+
+speccpudir="${specdir}/benchspec/CPU"
+cd ${speccpudir}/628.pop2_s/run/run_base_test_riscv64-qemu-default.0000
+
+echo "Recording..."
+time perf record -g -m 16M --call-graph dwarf,4096 -- qemu-riscv64 \
+     -cpu "rv64,zicsr=true,v=true,vext_spec=v1.0,zfh=true,zvfh=true" \
+     ./speed_pop2_base.riscv64-qemu-default \
+     > pop2_s-perf.out 2>> pop2_s-perf.err
+echo "Generating report..."
+time perf report --stdio --call-graph "graph,0.1,caller,function" \
+     -k /tmp/vmlinux | \
+    sed -e 's/[[:space:]]*$//' > ${topdir}/tooling/${reportfile}