diff --git a/debian/changelog b/debian/changelog
index fe12014e..4bc42967 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,6 +1,7 @@
 llvm-toolchain-19 (1:19.1.0~++rc3-1~exp2) UNRELEASED; urgency=medium
 
   * Add a .gitignore file with all toplevel upstream files and directories.
+  * Add some memory tracking, getting some information on memory hogs.
 
  -- Matthias Klose <doko@ubuntu.com>  Tue, 27 Aug 2024 11:47:25 +0200
 
diff --git a/debian/control b/debian/control
index 216285dd..046d46a3 100644
--- a/debian/control
+++ b/debian/control
@@ -18,7 +18,7 @@ Build-Depends: debhelper (>= 10.0), cmake, ninja-build,
     zlib1g-dev, libzstd-dev,
     g++-multilib [amd64 i386 kfreebsd-amd64 mips64 mips64el mipsel powerpc ppc64 s390 s390x sparc sparc64 x32],
     libjs-mathjax, python3-myst-parser | python3-recommonmark, python3-pexpect,
-    doxygen,
+    doxygen, python3:any, python3-psutil, python3-matplotlib,
     ocaml-base [amd64 arm64 armhf ppc64el riscv64 s390x] | ocaml-nox [amd64 arm64 armhf ppc64el riscv64 s390x],
     ocaml-findlib [amd64 arm64 armhf ppc64el riscv64 s390x],
     libctypes-ocaml-dev [amd64 arm64 armhf ppc64el riscv64 s390x],
diff --git a/debian/control.in b/debian/control.in
index 1635ad33..87bac8fd 100644
--- a/debian/control.in
+++ b/debian/control.in
@@ -18,7 +18,7 @@ Build-Depends: debhelper (>= 10.0), cmake, ninja-build,
     zlib1g-dev, libzstd-dev,
     g++-multilib [amd64 i386 kfreebsd-amd64 mips64 mips64el mipsel powerpc ppc64 s390 s390x sparc sparc64 x32],
     libjs-mathjax, python3-myst-parser | python3-recommonmark, python3-pexpect,
-    doxygen,
+    doxygen,@USAGE_BUILD_DEP@
     ocaml-base [amd64 arm64 armhf ppc64el riscv64 s390x] | ocaml-nox [amd64 arm64 armhf ppc64el riscv64 s390x],
     ocaml-findlib [amd64 arm64 armhf ppc64el riscv64 s390x],
     libctypes-ocaml-dev [amd64 arm64 armhf ppc64el riscv64 s390x],
diff --git a/debian/rules b/debian/rules
index fd19a3c6..ce9d77ca 100755
--- a/debian/rules
+++ b/debian/rules
@@ -71,7 +71,7 @@ packages := $(shell dh_listpackages)
 # flang has some memory hogs, of up to 6.2gb per process. Limit the parallel jobs
 # based on the available memory
 ifneq (,$(filter flang-$(LLVM_VERSION), $(packages)))
-  MEM_PER_CPU=4500
+  MEM_PER_CPU=6000
 else
   MEM_PER_CPU=2100
 endif
@@ -79,6 +79,12 @@ NJOBS := $(shell mt=`awk '/^(MemAvail|SwapFree)/ { mt += $$2 } END {print mt}' /
 		awk -vn=$(NCPUS) -vmt=$$mt -vm=$(MEM_PER_CPU) \
 		  'END { mt/=1024; n2 = int(mt/m); print n==1 ? 1 : n2<n+1 ? n2 : n+1}' < /dev/null)
 
+TIME_COMMAND = /bin/time -v
+TIME_COMMAND = $(CURDIR)/debian/usage-wrapper.py -j $(NJOBS) --base-memory -m 4.0 -o usage-$(notdir $@)-$(DEB_HOST_ARCH).svg
+ifneq (,$(findstring usage-wrapper, $(TIME_COMMAND)))
+  USAGE_BUILD_DEP = $(EMPTY) python3:any, python3-psutil, python3-matplotlib,
+endif
+
 DH_VERSION := $(shell dpkg -s debhelper | grep '^Version' | awk '{print $$2}')
 
 include /usr/share/dpkg/architecture.mk
@@ -646,6 +652,7 @@ stamps/preconfigure:
 		f2=$$(echo $$f | sed 's/\.in$$//;s/X\.Y/$(LLVM_VERSION)/'); \
 		echo "$$f => $$f2"; \
 		sed -e 's|@DEB_HOST_MULTIARCH@|$(DEB_HOST_MULTIARCH)|g' \
+			-e "s|@USAGE_BUILD_DEP@|$(USAGE_BUILD_DEP)|g" \
 			-e "s|@BRANCH_NAME@|$(BRANCH_NAME)|g" \
 			-e "s|@OCAML_STDLIB_DIR@|$(OCAML_STDLIB_DIR)|g" \
 			-e "s|@LLVM_EPOCH@|$(LLVM_EPOCH)|g" \
@@ -873,7 +880,7 @@ stamps/debian-full-build:
 	echo "Using cmake: $(CMAKE_BIN)"
 # linker hack so stage2 can link against stage1 libs at runtime
 	LD_LIBRARY_PATH=$(STAGE_1_LIB_DIR):$$LD_LIBRARY_PATH \
-	VERBOSE=1 $(PRE_PROCESS) $(CMAKE_BIN) --build $(TARGET_BUILD) -j $(NJOBS) --target stage2 || cat build-llvm/tools/clang/stage2-bins/CMakeFiles/CMakeOutput.log
+	VERBOSE=1 $(TIME_COMMAND) $(PRE_PROCESS) $(CMAKE_BIN) --build $(TARGET_BUILD) -j $(NJOBS) --target stage2 || cat build-llvm/tools/clang/stage2-bins/CMakeFiles/CMakeOutput.log
 
 # Check the stage 2 build worked
 	if ! readelf --string-dump .comment  $(TARGET_BUILD_STAGE2)/bin/clang 2>&1|grep -q "clang version"; then \
@@ -927,7 +934,7 @@ stamps/debian-libclc-build:
 	-DLLVM_CMAKE_DIR=$(STAGE_2_BIN_DIR)/../ \
 	$(LIBCLC_LLVM_SPIRV) \
 	-DLIBCLC_TARGETS_TO_BUILD="$(LIBCLC_TARGETS_TO_BUILD)"; \
-	LD_LIBRARY_PATH=$$LD_LIBRARY_PATH:$(STAGE_2_LIB_DIR) ninja -j $(NJOBS) $(VERBOSE)
+	LD_LIBRARY_PATH=$$LD_LIBRARY_PATH:$(STAGE_2_LIB_DIR) $(TIME_COMMAND) ninja -j $(NJOBS) $(VERBOSE)
 ifndef LLVM_SPIRV
 	echo "libclc built without SPIRV (.spv) outputs because llvm-spirv wasn't found"
 endif
@@ -967,7 +974,7 @@ build-wasm/compiler-rt-%:
 		-DCOMPILER_RT_USE_BUILTINS_LIBRARY=ON \
 		-DCOMPILER_RT_DEFAULT_TARGET_TRIPLE=$(cpu)-unknown-unknown \
 		-DCOMPILER_RT_OS_DIR=wasi
-	ninja -C "$@" -j $(NJOBS) $(VERBOSE)
+	$(TIME_COMMAND) ninja -C "$@" -j $(NJOBS) $(VERBOSE)
 
 ifeq ($(LIBCXX_WASM_ENABLE), no)
 build-wasm/libcxx-%-wasi: build-wasm/compiler-rt-%
@@ -1050,7 +1057,7 @@ build-wasm/libcxx-%-wasi: build-wasm/compiler-rt-%
 		-DLIBCXXABI_HAS_EXTERNAL_THREAD_API:BOOL=OFF \
 		-DLIBCXXABI_BUILD_EXTERNAL_THREAD_LIBRARY:BOOL=OFF \
 		-DLIBCXXABI_USE_LLVM_UNWINDER:BOOL=OFF
-	ninja -C "$@" -j $(NJOBS) $(VERBOSE)
+	$(TIME_COMMAND) ninja -C "$@" -j $(NJOBS) $(VERBOSE)
 endif
 
 # Build compiler-rt for wasm32 and wasm64. Build libcxx only for wasm32, as
@@ -1452,6 +1459,12 @@ endif
 # So, we remove this directory from the package
 	rm -fr $(CURDIR)/debian/libclang-rt-$(LLVM_VERSION)-dev/usr/lib/llvm-$(LLVM_VERSION)/lib/clang/$(LLVM_VERSION)/lib/wasi/
 
+	for svg in usage-*.svg; do \
+	  [ -f $$svg ] || continue; \
+	  xz -9v $$svg; \
+	  cp -p $$svg.xz debian/llvm-$(LLVM_VERSION)/usr/share/doc/llvm-$(LLVM_VERSION)/.; \
+	done
+
 stamps/repack_a_llvm_ir:
 ifeq (${LTO_ENABLE},yes)
 # with LTO, .a contains llvm ir instead of native code. So, recompile them
@@ -1603,5 +1616,6 @@ override_dh_auto_clean:
 	: # remove extra stamps
 	rm -f override_dh_auto_install
 	rm -rf stamps
+	rm -f usage-*.svg*
 
 .PHONY: override_dh_strip preconfigure
diff --git a/debian/usage-wrapper.py b/debian/usage-wrapper.py
new file mode 100755
index 00000000..ed474e80
--- /dev/null
+++ b/debian/usage-wrapper.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python3
+
+import argparse
+import math
+import os
+import subprocess
+import sys
+import threading
+import time
+
+try:
+    import psutil
+except ImportError:
+    print(f'{sys.argv[0]}: the psutil module is required.', file=sys.stderr)
+    sys.exit(1)
+
+try:
+    import matplotlib.pyplot as plt
+    from matplotlib.lines import Line2D
+except ImportError:
+    plt = None
+
+
+def to_gigabyte(value):
+    return value / 1024**3
+
+
+INTERVAL = 0.33
+LW = 0.5
+
+global_n = 0
+global_cpu_data_sum = 0
+global_memory_data_sum = 0
+global_cpu_data_max = 0
+global_memory_data_min = to_gigabyte(psutil.virtual_memory().total)
+global_memory_data_max = 0
+global_swap_data_min = to_gigabyte(psutil.swap_memory().total)
+global_swap_data_max = 0
+global_disk_data_total = to_gigabyte(psutil.disk_usage('.').total)
+global_disk_data_start = to_gigabyte(psutil.disk_usage('.').used)
+
+global_timestamps = []
+global_cpu_data = []
+global_memory_data = []
+global_process_usage = []
+global_process_hogs = {}
+
+process_name_map = {}
+lock = threading.Lock()
+
+done = False
+start_ts = time.monotonic()
+cpu_count = psutil.cpu_count()
+
+special_processes = {'ld': 'gold',
+                     'WPA': 'deepskyblue',
+                     'WPA-stream': 'lightblue',
+                     'ltrans': 'forestgreen',
+                     'as': 'coral',
+                     'GCC': 'gray',
+                     'clang': 'darkgray',
+                     'rust': 'brown',
+                     'go': 'hotpink',
+                     'dwz': 'limegreen',
+                     'rpm/dpkg': 'plum'}
+for i, k in enumerate(special_processes.keys()):
+    process_name_map[k] = i
+
+
+descr = 'Run command and measure memory and CPU utilization'
+parser = argparse.ArgumentParser(description=descr)
+parser.add_argument('command', metavar='command',
+                    help='Command', nargs=argparse.REMAINDER)
+parser.add_argument('-c', '--command', dest='command1',
+                    help='command as a single argument')
+parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
+parser.add_argument('--summary-only', dest='summary_only',
+                    action='store_true',
+                    help='No plot, just a summary at the end')
+parser.add_argument('--base-memory', action='store_true',
+                    help='Adjust memory to include the system load')
+parser.add_argument('-s', '--separate-ltrans', action='store_true',
+                    help='Separate LTRANS processes in graph')
+parser.add_argument('-o', '--output', default='usage.svg',
+                    help='Path to output image (default: usage.svg)')
+parser.add_argument('-r', '--ranges',
+                    help='Plot only the selected time ranges '
+                    '(e.g. 20-30, 0-1000)')
+parser.add_argument('-t', '--title', help='Graph title')
+parser.add_argument('-m', '--memory-hog-threshold', type=float,
+                    help='Report about processes that consume the amount of '
+                    'memory (in GB)')
+parser.add_argument('-f', '--frequency', type=float,
+                    default=INTERVAL,
+                    help='Frequency of measuring (in seconds)')
+parser.add_argument('-j', '--jobs', type=int,
+                    default=cpu_count, dest='used_cpus',
+                    help='Scale up CPU data to used CPUs '
+                    'instead of available CPUs')
+args = parser.parse_args()
+
+if args.command1 and args.command:
+    print(f'{sys.argv[0]}: either use -c "<shell command>", '
+          'or append the command', file=sys.stderr)
+    sys.exit(1)
+
+if not args.summary_only and plt is None:
+    print(f'{sys.argv[0]}: use --summary-only, '
+          'or install the matplotlib module', file=sys.stderr)
+    sys.exit(1)
+
+cpu_scale = cpu_count / args.used_cpus
+
+
+def get_process_name(proc):
+    name = proc.name()
+    cmdline = proc.cmdline()
+    if name == 'ld' or name == 'ld.gold':
+        return 'ld'
+    elif name == 'lto1-wpa':
+        return 'WPA'
+    elif name == 'lto1-wpa-stream':
+        return 'WPA-stream-out'
+    elif name in ('cc1', 'cc1plus', 'cc1objc', 'f951', 'd21', 'go1', 'gnat1'):
+        return 'GCC'
+    elif name.startswith('clang'):
+        return 'clang'
+    elif name.startswith('rust'):
+        return 'rust'
+    elif name in ('as', 'dwz', 'go'):
+        return name
+    elif name == 'rpmbuild' or name.startswith('dpkg'):
+        return 'rpm/dpkg'
+    elif '-fltrans' in cmdline:
+        if args.separate_ltrans:
+            return 'ltrans-%d' % proc.pid
+        else:
+            return 'ltrans'
+    return None
+
+
+def record_process_memory_hog(proc, memory, timestamp):
+    if args.memory_hog_threshold:
+        if memory >= args.memory_hog_threshold:
+            cmd = ' '.join(proc.cmdline())
+            tpl = (memory, timestamp)
+            if cmd not in global_process_hogs:
+                global_process_hogs[cmd] = tpl
+            elif memory > global_process_hogs[cmd][0]:
+                global_process_hogs[cmd] = tpl
+
+
+def record():
+    global global_n, global_cpu_data_sum, global_cpu_data_max
+    global global_memory_data_sum, global_memory_data_min
+    global global_memory_data_max
+    global global_swap_data_min, global_swap_data_max
+
+    active_pids = {}
+    while not done:
+        timestamp = time.monotonic() - start_ts
+        used_cpu = psutil.cpu_percent(interval=args.frequency) * cpu_scale
+        used_memory = to_gigabyte(psutil.virtual_memory().used)
+        used_swap = to_gigabyte(psutil.swap_memory().used)
+        if not args.summary_only:
+            global_timestamps.append(timestamp)
+            global_memory_data.append(used_memory)
+            global_cpu_data.append(used_cpu)
+
+        global_n += 1
+        global_cpu_data_sum += used_cpu
+        global_memory_data_sum += used_memory
+        global_cpu_data_max = max(global_cpu_data_max, used_cpu)
+        global_memory_data_min = min(global_memory_data_min, used_memory)
+        global_memory_data_max = max(global_memory_data_max, used_memory)
+        global_swap_data_min = min(global_swap_data_min, used_swap)
+        global_swap_data_max = max(global_swap_data_max, used_swap)
+
+        entry = {}
+        seen_pids = set()
+        for proc in psutil.Process().children(recursive=True):
+            try:
+                memory = to_gigabyte(proc.memory_info().rss)
+                record_process_memory_hog(proc, memory, timestamp)
+                name = get_process_name(proc)
+                if name:
+                    seen_pids.add(proc.pid)
+                    if proc.pid not in active_pids:
+                        active_pids[proc.pid] = proc
+                    else:
+                        proc = active_pids[proc.pid]
+                    cpu = proc.cpu_percent() / args.used_cpus
+                    if name not in process_name_map:
+                        length = len(process_name_map)
+                        process_name_map[name] = length
+                    if name not in entry:
+                        entry[name] = {'memory': 0, 'cpu': 0}
+                    entry[name]['cpu'] += cpu
+                    # FIXME: ignore WPA streaming memory - COW makes it bogus
+                    if name != 'WPA-stream-out':
+                        entry[name]['memory'] += memory
+            except Exception:
+                # the process can be gone
+                pass
+        for pid in list(active_pids.keys()):
+            if pid not in seen_pids:
+                del active_pids[pid]
+        if args.verbose:
+            print(entry, flush=True)
+        if not args.summary_only:
+            global_process_usage.append(entry)
+
+
+def stack_values(process_usage, key):
+    stacks = []
+    for _ in range(len(process_name_map)):
+        stacks.append([])
+    for values in process_usage:
+        for k, v in process_name_map.items():
+            if k in values:
+                stacks[v].append(values[k][key])
+            else:
+                stacks[v].append(0)
+    return stacks
+
+
+def get_footnote():
+    hostname = os.uname()[1].split('.')[0]
+    cpu_average = global_cpu_data_sum / global_n
+    cpu_max = global_cpu_data_max
+    base_memory = global_memory_data_min
+    peak_memory = global_memory_data_max
+    total_mem = to_gigabyte(psutil.virtual_memory().total)
+    return (f'host: {hostname}; CPUs: {args.used_cpus}/{cpu_count};'
+            f' CPU avg: {cpu_average:.0f}%;'
+            f' CPU max: {cpu_max:.0f}%;'
+            f' base memory: {base_memory:.1f} GB;'
+            f' peak memory: {peak_memory:.1f} GB;'
+            f' total memory: {total_mem:.1f} GB')
+
+
+def get_footnote2():
+    peak_swap = global_swap_data_max
+    total_swap = to_gigabyte(psutil.swap_memory().total)
+    disk_total = global_disk_data_total
+    disk_start = global_disk_data_start
+    disk_end = to_gigabyte(psutil.disk_usage('.').used)
+    disk_delta = disk_end - disk_start
+    return (f'swap peak/total: {peak_swap:.1f}/{total_swap:.1f} GB;'
+            f' disk start/end/total: {disk_start:.1f}/{disk_end:.1f}/{disk_total:.1f} GB;'
+            f' disk delta: {disk_delta:.1f} GB')
+
+
+def generate_graph(time_range):
+    timestamps = []
+    cpu_data = []
+    memory_data = []
+    process_usage = []
+
+    # filter date by timestamp
+    for i, ts in enumerate(global_timestamps):
+        if not time_range or time_range[0] <= ts and ts <= time_range[1]:
+            timestamps.append(ts)
+            cpu_data.append(global_cpu_data[i])
+            memory_data.append(global_memory_data[i])
+            process_usage.append(global_process_usage[i])
+
+    if not timestamps:
+        if args.verbose:
+            print('No data for range: %s' % str(time_range))
+        return
+
+    peak_memory = max(memory_data)
+
+    fig, (cpu_subplot, mem_subplot) = plt.subplots(2, sharex=True)
+    title = args.title if args.title else ''
+    if time_range:
+        title += ' (%d-%d s)' % (time_range[0], time_range[1])
+    fig.suptitle(title, fontsize=17)
+    fig.set_figheight(5)
+    fig.set_figwidth(10)
+    # scale cpu axis
+    local_peak_cpu = max(cpu_data)
+    cpu_ylimit = (local_peak_cpu // 10) * 11 + 5
+    if cpu_ylimit > 200:
+        cpu_ylimit = 200
+    cpu_subplot.set_title('CPU usage')
+    cpu_subplot.set_ylabel('%')
+    cpu_subplot.plot(timestamps, cpu_data, c='blue', lw=LW, label='total')
+    cpu_subplot.set_ylim([0, cpu_ylimit])
+    cpu_subplot.axhline(color='r', alpha=0.5, y=100.0 / args.used_cpus, lw=LW,
+                        linestyle='dotted', label='single core')
+    cpu_subplot.set_xlim(left=time_range[0] if time_range else 0)
+    cpu_subplot.grid(True)
+
+    mem_subplot.plot(timestamps, memory_data, c='blue', lw=LW, label='total')
+    mem_subplot.set_title('Memory usage')
+    mem_subplot.set_ylabel('GB')
+    mem_subplot.set_xlabel('time')
+
+    # scale it to a reasonable limit
+    limit = 1
+    while peak_memory > limit:
+        limit *= 2
+    if limit > 2 and limit * 0.75 >= peak_memory:
+        limit = int(limit * 0.75)
+    mem_subplot.set_ylim([0, 1.1 * limit])
+    mem_subplot.set_yticks(range(0, limit + 1, math.ceil(limit / 8)))
+    mem_subplot.grid(True)
+
+    colors = list(plt.cm.get_cmap('tab20c').colors * 100)
+    for name, color in special_processes.items():
+        if name in process_name_map:
+            colors[process_name_map[name]] = color
+
+    mem_stacks = stack_values(process_usage, 'memory')
+    cpu_stacks = stack_values(process_usage, 'cpu')
+    if mem_stacks:
+        mem_subplot.stackplot(timestamps, mem_stacks,
+                              colors=colors)
+        cpu_subplot.stackplot(timestamps, cpu_stacks,
+                              colors=colors)
+
+        # generate custom legend
+        colors = special_processes.values()
+        custom_lines = [Line2D([0], [0], color=x, lw=5) for x in colors]
+        custom_lines.insert(0, Line2D([0], [0], color='b', lw=LW))
+        custom_lines.insert(0, Line2D([0], [0], color='r', alpha=0.5,
+                                      linestyle='dotted', lw=LW))
+        names = ['single core', 'total'] + list(special_processes.keys())
+        fig.legend(custom_lines, names, loc='right', prop={'size': 6})
+
+    filename = args.output
+    if time_range:
+        tr = '-%d-%d' % (time_range[0], time_range[1])
+        filename = os.path.splitext(args.output)[0] + tr + '.svg'
+    plt.subplots_adjust(bottom=0.15)
+    plt.figtext(0.1, 0.04, get_footnote(), fontsize='small')
+    plt.figtext(0.1, 0.01, get_footnote2(), fontsize='small')
+    plt.savefig(filename)
+    if args.verbose:
+        print('Saving plot to %s' % filename)
+
+
+def summary():
+    print(f'SUMMARY: {get_footnote()}')
+    print(f'SUMMARY: {get_footnote2()}')
+    if global_process_hogs:
+        print(f'PROCESS MEMORY HOGS (>={args.memory_hog_threshold:.1f} GB):')
+        items = sorted(global_process_hogs.items(), key=lambda x: x[1][0],
+                       reverse=True)
+        for cmdline, (memory, ts) in items:
+            print(f'  {memory:.1f} GB: {ts:.1f} s: {cmdline}')
+
+
+thread = threading.Thread(target=record, args=())
+thread.start()
+
+ranges = []
+if args.ranges:
+    for r in args.ranges.split(','):
+        parts = r.split('-')
+        assert len(parts) == 2
+        ranges.append([int(x) for x in parts])
+
+if args.verbose:
+    print('Ranges are %s' % str(ranges))
+    print('Running command', flush=True)
+
+cp = None
+try:
+    if args.command1:
+        cp = subprocess.run(args.command1, shell=True)
+    else:
+        cp = subprocess.run(args.command)
+except KeyboardInterrupt:
+    rv = 2
+finally:
+    done = True
+    thread.join()
+    summary()
+    if global_memory_data:
+        min_memory = min(global_memory_data)
+        if not args.base_memory:
+            global_memory_data = [x - min_memory for x in global_memory_data]
+
+        if plt:
+            generate_graph(None)
+            for r in ranges:
+                generate_graph(r)
+    if cp:
+        rv = cp.returncode
+
+sys.exit(rv)