NOTE: As of relay-apps-0.90, the 'relay-app' pseudo-framework has been removed and all examples have been made standalone. i.e. for the time being, ignore the 'relay_app_XXX' parts of the code on this page and look at the examples directly. The website will be updated to reflect these changes soon...
kleak example - using relayfs to track kmalloc/kfree
Using relayfs directly
Logging text using klog
Logging binary data using klog
Logging data using kprobes
tprintk example - using relayfs and kprobes to intercept and log printk output
Logging data using kprobes
Total kmallocs: 631 Total kfrees: 596 Total bytes kmalloced: 731242 [1252960 with slack] Total bytes kfreed: 1273920 Total kmallocs by caller: c01a3a94 [stat_open+0x0]: 1 c017bff8 [sys_poll+0x54]: 2 c0180b12 [d_alloc+0xf8]: 6 c01cb138 [__jbd_kmalloc+0x192]: 1 c028f727 [alloc_skb+0x28]: 327 c01948d0 [load_elf_binary+0x47]: 2 c011e015 [do_fork+0x100]: 40 c029019f [pskb_expand_head+0x55]: 2 c0194327 [load_elf_interp+0x5f]: 2 c018464b [alloc_fdset+0xb7]: 4 c01841d2 [alloc_fd_array+0x2b]: 2 c017b8cd [select_bits_alloc+0x22]: 240 c019536e [load_elf_binary+0x1d]: 2 Total kfrees by caller: c018a582 [single_release+0xb9e]: 1 c01c604d [journal_commit_transaction+0x32]: 1 c0184413 [free_fdtable_rcu+0xa0d]: 2 c0194878 [load_elf_binary+0xd3]: 2 c019435f [load_elf_interp+0xa8]: 2 c0184429 [free_fdtable_rcu+0xef]: 2 c0172d78 [do_execve+0xe9]: 2 c018a253 [seq_release+0x1b8]: 2 c013141c [rcu_do_batch+0x23]: 2 c018a24b [seq_release+0x2c]: 2 c028fa33 [kfree_skbmem+0x1b]: 329 c0194cf4 [load_elf_binary+0x13]: 2 c0194ce8 [load_elf_binary+0x524]: 2 c011c166 [free_task+0x518]: 2 c024fb58 [put_io_context+0x16]: 1 c018441e [free_fdtable_rcu+0x68]: 2 c017bb24 [sys_select+0xde]: 240 Total kmalloced bytes by caller: c01a3a94 [stat_open+0x0]: 4096 [4096] c017bff8 [sys_poll+0x54]: 32 [64] c0180b12 [d_alloc+0xf8]: 256 [384] c01cb138 [__jbd_kmalloc+0x192]: 92 [128] c028f727 [alloc_skb+0x28]: 696160 [1213952] c01948d0 [load_elf_binary+0x47]: 480 [512] c011e015 [do_fork+0x100]: 4000 [5120] c029019f [pskb_expand_head+0x55]: 832 [1024] c0194327 [load_elf_interp+0x5f]: 320 [512] c018464b [alloc_fdset+0xb7]: 16384 [16384] c01841d2 [alloc_fd_array+0x2b]: 2048 [2048] c017b8cd [select_bits_alloc+0x22]: 6504 [8672] c019536e [load_elf_binary+0x1d]: 38 [64] Total kfreed bytes by caller: c018a582 [single_release+0xb9e]: 32 c01c604d [journal_commit_transaction+0x32]: 4096 c0184413 [free_fdtable_rcu+0xa0d]: 8192 c0194878 [load_elf_binary+0xd3]: 512 c019435f [load_elf_interp+0xa8]: 512 c0184429 [free_fdtable_rcu+0xef]: 2048 c0172d78 [do_execve+0xe9]: 1024 c018a253 [seq_release+0x1b8]: 256 c013141c [rcu_do_batch+0x23]: 128 c018a24b [seq_release+0x2c]: 8192 c028fa33 [kfree_skbmem+0x1b]: 1214976 c0194cf4 [load_elf_binary+0x13]: 512 c0194ce8 [load_elf_binary+0x524]: 64 c011c166 [free_task+0x518]: 16384 c024fb58 [put_io_context+0x16]: 128 c018441e [free_fdtable_rcu+0x68]: 8192 c017bb24 [sys_select+0xde]: 8672 Unfreed kmallocs: sys_poll+0x54: 2 alloc_skb+0x28: 1 d_alloc+0xf8: 6 select_bits_alloc+0x22: 1 do_fork+0x100: 40 __jbd_kmalloc+0x192: 1 Unmalloced kfrees: put_io_context+0x16: 1 free_task+0x518: 1 kfree_skbmem+0x1b: 5 load_elf_binary+0xd3: 1 rcu_do_batch+0x23: 2 seq_release+0x1b8: 1 sys_select+0xde: 1
The lines highlighted red show a noticeable memory leak in do_fork()
, which is the result of some code deliberately introduced there for this example, and not a real bug in do_fork()
.
To try out the example on your own system:
klog.patch
from the relay-apps tarball has been applied
examples/kleak/kleak.patch
to the kernel (this adds the "instrumentation" to kmalloc/kfree in mm/slab.c)
KDIR
in Makefile
to point to your compiled kernel tree if necessary):
# cd relay-apps/examples/kleak
# make
# make -f Makefile-app
# mount -t relayfs relayfs /mnt/relay
# insmod ./kleak.ko
# ./kleak
# Ctrl-C
# ./kleak.pl
NOTE: the kleak.pl script isn't exactly the paragon of efficiency, so it may take a little while to generate the output if the files are large...
The following sections show a few different ways this example could be implemented using relayfs, some more useful in reality than others...
Setting up a relayfs channel and writing to it is easy. Here's all the code needed to gather the necessary information to track kmalloc/kfree allocations in the kernel. The added relayfs-related code is shown in blue:
In init/main.c:
#include <linux/relayfs_fs.h> #define SUBBUF_SIZE 262144 #define N_SUBBUFS 4 struct rchan *kleak_chan = NULL; static void __init do_basic_setup(void) { /* drivers will send hotplug events */ init_workqueues(); usermodehelper_init(); driver_init(); #ifdef CONFIG_SYSCTL sysctl_init(); #endif /* Networking initialization needs a process context */ sock_init(); do_initcalls(); kleak_chan = relay_open("kleak", NULL, SUBBUF_SIZE, N_SUBBUFS, 1, NULL); if (!kleak_chan) printk("kleak channel creation failed\n"); else printk("kleak channel ready\n"); }
In mm/slab.c:
#include <linux/relayfs_fs.h> extern struct rchan *kleak_chan; void *__kmalloc(size_t size, unsigned int __nocast flags) { kmem_cache_t *cachep; void *a; cachep = __find_general_cachep(size, flags); if (unlikely(cachep == NULL)) return NULL; a = __cache_alloc(cachep, flags); if (kleak_chan) { char buf[64]; int size = sprintf(buf, "kmalloc: addr %p caller %p, size %d\n", a, __builtin_return_address(0), size); relay_write(kleak_chan, buf, size); } return a; }
void kfree(const void *objp) { kmem_cache_t *c; unsigned long flags; if (unlikely(!objp)) return; if (kleak_chan) { char buf[64]; int size = sprintf(buf, "kfree: addr %p caller %p, size %d\n", objp, __builtin_return_address(0), ksize(objp)); relay_write(kleak_chan, buf, size); } local_irq_save(flags); kfree_debugcheck(objp); c = GET_PAGE_CACHE(virt_to_page(objp)); __cache_free(c, (void*)objp); local_irq_restore(flags); }That's all there is to it - we've created a set of buffers along with the corresponding files in the filesystem, and are logging data into them.
For lack of a better place, we created the relayfs channel at the end
of do_basic_setup()
, when we know relayfs has been
initialized, and added a few lines of code to each of
__kmalloc()
and kfree()
which log some data
for each kmalloc/kfree.
So far, so good, but we still need to get the data to user space and do something useful with it. To do that, we need code that will:
Real applications probably will want to micromanage all of this on their own; casual users however would like to be able to ignore all the details and simply log data on demand with as little setup and application-specific code as possible.
The 'klog' facility makes it as easy as possible to quickly get a kernel logging application up and running. It basically provides a couple of logging functions that will send logged data to a 'klog handler' if one is registered, or discard it if not. This makes it easy to insert simple logging functions anywhere in the kernel which can be called unconditionally regardless of whether there's a channel ready for logging or not.
To use klog, you'll need to apply a small patch, klog.patch
, found in the relay-apps tarball and say 'y' to 'klog debugging functions' in the 'kernel hacking' config section. This will make two functions available for logging from anywhere within the kernel or kernel modules:
klog(const void *data, int len)
klog_printk(const char *fmt, ...)
In addition to putting the logging calls where you want them in the kernel, you need to:
To log data from the kernel or a kernel module using klog, first include linux/klog.h
and insert klog_printk()
(for writing formatted data using the same format strings as printk()
or klog()
(for writing binary data) calls wherever you want to log data:
#include <linux/klog.h> void *__kmalloc(size_t size, unsigned int __nocast flags) { kmem_cache_t *cachep; void *a; /* If you want to save a few bytes .text space: replace * __ with kmem_. * Then kmalloc uses the uninlined functions instead of the inline * functions. */ cachep = __find_general_cachep(size, flags); if (unlikely(cachep == NULL)) return NULL; a = __cache_alloc(cachep, flags); klog_printk("kmalloc: addr %p caller %p, size %d\n", a, __builtin_return_address(0), size); return a; }
void kfree(const void *objp) { kmem_cache_t *c; unsigned long flags; if (unlikely(!objp)) return; klog_printk("kfree: addr %p caller %p, size %d\n", objp, __builtin_return_address(0), ksize(objp)); local_irq_save(flags); kfree_debugcheck(objp); c = GET_PAGE_CACHE(virt_to_page(objp)); __cache_free(c, (void*)objp); local_irq_restore(flags); }The next step is to write a kernel module to interface with the klog logging functions. Here we include
"relay-app.h"
, which encapsulates the channel management and communication with userspace, in addition to providing a few functions to actually write into the relayfs channel.
In the module init function, we call init_relay_app()
with the name of the relayfs directory to create and the base filename of the relayfs files to create within it. This will cause the files /kleak/cpu0../kleak/cpuX to be created in the filesystem. Also in the module init function, we register the 'klog handler' function, here kleak_log()
, which will be called whenever the klog_printk()
function is invoked. kleak_log()
doesn't do anything but send the data to the relayfs channel using relayapp_write()
, a function defined in relay-app.h
. See relay-app.h
for more details on the available relay-app functions.
Finally, in the module cleanup code, close_relay_app()
is called to close the channel and the klog handler is unregistered.
Here's the module in its entirety (the highlighted portions show the only code that would need to be changed to create a different application):
#include <linux/module.h> #include <linux/relayfs_fs.h> #include <linux/klog.h> #include "relay-app.h" static void kleak_log(const void *data, int len) { relayapp_write(data, len); } static struct klog_operations kleak_ops = { .klog = kleak_log, }; static int init_kleak(void) { int err; if (init_relay_app("kleak", "cpu", NULL)) { printk("kleak: couldn't init relay app\n"); return -1; } err = register_klog_handler(&kleak_ops); return err; } static void cleanup_kleak(void) { close_relay_app(); unregister_klog_handler(); } module_init(init_kleak); module_exit(cleanup_kleak); MODULE_LICENSE("GPL");Now that we've completed the kernel side of the code, we need to write the user space application code. Most of this code is for dealing with command-line options.
Here we include "librelay.h"
, which like relay_apps.h
for the kernel side, hides the details of channel management and communication and makes a few other functions available.
The call to init_relay_app()
sets up the parameters for the channel that will be created. kleak_filebase
is set to the base filename of the relayfs files created in the kernel side, and should match the directory/filebase specified in the kernel module, with the mount point of the mounted relayfs filesystem prepened e.g. if we mount the filesystem at "/mnt/relay", the relayfs files will be found as /mnt/relay/kleak/cpu0../mnt/relay/kleak/cpuX. kleak_outfilebase is the base name of the per-cpu output files that will contain the corresponding channel data e.g. kleak0..kleakX.
The subbuf_size
and n_subbufs
params determine the size of each of the per-cpu buffers i.e. each per-cpu buffer will be of size subbuf_size * n_subbufs
.
Notice that the channel isn't created by the module until the init_relay_app()
call is made. This means that the buffer sizes can be specified as different sizes for each run of the user space application, which is very useful for experimenting with different buffer sizes and determining the most reasonable buffer size for the application (the relay-app code will print out a message if the buffers overflow, and suggest larger buffer sizes be used).
Finally, relay_app_main_loop()
is called, which handles the communication between the kernel, and writes the buffer data to disk as it arrives. Pressing 'Control-C' to break out of the loop will write the last buffers and stop the application.
Here's the user space application code in its entirety (the highlighted portions show the only code that would need to be changed to create a different application):
#include <stdio.h> #include "librelay.h" /* relayfs base file name */ static char *kleak_filebase = "/mnt/relay/kleak/cpu"; /* logging output written here, filebase0...N */ static char *kleak_outfilebase = "kleak"; /* internal variables */ #define DEFAULT_SUBBUF_SIZE (262144) #define DEFAULT_N_SUBBUFS (4) static unsigned subbuf_size = DEFAULT_SUBBUF_SIZE; static unsigned n_subbufs = DEFAULT_N_SUBBUFS; static void usage(void) { fprintf(stderr, "kleak [-b subbuf_size -n n_subbufs]\n"); exit(1); } int main(int argc, char **argv) { extern char *optarg; extern int optopt; int c; unsigned opt_subbuf_size = 0; unsigned opt_n_subbufs = 0; while ((c = getopt(argc, argv, "b:n:")) != -1) { switch (c) { case 'b': opt_subbuf_size = (unsigned)atoi(optarg); if (!opt_subbuf_size) usage(); break; case 'n': opt_n_subbufs = (unsigned)atoi(optarg); if (!opt_n_subbufs) usage(); break; case '?': printf("Unknown option -%c\n", optopt); usage(); break; default: break; } } if ((opt_n_subbufs && !opt_subbuf_size) || (!opt_n_subbufs && opt_subbuf_size)) usage(); if (opt_n_subbufs && opt_n_subbufs) { subbuf_size = opt_subbuf_size; n_subbufs = opt_n_subbufs; } /* use _init... function because we want a different netlink 'unit' */ if (init_relay_app(kleak_filebase, kleak_outfilebase, subbuf_size, n_subbufs, 1)) { printf("Couldn't initialize relay app. Exiting.\n"); exit(1); } printf("Creating channel with %u sub-buffers of size %u.\n", n_subbufs, subbuf_size); printf("Logging... Press Control-C to stop.\n"); /* use _init... function because we want a different netlink 'unit' */ if (relay_app_main_loop()) { printf("Couldn't enter main loop. Exiting.\n"); exit(1); } }
klog()
to log binary data instead of using klog_printk()
to log text data. Something like this is what you should probably use for higher-volume logging (aside from the fact that the equivalent text data is larger, the kernel formatting functions used for klog_printk()
(and printk()
itself) are very inefficient).
The same exact klog module and klog user space application can be used with this example; please see the Logging text using klog section for that code.
#include <linux/klog.h> void *__kmalloc(size_t size, unsigned int __nocast flags) { kmem_cache_t *cachep; void *a; /* If you want to save a few bytes .text space: replace * __ with kmem_. * Then kmalloc uses the uninlined functions instead of the inline * functions. */ cachep = __find_general_cachep(size, flags); if (unlikely(cachep == NULL)) return NULL; a = __cache_alloc(cachep, flags); struct { unsigned event_id; void *a; void *c; size_t size; } data = { 0x1234, a, __builtin_return_address(0), size }; klog(&data, sizeof(data)); return a; }
void kfree(const void *objp) { kmem_cache_t *c; unsigned long flags; if (unlikely(!objp)) return; struct { unsigned event_id; const void *a; void *c; size_t size; } data = { 0xabcd, objp, __builtin_return_address(0), ksize(objp) }; klog(&data, sizeof(data)); local_irq_save(flags); kfree_debugcheck(objp); c = GET_PAGE_CACHE(virt_to_page(objp)); __cache_free(c, (void*)objp); local_irq_restore(flags); }
# iptables -A input -p TCP -j LOG
This prints lots of packet tracing data to the system log. It would be nice if it were possible to get the same output but not clutter up the system log with lots of debugging text. Even better would be if we could accomplish this with changing any kernel code at all on a live production system.
relay-app.h
to temporarily intercept printk()
output and log it to a relayfs channel instead (actually it doesn't intercept it, but rather 'tees' it - for this to be of much use you'd want to also temporarily disable output to the systemlog e.g. by killing klogd).
In the init function, we set up the channel as usual using init_relay_app()
, and register the printk()
jprobe. In the jprobe handler, the data is massaged slightly and logged to the relayfs channel via relayapp_write()
. Finally, in the module exit function, the jprobe is unregistered and the relayfs channel is closed. The corresponding userspace application is pretty much the same as in the previous examples.
Note that all the channel management as well as all the data logging is done from within this module, in contrast to previous examples where these were separated.
Here's how you'd use this module (the code can be found in the relay-apps tarball):
# mount -t relayfs relayfs /mnt/relay
# insmod tprintk.ko
# pkill klogd
# iptables -A input -p TCP -j LOG
# tprintk
As with the other relay-app examples, logging doesn't start until the tprintk user application is run, and stops on control-c, at which point the data can be found in the per-cpu files. Each record has a timestamp prepended to it, so these files can be merged and sorted to get them all in order in a single file.
#include <linux/module.h> #include <linux/kprobes.h> #include <linux/kallsyms.h> #include <linux/relayfs_fs.h> #include "relay-app.h" /* protection for the formatting temporary buffer */ static DEFINE_SPINLOCK(tmpbuf_lock); static struct jprobe tprintk_jp; /* * printk() jprobe handler. */ static int tprintk(const char *fmt, ...) { static char printk_buf[1024]; int len = 0; unsigned long flags; va_list args; static int log_level_unknown = 1; char *p, *start; spin_lock_irqsave(&tmpbuf_lock, flags); va_start(args, fmt); len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); /* * Write the output to the relayfs channel. We remove log level * tags because they're not used and we write a timestamp (as * [secs.usecs] so things can be sorted out between per-cpu buffers. */ for (p = start = printk_buf; *p; p++) { if (log_level_unknown) { . . . } if (*p == '\n') { len = (p + 1) - start; relayapp_write(start, len); start = p + 1; log_level_unknown = 1; } } len = p - start; if (len && *start) relayapp_write(start, len); va_end(args); spin_unlock_irqrestore(&tmpbuf_lock, flags); jprobe_return(); return 0; } /* * init_tprintk - initialize relay app and insert printk probe * * relay files will be named /mnt/relay/tprintk/cpuXXX */ static int init_tprintk(void) { if (init_relay_app("tprintk", "cpu", &tprintk_callbacks)) { printk("tprintk: couldn't init relay app\n"); return -1; } tprintk_jp.entry = (kprobe_opcode_t *)tprintk; tprintk_jp.kp.addr = (kprobe_opcode_t *)kallsyms_lookup_name("printk"); if (!tprintk_jp.kp.addr) { printk("Couldn't find printk function to instrument.\n"); return -1; } register_jprobe(&tprintk_jp); printk("tprintk probe inserted.\n"); return 0; } /* * cleanup_tprintk - destroy channels and remove printk probe */ static void cleanup_tprintk(void) { unregister_jprobe(&tprintk_jp); printk("tprintk probe removed.\n"); close_relay_app(); } module_init(init_tprintk); module_exit(cleanup_tprintk); MODULE_LICENSE("GPL");