This is the mail archive of the systemtap@sources.redhat.com mailing list for the systemtap project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] djprobe: Low overhead probe method


Hello, all

It has been months since I last wrote in this mailing list.
Today, I'm pleased to announce releasing a new light weight probe
program and a new version of Linux Kernel State Tracer (LKSTv2.3).

This light weight probe is called "djprobe (direct jump probe)".
Currently, it works on i386 architecture. This probe uses 'jmp' opcode
instead of 'int3' to reduce overhead. In addition, it does not use any
locks when it works, so it does not lock other processors.
I attached a patch of djprobe for linux-2.6.13-rc2-mm3 in this mail.

The djprobe's overhead is far smaller than kprobes' one. I already
evaluated the overheads of djprobe and other probes in several
situations. The report of the results is published at following URL:

http://lkst.sourceforge.net/docs/probes-eval-report.pdf

The results indicated that the djprobe is 10 times or more as fast as
kprobe and jprobe. The djprobe works at roughly 100 nano-seconds or less
(it depends on the processor on which djprobe works).

You can download the full source package of djprobe (this includes
documents, patches for 2.6.12 and for 2.6.13-rc2-mm3, examples,
benchmark program, and kernel module version of djprobe) and LKST from
following LKST project page:

LKST project page: http://sourceforge.net/projects/lkst/

I hope to make djprobe a member of kprobes family. In my honest opinion,
djprobe is the best probe method in the view of performance. What would
you think about this?

And I have a plan that I use this light weight probe when I port LKST on
the SystemTAP. Because, the LKST that will be working anytime should
make an impact to system's performance as small as possible.
If you have any comments, please let me know.


By the way, I will attend the OLS2005 and join the SystemTAP's BOF. If I have a chance, I would like to introduce the LKST and the djprobe in the BOF.

Best regards,

--
Masami HIRAMATSU
2nd Research Dept.
Hitachi, Ltd., Systems Development Laboratory
 E-mail: hiramatu@sdl.hitachi.co.jp

---

 arch/i386/Kconfig.debug         |    8 ++
 arch/i386/kernel/Makefile       |    1
 arch/i386/kernel/kprobes.c      |  138 ++++++++++++++++++++++++++++++++++++++++
 arch/i386/kernel/stub_djprobe.S |   78 ++++++++++++++++++++++
 include/asm-i386/kprobes.h      |   25 +++++++
 include/linux/kprobes.h         |   52 +++++++++++++++
 kernel/kprobes.c                |  126 ++++++++++++++++++++++++++++++++++++
 7 files changed, 428 insertions(+)

diff -Narup linux-2.6.13-rc2-mm3/arch/i386/Kconfig.debug linux-2.6.13-rc2-mm3djp/arch/i386/Kconfig.debug
--- linux-2.6.13-rc2-mm3/arch/i386/Kconfig.debug	2005-07-12 23:37:27.000000000 +0900
+++ linux-2.6.13-rc2-mm3djp/arch/i386/Kconfig.debug	2005-07-13 15:35:39.000000000 +0900
@@ -29,6 +29,14 @@ config KPROBES
 	  for kernel debugging, non-intrusive instrumentation and testing.
 	  If in doubt, say "N".

+config DJPROBE
+	bool "Direct Jump Probe"
+	depends on KPROBES
+	help
+	  Djprobe is ultra-light probing system. This uses a jmp opecode
+	  instead of an int3 trap opecode. Djprobe is useful for probing
+	  kernel tight timing problems.
+
 config DEBUG_STACK_USAGE
 	bool "Stack utilization instrumentation"
 	depends on DEBUG_KERNEL
diff -Narup linux-2.6.13-rc2-mm3/arch/i386/kernel/Makefile linux-2.6.13-rc2-mm3djp/arch/i386/kernel/Makefile
--- linux-2.6.13-rc2-mm3/arch/i386/kernel/Makefile	2005-07-12 23:37:27.000000000 +0900
+++ linux-2.6.13-rc2-mm3djp/arch/i386/kernel/Makefile	2005-07-13 15:35:39.000000000 +0900
@@ -29,6 +29,7 @@ obj-$(CONFIG_KEXEC)		+= machine_kexec.o
 obj-$(CONFIG_X86_NUMAQ)		+= numaq.o
 obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
+obj-$(CONFIG_DJPROBE)		+= stub_djprobe.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-y				+= sysenter.o vsyscall.o
 obj-$(CONFIG_ACPI_SRAT) 	+= srat.o
diff -Narup linux-2.6.13-rc2-mm3/arch/i386/kernel/kprobes.c linux-2.6.13-rc2-mm3djp/arch/i386/kernel/kprobes.c
--- linux-2.6.13-rc2-mm3/arch/i386/kernel/kprobes.c	2005-07-12 23:36:47.000000000 +0900
+++ linux-2.6.13-rc2-mm3djp/arch/i386/kernel/kprobes.c	2005-07-13 15:35:39.000000000 +0900
@@ -541,3 +541,141 @@ int __init arch_init_kprobes(void)
 {
 	return register_kprobe(&trampoline_p);
 }
+
+#ifdef CONFIG_DJPROBE
+/*
+ * DJProbe (Direct Jump Probe) has 3 phases.
+ * The 1st phase is the inserting phase. In this phase, a probe is driven by
+ * the kprobe. The kprobe's pre_handler checks safety of the cpu on which it
+ * works. If it finished checking all cpus, it goes to the 2nd phase. If not,
+ * it changes execution point to the head of stub code that the probe has.
+ * The 2nd phase is the djprobe working phase. In this phase, the probe is
+ * driven by the djprobe. The kernel execution path is changed to jump into
+ * the stub code directly.
+ * The 3rd phase is the removing phase. In this phase, the probe is driven by
+ * the kprobe again. The kprobe's pre_handler checks safety of the cpu on
+ * which it works. If it finished checking all cpus, it removes the kprobe
+ * from the code. If not, it resumes execution.
+ */
+
+/* jmp code manipulators */
+struct __arch_jmp_op {
+	char op;
+	long raddr;
+} __attribute__((packed));
+/* insert jmp code */
+static inline void __set_jmp_op(void *from, void *to)
+{
+	struct __arch_jmp_op *jop;
+	jop = (struct __arch_jmp_op *)from;
+	jop->raddr=(long)(to) - ((long)(from) + 5);
+	smp_mb();
+	jop->op = RELATIVEJUMP_INSTRUCTION;
+}
+/* switch back to the kprobe */
+static inline void __set_breakpoint_op(void *dest, void *orig)
+{
+	struct __arch_jmp_op *jop = (struct __arch_jmp_op *)dest,
+		*jop2 = (struct __arch_jmp_op *)orig;
+
+	jop->op = BREAKPOINT_INSTRUCTION;
+	smp_mb();
+	jop->raddr = jop2->raddr;
+}
+/*
+ * djprobe call back function: called from stub code.
+ */
+static void asmlinkage djprobe_callback(struct djprobe_instance * djpi,
+					struct pt_regs *regs)
+{
+	/*TODO: use list*/
+	if (djpi->djp && djpi->djp->handler)
+		djpi->djp->handler(djpi->djp, regs);
+}
+
+/*
+ * Copy post processing instructions
+ * Target instructions MUST be relocatable.
+ */
+int arch_prepare_djprobe_instance(struct djprobe_instance *djpi,
+				  unsigned long size)
+{
+	kprobe_opcode_t *stub;
+	stub = djpi->stub.insn;
+
+	/* copy arch-dep-instance from template */
+	memcpy((void*)stub, (void*)&arch_tmpl_stub_entry, ARCH_STUB_SIZE);
+
+	/* set probe information */
+	*((long*)(stub + ARCH_STUB_VAL_IDX)) = (long)djpi;
+	/* set probe function */
+	*((long*)(stub + ARCH_STUB_CALL_IDX)) = (long)djprobe_callback;
+
+	/* copy instructions into the middle of axporbe instance */
+	memcpy((void*)(stub + ARCH_STUB_INST_IDX),
+	       (void*)djpi->kp.addr, size);
+	djpi->stub.size = size;
+
+	/* set returning jmp instruction at the tail of axporbe instance*/
+	__set_jmp_op(stub + ARCH_STUB_END_IDX,
+		     (void*)((long)djpi->kp.addr + size));
+
+	return 0;
+}
+
+/* Insert "jmp" instruction into the probing point. */
+static void arch_install_djprobe_instance(struct djprobe_instance *djpi)
+{
+	kprobe_opcode_t *stub;
+	stub = djpi->stub.insn;
+	__set_jmp_op((void*)djpi->kp.addr, (void*)stub);
+}
+/* Write back original instructions & kprobe */
+void arch_uninstall_djprobe_instance(struct djprobe_instance *djpi)
+{
+	kprobe_opcode_t *stub;
+	stub = &djpi->stub.insn[ARCH_STUB_INST_IDX];
+	__set_breakpoint_op((void*)djpi->kp.addr, (void*)stub);
+}
+
+/*
+ * safety check handler
+ */
+int djprobe_bypass_handler(struct kprobe * kp, struct pt_regs * regs)
+{
+	struct djprobe_instance *djpi =
+		container_of(kp,struct djprobe_instance, kp);
+	kprobe_opcode_t *stub = djpi->stub.insn;
+	int cpu = smp_processor_id();
+
+	if (!DJPI_CHECKED(djpi)) {
+		cpu_set(cpu, djpi->checked_cpus); /* check this cpu */
+		
+		if (DJPI_CHECKED(djpi)) { /* all cpus are checked */
+			if (DJPI_EMPTY(djpi)) {
+				/* kick the defered releasing process */
+				schdule_release_djprobe_instance();
+			} else {
+				/* write djprobe over kprobe */
+				arch_install_djprobe_instance(djpi);
+			}
+		}
+	}
+	if (DJPI_EMPTY(djpi)) {
+		/* fixup dummy instruction */
+		kp->ainsn.insn[0] = djpi->stub.insn[ARCH_STUB_INST_IDX];
+		return 0;
+	} else {
+		regs->eip = (unsigned long)stub;
+		regs->eflags |= TF_MASK;
+		regs->eflags &= ~IF_MASK;
+		/*
+		 * dummy return code :
+		 * This code is to avoid to be changed eip value by
+		 * resume_execute() of kprobes
+		 */
+		kp->ainsn.insn[0] = RETURN_INSTRUCTION;
+		return 1; /* already prepared */
+	}
+}
+#endif /*DJPROBE*/
diff -Narup linux-2.6.13-rc2-mm3/arch/i386/kernel/stub_djprobe.S linux-2.6.13-rc2-mm3djp/arch/i386/kernel/stub_djprobe.S
--- linux-2.6.13-rc2-mm3/arch/i386/kernel/stub_djprobe.S	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.13-rc2-mm3djp/arch/i386/kernel/stub_djprobe.S	2005-07-13 15:35:39.000000000 +0900
@@ -0,0 +1,78 @@
+/*
+ *  linux/arch/i386/stub_djprobe.S
+ *
+ *  Copyright (C) HITACHI,LTD. 2005
+ *  Created by Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp>
+ */
+
+#include <linux/config.h>
+
+# jmp into this function from other functions.
+.global arch_tmpl_stub_entry
+arch_tmpl_stub_entry:
+	nop
+	subl $8, %esp	#skip segment registers.
+	pushf
+	subl $20, %esp	#skip segment registers.
+	pushl %eax
+	pushl %ebp
+	pushl %edi
+	pushl %esi
+	pushl %edx
+	pushl %ecx
+	pushl %ebx
+
+	movl %esp, %eax
+	pushl %eax
+	addl $60, %eax
+	movl %eax, 56(%esp)
+.global arch_tmpl_stub_val
+arch_tmpl_stub_val:
+	movl $0xffffffff, %eax
+	pushl %eax
+.global arch_tmpl_stub_call
+arch_tmpl_stub_call:
+	movl $0xffffffff, %eax
+	call *%eax
+	addl $8, %esp
+
+	popl %ebx
+	popl %ecx
+	popl %edx
+	popl %esi
+	popl %edi
+	popl %ebp
+	popl %eax
+	addl $20, %esp
+	popf
+	addl $8, %esp
+.global arch_tmpl_stub_inst
+arch_tmpl_stub_inst:
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+
+.global arch_tmpl_stub_end
+arch_tmpl_stub_end:
+	nop
+	nop
+	nop
+	nop
+	nop
diff -Narup linux-2.6.13-rc2-mm3/include/asm-i386/kprobes.h linux-2.6.13-rc2-mm3djp/include/asm-i386/kprobes.h
--- linux-2.6.13-rc2-mm3/include/asm-i386/kprobes.h	2005-07-12 23:37:10.000000000 +0900
+++ linux-2.6.13-rc2-mm3djp/include/asm-i386/kprobes.h	2005-07-13 15:35:39.000000000 +0900
@@ -31,6 +31,8 @@ struct pt_regs;

 typedef u8 kprobe_opcode_t;
 #define BREAKPOINT_INSTRUCTION	0xcc
+#define RELATIVEJUMP_INSTRUCTION 0xe9
+#define RETURN_INSTRUCTION 0xc3
 #define MAX_INSN_SIZE 16
 #define MAX_STACK_SIZE 64
 #define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
@@ -49,6 +51,29 @@ struct arch_specific_insn {
 	kprobe_opcode_t insn[MAX_INSN_SIZE];
 };

+#ifdef CONFIG_DJPROBE
+/* stub template code */
+extern long arch_tmpl_stub_entry;
+extern long arch_tmpl_stub_val;
+extern long arch_tmpl_stub_call;
+extern long arch_tmpl_stub_inst;
+extern long arch_tmpl_stub_end;
+
+#define ARCH_STUB_VAL_IDX ((long)&arch_tmpl_stub_val - (long)&arch_tmpl_stub_entry + 1)
+#define ARCH_STUB_CALL_IDX ((long)&arch_tmpl_stub_call - (long)&arch_tmpl_stub_entry + 1)
+#define ARCH_STUB_INST_IDX ((long)&arch_tmpl_stub_inst - (long)&arch_tmpl_stub_entry)
+#define ARCH_STUB_END_IDX ((long)&arch_tmpl_stub_end - (long)&arch_tmpl_stub_entry)
+#define ARCH_STUB_SIZE ((long)&arch_tmpl_stub_end - (long)&arch_tmpl_stub_entry + 5)
+#endif
+
+#define ARCH_STUB_INSN_SIZE 80
+#define ARCH_STUB_INSN_MAX 20
+#define ARCH_STUB_INSN_MIN 5
+
+struct arch_djprobe_stub {
+	kprobe_opcode_t insn[ARCH_STUB_INSN_SIZE];
+	int size;
+};

 /* trap3/1 are intr gates for kprobes.  So, restore the status of IF,
  * if necessary, before executing the original int3/1 (trap) handler.
diff -Narup linux-2.6.13-rc2-mm3/include/linux/kprobes.h linux-2.6.13-rc2-mm3djp/include/linux/kprobes.h
--- linux-2.6.13-rc2-mm3/include/linux/kprobes.h	2005-07-12 23:37:12.000000000 +0900
+++ linux-2.6.13-rc2-mm3djp/include/linux/kprobes.h	2005-07-13 15:35:39.000000000 +0900
@@ -28,6 +28,8 @@
  * 2005-May	Hien Nguyen <hien@us.ibm.com> and Jim Keniston
  *		<jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
  *		<prasanna@in.ibm.com> added function-return probes.
+ * 2005-June	Masami HIRAMATSU <hiramatu@sdl.hitachi.co.jp> added direct
+ * 		jump probe (djprobe) interface to reduce overhead.
  */
 #include <linux/config.h>
 #include <linux/list.h>
@@ -138,6 +140,37 @@ struct kretprobe_instance {
 	struct task_struct *task;
 };

+/* djprobe's instance (internal use)*/
+struct djprobe_instance {
+	struct djprobe *djp;
+	struct arch_djprobe_stub stub;
+
+	struct kprobe kp;
+	struct list_head list; /* list of djprobe_instances */
+	cpumask_t checked_cpus;
+};
+#define DJPI_EMPTY(djpi)  (djpi->djp==NULL)
+#define DJPI_CHECKED(djpi) (cpus_equal(djpi->checked_cpus, cpu_online_map))
+
+struct djprobe;
+typedef void (*djprobe_handler_t)(struct djprobe *, struct pt_regs *);
+/*
+ * Direct Jump probe interface structure
+ */
+struct djprobe {
+	/* location of the probe point */
+	void * addr;
+	
+	/* sum of length of the replacing codes */
+	int size;
+	
+	/* probing handler (pre-executed) */
+	djprobe_handler_t handler;
+	
+	/* pointer for instance */
+	struct djprobe_instance * inst;
+};
+
 #ifdef CONFIG_KPROBES
 /* Locks kprobe: irq must be disabled */
 void lock_kprobes(void);
@@ -212,4 +245,23 @@ static inline void kprobe_flush_task(str
 {
 }
 #endif				/* CONFIG_KPROBES */
+
+#ifdef CONFIG_DJPROBE
+extern int arch_prepare_djprobe_instance(struct djprobe_instance *djpi,
+					 unsigned long size);
+extern int djprobe_bypass_handler(struct kprobe * kp, struct pt_regs * regs);
+extern void arch_uninstall_djprobe_instance(struct djprobe_instance *djpi);
+extern void schdule_release_djprobe_instance(void);
+
+int register_djprobe(struct djprobe *djp);
+void unregister_djprobe(struct djprobe *djp);
+#else
+static inline int register_djprobe(struct djprobe *djp)
+{
+	return -ENOSYS;
+}
+static inline void unregister_djprobe(struct djprobe *djp)
+{
+}
+#endif
 #endif				/* _LINUX_KPROBES_H */
diff -Narup linux-2.6.13-rc2-mm3/kernel/kprobes.c linux-2.6.13-rc2-mm3djp/kernel/kprobes.c
--- linux-2.6.13-rc2-mm3/kernel/kprobes.c	2005-07-12 23:37:13.000000000 +0900
+++ linux-2.6.13-rc2-mm3djp/kernel/kprobes.c	2005-07-13 15:35:39.000000000 +0900
@@ -30,6 +30,8 @@
  * 2005-May	Hien Nguyen <hien@us.ibm.com>, Jim Keniston
  *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi
  *		<prasanna@in.ibm.com> added function-return probes.
+ * 2005-June	Masami HIRAMATSU <hiramatu@sdl.hitachi.co.jp> added direct
+ * 		jump probe (djprobe) interface to reduce overhead.
  */
 #include <linux/kprobes.h>
 #include <linux/spinlock.h>
@@ -563,6 +565,126 @@ void unregister_kretprobe(struct kretpro
 	spin_unlock_irqrestore(&kprobe_lock, flags);
 }

+#ifdef CONFIG_DJPROBE
+/*
+ * The djprobe do not refer it's list when probe function called.
+ * This list is operated on registering and unregistering djprobe.
+ * Thus, It is not required processing speed. I decided using a 'list'.
+ */
+static DEFINE_SPINLOCK(djprobe_lock);
+static LIST_HEAD(djprobe_list);
+static int nr_instances = 0;
+
+static void work_free_djprobe_instances(void *data)
+{
+	struct list_head *pos;
+	struct djprobe_instance *djpi;
+	unsigned long flags;
+
+	spin_lock_irqsave(&djprobe_lock, flags);
+	list_for_each(pos, &djprobe_list) {
+		djpi = container_of(pos, struct djprobe_instance, list);
+		if (DJPI_EMPTY(djpi) && DJPI_CHECKED(djpi)) {
+			pos = pos->prev; /* because *pos will be removed from list*/
+			list_del(&djpi->list);
+			nr_instances -- ;
+			unregister_kprobe(&(djpi->kp));
+			kfree(djpi);
+		}
+	}
+	spin_unlock_irqrestore(&djprobe_lock, flags);
+}
+
+/* defered free worker */
+static DECLARE_WORK(djprobe_release_work,
+		    work_free_djprobe_instances, NULL);
+
+void schdule_release_djprobe_instance(void)
+{
+	schedule_work(&djprobe_release_work);
+}
+
+int register_djprobe(struct djprobe * djp)
+{
+	struct djprobe_instance *djpi;
+	int ret = 0;
+
+	if (djp == NULL || djp->addr == NULL ||
+	    djp->size > ARCH_STUB_INSN_MAX ||
+	    djp->size < ARCH_STUB_INSN_MIN ||
+	    djp->inst != NULL)
+		return -EINVAL;
+
+	spin_lock(&djprobe_lock);
+
+	list_for_each_entry(djpi, &djprobe_list, list) {
+		if (djpi->kp.addr == djp->addr) {
+			if (!DJPI_EMPTY(djpi)) {
+				ret = -EBUSY; /* already used ... */
+				goto out;
+			}
+			djp->inst = djpi;
+			djpi->djp = djp; /*TODO: use list*/
+			cpus_clear(djpi->checked_cpus);
+			goto out;
+		}
+	}
+	
+	/* could not find */
+	djpi = kmalloc(sizeof(struct djprobe_instance),GFP_KERNEL);
+	if (djpi == NULL) {
+		ret = -ENOMEM; /* memory allocation error */
+		goto out;
+	}
+
+	/* initialize */
+	memset(djpi, 0, sizeof(struct djprobe_instance));
+	INIT_LIST_HEAD(&djpi->list);
+	djpi->kp.addr = djp->addr;
+	cpus_clear(djpi->checked_cpus);
+	arch_prepare_djprobe_instance(djpi, djp->size); /*TODO : remove size*/
+	nr_instances ++ ;
+	list_add(&djpi->list, &djprobe_list);
+
+	djp->inst = djpi;
+	djpi->djp = djp; /*TODO: use list*/
+	/* first arming -- must register */
+	djpi->kp.pre_handler = djprobe_bypass_handler;
+	ret = register_kprobe(&(djpi->kp));
+	if (ret < 0) { /* failed to attach */
+		djp->inst = NULL;
+		list_del(&djpi->list);
+		nr_instances --;
+		kfree(djpi);
+	}
+out:
+	spin_unlock(&djprobe_lock);
+	return ret;
+}
+
+void unregister_djprobe(struct djprobe * djp)
+{
+	struct djprobe_instance *djpi;
+	if (djp == NULL || djp->inst == NULL)
+		return ;
+
+	djpi = djp->inst;
+	spin_lock(&djprobe_lock);
+	lock_kprobes();
+	djp->inst = NULL;
+	djpi->djp = NULL; /*TODO: use list*/
+	if (DJPI_EMPTY(djpi)) {
+		/* defered releasing */
+		cpus_clear(djpi->checked_cpus);
+		arch_uninstall_djprobe_instance(djpi);
+		/* bottom half will be processed in kprobes */
+	}
+	unlock_kprobes();
+	spin_unlock(&djprobe_lock);
+}
+
+#endif /*DJPROBE*/
+
 static int __init init_kprobes(void)
 {
 	int i, err = 0;
@@ -590,4 +712,8 @@ EXPORT_SYMBOL_GPL(unregister_jprobe);
 EXPORT_SYMBOL_GPL(jprobe_return);
 EXPORT_SYMBOL_GPL(register_kretprobe);
 EXPORT_SYMBOL_GPL(unregister_kretprobe);
+#ifdef CONFIG_DJPROBE
+EXPORT_SYMBOL_GPL(register_djprobe);
+EXPORT_SYMBOL_GPL(unregister_djprobe);
+#endif



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]