This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch ldmitrie/intel_mpx created. glibc-2.18-87-g1aef5b2


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, ldmitrie/intel_mpx has been created
        at  1aef5b2564676933dbc4eafa0c35c00792e47595 (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1aef5b2564676933dbc4eafa0c35c00792e47595

commit 1aef5b2564676933dbc4eafa0c35c00792e47595
Author: Liubov Dmitrieva <ldmitrie@sourceware.org>
Date:   Fri Aug 30 18:37:28 2013 +0400

    Implemented bound check support for string/memory routines for x86_64.
    TODO: Fix bound check support in strcmp-sse2 and implement in strspn, strstr and strcspn.

diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 08db331..db6838d 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -18,6 +18,9 @@ endif
 ifeq ($(subdir),string)
 sysdep_routines += cacheinfo strcasecmp_l-nonascii strncase_l-nonascii
 gen-as-const-headers += locale-defines.sym
+ifeq ($(enable-mpx), yes)
+sysdep_routines += strcpy_chk-c stpcpy_chk-c
+endif
 endif
 
 ifeq ($(subdir),elf)
diff --git a/sysdeps/x86_64/Versions b/sysdeps/x86_64/Versions
index a437f85..083770a 100644
--- a/sysdeps/x86_64/Versions
+++ b/sysdeps/x86_64/Versions
@@ -2,6 +2,13 @@ libc {
   GLIBC_2.14 {
     memcpy;
   }
+%ifdef __CHKP__
+  GLIBC_2.17 {
+    mpx_memset_nobnd;
+    mpx_memset_nochk;
+    mpx_memset_nobnd_nochk;
+  }
+%endif
 }
 libm {
   GLIBC_2.1 {
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index 891ee70..205345b 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -20,8 +20,17 @@
 
 /* fast SSE2 version with using pmaxub and 64 byte loop */
 
+#  ifdef __CHKP__
+#   define RETURN \
+	bndcu  (%rax), %bnd0; \
+	ret
+#  else
+#   define RETURN ret
+#  endif
+
 	.text
 ENTRY(memchr)
+
 	movd	%rsi, %xmm1
 	mov	%rdi, %rcx
 
@@ -33,6 +42,10 @@ ENTRY(memchr)
 	and	$63, %rcx
 	pshufd	$0, %xmm1, %xmm1
 
+#ifdef __CHKP__
+        bndcl  	(%rdi), %bnd0
+        bndcu  	(%rdi), %bnd0
+#endif
 	cmp	$48, %rcx
 	ja	L(crosscache)
 
@@ -72,7 +85,7 @@ L(crosscache):
 	jbe	L(return_null)
 	add	%rdi, %rax
 	add	%rcx, %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(unaligned_no_match):
@@ -85,24 +98,36 @@ L(unaligned_no_match):
 
 	.p2align 4
 L(loop_prolog):
+#ifdef __CHKP__
+        bndcu  	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pcmpeqb	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches)
 
+#ifdef __CHKP__
+        bndcu  	16(%rdi), %bnd0
+#endif
 	movdqa	16(%rdi), %xmm2
 	pcmpeqb	%xmm1, %xmm2
 	pmovmskb %xmm2, %eax
 	test	%eax, %eax
 	jnz	L(matches16)
 
+#ifdef __CHKP__
+        bndcu  	32(%rdi), %bnd0
+#endif
 	movdqa	32(%rdi), %xmm3
 	pcmpeqb	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32)
 
+#ifdef __CHKP__
+        bndcu  	48(%rdi), %bnd0
+#endif
 	movdqa	48(%rdi), %xmm4
 	pcmpeqb	%xmm1, %xmm4
 	add	$64, %rdi
@@ -116,24 +141,36 @@ L(loop_prolog):
 	sub	$64, %rdx
 	jbe	L(exit_loop)
 
+#ifdef __CHKP__
+        bndcu  	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pcmpeqb	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches)
 
+#ifdef __CHKP__
+        bndcu  	16(%rdi), %bnd0
+#endif
 	movdqa	16(%rdi), %xmm2
 	pcmpeqb	%xmm1, %xmm2
 	pmovmskb %xmm2, %eax
 	test	%eax, %eax
 	jnz	L(matches16)
 
+#ifdef __CHKP__
+        bndcu  	32(%rdi), %bnd0
+#endif
 	movdqa	32(%rdi), %xmm3
 	pcmpeqb	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32)
 
+#ifdef __CHKP__
+        bndcu  	48(%rdi), %bnd0
+#endif
 	movdqa	48(%rdi), %xmm3
 	pcmpeqb	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
@@ -151,6 +188,9 @@ L(loop_prolog):
 L(align64_loop):
 	sub	$64, %rdx
 	jbe	L(exit_loop)
+#ifdef __CHKP__
+        bndcu  	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	movdqa	16(%rdi), %xmm2
 	movdqa	32(%rdi), %xmm3
@@ -192,25 +232,34 @@ L(align64_loop):
 	pmovmskb %xmm1, %eax
 	bsf	%eax, %eax
 	lea	48(%rdi, %rax), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(exit_loop):
 	add	$32, %rdx
 	jle	L(exit_loop_32)
 
+#ifdef __CHKP__
+        bndcu  	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pcmpeqb	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches)
 
+#ifdef __CHKP__
+        bndcu   16(%rdi), %bnd0
+#endif
 	movdqa	16(%rdi), %xmm2
 	pcmpeqb	%xmm1, %xmm2
 	pmovmskb %xmm2, %eax
 	test	%eax, %eax
 	jnz	L(matches16)
 
+#ifdef __CHKP__
+        bndcu   32(%rdi), %bnd0
+#endif
 	movdqa	32(%rdi), %xmm3
 	pcmpeqb	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
@@ -219,6 +268,9 @@ L(exit_loop):
 	sub	$16, %rdx
 	jle	L(return_null)
 
+#ifdef __CHKP__
+        bndcu   48(%rdi), %bnd0
+#endif
 	pcmpeqb	48(%rdi), %xmm1
 	pmovmskb %xmm1, %eax
 	test	%eax, %eax
@@ -229,6 +281,9 @@ L(exit_loop):
 	.p2align 4
 L(exit_loop_32):
 	add	$32, %rdx
+#ifdef __CHKP__
+        bndcu   (%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pcmpeqb	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
@@ -237,6 +292,9 @@ L(exit_loop_32):
 	sub	$16, %rdx
 	jbe	L(return_null)
 
+#ifdef __CHKP__
+        bndcu   16(%rdi), %bnd0
+#endif
 	pcmpeqb	16(%rdi), %xmm1
 	pmovmskb %xmm1, %eax
 	test	%eax, %eax
@@ -248,25 +306,25 @@ L(exit_loop_32):
 L(matches0):
 	bsf	%eax, %eax
 	lea	-16(%rax, %rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(matches):
 	bsf	%eax, %eax
 	add	%rdi, %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(matches16):
 	bsf	%eax, %eax
 	lea	16(%rax, %rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(matches32):
 	bsf	%eax, %eax
 	lea	32(%rax, %rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(matches_1):
@@ -274,7 +332,7 @@ L(matches_1):
 	sub	%rax, %rdx
 	jbe	L(return_null)
 	add	%rdi, %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(matches16_1):
@@ -282,7 +340,7 @@ L(matches16_1):
 	sub	%rax, %rdx
 	jbe	L(return_null)
 	lea	16(%rdi, %rax), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(matches32_1):
@@ -290,7 +348,7 @@ L(matches32_1):
 	sub	%rax, %rdx
 	jbe	L(return_null)
 	lea	32(%rdi, %rax), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(matches48_1):
@@ -298,7 +356,7 @@ L(matches48_1):
 	sub	%rax, %rdx
 	jbe	L(return_null)
 	lea	48(%rdi, %rax), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(return_null):
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index d5c072c..77a7bca 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -23,6 +23,11 @@
 ENTRY (memcmp)
 	test	%rdx, %rdx
 	jz	L(finz)
+#ifdef __CHKP__
+	bndcl	(%rdi), %bnd0
+	bndcl	(%rsi), %bnd1
+#endif
+	pxor	%xmm0, %xmm0
 	cmpq	$1, %rdx
 	jle	L(finr1b)
 	subq	%rdi, %rsi
@@ -86,6 +91,10 @@ L(s16b):
 
 	.p2align 4,, 4
 L(finr1b):
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+#endif
 	movzbl	(%rdi), %eax
 	movzbl  (%rsi), %edx
 L(finz1):
@@ -132,6 +141,10 @@ L(gt32):
 	andq	$15, %r8
 	jz	L(16am)
 	/* Both pointers may be misaligned.  */
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqu	(%rdi),	%xmm1
 	movdqu	(%rdi, %rsi), %xmm0
 	pcmpeqb   %xmm0, %xmm1
@@ -146,6 +159,10 @@ L(16am):
 	jz      L(ATR)
 	testq	$16, %rdi
 	jz	L(A32)
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqu	(%rdi, %rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -160,6 +177,10 @@ L(A32):
 	/* Pre-unroll to be ready for unrolled 64B loop.  */
 	testq	$32, %rdi
 	jz	L(A64)
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqu    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -167,6 +188,10 @@ L(A32):
 	jnz       L(neq)
 	addq       $16, %rdi
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqu    (%rdi,%rsi), %xmm0
 	pcmpeqb  (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -181,6 +206,10 @@ L(A64):
         jge	L(mt32)
 
 L(A64main):
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqu    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -188,6 +217,10 @@ L(A64main):
 	jnz       L(neq)
 	addq       $16, %rdi
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqu    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -195,6 +228,10 @@ L(A64main):
 	jnz       L(neq)
 	addq       $16, %rdi
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqu    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -202,6 +239,10 @@ L(A64main):
 	jnz       L(neq)
 	addq       $16, %rdi
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqu    (%rdi,%rsi), %xmm0
 	pcmpeqb  (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -219,6 +260,10 @@ L(mt32):
         jge	L(mt16)
 
 L(A32main):
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqu    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -226,6 +271,10 @@ L(A32main):
 	jnz       L(neq)
 	addq       $16, %rdi
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqu    (%rdi,%rsi), %xmm0
 	pcmpeqb  (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -258,6 +307,10 @@ L(ATR):
 	testq	$16, %rdi
 	jz	L(ATR32)
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -273,6 +326,10 @@ L(ATR32):
 	testq	$32, %rdi
 	jz	L(ATR64)
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -280,6 +337,10 @@ L(ATR32):
 	jnz       L(neq)
 	addq       $16, %rdi
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -292,6 +353,10 @@ L(ATR64):
 	je	   L(mt32)
 
 L(ATR64main):
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -299,6 +364,10 @@ L(ATR64main):
 	jnz       L(neq)
 	addq       $16, %rdi
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -306,6 +375,10 @@ L(ATR64main):
 	jnz       L(neq)
 	addq       $16, %rdi
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -313,6 +386,10 @@ L(ATR64main):
 	jnz       L(neq)
 	addq       $16, %rdi
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -328,6 +405,10 @@ L(ATR64main):
         jge	L(mt16)
 
 L(ATR32res):
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
@@ -335,6 +416,10 @@ L(ATR32res):
 	jnz       L(neq)
 	addq       $16, %rdi
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rdi, %rsi), %bnd1
+#endif
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index 5a659fe..3afa97c 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -27,6 +27,11 @@ ENTRY (memrchr)
 	sub	$16, %rdx
 	jbe	L(length_less16)
 
+#ifdef __CHKP__
+        bndcl  (%rdi), %bnd0
+        bndcu  -1(%rdi, %rdx), %bnd0
+#endif
+
 	punpcklbw	%xmm1, %xmm1
 	punpcklbw	%xmm1, %xmm1
 
@@ -284,6 +289,10 @@ L(length_less16_offset0):
 	test	%edx, %edx
 	jz	L(return_null)
 
+#ifdef __CHKP__
+        bndcl  (%rdi), %bnd0
+        bndcu  -1(%rdi, %rdx), %bnd0
+#endif
 	mov	%dl, %cl
 	pcmpeqb	(%rdi), %xmm1
 
@@ -314,6 +323,10 @@ L(length_less16):
 	and	$15, %rcx
 	jz	L(length_less16_offset0)
 
+#ifdef __CHKP__
+        bndcl  (%rdi), %bnd0
+        bndcu  -1(%rdi, %rdx), %bnd0
+#endif
 	mov	%rdi, %rcx
 	and	$15, %rcx
 	mov	%cl, %dh
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 6c69f4b..4e1bb84 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -26,6 +26,15 @@
 	.text
 #if !defined NOT_IN_libc
 ENTRY(__bzero)
+	testq	%rsi, %rsi
+	jz	L(only_return)
+
+# ifdef __CHKP__
+	bndcl	(%rdi), %bnd0
+	bndcu	-1(%rdi, %rsi), %bnd0
+# endif
+
+	mov	%rdi, %rax
 	movq	%rdi, %rax /* Set return value.  */
 	movq	%rsi, %rdx /* Set n.  */
 	pxor	%xmm8, %xmm8
@@ -53,7 +62,20 @@ ENTRY_CHK (__memset_chk)
 END_CHK (__memset_chk)
 #endif
 
+#ifdef __CHKP__
+ENTRY (mpx_memset_nochk)
+	jmp	L(entry_from_mpx_memset_nochk)
+END (mpx_memset_nochk)
+#endif
+
 ENTRY (memset)
+	testq	%rdx, %rdx
+	jz	L(only_return)
+#ifdef __CHKP__
+	bndcl	(%rdi), %bnd0
+	bndcu	-1(%rdi, %rdx), %bnd0
+L(entry_from_mpx_memset_nochk):
+#endif
 	movd	%esi, %xmm8
 	movq	%rdi, %rax
 	punpcklbw	%xmm8, %xmm8
@@ -71,6 +93,9 @@ L(entry_from_bzero):
 L(return):
 	rep
 	ret
+L(only_return):
+	movq	%rdi, %rax
+	ret
 	ALIGN (4)
 L(between_32_64_bytes):
 	movdqu	%xmm8, 16(%rdi)
@@ -129,6 +154,11 @@ L(between8_16bytes):
 END (memset)
 libc_hidden_builtin_def (memset)
 
+#ifdef __CHKP__
+weak_alias (memset, mpx_memset_nobnd)
+weak_alias (mpx_memset_nochk, mpx_memset_nobnd_nochk)
+#endif
+
 #if defined PIC && !defined NOT_IN_libc && !defined USE_MULTIARCH
 strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
 	.section .gnu.warning.__memset_zero_constant_len_parameter
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 203d16e..490950e 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -26,6 +26,15 @@ CFLAGS-strstr.c += -msse4
 CFLAGS-strcasestr.c += -msse4
 CFLAGS-strcasestr-nonascii.c += -msse4
 endif
+
+ifeq ($(enable-mpx), yes)
+sysdep_routines += memcpy-ssse3-back-1 mempcpy-ssse3-back-1 memmove-ssse3-back-1 \
+						 memcpy-c memmove-c mempcpy-c memcpy_chk-c mempcpy_chk-c memmove_chk-c
+ASFLAGS-memcpy-ssse3-back-1.S += -fno-mpx
+ASFLAGS-mempcpy-ssse3-back-1.S += -fno-mpx
+ASFLAGS-memmove-ssse3-back-1.S += -fno-mpx
+endif
+
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/multiarch/Versions b/sysdeps/x86_64/multiarch/Versions
index 59b185a..fa1cf0a 100644
--- a/sysdeps/x86_64/multiarch/Versions
+++ b/sysdeps/x86_64/multiarch/Versions
@@ -2,4 +2,17 @@ libc {
   GLIBC_PRIVATE {
     __get_cpu_features;
   }
+%ifdef __CHKP__
+  GLIBC_2.17 {
+   mpx_memcpy_nobnd;
+   mpx_memmove_nobnd;
+   mpx_mempcpy_nobnd;
+   mpx_memcpy_nobnd_nochk;
+   mpx_memmove_nobnd_nochk;
+   mpx_mempcpy_nobnd_nochk;
+   mpx_memcpy_nochk;
+   mpx_memmove_nochk;
+   mpx_mempcpy_nochk;
+  }
+%endif
 }
diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
index 639f02b..9809d47 100644
--- a/sysdeps/x86_64/multiarch/bcopy.S
+++ b/sysdeps/x86_64/multiarch/bcopy.S
@@ -3,5 +3,10 @@
 	.text
 ENTRY(bcopy)
 	xchg	%rdi, %rsi
+#ifdef __CHKP__
+	bndmov %bnd0, %bnd2
+	bndmov %bnd1, %bnd0
+	bndmov %bnd2, %bnd1
+#endif
 	jmp	__libc_memmove	/* Branch to IFUNC memmove.  */
 END(bcopy)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d0992e1..e3a4163 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -44,6 +44,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
 
+#ifndef __CHKP__
+  /* We use specific version for MPX glibc */
   /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
   IFUNC_IMPL (i, name, __memmove_chk,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
@@ -60,6 +62,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
 			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
+#endif
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.S.  */
   IFUNC_IMPL (i, name, stpncpy,
@@ -207,6 +210,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
 #ifdef SHARED
+#ifndef __CHKP__
+  /* We use specific version of memcpy, memcpy_chk, mempcpy if Intel MPX is enabled.  */
   /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
   IFUNC_IMPL (i, name, __memcpy_chk,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
@@ -240,6 +245,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
+#endif
 
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
   IFUNC_IMPL (i, name, strncmp,
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index 1ed4200..b5c6675 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -48,6 +48,13 @@ ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
 	shl	$2, %rdx
 # endif
+# ifdef __CHKP__
+	testq	%rdx, %rdx
+	jz	L(NoEntryCheck)
+	bndcl	(%rdi), %bnd0
+	bndcl	(%rsi), %bnd1
+L(NoEntryCheck):
+# endif
 	pxor	%xmm0, %xmm0
 	cmp	$79, %rdx
 	ja	L(79bytesormore)
@@ -70,6 +77,10 @@ L(firstbyte):
 
 	ALIGN (4)
 L(79bytesormore):
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rsi), %xmm1
 	movdqu	(%rdi), %xmm2
 	pxor	%xmm1, %xmm2
@@ -90,21 +101,37 @@ L(79bytesormore):
 L(less128bytes):
 	sub	$64, %rdx
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqu	32(%rdi), %xmm2
 	pxor	32(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(48bytesin256)
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqu	48(%rdi), %xmm2
 	pxor	48(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -112,11 +139,19 @@ L(less128bytes):
 	cmp	$32, %rdx
 	jb	L(less32bytesin64)
 
+# ifdef __CHKP__
+	bndcu	64(%rdi), %bnd0
+	bndcu	64(%rsi), %bnd1
+# endif
 	movdqu	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(80bytesin256)
 
+# ifdef __CHKP__
+	bndcu	80(%rdi), %bnd0
+	bndcu	80(%rsi), %bnd1
+# endif
 	movdqu	80(%rdi), %xmm2
 	pxor	80(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -139,41 +174,73 @@ L(128bytesormore):
 L(less256bytes):
 	sub	$128, %rdx
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqu	32(%rdi), %xmm2
 	pxor	32(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(48bytesin256)
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqu	48(%rdi), %xmm2
 	pxor	48(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(64bytesin256)
 
+# ifdef __CHKP__
+	bndcu	64(%rdi), %bnd0
+	bndcu	64(%rsi), %bnd1
+# endif
 	movdqu	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(80bytesin256)
 
+# ifdef __CHKP__
+	bndcu	80(%rdi), %bnd0
+	bndcu	80(%rsi), %bnd1
+# endif
 	movdqu	80(%rdi), %xmm2
 	pxor	80(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(96bytesin256)
 
+# ifdef __CHKP__
+	bndcu	96(%rdi), %bnd0
+	bndcu	96(%rsi), %bnd1
+# endif
 	movdqu	96(%rdi), %xmm2
 	pxor	96(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(112bytesin256)
 
+# ifdef __CHKP__
+	bndcu	112(%rdi), %bnd0
+	bndcu	112(%rsi), %bnd1
+# endif
 	movdqu	112(%rdi), %xmm2
 	pxor	112(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -188,11 +255,19 @@ L(less256bytes):
 	cmp	$32, %rdx
 	jb	L(less32bytesin128)
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -207,81 +282,145 @@ L(less32bytesin128):
 
 L(less512bytes):
 	sub	$256, %rdx
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqu	32(%rdi), %xmm2
 	pxor	32(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(48bytesin256)
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqu	48(%rdi), %xmm2
 	pxor	48(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(64bytesin256)
 
+# ifdef __CHKP__
+	bndcu	64(%rdi), %bnd0
+	bndcu	64(%rsi), %bnd1
+# endif
 	movdqu	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(80bytesin256)
 
+# ifdef __CHKP__
+	bndcu	80(%rdi), %bnd0
+	bndcu	80(%rsi), %bnd1
+# endif
 	movdqu	80(%rdi), %xmm2
 	pxor	80(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(96bytesin256)
 
+# ifdef __CHKP__
+	bndcu	96(%rdi), %bnd0
+	bndcu	96(%rsi), %bnd1
+# endif
 	movdqu	96(%rdi), %xmm2
 	pxor	96(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(112bytesin256)
 
+# ifdef __CHKP__
+	bndcu	112(%rdi), %bnd0
+	bndcu	112(%rsi), %bnd1
+# endif
 	movdqu	112(%rdi), %xmm2
 	pxor	112(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(128bytesin256)
 
+# ifdef __CHKP__
+	bndcu	128(%rdi), %bnd0
+	bndcu	128(%rsi), %bnd1
+# endif
 	movdqu	128(%rdi), %xmm2
 	pxor	128(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(144bytesin256)
 
+# ifdef __CHKP__
+	bndcu	144(%rdi), %bnd0
+	bndcu	144(%rsi), %bnd1
+# endif
 	movdqu	144(%rdi), %xmm2
 	pxor	144(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(160bytesin256)
 
+# ifdef __CHKP__
+	bndcu	160(%rdi), %bnd0
+	bndcu	160(%rsi), %bnd1
+# endif
 	movdqu	160(%rdi), %xmm2
 	pxor	160(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(176bytesin256)
 
+# ifdef __CHKP__
+	bndcu	176(%rdi), %bnd0
+	bndcu	176(%rsi), %bnd1
+# endif
 	movdqu	176(%rdi), %xmm2
 	pxor	176(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(192bytesin256)
 
+# ifdef __CHKP__
+	bndcu	192(%rdi), %bnd0
+	bndcu	192(%rsi), %bnd1
+# endif
 	movdqu	192(%rdi), %xmm2
 	pxor	192(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(208bytesin256)
 
+# ifdef __CHKP__
+	bndcu	208(%rdi), %bnd0
+	bndcu	208(%rsi), %bnd1
+# endif
 	movdqu	208(%rdi), %xmm2
 	pxor	208(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(224bytesin256)
 
+# ifdef __CHKP__
+	bndcu	224(%rdi), %bnd0
+	bndcu	224(%rsi), %bnd1
+# endif
 	movdqu	224(%rdi), %xmm2
 	pxor	224(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(240bytesin256)
 
+# ifdef __CHKP__
+	bndcu	240(%rdi), %bnd0
+	bndcu	240(%rsi), %bnd1
+# endif
 	movdqu	240(%rdi), %xmm2
 	pxor	240(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -299,11 +438,19 @@ L(less512bytes):
 	cmp	$32, %rdx
 	jb	L(less32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -331,18 +478,34 @@ L(512bytesormore):
 	sub	$64, %rdx
 	ALIGN (4)
 L(64bytesormore_loop):
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	movdqa	%xmm2, %xmm1
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm3
 	pxor	16(%rsi), %xmm3
 	por	%xmm3, %xmm1
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqu	32(%rdi), %xmm4
 	pxor	32(%rsi), %xmm4
 	por	%xmm4, %xmm1
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqu	48(%rdi), %xmm5
 	pxor	48(%rsi), %xmm5
 	por	%xmm5, %xmm1
@@ -365,18 +528,34 @@ L(L2_L3_cache_unaglined):
 L(L2_L3_unaligned_128bytes_loop):
 	prefetchnta 0x1c0(%rdi)
 	prefetchnta 0x1c0(%rsi)
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	movdqa	%xmm2, %xmm1
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm3
 	pxor	16(%rsi), %xmm3
 	por	%xmm3, %xmm1
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqu	32(%rdi), %xmm4
 	pxor	32(%rsi), %xmm4
 	por	%xmm4, %xmm1
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqu	48(%rdi), %xmm5
 	pxor	48(%rsi), %xmm5
 	por	%xmm5, %xmm1
@@ -403,21 +582,37 @@ L(2aligned):
 L(less128bytesin2aligned):
 	sub	$64, %rdx
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqa	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqa	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqa	32(%rdi), %xmm2
 	pxor	32(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(48bytesin256)
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqa	48(%rdi), %xmm2
 	pxor	48(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -425,11 +620,19 @@ L(less128bytesin2aligned):
 	cmp	$32, %rdx
 	jb	L(less32bytesin64in2alinged)
 
+# ifdef __CHKP__
+	bndcu	64(%rdi), %bnd0
+	bndcu	64(%rsi), %bnd1
+# endif
 	movdqa	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(80bytesin256)
 
+# ifdef __CHKP__
+	bndcu	80(%rdi), %bnd0
+	bndcu	80(%rsi), %bnd1
+# endif
 	movdqa	80(%rdi), %xmm2
 	pxor	80(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -453,41 +656,73 @@ L(128bytesormorein2aligned):
 L(less256bytesin2alinged):
 	sub	$128, %rdx
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqa	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqa	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqa	32(%rdi), %xmm2
 	pxor	32(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(48bytesin256)
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqa	48(%rdi), %xmm2
 	pxor	48(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(64bytesin256)
 
+# ifdef __CHKP__
+	bndcu	64(%rdi), %bnd0
+	bndcu	64(%rsi), %bnd1
+# endif
 	movdqa	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(80bytesin256)
 
+# ifdef __CHKP__
+	bndcu	80(%rdi), %bnd0
+	bndcu	80(%rsi), %bnd1
+# endif
 	movdqa	80(%rdi), %xmm2
 	pxor	80(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(96bytesin256)
 
+# ifdef __CHKP__
+	bndcu	96(%rdi), %bnd0
+	bndcu	96(%rsi), %bnd1
+# endif
 	movdqa	96(%rdi), %xmm2
 	pxor	96(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(112bytesin256)
 
+# ifdef __CHKP__
+	bndcu	112(%rdi), %bnd0
+	bndcu	112(%rsi), %bnd1
+# endif
 	movdqa	112(%rdi), %xmm2
 	pxor	112(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -502,11 +737,19 @@ L(less256bytesin2alinged):
 	cmp	$32, %rdx
 	jb	L(less32bytesin128in2aligned)
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -523,81 +766,145 @@ L(less32bytesin128in2aligned):
 L(256bytesormorein2aligned):
 
 	sub	$256, %rdx
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqa	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqa	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqa	32(%rdi), %xmm2
 	pxor	32(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(48bytesin256)
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqa	48(%rdi), %xmm2
 	pxor	48(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(64bytesin256)
 
+# ifdef __CHKP__
+	bndcu	64(%rdi), %bnd0
+	bndcu	64(%rsi), %bnd1
+# endif
 	movdqa	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(80bytesin256)
 
+# ifdef __CHKP__
+	bndcu	80(%rdi), %bnd0
+	bndcu	80(%rsi), %bnd1
+# endif
 	movdqa	80(%rdi), %xmm2
 	pxor	80(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(96bytesin256)
 
+# ifdef __CHKP__
+	bndcu	96(%rdi), %bnd0
+	bndcu	96(%rsi), %bnd1
+# endif
 	movdqa	96(%rdi), %xmm2
 	pxor	96(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(112bytesin256)
 
+# ifdef __CHKP__
+	bndcu	112(%rdi), %bnd0
+	bndcu	112(%rsi), %bnd1
+# endif
 	movdqa	112(%rdi), %xmm2
 	pxor	112(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(128bytesin256)
 
+# ifdef __CHKP__
+	bndcu	128(%rdi), %bnd0
+	bndcu	128(%rsi), %bnd1
+# endif
 	movdqa	128(%rdi), %xmm2
 	pxor	128(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(144bytesin256)
 
+# ifdef __CHKP__
+	bndcu	144(%rdi), %bnd0
+	bndcu	144(%rsi), %bnd1
+# endif
 	movdqa	144(%rdi), %xmm2
 	pxor	144(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(160bytesin256)
 
+# ifdef __CHKP__
+	bndcu	160(%rdi), %bnd0
+	bndcu	160(%rsi), %bnd1
+# endif
 	movdqa	160(%rdi), %xmm2
 	pxor	160(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(176bytesin256)
 
+# ifdef __CHKP__
+	bndcu	176(%rdi), %bnd0
+	bndcu	176(%rsi), %bnd1
+# endif
 	movdqa	176(%rdi), %xmm2
 	pxor	176(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(192bytesin256)
 
+# ifdef __CHKP__
+	bndcu	192(%rdi), %bnd0
+	bndcu	192(%rsi), %bnd1
+# endif
 	movdqa	192(%rdi), %xmm2
 	pxor	192(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(208bytesin256)
 
+# ifdef __CHKP__
+	bndcu	208(%rdi), %bnd0
+	bndcu	208(%rsi), %bnd1
+# endif
 	movdqa	208(%rdi), %xmm2
 	pxor	208(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(224bytesin256)
 
+# ifdef __CHKP__
+	bndcu	224(%rdi), %bnd0
+	bndcu	224(%rsi), %bnd1
+# endif
 	movdqa	224(%rdi), %xmm2
 	pxor	224(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(240bytesin256)
 
+# ifdef __CHKP__
+	bndcu	240(%rdi), %bnd0
+	bndcu	240(%rsi), %bnd1
+# endif
 	movdqa	240(%rdi), %xmm2
 	pxor	240(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -648,18 +955,34 @@ L(512bytesormorein2aligned):
 	sub	$64, %rdx
 	ALIGN (4)
 L(64bytesormore_loopin2aligned):
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqa	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	movdqa	%xmm2, %xmm1
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqa	16(%rdi), %xmm3
 	pxor	16(%rsi), %xmm3
 	por	%xmm3, %xmm1
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqa	32(%rdi), %xmm4
 	pxor	32(%rsi), %xmm4
 	por	%xmm4, %xmm1
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqa	48(%rdi), %xmm5
 	pxor	48(%rsi), %xmm5
 	por	%xmm5, %xmm1
@@ -682,18 +1005,34 @@ L(L2_L3_cache_aglined):
 L(L2_L3_aligned_128bytes_loop):
 	prefetchnta 0x1c0(%rdi)
 	prefetchnta 0x1c0(%rsi)
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqa	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	movdqa	%xmm2, %xmm1
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqa	16(%rdi), %xmm3
 	pxor	16(%rsi), %xmm3
 	por	%xmm3, %xmm1
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqa	32(%rdi), %xmm4
 	pxor	32(%rsi), %xmm4
 	por	%xmm4, %xmm1
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqa	48(%rdi), %xmm5
 	pxor	48(%rsi), %xmm5
 	por	%xmm5, %xmm1
diff --git a/sysdeps/x86_64/multiarch/memcpy-c.c b/sysdeps/x86_64/multiarch/memcpy-c.c
new file mode 100644
index 0000000..7076d4a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-c.c
@@ -0,0 +1,70 @@
+/* C-version of memcpy for using when Intel MPX is enabled
+   in order to prosess with a buffer of pointers correctly.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef __CHKP__
+
+# include <stddef.h>
+
+void *
+__memcpy (void *dst, const void *src, size_t n)
+{
+  const char *s = src;
+  char *d = dst;
+  void *ret = dst;
+  size_t offset_src = ((size_t) s) & (sizeof(size_t) - 1);
+  size_t offset_dst = ((size_t) d) & (sizeof(size_t) - 1);
+
+  if (offset_src != offset_dst)
+  {
+    while (n--)
+      *d++ = *s++;
+  }
+  else
+  {
+    if (offset_src) offset_src = sizeof(size_t) - offset_src;
+    while (n-- && offset_src--)
+      *d++ = *s++;
+    n++;
+    if (!n) return ret;
+    void **d1 = (void **)d;
+    void **s1 = (void **)s;
+    while (n >= sizeof(void *))
+    {
+      n -= sizeof(void *);
+      *d1++ = *s1++;
+    }
+    s = (char *)s1;
+    d = (char *)d1;
+    while (n--)
+      *d++ = *s++;
+  }
+  return ret;
+}
+
+weak_alias (__memcpy, __GI_memcpy)
+
+# if defined SHARED && !defined NOT_IN_libc && !defined IA32
+#  include <shlib-compat.h>
+versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
+# else
+weak_alias (__memcpy, memcpy)
+# endif
+
+weak_alias (__memcpy, mpx_memcpy_nochk)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back-1.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back-1.S
new file mode 100644
index 0000000..e0c179a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back-1.S
@@ -0,0 +1,4 @@
+#ifdef __CHKP__
+# define MEMCPY mpx_memcpy_nobnd_nochk
+# include "memcpy-ssse3-back.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index fc9fcef..5731b9d 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -27,7 +27,11 @@
 #include "asm-syntax.h"
 
 #ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3_back
+# ifdef __CHKP__
+#  define MEMCPY		mpx_memcpy_nobnd
+# else
+#  define MEMCPY		__memcpy_ssse3_back
+# endif
 # define MEMCPY_CHK	__memcpy_chk_ssse3_back
 #endif
 
@@ -48,7 +52,7 @@
   ud2
 
 	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_BCOPY
+#if !defined USE_AS_BCOPY && defined MEMCPY_CHK
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
@@ -56,6 +60,15 @@ END (MEMCPY_CHK)
 #endif
 
 ENTRY (MEMCPY)
+#ifdef __CHKP__
+	testq	%rdx, %rdx
+	jz	L(NoEntryCheck)
+	bndcl	(%rdi), %bnd0
+	bndcu	-1(%rdi, %rdx), %bnd0
+	bndcl	(%rsi), %bnd1
+	bndcu	-1(%rsi, %rdx), %bnd1
+#endif
+
 	mov	%rdi, %rax
 #ifdef USE_AS_MEMPCPY
 	add	%rdx, %rax
@@ -87,6 +100,15 @@ L(bk_write):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 #endif
 
+#ifdef __CHKP__
+L(NoEntryCheck):
+	mov	%rdi, %rax
+# ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+# endif
+	ret
+#endif
+
 	ALIGN (4)
 L(144bytesormore):
 
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index a1e5031..34987b8 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -18,14 +18,15 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <shlib-compat.h>
-#include <init-arch.h>
+#ifndef __CHKP__
+# include <sysdep.h>
+# include <shlib-compat.h>
+# include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib and for
    DSO.  In static binaries we need memcpy before the initialization
    happened.  */
-#if defined SHARED && !defined NOT_IN_libc
+# if defined SHARED && !defined NOT_IN_libc
 	.text
 ENTRY(__new_memcpy)
 	.type	__new_memcpy, @gnu_indirect_function
@@ -43,37 +44,39 @@ ENTRY(__new_memcpy)
 3:	ret
 END(__new_memcpy)
 
-# undef ENTRY
-# define ENTRY(name) \
+#  undef ENTRY
+#  define ENTRY(name) \
 	.type __memcpy_sse2, @function; \
 	.globl __memcpy_sse2; \
 	.hidden __memcpy_sse2; \
 	.p2align 4; \
 	__memcpy_sse2: cfi_startproc; \
 	CALL_MCOUNT
-# undef END
-# define END(name) \
+#  undef END
+#  define END(name) \
 	cfi_endproc; .size __memcpy_sse2, .-__memcpy_sse2
 
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
+#  undef ENTRY_CHK
+#  define ENTRY_CHK(name) \
 	.type __memcpy_chk_sse2, @function; \
 	.globl __memcpy_chk_sse2; \
 	.p2align 4; \
 	__memcpy_chk_sse2: cfi_startproc; \
 	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
+#  undef END_CHK
+#  define END_CHK(name) \
 	cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2
 
-# undef libc_hidden_builtin_def
+#  undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal memcpy calls through a PLT.
    The speedup we get from using SSSE3 instruction is likely eaten away
    by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
+#  define libc_hidden_builtin_def(name) \
 	.globl __GI_memcpy; __GI_memcpy = __memcpy_sse2
 
 versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14);
-#endif
+# endif
+
+# include "../memcpy.S"
 
-#include "../memcpy.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk-c.c b/sysdeps/x86_64/multiarch/memcpy_chk-c.c
new file mode 100644
index 0000000..3bca281
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy_chk-c.c
@@ -0,0 +1,3 @@
+#ifdef __CHKP__
+# include <debug/memcpy_chk.c>
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index ad01d8c..5b03f20 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -18,14 +18,15 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
+#ifndef __CHKP__
+# include <sysdep.h>
+# include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib and for
    DSO.  There are no multiarch memcpy functions for static binaries.
  */
-#ifndef NOT_IN_libc
-# ifdef SHARED
+# ifndef NOT_IN_libc
+#  ifdef SHARED
 	.text
 ENTRY(__memcpy_chk)
 	.type	__memcpy_chk, @gnu_indirect_function
@@ -41,7 +42,8 @@ ENTRY(__memcpy_chk)
 	leaq	__memcpy_chk_ssse3_back(%rip), %rax
 2:	ret
 END(__memcpy_chk)
-# else
-#  include "../memcpy_chk.S"
+#  else
+#   include "../memcpy_chk.S"
+#  endif
 # endif
 #endif
diff --git a/sysdeps/x86_64/multiarch/memmove-c.c b/sysdeps/x86_64/multiarch/memmove-c.c
new file mode 100644
index 0000000..63d779e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-c.c
@@ -0,0 +1,108 @@
+/* C-version of memmove for using when Intel MPX is enabled
+   in order to prosess with a buffer of pointers correctly.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef __CHKP__
+
+# include <stddef.h>
+
+void *
+__memmove (void *dst, const void *src, size_t n)
+{
+  const char *s = src;
+  char *d = dst;
+  void *ret = dst;
+  size_t offset_src = ((size_t) s) & (sizeof(size_t) - 1);
+  size_t offset_dst = ((size_t) d) & (sizeof(size_t) - 1);
+
+  if (offset_src != offset_dst)
+  {
+    if (s < d)
+    {
+      /* backward copying */
+      d += n;
+      s += n;
+      while (n--)
+        *--d = *--s;
+    }
+    else
+      /* forward copying */
+      while (n--)
+        *d++ = *s++;
+  }
+  else
+  {
+    if (s < d)
+    {
+      offset_src = (offset_src + (size_t)src) & (sizeof(size_t) - 1);
+      /* backward copying */
+      d += n;
+      s += n;
+      while (n-- && offset_src--)
+        *--d = *--s;
+      n++;
+      if (!n) return ret;
+      void **d1 = (void **)d;
+      void **s1 = (void **)s;
+      while (n >= sizeof(void *))
+      {
+        n -= sizeof(void *);
+        *--d1 = *--s1;
+      }
+      s = (char *)s1;
+      d = (char *)d1;
+      while (n--)
+        *--d = *--s;
+    }
+    else
+    {
+      if (offset_src) offset_src = sizeof(size_t) - offset_src;
+      /* forward copying */
+      while (n-- && offset_src--)
+        *d++ = *s++;
+      n++;
+      if (!n) return ret;
+      void **d1 = (void **)d;
+      void **s1 = (void **)s;
+      while (n >= sizeof(void *))
+      {
+        n -= sizeof(void *);
+        *d1++ = *s1++;
+      }
+      s = (char *)s1;
+      d = (char *)d1;
+      while (n--)
+        *d++ = *s++;
+    }
+  }
+  return ret;
+}
+
+weak_alias (__memmove, __libc_memmove)
+weak_alias (__memmove, __GI_memmove)
+weak_alias (__memmove, memmove)
+
+# if defined SHARED && !defined NOT_IN_libc
+#  include <shlib-compat.h>
+#  if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
+#  endif
+# endif
+
+weak_alias (__memmove, mpx_memmove_nochk)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back-1.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back-1.S
new file mode 100644
index 0000000..45a8209
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3-back-1.S
@@ -0,0 +1,5 @@
+#ifdef __CHKP__
+# define USE_AS_MEMMOVE
+# define MEMCPY mpx_memmove_nobnd_nochk
+# include "memcpy-ssse3-back.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
index f9a4e9a..53e90e7 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
@@ -1,4 +1,10 @@
 #define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3_back
+#ifdef __CHKP__
+/* version of memmove with no copying of bounds support
+   if there are pointers in the source buffer. */
+# define MEMCPY	   mpx_memmove_nobnd
+# else
+# define MEMCPY		__memmove_ssse3_back
+#endif
 #define MEMCPY_CHK	__memmove_chk_ssse3_back
 #include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index 8149c48..0d2c6f0 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -17,31 +17,32 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef NOT_IN_libc
-# define MEMMOVE __memmove_sse2
-# ifdef SHARED
-#  undef libc_hidden_builtin_def
-#  define libc_hidden_builtin_def(name) \
+#ifndef __CHKP__
+# ifndef NOT_IN_libc
+#  define MEMMOVE __memmove_sse2
+#  ifdef SHARED
+#   undef libc_hidden_builtin_def
+#   define libc_hidden_builtin_def(name) \
   __hidden_ver1 (__memmove_sse2, __GI_memmove, __memmove_sse2);
-# endif
+#  endif
 
 /* Redefine memmove so that the compiler won't complain about the type
    mismatch with the IFUNC selector in strong_alias, below.  */
-# undef memmove
-# define memmove __redirect_memmove
-# include <string.h>
-# undef memmove
+#  undef memmove
+#  define memmove __redirect_memmove
+#  include <string.h>
+#  undef memmove
 
 extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
-#endif
+# endif
 
-#include "string/memmove.c"
+# include "string/memmove.c"
 
-#ifndef NOT_IN_libc
-# include <shlib-compat.h>
-# include "init-arch.h"
+# ifndef NOT_IN_libc
+#  include <shlib-compat.h>
+#  include "init-arch.h"
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
@@ -54,7 +55,8 @@ libc_ifunc (__libc_memmove,
 
 strong_alias (__libc_memmove, memmove)
 
-# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+#  if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
 compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
+#  endif
 # endif
 #endif
diff --git a/sysdeps/x86_64/multiarch/memmove_chk-c.c b/sysdeps/x86_64/multiarch/memmove_chk-c.c
new file mode 100644
index 0000000..bbf53d0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove_chk-c.c
@@ -0,0 +1 @@
+#include <debug/memmove_chk.c>
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
index 17ed460..c1b0b93 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -17,19 +17,21 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <string.h>
-#include "init-arch.h"
+#ifndef __CHKP__
+# include <string.h>
+# include "init-arch.h"
 
-#define MEMMOVE_CHK __memmove_chk_sse2
+# define MEMMOVE_CHK __memmove_chk_sse2
 
 extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
 
-#include "debug/memmove_chk.c"
+# include "debug/memmove_chk.c"
 
 libc_ifunc (__memmove_chk,
 	    HAS_SSSE3
 	    ? (HAS_FAST_COPY_BACKWARD
 	       ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
 	    : __memmove_chk_sse2);
+#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy-c.c b/sysdeps/x86_64/multiarch/mempcpy-c.c
new file mode 100644
index 0000000..b9fcb11
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-c.c
@@ -0,0 +1,64 @@
+/* C-version of mempcpy for using when Intel MPX is enabled
+   in order to process with an array of pointers correctly.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef __CHKP__
+
+# include <stddef.h>
+
+void *
+mempcpy (void *dst, const void *src, size_t n)
+{
+  const char *s = src;
+  char *d = dst;
+  void *ret = dst + n;
+  size_t offset_src = ((size_t) s) & (sizeof(size_t) - 1);
+  size_t offset_dst = ((size_t) d) & (sizeof(size_t) - 1);
+
+  if (offset_src != offset_dst)
+  {
+    while (n--)
+      *d++ = *s++;
+  }
+  else
+  {
+    if (offset_src) offset_src = sizeof(size_t) - offset_src;
+    while (n-- && offset_src--)
+      *d++ = *s++;
+    n++;
+    if (!n) return ret;
+    void **d1 = (void **)d;
+    void **s1 = (void **)s;
+    while (n >= sizeof(void *))
+    {
+      n -= sizeof(void *);
+      *d1++ = *s1++;
+    }
+    s = (char *)s1;
+    d = (char *)d1;
+    while (n--)
+      *d++ = *s++;
+  }
+  return ret;
+}
+
+weak_alias (mempcpy, __GI_mempcpy)
+weak_alias (mempcpy, __GI___mempcpy)
+weak_alias (mempcpy, __mempcpy)
+weak_alias (mempcpy, mpx_mempcpy_nochk)
+#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back-1.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back-1.S
new file mode 100644
index 0000000..8fa99b5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back-1.S
@@ -0,0 +1,6 @@
+#ifdef __CHKP__
+# define USE_AS_MEMPCPY
+/* the version of mempcpy without ant checks or copying bounds. */
+# define MEMCPY mpx_mempcpy_nobnd_nochk
+# include "memcpy-ssse3-back.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
index 82ffacb..2aa5313 100644
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
@@ -1,4 +1,12 @@
 #define USE_AS_MEMPCPY
-#define MEMCPY		__mempcpy_ssse3_back
-#define MEMCPY_CHK	__mempcpy_chk_ssse3_back
+
+#ifdef __CHKP__
+/* version of mempcpy with no copying of bounds support
+   if there are pointers in the source buffer. */
+# define MEMCPY  mpx_mempcpy_nobnd
+#else
+# define MEMCPY	__mempcpy_ssse3_back
+#endif
+
+#define MEMCPY_CHK __mempcpy_chk_ssse3_back
 #include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index b8b7fcd..b4bfbdc 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -18,13 +18,14 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
+#ifndef __CHKP__
+# include <sysdep.h>
+# include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib and for
    DSO.  In static binaries we need mempcpy before the initialization
    happened.  */
-#if defined SHARED && !defined NOT_IN_libc
+# if defined SHARED && !defined NOT_IN_libc
 ENTRY(__mempcpy)
 	.type	__mempcpy, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
@@ -40,38 +41,40 @@ ENTRY(__mempcpy)
 2:	ret
 END(__mempcpy)
 
-# undef ENTRY
-# define ENTRY(name) \
+#  undef ENTRY
+#  define ENTRY(name) \
 	.type __mempcpy_sse2, @function; \
 	.p2align 4; \
 	.globl __mempcpy_sse2; \
 	.hidden __mempcpy_sse2; \
 	__mempcpy_sse2: cfi_startproc; \
 	CALL_MCOUNT
-# undef END
-# define END(name) \
+#  undef END
+#  define END(name) \
 	cfi_endproc; .size __mempcpy_sse2, .-__mempcpy_sse2
 
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
+#  undef ENTRY_CHK
+#  define ENTRY_CHK(name) \
 	.type __mempcpy_chk_sse2, @function; \
 	.globl __mempcpy_chk_sse2; \
 	.p2align 4; \
 	__mempcpy_chk_sse2: cfi_startproc; \
 	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
+#  undef END_CHK
+#  define END_CHK(name) \
 	cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2
 
-# undef libc_hidden_def
-# undef libc_hidden_builtin_def
+#  undef libc_hidden_def
+#  undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal mempcpy calls through a PLT.
    The speedup we get from using SSSE3 instruction is likely eaten away
    by the indirect call in the PLT.  */
-# define libc_hidden_def(name) \
+#  define libc_hidden_def(name) \
 	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2
-# define libc_hidden_builtin_def(name) \
+#  define libc_hidden_builtin_def(name) \
 	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2
-#endif
+# endif
+
+# include "../mempcpy.S"
 
-#include "../mempcpy.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk-c.c b/sysdeps/x86_64/multiarch/mempcpy_chk-c.c
new file mode 100644
index 0000000..40ae725
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk-c.c
@@ -0,0 +1,3 @@
+#ifdef __CHKP__
+# include <debug/mempcpy_chk.c>
+#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 3801db3..10653c5 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -18,14 +18,15 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
+#ifndef __CHKP__
+# include <sysdep.h>
+# include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib and for
    DSO.  There are no multiarch mempcpy functions for static binaries.
  */
-#ifndef NOT_IN_libc
-# ifdef SHARED
+# ifndef NOT_IN_libc
+#  ifdef SHARED
 	.text
 ENTRY(__mempcpy_chk)
 	.type	__mempcpy_chk, @gnu_indirect_function
@@ -41,7 +42,8 @@ ENTRY(__mempcpy_chk)
 	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
 2:	ret
 END(__mempcpy_chk)
-# else
-#  include "../mempcpy_chk.S"
+#  else
+#   include "../mempcpy_chk.S"
+#  endif
 # endif
 #endif
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
index 028c6d3..a3535ad 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -25,6 +25,14 @@
 #  define STRCAT  __strcat_sse2_unaligned
 # endif
 
+# ifdef __CHKP__
+#  define RETURN \
+	bndcu	-1(%rdi, %rax), %bnd0; \
+	ret
+# else
+#  define RETURN ret
+# endif
+
 # define USE_AS_STRCAT
 
 .text
@@ -37,6 +45,10 @@ ENTRY (STRCAT)
 /* Inline corresponding strlen file, temporary until new strcpy
    implementation gets merged.  */
 
+# ifdef __CHKP__
+	bndcl  (%rdi), %bnd0
+	bndcu	(%rdi), %bnd0
+# endif
 	xor	%rax, %rax
 	mov	%edi, %ecx
 	and	$0x3f, %ecx
@@ -67,84 +79,132 @@ L(align16_start):
 	pxor	%xmm1, %xmm1
 	pxor	%xmm2, %xmm2
 	pxor	%xmm3, %xmm3
+# ifdef __CHKP__
+	bndcu	16(%rax), %bnd0
+# endif
 	pcmpeqb	16(%rax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(exit16)
 
+# ifdef __CHKP__
+	bndcu	32(%rax), %bnd0
+# endif
 	pcmpeqb	32(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	jnz	L(exit32)
 
+# ifdef __CHKP__
+	bndcu	48(%rax), %bnd0
+# endif
 	pcmpeqb	48(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	jnz	L(exit48)
 
+# ifdef __CHKP__
+	bndcu	64(%rax), %bnd0
+# endif
 	pcmpeqb	64(%rax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
 	jnz	L(exit64)
 
+# ifdef __CHKP__
+	bndcu	80(%rax), %bnd0
+# endif
 	pcmpeqb	80(%rax), %xmm0
 	add	$64, %rax
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(exit16)
 
+# ifdef __CHKP__
+	bndcu	32(%rax), %bnd0
+# endif
 	pcmpeqb	32(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	jnz	L(exit32)
 
+# ifdef __CHKP__
+	bndcu	48(%rax), %bnd0
+# endif
 	pcmpeqb	48(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	jnz	L(exit48)
 
+# ifdef __CHKP__
+	bndcu	64(%rax), %bnd0
+# endif
 	pcmpeqb	64(%rax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
 	jnz	L(exit64)
 
+# ifdef __CHKP__
+	bndcu	80(%rax), %bnd0
+# endif
 	pcmpeqb	80(%rax), %xmm0
 	add	$64, %rax
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(exit16)
 
+# ifdef __CHKP__
+	bndcu	32(%rax), %bnd0
+# endif
 	pcmpeqb	32(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	jnz	L(exit32)
 
+# ifdef __CHKP__
+	bndcu	48(%rax), %bnd0
+# endif
 	pcmpeqb	48(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	jnz	L(exit48)
 
+# ifdef __CHKP__
+	bndcu	64(%rax), %bnd0
+# endif
 	pcmpeqb	64(%rax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
 	jnz	L(exit64)
 
+# ifdef __CHKP__
+	bndcu	80(%rax), %bnd0
+# endif
 	pcmpeqb	80(%rax), %xmm0
 	add	$64, %rax
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(exit16)
 
+# ifdef __CHKP__
+	bndcu	32(%rax), %bnd0
+# endif
 	pcmpeqb	32(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	jnz	L(exit32)
 
+# ifdef __CHKP__
+	bndcu	48(%rax), %bnd0
+# endif
 	pcmpeqb	48(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	jnz	L(exit48)
 
+# ifdef __CHKP__
+	bndcu	64(%rax), %bnd0
+# endif
 	pcmpeqb	64(%rax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
@@ -153,6 +213,9 @@ L(align16_start):
 	test	$0x3f, %rax
 	jz	L(align64_loop)
 
+# ifdef __CHKP__
+	bndcu	80(%rax), %bnd0
+# endif
 	pcmpeqb	80(%rax), %xmm0
 	add	$80, %rax
 	pmovmskb %xmm0, %edx
@@ -162,6 +225,9 @@ L(align16_start):
 	test	$0x3f, %rax
 	jz	L(align64_loop)
 
+# ifdef __CHKP__
+	bndcu	16(%rax), %bnd0
+# endif
 	pcmpeqb	16(%rax), %xmm1
 	add	$16, %rax
 	pmovmskb %xmm1, %edx
@@ -171,6 +237,9 @@ L(align16_start):
 	test	$0x3f, %rax
 	jz	L(align64_loop)
 
+# ifdef __CHKP__
+	bndcu	16(%rax), %bnd0
+# endif
 	pcmpeqb	16(%rax), %xmm2
 	add	$16, %rax
 	pmovmskb %xmm2, %edx
@@ -180,6 +249,9 @@ L(align16_start):
 	test	$0x3f, %rax
 	jz	L(align64_loop)
 
+# ifdef __CHKP__
+	bndcu	16(%rax), %bnd0
+# endif
 	pcmpeqb	16(%rax), %xmm3
 	add	$16, %rax
 	pmovmskb %xmm3, %edx
@@ -187,8 +259,12 @@ L(align16_start):
 	jnz	L(exit)
 
 	add	$16, %rax
+
 	.p2align 4
 	L(align64_loop):
+# ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+# endif
 	movaps	(%rax),	%xmm4
 	pminub	16(%rax),	%xmm4
 	movaps	32(%rax),	%xmm5
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index f170238..4311e86 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -91,6 +91,10 @@ __strchr_sse42:
 	CALL_MCOUNT
 	testb	%sil, %sil
 	je	__strend_sse4
+# ifdef __CHKP__
+	bndcl  (%rdi), %bnd0
+	bndcu  (%rdi), %bnd0
+# endif
 	pxor	%xmm2, %xmm2
 	movd	%esi, %xmm1
 	movl	%edi, %ecx
@@ -124,6 +128,9 @@ __strchr_sse42:
 	ja	L(return_null)
 L(unaligned_match):
 	addq	%rdi, %rax
+# ifdef __CHKP__
+	bndcu 	(%rax), %bnd0
+# endif
 	ret
 
 	.p2align 4
@@ -135,15 +142,27 @@ L(unaligned_no_match):
 L(loop):
 	addq	$16, %r8
 L(aligned_start):
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 	pcmpistri	$0x2, (%r8), %xmm1
 	jbe	L(wrap)
 	addq	$16, %r8
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 	pcmpistri	$0x2, (%r8), %xmm1
 	jbe	L(wrap)
 	addq	$16, %r8
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 	pcmpistri       $0x2, (%r8), %xmm1
 	jbe     L(wrap)
 	addq	$16, %r8
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 	pcmpistri	$0x2, (%r8), %xmm1
 	jbe	L(wrap)
 	jmp	L(loop)
@@ -159,6 +178,9 @@ L(return_null):
 	.p2align 4
 L(loop_exit):
 	leaq	(%r8,%rcx), %rax
+# ifdef __CHKP__
+	bndcu 	(%rax), %bnd0
+# endif
 	ret
 	cfi_endproc
 	.size	__strchr_sse42, .-__strchr_sse42
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index c84f1c2..edfa915 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -127,6 +127,14 @@ STRCMP_SSE42:
 	je	LABEL(Byte0)
 	mov	%rdx, %r11
 #endif
+
+#ifdef __CHKP__
+	bndcl 	(%rdi), %bnd0
+	bndcu 	(%rdi), %bnd0
+	bndcl 	(%rsi), %bnd1
+	bndcu 	(%rsi), %bnd1
+#endif
+
 	mov	%esi, %ecx
 	mov	%edi, %eax
 /* Use 64bit AND here to avoid long NOP padding.  */
@@ -210,6 +218,10 @@ LABEL(touppermask):
 #endif
 	add	$16, %rsi		/* prepare to search next 16 bytes */
 	add	$16, %rdi		/* prepare to search next 16 bytes */
+#ifdef __CHKP__
+	bndcu 	(%rdi), %bnd0
+	bndcu 	(%rsi), %bnd1
+#endif
 
 	/*
 	 * Determine source and destination string offsets from 16-byte
@@ -231,6 +243,11 @@ LABEL(crosscache):
 	mov	%edx, %r8d		/* r8d is offset flag for exit tail */
 	xchg	%ecx, %eax
 	xchg	%rsi, %rdi
+#ifdef __CHKP__
+	bndmov	%bnd0, %bnd2
+	bndmov	%bnd1, %bnd0
+	bndmov	%bnd2, %bnd1
+#endif
 LABEL(bigger):
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -280,6 +297,10 @@ LABEL(ashr_0):
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
 	.p2align 4
 LABEL(ashr_0_use):
+#ifdef __CHKP__
+	bndcu	-1(%rdi, %rdx), %bnd0
+	bndcu	-1(%rsi, %rdx), %bnd1
+#endif
 	movdqa	(%rdi,%rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
@@ -295,6 +316,10 @@ LABEL(ashr_0_use):
 	jbe	LABEL(strcmp_exitz)
 #endif
 
+#ifdef __CHKP__
+	bndcu	-1(%rdi, %rdx), %bnd0
+	bndcu	-1(%rsi, %rdx), %bnd1
+#endif
 	movdqa	(%rdi,%rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
@@ -320,6 +345,10 @@ LABEL(ashr_0_exit_use):
 	jbe	LABEL(strcmp_exitz)
 #endif
 	lea	-16(%rdx, %rcx), %rcx
+#ifdef __CHKP__
+	bndcu	-1(%rdi, %rcx), %bnd0
+	bndcu	-1(%rsi, %rcx), %bnd1
+#endif
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %edx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
@@ -362,6 +391,15 @@ LABEL(ashr_1):
 	and	$0xfff, %r10		/* offset into 4K page */
 	sub	$0x1000, %r10		/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_1_use)
+LABEL(ashr_1_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_1_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_1_use):
@@ -416,7 +454,11 @@ LABEL(nibble_ashr_1_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$14, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_1_check)
+#else
 	ja	LABEL(nibble_ashr_1_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -450,6 +492,15 @@ LABEL(ashr_2):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_2_use)
+LABEL(ashr_2_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_2_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_2_use):
@@ -504,7 +555,11 @@ LABEL(nibble_ashr_2_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$13, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_2_check)
+#else
 	ja	LABEL(nibble_ashr_2_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -539,6 +594,15 @@ LABEL(ashr_3):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_3_use)
+LABEL(ashr_3_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_3_restart_use)
+#endif
 
 LABEL(loop_ashr_3_use):
 	add	$16, %r10
@@ -592,7 +656,11 @@ LABEL(nibble_ashr_3_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$12, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_3_check)
+#else
 	ja	LABEL(nibble_ashr_3_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -627,6 +695,15 @@ LABEL(ashr_4):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_4_use)
+LABEL(ashr_4_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_4_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_4_use):
@@ -681,7 +758,11 @@ LABEL(nibble_ashr_4_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$11, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_4_check)
+#else
 	ja	LABEL(nibble_ashr_4_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -716,6 +797,15 @@ LABEL(ashr_5):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_5_use)
+LABEL(ashr_5_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_5_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_5_use):
@@ -771,7 +861,11 @@ LABEL(nibble_ashr_5_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$10, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_5_check)
+#else
 	ja	LABEL(nibble_ashr_5_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -806,6 +900,15 @@ LABEL(ashr_6):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_6_use)
+LABEL(ashr_6_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_6_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_6_use):
@@ -860,7 +963,11 @@ LABEL(nibble_ashr_6_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$9, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_6_check)
+#else
 	ja	LABEL(nibble_ashr_6_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -895,6 +1002,15 @@ LABEL(ashr_7):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_7_use)
+LABEL(ashr_7_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_7_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_7_use):
@@ -949,7 +1065,11 @@ LABEL(nibble_ashr_7_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$8, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_7_check)
+#else
 	ja	LABEL(nibble_ashr_7_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -984,6 +1104,15 @@ LABEL(ashr_8):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_8_use)
+LABEL(ashr_8_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_8_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_8_use):
@@ -1038,7 +1167,11 @@ LABEL(nibble_ashr_8_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$7, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_8_check)
+#else
 	ja	LABEL(nibble_ashr_8_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1073,6 +1206,15 @@ LABEL(ashr_9):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_9_use)
+LABEL(ashr_9_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_9_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_9_use):
@@ -1128,7 +1270,11 @@ LABEL(nibble_ashr_9_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$6, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_9_check)
+#else
 	ja	LABEL(nibble_ashr_9_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1163,6 +1309,15 @@ LABEL(ashr_10):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_10_use)
+LABEL(ashr_10_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_10_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_10_use):
@@ -1217,7 +1372,11 @@ LABEL(nibble_ashr_10_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$5, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_10_check)
+#else
 	ja	LABEL(nibble_ashr_10_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1252,6 +1411,15 @@ LABEL(ashr_11):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_11_use)
+LABEL(ashr_11_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_11_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_11_use):
@@ -1306,7 +1474,11 @@ LABEL(nibble_ashr_11_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$4, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_11_check)
+#else
 	ja	LABEL(nibble_ashr_11_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1341,6 +1513,15 @@ LABEL(ashr_12):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_12_use)
+LABEL(ashr_12_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_12_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_12_use):
@@ -1395,7 +1576,11 @@ LABEL(nibble_ashr_12_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$3, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_12_check)
+#else
 	ja	LABEL(nibble_ashr_12_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1431,6 +1616,15 @@ LABEL(ashr_13):
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_13_use)
+LABEL(ashr_13_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_13_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_13_use):
@@ -1485,7 +1679,11 @@ LABEL(nibble_ashr_13_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$2, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_13_check)
+#else
 	ja	LABEL(nibble_ashr_13_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1521,6 +1719,15 @@ LABEL(ashr_14):
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_14_use)
+LABEL(ashr_14_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_14_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_14_use):
@@ -1575,7 +1782,11 @@ LABEL(nibble_ashr_14_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$1, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_14_check)
+#else
 	ja	LABEL(nibble_ashr_14_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1613,6 +1824,15 @@ LABEL(ashr_15):
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_15_use)
+LABEL(ashr_15_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_15_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_15_use):
@@ -1667,7 +1887,11 @@ LABEL(nibble_ashr_15_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$0, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_15_check)
+#else
 	ja	LABEL(nibble_ashr_15_restart_use)
+#endif
 
 LABEL(nibble_ashr_exit_use):
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
@@ -1691,6 +1915,11 @@ LABEL(exit_use):
 	test	%r8d, %r8d
 	jz	LABEL(ret_use)
 	xchg	%eax, %edx
+#ifdef __CHKP__
+	bndmov	%bnd0, %bnd2
+	bndmov	%bnd1, %bnd0
+	bndmov	%bnd2, %bnd1
+#endif
 LABEL(ret_use):
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
@@ -1707,6 +1936,11 @@ LABEL(less32bytes):
 	test	%r8d, %r8d
 	jz	LABEL(ret)
 	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
+#ifdef __CHKP__
+	bndmov	%bnd0, %bnd2
+	bndmov	%bnd1, %bnd0
+	bndmov	%bnd2, %bnd1
+#endif
 
 	.p2align 4
 LABEL(ret):
@@ -1717,6 +1951,10 @@ LABEL(less16bytes):
 	sub	%rdx, %r11
 	jbe	LABEL(strcmp_exitz)
 #endif
+#ifdef __CHKP__
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+#endif
 	movzbl	(%rsi, %rdx), %ecx
 	movzbl	(%rdi, %rdx), %eax
 
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
index 7710173..e6baee9 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -33,7 +33,7 @@
 	lea	TABLE(%rip), %r11;                              \
 	movslq	(%r11, INDEX, SCALE), %rcx;                     \
 	lea	(%r11, %rcx), %rcx;                             \
-	jmp	*%rcx
+	jmp *%rcx
 
 # ifndef USE_AS_STRCAT
 
@@ -51,6 +51,16 @@ ENTRY (STRCPY)
 
 # endif
 
+# ifdef __CHKP__
+	bndcl	(%rdi), %bnd0
+	bndcu	(%rdi), %bnd0
+	bndcl	(%rsi), %bnd1
+	bndcu	(%rsi), %bnd1
+#  if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	bndcu	-1(%rdi, %rdx), %bnd0
+#  endif
+# endif
+
 	and	$63, %rcx
 	cmp	$32, %rcx
 	jbe	L(SourceStringAlignmentLess32)
@@ -79,6 +89,9 @@ ENTRY (STRCPY)
 	test	%rdx, %rdx
 	jnz	L(CopyFrom1To16BytesTail)
 
+# ifdef __CHKP__
+	bndcu	16(%rsi), %bnd1
+# endif
 	pcmpeqb	16(%rsi), %xmm0
 	pmovmskb %xmm0, %rdx
 
@@ -91,6 +104,9 @@ ENTRY (STRCPY)
 	jnz	L(CopyFrom1To32Bytes)
 
 	movdqu	(%rsi, %rcx), %xmm1   /* copy 16 bytes */
+# ifdef __CHKP__
+	bndcu	15(%rdi), %bnd0
+# endif
 	movdqu	%xmm1, (%rdi)
 
 /* If source address alignment != destination address alignment */
@@ -101,6 +117,10 @@ L(Unalign16Both):
 	add	%rcx, %r8
 # endif
 	mov	$16, %rcx
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movdqa	(%rsi, %rcx), %xmm1
 	movaps	16(%rsi, %rcx), %xmm2
 	movdqu	%xmm1, (%rdi, %rcx)
@@ -118,6 +138,10 @@ L(Unalign16Both):
 	jnz	L(CopyFrom1To16Bytes)
 # endif
 
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movaps	16(%rsi, %rcx), %xmm3
 	movdqu	%xmm2, (%rdi, %rcx)
 	pcmpeqb	%xmm3, %xmm0
@@ -134,6 +158,10 @@ L(Unalign16Both):
 	jnz	L(CopyFrom1To16Bytes)
 # endif
 
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movaps	16(%rsi, %rcx), %xmm4
 	movdqu	%xmm3, (%rdi, %rcx)
 	pcmpeqb	%xmm4, %xmm0
@@ -150,6 +178,10 @@ L(Unalign16Both):
 	jnz	L(CopyFrom1To16Bytes)
 # endif
 
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movaps	16(%rsi, %rcx), %xmm1
 	movdqu	%xmm4, (%rdi, %rcx)
 	pcmpeqb	%xmm1, %xmm0
@@ -166,6 +198,10 @@ L(Unalign16Both):
 	jnz	L(CopyFrom1To16Bytes)
 # endif
 
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movaps	16(%rsi, %rcx), %xmm2
 	movdqu	%xmm1, (%rdi, %rcx)
 	pcmpeqb	%xmm2, %xmm0
@@ -182,6 +218,10 @@ L(Unalign16Both):
 	jnz	L(CopyFrom1To16Bytes)
 # endif
 
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movaps	16(%rsi, %rcx), %xmm3
 	movdqu	%xmm2, (%rdi, %rcx)
 	pcmpeqb	%xmm3, %xmm0
@@ -198,6 +238,10 @@ L(Unalign16Both):
 	jnz	L(CopyFrom1To16Bytes)
 # endif
 
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movdqu	%xmm3, (%rdi, %rcx)
 	mov	%rsi, %rdx
 	lea	16(%rsi, %rcx), %rsi
@@ -208,6 +252,9 @@ L(Unalign16Both):
 	lea	128(%r8, %rdx), %r8
 # endif
 L(Unaligned64Loop):
+# ifdef __CHKP__
+	bndcu	48(%rsi), %bnd1
+# endif
 	movaps	(%rsi), %xmm2
 	movaps	%xmm2, %xmm4
 	movaps	16(%rsi), %xmm5
@@ -229,6 +276,10 @@ L(Unaligned64Loop):
 L(Unaligned64Loop_start):
 	add	$64, %rdi
 	add	$64, %rsi
+# ifdef __CHKP__
+	bndcu	(%rsi), %bnd1
+	bndcu	(%rdi), %bnd0
+# endif
 	movdqu	%xmm4, -64(%rdi)
 	movaps	(%rsi), %xmm2
 	movdqa	%xmm2, %xmm4
@@ -271,16 +322,28 @@ L(Unaligned64Leave):
 	jnz	L(CopyFrom1To16BytesUnaligned_32)
 
 	bsf	%rcx, %rdx
+# ifdef __CHKP__
+	bndcu	47(%rdi), %bnd0
+# endif
 	movdqu	%xmm4, (%rdi)
 	movdqu	%xmm5, 16(%rdi)
 	movdqu	%xmm6, 32(%rdi)
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 # ifdef USE_AS_STPCPY
+#  ifdef __CHKP__
+	bndcu	48(%rdi, %rdx), %bnd0
+#  endif
 	lea	48(%rdi, %rdx), %rax
 # endif
+#  ifdef __CHKP__
+	bndcu	63(%rdi), %bnd0
+#  endif
 	movdqu	%xmm7, 48(%rdi)
 	add	$15, %r8
 	sub	%rdx, %r8
+#  ifdef __CHKP__
+	bndcu	49(%rdi, %rdx), %bnd0
+#  endif
 	lea	49(%rdi, %rdx), %rdi
 	jmp	L(StrncpyFillTailWithZero)
 # else
@@ -309,6 +372,10 @@ L(SourceStringAlignmentLess32):
 	test	%rdx, %rdx
 	jnz	L(CopyFrom1To16BytesTail1)
 
+# ifdef __CHKP__
+	bndcu	16(%rsi), %bnd1
+	bndcu	15(%rdi), %bnd0
+# endif
 	pcmpeqb	%xmm2, %xmm0
 	movdqu	%xmm1, (%rdi)
 	pmovmskb %xmm0, %rdx
@@ -372,6 +439,9 @@ L(CopyFrom1To16BytesUnaligned_0):
 # ifdef USE_AS_STPCPY
 	lea	(%rdi, %rdx), %rax
 # endif
+#  ifdef __CHKP__
+	bndcu	15(%rdi), %bnd0
+#  endif
 	movdqu	%xmm4, (%rdi)
 	add	$63, %r8
 	sub	%rdx, %r8
@@ -384,6 +454,9 @@ L(CopyFrom1To16BytesUnaligned_0):
 	.p2align 4
 L(CopyFrom1To16BytesUnaligned_16):
 	bsf	%rcx, %rdx
+#  ifdef __CHKP__
+	bndcu	31(%rdi), %bnd0
+#  endif
 	movdqu	%xmm4, (%rdi)
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 # ifdef USE_AS_STPCPY
@@ -403,6 +476,9 @@ L(CopyFrom1To16BytesUnaligned_16):
 	.p2align 4
 L(CopyFrom1To16BytesUnaligned_32):
 	bsf	%rdx, %rdx
+#  ifdef __CHKP__
+	bndcu	47(%rdi), %bnd0
+#  endif
 	movdqu	%xmm4, (%rdi)
 	movdqu	%xmm5, 16(%rdi)
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
@@ -529,6 +605,9 @@ L(CopyFrom1To16BytesTail1Case2OrCase3):
 
 	.p2align 4
 L(Exit1):
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+# endif
 	mov	%dh, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	(%rdi), %rax
@@ -543,6 +622,9 @@ L(Exit1):
 	.p2align 4
 L(Exit2):
 	mov	(%rsi), %dx
+# ifdef __CHKP__
+	bndcu	1(%rdi), %bnd0
+# endif
 	mov	%dx, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	1(%rdi), %rax
@@ -557,6 +639,9 @@ L(Exit2):
 	.p2align 4
 L(Exit3):
 	mov	(%rsi), %cx
+# ifdef __CHKP__
+	bndcu	2(%rdi), %bnd0
+# endif
 	mov	%cx, (%rdi)
 	mov	%dh, 2(%rdi)
 # ifdef USE_AS_STPCPY
@@ -572,6 +657,9 @@ L(Exit3):
 	.p2align 4
 L(Exit4):
 	mov	(%rsi), %edx
+# ifdef __CHKP__
+	bndcu	3(%rdi), %bnd0
+# endif
 	mov	%edx, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	3(%rdi), %rax
@@ -586,6 +674,9 @@ L(Exit4):
 	.p2align 4
 L(Exit5):
 	mov	(%rsi), %ecx
+# ifdef __CHKP__
+	bndcu	4(%rdi), %bnd0
+# endif
 	mov	%dh, 4(%rdi)
 	mov	%ecx, (%rdi)
 # ifdef USE_AS_STPCPY
@@ -602,6 +693,9 @@ L(Exit5):
 L(Exit6):
 	mov	(%rsi), %ecx
 	mov	4(%rsi), %dx
+# ifdef __CHKP__
+	bndcu	5(%rdi), %bnd0
+# endif
 	mov	%ecx, (%rdi)
 	mov	%dx, 4(%rdi)
 # ifdef USE_AS_STPCPY
@@ -618,6 +712,9 @@ L(Exit6):
 L(Exit7):
 	mov	(%rsi), %ecx
 	mov	3(%rsi), %edx
+# ifdef __CHKP__
+	bndcu	6(%rdi), %bnd0
+# endif
 	mov	%ecx, (%rdi)
 	mov	%edx, 3(%rdi)
 # ifdef USE_AS_STPCPY
@@ -633,6 +730,9 @@ L(Exit7):
 	.p2align 4
 L(Exit8):
 	mov	(%rsi), %rdx
+# ifdef __CHKP__
+	bndcu	7(%rdi), %bnd0
+# endif
 	mov	%rdx, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	7(%rdi), %rax
@@ -647,6 +747,9 @@ L(Exit8):
 	.p2align 4
 L(Exit9):
 	mov	(%rsi), %rcx
+# ifdef __CHKP__
+	bndcu	8(%rdi), %bnd0
+# endif
 	mov	%dh, 8(%rdi)
 	mov	%rcx, (%rdi)
 # ifdef USE_AS_STPCPY
@@ -663,6 +766,9 @@ L(Exit9):
 L(Exit10):
 	mov	(%rsi), %rcx
 	mov	8(%rsi), %dx
+# ifdef __CHKP__
+	bndcu	9(%rdi), %bnd0
+# endif
 	mov	%rcx, (%rdi)
 	mov	%dx, 8(%rdi)
 # ifdef USE_AS_STPCPY
@@ -679,6 +785,9 @@ L(Exit10):
 L(Exit11):
 	mov	(%rsi), %rcx
 	mov	7(%rsi), %edx
+# ifdef __CHKP__
+	bndcu	10(%rdi), %bnd0
+# endif
 	mov	%rcx, (%rdi)
 	mov	%edx, 7(%rdi)
 # ifdef USE_AS_STPCPY
@@ -695,6 +804,9 @@ L(Exit11):
 L(Exit12):
 	mov	(%rsi), %rcx
 	mov	8(%rsi), %edx
+# ifdef __CHKP__
+	bndcu	11(%rdi), %bnd0
+# endif
 	mov	%rcx, (%rdi)
 	mov	%edx, 8(%rdi)
 # ifdef USE_AS_STPCPY
@@ -711,6 +823,9 @@ L(Exit12):
 L(Exit13):
 	mov	(%rsi), %rcx
 	mov	5(%rsi), %rdx
+# ifdef __CHKP__
+	bndcu	12(%rdi), %bnd0
+# endif
 	mov	%rcx, (%rdi)
 	mov	%rdx, 5(%rdi)
 # ifdef USE_AS_STPCPY
@@ -727,6 +842,9 @@ L(Exit13):
 L(Exit14):
 	mov	(%rsi), %rcx
 	mov	6(%rsi), %rdx
+# ifdef __CHKP__
+	bndcu	13(%rdi), %bnd0
+# endif
 	mov	%rcx, (%rdi)
 	mov	%rdx, 6(%rdi)
 # ifdef USE_AS_STPCPY
@@ -743,6 +861,9 @@ L(Exit14):
 L(Exit15):
 	mov	(%rsi), %rcx
 	mov	7(%rsi), %rdx
+# ifdef __CHKP__
+	bndcu	14(%rdi), %bnd0
+# endif
 	mov	%rcx, (%rdi)
 	mov	%rdx, 7(%rdi)
 # ifdef USE_AS_STPCPY
@@ -758,6 +879,9 @@ L(Exit15):
 	.p2align 4
 L(Exit16):
 	movdqu	(%rsi), %xmm0
+# ifdef __CHKP__
+	bndcu	15(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	15(%rdi), %rax
@@ -772,6 +896,9 @@ L(Exit16):
 	.p2align 4
 L(Exit17):
 	movdqu	(%rsi), %xmm0
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%dh, 16(%rdi)
 # ifdef USE_AS_STPCPY
@@ -788,6 +915,9 @@ L(Exit17):
 L(Exit18):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %cx
+# ifdef __CHKP__
+	bndcu	17(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%cx, 16(%rdi)
 # ifdef USE_AS_STPCPY
@@ -804,6 +934,9 @@ L(Exit18):
 L(Exit19):
 	movdqu	(%rsi), %xmm0
 	mov	15(%rsi), %ecx
+# ifdef __CHKP__
+	bndcu	18(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%ecx, 15(%rdi)
 # ifdef USE_AS_STPCPY
@@ -820,6 +953,9 @@ L(Exit19):
 L(Exit20):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %ecx
+# ifdef __CHKP__
+	bndcu	19(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%ecx, 16(%rdi)
 # ifdef USE_AS_STPCPY
@@ -836,6 +972,9 @@ L(Exit20):
 L(Exit21):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %ecx
+# ifdef __CHKP__
+	bndcu	20(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%ecx, 16(%rdi)
 	mov	%dh, 20(%rdi)
@@ -853,6 +992,9 @@ L(Exit21):
 L(Exit22):
 	movdqu	(%rsi), %xmm0
 	mov	14(%rsi), %rcx
+# ifdef __CHKP__
+	bndcu	21(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 14(%rdi)
 # ifdef USE_AS_STPCPY
@@ -869,6 +1011,9 @@ L(Exit22):
 L(Exit23):
 	movdqu	(%rsi), %xmm0
 	mov	15(%rsi), %rcx
+# ifdef __CHKP__
+	bndcu	22(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 15(%rdi)
 # ifdef USE_AS_STPCPY
@@ -885,6 +1030,9 @@ L(Exit23):
 L(Exit24):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rcx
+# ifdef __CHKP__
+	bndcu	23(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 16(%rdi)
 # ifdef USE_AS_STPCPY
@@ -901,6 +1049,9 @@ L(Exit24):
 L(Exit25):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rcx
+# ifdef __CHKP__
+	bndcu	24(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 16(%rdi)
 	mov	%dh, 24(%rdi)
@@ -919,6 +1070,9 @@ L(Exit26):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	24(%rsi), %cx
+# ifdef __CHKP__
+	bndcu	25(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%cx, 24(%rdi)
@@ -937,6 +1091,9 @@ L(Exit27):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	23(%rsi), %ecx
+# ifdef __CHKP__
+	bndcu	26(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%ecx, 23(%rdi)
@@ -955,6 +1112,9 @@ L(Exit28):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	24(%rsi), %ecx
+# ifdef __CHKP__
+	bndcu	27(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%ecx, 24(%rdi)
@@ -972,6 +1132,9 @@ L(Exit28):
 L(Exit29):
 	movdqu	(%rsi), %xmm0
 	movdqu	13(%rsi), %xmm2
+# ifdef __CHKP__
+	bndcu	28(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 13(%rdi)
 # ifdef USE_AS_STPCPY
@@ -988,6 +1151,9 @@ L(Exit29):
 L(Exit30):
 	movdqu	(%rsi), %xmm0
 	movdqu	14(%rsi), %xmm2
+# ifdef __CHKP__
+	bndcu	29(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 14(%rdi)
 # ifdef USE_AS_STPCPY
@@ -1004,6 +1170,9 @@ L(Exit30):
 L(Exit31):
 	movdqu	(%rsi), %xmm0
 	movdqu	15(%rsi), %xmm2
+# ifdef __CHKP__
+	bndcu	30(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 15(%rdi)
 # ifdef USE_AS_STPCPY
@@ -1020,6 +1189,9 @@ L(Exit31):
 L(Exit32):
 	movdqu	(%rsi), %xmm0
 	movdqu	16(%rsi), %xmm2
+# ifdef __CHKP__
+	bndcu	31(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 16(%rdi)
 # ifdef USE_AS_STPCPY
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
index 9c0dcf0..dfdde27 100644
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -20,6 +20,8 @@
 #include <nmmintrin.h>
 #include <string.h>
 #include "varshift.h"
+#ifdef __CHKP__
+#endif
 
 /* We use 0x2:
 	_SIDD_SBYTE_OPS
@@ -84,6 +86,12 @@ STRCSPN_SSE42 (const char *s, const char *a)
   if (*a == 0)
     RETURN (NULL, strlen (s));
 
+#ifdef __CHKP__
+/* TODO: Implement MPX support for these vertorized version manually using mpx intrinsics */
+		a = __bnd_init_ptr_bounds(a);
+		s = __bnd_init_ptr_bounds(s);
+#endif
+
   const char *aligned;
   __m128i mask;
   int offset = (int) ((size_t) a & 15);
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
index 3f92a41..1fed105 100644
--- a/sysdeps/x86_64/multiarch/strrchr.S
+++ b/sysdeps/x86_64/multiarch/strrchr.S
@@ -97,6 +97,10 @@ __strrchr_sse42:
 	CALL_MCOUNT
 	testb	%sil, %sil
 	je	__strend_sse4
+# ifdef __CHKP__
+	bndcl  (%rdi), %bnd0
+	bndcu  (%rdi), %bnd0
+# endif
 	xor	%eax,%eax	/* RAX has the last occurrence of s.  */
 	movd	%esi, %xmm1
 	punpcklbw	%xmm1, %xmm1
@@ -135,6 +139,9 @@ L(unaligned_no_byte):
 	   contain the NULL terminator.  */
 	jg	L(exit)
 	addq	$16, %r8
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 
 /* Loop start on aligned string.  */
 	.p2align 4
@@ -142,6 +149,9 @@ L(loop):
 	pcmpistri	$0x4a, (%r8), %xmm1
 	jbe	L(match_or_eos)
 	addq	$16, %r8
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 	jmp	L(loop)
 	.p2align 4
 L(match_or_eos):
@@ -149,11 +159,17 @@ L(match_or_eos):
 L(match_no_eos):
 	leaq	(%r8,%rcx), %rax
 	addq	$16, %r8
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 	jmp     L(loop)
 	.p2align 4
 L(had_eos):
 	jnc     L(exit)
 	leaq	(%r8,%rcx), %rax
+# ifdef __CHKP__
+	bndcu  	(%rax), %bnd0
+# endif
 	.p2align 4
 L(exit):
 	ret
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index 8128cb9..ecc3a3a 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -18,6 +18,8 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <nmmintrin.h>
+#ifdef __CHKP__
+#endif
 #include <string.h>
 #include "varshift.h"
 
@@ -62,6 +64,12 @@ __strspn_sse42 (const char *s, const char *a)
   if (*a == 0)
     return 0;
 
+#ifdef __CHKP__
+/* TODO: Implement Intel MPX manual checks for these vertorized version using new intrinsics */
+		s = __bnd_init_ptr_bounds(s);
+		a = __bnd_init_ptr_bounds(a);
+#endif
+
   const char *aligned;
   __m128i mask;
   int offset = (int) ((size_t) a & 15);
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index cd63b68..577744b 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -165,8 +165,14 @@ char *
 __attribute__ ((section (".text.sse4.2")))
 STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
 {
-#define p1 s1
+#ifdef __CHKP__
+/* TODO: Implement Intel MPX manual checks for these vertorized version using new intrinsics */
+  unsigned char *p1 = __bnd_init_ptr_bounds(s1);
+  unsigned char *p2 = __bnd_init_ptr_bounds(s2);
+#else
+# define p1 s1
   const unsigned char *p2 = s2;
+#endif
 
 #ifndef STRCASESTR_NONASCII
   if (__builtin_expect (p2[0] == '\0', 0))
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
index b7de092..77889dd 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -25,13 +25,27 @@ ENTRY (__wcscpy_ssse3)
 
 	mov	%rsi, %rcx
 	mov	%rdi, %rdx
+# ifdef __CHKP__
+	bndcl  	(%rdi), %bnd0
+	bndcl  	(%rsi), %bnd1
+	bndcu  	(%rsi), %bnd1
+# endif
 
 	cmpl	$0, (%rcx)
 	jz	L(Exit4)
+# ifdef __CHKP__
+	bndcu  	4(%rcx), %bnd1
+# endif
 	cmpl	$0, 4(%rcx)
 	jz	L(Exit8)
+# ifdef __CHKP__
+	bndcu  	8(%rcx), %bnd1
+# endif
 	cmpl	$0, 8(%rcx)
 	jz	L(Exit12)
+# ifdef __CHKP__
+	bndcu  	12(%rcx), %bnd1
+# endif
 	cmpl	$0, 12(%rcx)
 	jz	L(Exit16)
 
@@ -40,10 +54,19 @@ ENTRY (__wcscpy_ssse3)
 
 	pxor	%xmm0, %xmm0
 	mov	(%rcx), %r9
+# ifdef __CHKP__
+	bndcu  	7(%rdx), %bnd0
+# endif
 	mov	%r9, (%rdx)
 
+# ifdef __CHKP__
+	bndcu  	(%rsi), %bnd1
+# endif
 	pcmpeqd	(%rsi), %xmm0
 	mov	8(%rcx), %r9
+# ifdef __CHKP__
+	bndcu  	15(%rdx), %bnd0
+# endif
 	mov	%r9, 8(%rdx)
 
 	pmovmskb %xmm0, %rax
@@ -72,6 +95,10 @@ ENTRY (__wcscpy_ssse3)
 	jmp	L(Shl12)
 
 L(Align16Both):
+# ifdef __CHKP__
+	bndcu  	16(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	(%rcx), %xmm1
 	movaps	16(%rcx), %xmm2
 	movaps	%xmm1, (%rdx)
@@ -82,6 +109,10 @@ L(Align16Both):
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%rcx, %rsi), %bnd1
+	bndcu  	15(%rdx, %rsi), %bnd0
+# endif
 	movaps	16(%rcx, %rsi), %xmm3
 	movaps	%xmm2, (%rdx, %rsi)
 	pcmpeqd	%xmm3, %xmm0
@@ -91,6 +122,10 @@ L(Align16Both):
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%rcx, %rsi), %bnd1
+	bndcu  	15(%rdx, %rsi), %bnd0
+# endif
 	movaps	16(%rcx, %rsi), %xmm4
 	movaps	%xmm3, (%rdx, %rsi)
 	pcmpeqd	%xmm4, %xmm0
@@ -100,6 +135,10 @@ L(Align16Both):
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%rcx, %rsi), %bnd1
+	bndcu  	15(%rdx, %rsi), %bnd0
+# endif
 	movaps	16(%rcx, %rsi), %xmm1
 	movaps	%xmm4, (%rdx, %rsi)
 	pcmpeqd	%xmm1, %xmm0
@@ -109,6 +148,10 @@ L(Align16Both):
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%rcx, %rsi), %bnd1
+	bndcu  	15(%rdx, %rsi), %bnd0
+# endif
 	movaps	16(%rcx, %rsi), %xmm2
 	movaps	%xmm1, (%rdx, %rsi)
 	pcmpeqd	%xmm2, %xmm0
@@ -118,6 +161,10 @@ L(Align16Both):
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%rcx, %rsi), %bnd1
+	bndcu  	15(%rdx, %rsi), %bnd0
+# endif
 	movaps	16(%rcx, %rsi), %xmm3
 	movaps	%xmm2, (%rdx, %rsi)
 	pcmpeqd	%xmm3, %xmm0
@@ -127,6 +174,10 @@ L(Align16Both):
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%rcx, %rsi), %bnd1
+	bndcu  	15(%rdx, %rsi), %bnd0
+# endif
 	movaps	%xmm3, (%rdx, %rsi)
 	mov	%rcx, %rax
 	lea	16(%rcx, %rsi), %rcx
@@ -138,6 +189,10 @@ L(Align16Both):
 
 	.p2align 4
 L(Aligned64Loop):
+# ifdef __CHKP__
+	bndcu  	(%rcx), %bnd1
+	bndcu  	63(%rdx), %bnd0
+# endif
 	movaps	(%rcx), %xmm2
 	movaps	%xmm2, %xmm4
 	movaps	16(%rcx), %xmm5
@@ -168,6 +223,9 @@ L(Aligned64Leave):
 	pcmpeqd	%xmm5, %xmm0
 
 	pmovmskb %xmm0, %rax
+# ifdef __CHKP__
+	bndcu  	-49(%rdx), %bnd0
+# endif
 	movaps	%xmm4, -64(%rdx)
 	test	%rax, %rax
 	lea	16(%rsi), %rsi
@@ -176,11 +234,17 @@ L(Aligned64Leave):
 	pcmpeqd	%xmm6, %xmm0
 
 	pmovmskb %xmm0, %rax
+# ifdef __CHKP__
+	bndcu  	-33(%rdx), %bnd0
+# endif
 	movaps	%xmm5, -48(%rdx)
 	test	%rax, %rax
 	lea	16(%rsi), %rsi
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	-17(%rdx), %bnd0
+# endif
 	movaps	%xmm6, -32(%rdx)
 	pcmpeqd	%xmm7, %xmm0
 
@@ -190,11 +254,17 @@ L(Aligned64Leave):
 	jnz	L(CopyFrom1To16Bytes)
 
 	mov	$-0x40, %rsi
+# ifdef __CHKP__
+	bndcu  	-1(%rdx), %bnd0
+# endif
 	movaps	%xmm7, -16(%rdx)
 	jmp	L(Aligned64Loop)
 
 	.p2align 4
 L(Shl4):
+# ifdef __CHKP__
+	bndcu  	12(%rcx), %bnd1
+# endif
 	movaps	-4(%rcx), %xmm1
 	movaps	12(%rcx), %xmm2
 L(Shl4Start):
@@ -206,6 +276,10 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	28(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -219,6 +293,10 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	28(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -232,6 +310,10 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	28(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -244,6 +326,9 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	lea	28(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -258,6 +343,9 @@ L(Shl4Start):
 
 	.p2align 4
 L(Shl4LoopStart):
+# ifdef __CHKP__
+	bndcu  	12(%rcx), %bnd1
+# endif
 	movaps	12(%rcx), %xmm2
 	movaps	28(%rcx), %xmm3
 	movaps	%xmm3, %xmm6
@@ -279,6 +367,9 @@ L(Shl4LoopStart):
 	lea	64(%rcx), %rcx
 	palignr	$4, %xmm1, %xmm2
 	movaps	%xmm7, %xmm1
+# ifdef __CHKP__
+	bndcu  	63(%rdx), %bnd0
+# endif
 	movaps	%xmm5, 48(%rdx)
 	movaps	%xmm4, 32(%rdx)
 	movaps	%xmm3, 16(%rdx)
@@ -287,6 +378,10 @@ L(Shl4LoopStart):
 	jmp	L(Shl4LoopStart)
 
 L(Shl4LoopExit):
+# ifdef __CHKP__
+	bndcu  	-4(%rcx), %bnd1
+	bndcu  	11(%rdx), %bnd0
+# endif
 	movdqu	-4(%rcx), %xmm1
 	mov	$12, %rsi
 	movdqu	%xmm1, -4(%rdx)
@@ -294,6 +389,9 @@ L(Shl4LoopExit):
 
 	.p2align 4
 L(Shl8):
+# ifdef __CHKP__
+	bndcu  	8(%rcx), %bnd1
+# endif
 	movaps	-8(%rcx), %xmm1
 	movaps	8(%rcx), %xmm2
 L(Shl8Start):
@@ -305,6 +403,10 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	24(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -318,6 +420,10 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	24(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -331,6 +437,10 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	24(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -343,6 +453,10 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	24(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	lea	24(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -357,6 +471,9 @@ L(Shl8Start):
 
 	.p2align 4
 L(Shl8LoopStart):
+# ifdef __CHKP__
+	bndcu  	8(%rcx), %bnd1
+# endif
 	movaps	8(%rcx), %xmm2
 	movaps	24(%rcx), %xmm3
 	movaps	%xmm3, %xmm6
@@ -378,6 +495,9 @@ L(Shl8LoopStart):
 	lea	64(%rcx), %rcx
 	palignr	$8, %xmm1, %xmm2
 	movaps	%xmm7, %xmm1
+# ifdef __CHKP__
+	bndcu  	63(%rdx), %bnd0
+# endif
 	movaps	%xmm5, 48(%rdx)
 	movaps	%xmm4, 32(%rdx)
 	movaps	%xmm3, 16(%rdx)
@@ -386,6 +506,10 @@ L(Shl8LoopStart):
 	jmp	L(Shl8LoopStart)
 
 L(Shl8LoopExit):
+# ifdef __CHKP__
+	bndcu  	(%rcx), %bnd1
+	bndcu  	7(%rdx), %bnd0
+# endif
 	mov	(%rcx), %r9
 	mov	$8, %rsi
 	mov	%r9, (%rdx)
@@ -393,6 +517,9 @@ L(Shl8LoopExit):
 
 	.p2align 4
 L(Shl12):
+# ifdef __CHKP__
+	bndcu  	4(%rcx), %bnd1
+# endif
 	movaps	-12(%rcx), %xmm1
 	movaps	4(%rcx), %xmm2
 L(Shl12Start):
@@ -404,6 +531,10 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	20(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -417,6 +548,10 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	20(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -430,6 +565,10 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	20(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -442,6 +581,10 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	20(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	lea	20(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -456,6 +599,9 @@ L(Shl12Start):
 
 	.p2align 4
 L(Shl12LoopStart):
+# ifdef __CHKP__
+	bndcu  	4(%rcx), %bnd1
+# endif
 	movaps	4(%rcx), %xmm2
 	movaps	20(%rcx), %xmm3
 	movaps	%xmm3, %xmm6
@@ -476,6 +622,9 @@ L(Shl12LoopStart):
 	lea	64(%rcx), %rcx
 	palignr	$12, %xmm1, %xmm2
 	movaps	%xmm7, %xmm1
+# ifdef __CHKP__
+	bndcu  	63(%rdx), %bnd0
+# endif
 	movaps	%xmm5, 48(%rdx)
 	movaps	%xmm4, 32(%rdx)
 	movaps	%xmm3, 16(%rdx)
@@ -484,6 +633,10 @@ L(Shl12LoopStart):
 	jmp	L(Shl12LoopStart)
 
 L(Shl12LoopExit):
+# ifdef __CHKP__
+	bndcu  	(%rcx), %bnd1
+	bndcu  	3(%rdx), %bnd0
+# endif
 	mov	(%rcx), %r9d
 	mov	$4, %rsi
 	mov	%r9d, (%rdx)
@@ -500,6 +653,9 @@ L(CopyFrom1To16Bytes):
 	jnz	L(Exit4)
 
 	mov	(%rcx), %rax
+# ifdef __CHKP__
+	bndcu  	7(%rdx), %bnd0
+# endif
 	mov	%rax, (%rdx)
 	mov	%rdi, %rax
 	ret
@@ -510,6 +666,9 @@ L(ExitHigh):
 	jnz	L(Exit12)
 
 	mov	(%rcx), %rax
+# ifdef __CHKP__
+	bndcu  	15(%rdx), %bnd0
+# endif
 	mov	%rax, (%rdx)
 	mov	8(%rcx), %rax
 	mov	%rax, 8(%rdx)
@@ -519,6 +678,9 @@ L(ExitHigh):
 	.p2align 4
 L(Exit4):
 	movl	(%rcx), %eax
+# ifdef __CHKP__
+	bndcu  	3(%rdx), %bnd0
+# endif
 	movl	%eax, (%rdx)
 	mov	%rdi, %rax
 	ret
@@ -526,6 +688,9 @@ L(Exit4):
 	.p2align 4
 L(Exit8):
 	mov	(%rcx), %rax
+# ifdef __CHKP__
+	bndcu  	7(%rdx), %bnd0
+# endif
 	mov	%rax, (%rdx)
 	mov	%rdi, %rax
 	ret
@@ -533,6 +698,9 @@ L(Exit8):
 	.p2align 4
 L(Exit12):
 	mov	(%rcx), %rax
+# ifdef __CHKP__
+	bndcu  	11(%rdx), %bnd0
+# endif
 	mov	%rax, (%rdx)
 	mov	8(%rcx), %eax
 	mov	%eax, 8(%rdx)
@@ -542,6 +710,9 @@ L(Exit12):
 	.p2align 4
 L(Exit16):
 	mov	(%rcx), %rax
+# ifdef __CHKP__
+	bndcu  	15(%rdx), %bnd0
+# endif
 	mov	%rax, (%rdx)
 	mov	8(%rcx), %rax
 	mov	%rax, 8(%rdx)
diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S
index f4d5591..2f4cb25 100644
--- a/sysdeps/x86_64/rawmemchr.S
+++ b/sysdeps/x86_64/rawmemchr.S
@@ -20,11 +20,23 @@
 
 #include <sysdep.h>
 
+#ifdef __CHKP__
+# define RETURN \
+        bndcu  (%rax), %bnd0; \
+        ret
+#else
+# define RETURN ret
+#endif
+
 	.text
 ENTRY (rawmemchr)
 	movd	%rsi, %xmm1
 	mov	%rdi, %rcx
 
+#ifdef __CHKP__
+	bndcl  (%rdi), %bnd0
+#endif
+
 	punpcklbw %xmm1, %xmm1
 	punpcklbw %xmm1, %xmm1
 
@@ -63,7 +75,7 @@ L(crosscache):
 
 	add	%rdi, %rax
 	add	%rcx, %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(unaligned_no_match):
@@ -71,24 +83,36 @@ L(unaligned_no_match):
 
 	.p2align 4
 L(loop_prolog):
+#ifdef __CHKP__
+	bndcu 	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pcmpeqb	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches)
 
+#ifdef __CHKP__
+	bndcu 	16(%rdi), %bnd0
+#endif
 	movdqa	16(%rdi), %xmm2
 	pcmpeqb	%xmm1, %xmm2
 	pmovmskb %xmm2, %eax
 	test	%eax, %eax
 	jnz	L(matches16)
 
+#ifdef __CHKP__
+	bndcu 	32(%rdi), %bnd0
+#endif
 	movdqa	32(%rdi), %xmm3
 	pcmpeqb	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32)
 
+#ifdef __CHKP__
+	bndcu 	48(%rdi), %bnd0
+#endif
 	movdqa	48(%rdi), %xmm4
 	pcmpeqb	%xmm1, %xmm4
 	add	$64, %rdi
@@ -99,24 +123,36 @@ L(loop_prolog):
 	test	$0x3f, %rdi
 	jz	L(align64_loop)
 
+#ifdef __CHKP__
+	bndcu 	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pcmpeqb	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches)
 
+#ifdef __CHKP__
+	bndcu 	16(%rdi), %bnd0
+#endif
 	movdqa	16(%rdi), %xmm2
 	pcmpeqb	%xmm1, %xmm2
 	pmovmskb %xmm2, %eax
 	test	%eax, %eax
 	jnz	L(matches16)
 
+#ifdef __CHKP__
+	bndcu 	32(%rdi), %bnd0
+#endif
 	movdqa	32(%rdi), %xmm3
 	pcmpeqb	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32)
 
+#ifdef __CHKP__
+	bndcu 	48(%rdi), %bnd0
+#endif
 	movdqa	48(%rdi), %xmm3
 	pcmpeqb	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
@@ -129,6 +165,9 @@ L(loop_prolog):
 
 	.p2align 4
 L(align64_loop):
+#ifdef __CHKP__
+	bndcu 	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	movdqa	16(%rdi), %xmm2
 	movdqa	32(%rdi), %xmm3
@@ -170,36 +209,36 @@ L(align64_loop):
 	pmovmskb %xmm1, %eax
 	bsf	%eax, %eax
 	lea	48(%rdi, %rax), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(matches0):
 	bsf	%eax, %eax
 	lea	-16(%rax, %rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(matches):
 	bsf	%eax, %eax
 	add	%rdi, %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(matches16):
 	bsf	%eax, %eax
 	lea	16(%rax, %rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(matches32):
 	bsf	%eax, %eax
 	lea	32(%rax, %rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(return_null):
 	xor	%rax, %rax
-	ret
+	RETURN
 
 END (rawmemchr)
 
diff --git a/sysdeps/x86_64/stpcpy_chk-c.c b/sysdeps/x86_64/stpcpy_chk-c.c
new file mode 100644
index 0000000..5de29f9
--- /dev/null
+++ b/sysdeps/x86_64/stpcpy_chk-c.c
@@ -0,0 +1,3 @@
+#ifdef __CHKP__
+# include <debug/stpcpy_chk.c>
+#endif
diff --git a/sysdeps/x86_64/stpcpy_chk.S b/sysdeps/x86_64/stpcpy_chk.S
index 905e8d7..d4a2764 100644
--- a/sysdeps/x86_64/stpcpy_chk.S
+++ b/sysdeps/x86_64/stpcpy_chk.S
@@ -1,3 +1,5 @@
-#define USE_AS_STPCPY_CHK
-#define STRCPY_CHK __stpcpy_chk
-#include <sysdeps/x86_64/strcpy_chk.S>
+#ifndef __CHKP__
+# define USE_AS_STPCPY_CHK
+# define STRCPY_CHK __stpcpy_chk
+# include <sysdeps/x86_64/strcpy_chk.S>
+#endif
diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S
index 8bea6fb..7832379 100644
--- a/sysdeps/x86_64/strcat.S
+++ b/sysdeps/x86_64/strcat.S
@@ -25,6 +25,11 @@
 
 	.text
 ENTRY (strcat)
+#ifdef __CHKP__
+	bndcl (%rdi), %bnd0
+	bndcl (%rsi), %bnd1
+#endif
+
 	movq %rdi, %rcx		/* Dest. register. */
 	andl $7, %ecx		/* mask alignment bits */
 	movq %rdi, %rax		/* Duplicate destination pointer.  */
@@ -36,7 +41,11 @@ ENTRY (strcat)
 	neg %ecx		/* We need to align to 8 bytes.  */
 	addl $8,%ecx
 	/* Search the first bytes directly.  */
-0:	cmpb $0x0,(%rax)	/* is byte NUL? */
+0:
+#ifdef __CHKP__
+	bndcu (%rax), %bnd0
+#endif
+	cmpb $0x0,(%rax)	/* is byte NUL? */
 	je 2f			/* yes => start copy */
 	incq %rax		/* increment pointer */
 	decl %ecx
@@ -48,6 +57,9 @@ ENTRY (strcat)
 	.p2align 4
 4:
 	/* First unroll.  */
+#ifdef __CHKP__
+	bndcu (%rax), %bnd0
+#endif
 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
 	addq $8,%rax		/* adjust pointer for next word */
 	movq %r8, %rdx		/* magic value */
@@ -62,6 +74,9 @@ ENTRY (strcat)
 	jnz 3f			/* found NUL => return pointer */
 
 	/* Second unroll.  */
+#ifdef __CHKP__
+	bndcu (%rax), %bnd0
+#endif
 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
 	addq $8,%rax		/* adjust pointer for next word */
 	movq %r8, %rdx		/* magic value */
@@ -76,6 +91,9 @@ ENTRY (strcat)
 	jnz 3f			/* found NUL => return pointer */
 
 	/* Third unroll.  */
+#ifdef __CHKP__
+	bndcu (%rax), %bnd0
+#endif
 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
 	addq $8,%rax		/* adjust pointer for next word */
 	movq %r8, %rdx		/* magic value */
@@ -90,6 +108,9 @@ ENTRY (strcat)
 	jnz 3f			/* found NUL => return pointer */
 
 	/* Fourth unroll.  */
+#ifdef __CHKP__
+	bndcu (%rax), %bnd0
+#endif
 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
 	addq $8,%rax		/* adjust pointer for next word */
 	movq %r8, %rdx		/* magic value */
@@ -163,6 +184,9 @@ ENTRY (strcat)
 	.p2align 4
 22:
 	/* 1st unroll.  */
+#ifdef __CHKP__
+	bndcu	(%rsi), %bnd1
+#endif
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
 	addq	$8, %rsi	/* Adjust pointer for next word.  */
 	movq	%rax, %r9	/* Save a copy for NUL finding.  */
@@ -177,10 +201,16 @@ ENTRY (strcat)
 
 	jnz	23f		/* found NUL => return pointer */
 
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movq	%rax, (%rdx)	/* Write value to destination.  */
 	addq	$8, %rdx	/* Adjust pointer.  */
 
 	/* 2nd unroll.  */
+#ifdef __CHKP__
+	bndcu	(%rsi), %bnd1
+#endif
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
 	addq	$8, %rsi	/* Adjust pointer for next word.  */
 	movq	%rax, %r9	/* Save a copy for NUL finding.  */
@@ -195,10 +225,16 @@ ENTRY (strcat)
 
 	jnz	23f		/* found NUL => return pointer */
 
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movq	%rax, (%rdx)	/* Write value to destination.  */
 	addq	$8, %rdx	/* Adjust pointer.  */
 
 	/* 3rd unroll.  */
+#ifdef __CHKP__
+	bndcu	(%rsi), %bnd1
+#endif
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
 	addq	$8, %rsi	/* Adjust pointer for next word.  */
 	movq	%rax, %r9	/* Save a copy for NUL finding.  */
@@ -213,10 +249,16 @@ ENTRY (strcat)
 
 	jnz	23f		/* found NUL => return pointer */
 
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movq	%rax, (%rdx)	/* Write value to destination.  */
 	addq	$8, %rdx	/* Adjust pointer.  */
 
 	/* 4th unroll.  */
+#ifdef __CHKP__
+	bndcu	(%rsi), %bnd1
+#endif
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
 	addq	$8, %rsi	/* Adjust pointer for next word.  */
 	movq	%rax, %r9	/* Save a copy for NUL finding.  */
@@ -231,6 +273,9 @@ ENTRY (strcat)
 
 	jnz	23f		/* found NUL => return pointer */
 
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movq	%rax, (%rdx)	/* Write value to destination.  */
 	addq	$8, %rdx	/* Adjust pointer.  */
 	jmp	22b		/* Next iteration.  */
@@ -239,10 +284,16 @@ ENTRY (strcat)
 	   The loop is unrolled twice.  */
 	.p2align 4
 23:
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movb	%al, (%rdx)	/* 1st byte.  */
 	testb	%al, %al	/* Is it NUL.  */
 	jz	24f		/* yes, finish.  */
 	incq	%rdx		/* Increment destination.  */
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movb	%ah, (%rdx)	/* 2nd byte.  */
 	testb	%ah, %ah	/* Is it NUL?.  */
 	jz	24f		/* yes, finish.  */
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
index d89f1eb..8519a81 100644
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -22,6 +22,10 @@
 
 	.text
 ENTRY (strchr)
+#ifdef __CHKP__
+	bndcl	(%rdi), %bnd0
+	bndcu	(%rdi), %bnd0
+#endif
 	movd	%esi, %xmm1
 	movq	%rdi, %rcx
 	punpcklbw %xmm1, %xmm1
@@ -29,6 +33,9 @@ ENTRY (strchr)
 	pxor	%xmm2, %xmm2
 	punpcklbw %xmm1, %xmm1
 	orl	$0xffffffff, %esi
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pshufd	$0, %xmm1, %xmm1
 	subq	%rdi, %rcx
@@ -44,7 +51,11 @@ ENTRY (strchr)
 	orl	%edx, %ecx
 	jnz	1f
 
-2:	movdqa	(%rdi), %xmm0
+2:
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+#endif
+	movdqa	(%rdi), %xmm0
 	leaq	16(%rdi), %rdi
 	movdqa	%xmm0, %xmm3
 	pcmpeqb	%xmm1, %xmm0
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
index d8c345b..3e4abfa 100644
--- a/sysdeps/x86_64/strchrnul.S
+++ b/sysdeps/x86_64/strchrnul.S
@@ -23,6 +23,10 @@
 
 	.text
 ENTRY (__strchrnul)
+#ifdef __CHKP__
+	bndcl 	(%rdi), %bnd0
+	bndcu 	(%rdi), %bnd0
+#endif
 	movd	%esi, %xmm1
 	movq	%rdi, %rcx
 	punpcklbw %xmm1, %xmm1
@@ -44,7 +48,11 @@ ENTRY (__strchrnul)
 	andl	%esi, %ecx
 	jnz	1f
 
-2:	movdqa	(%rdi), %xmm0
+2:
+#ifdef __CHKP__
+	bndcu 	(%rdi), %bnd0
+#endif
+	movdqa	(%rdi), %xmm0
 	leaq	16(%rdi), %rdi
 	movdqa	%xmm0, %xmm3
 	pcmpeqb	%xmm1, %xmm0
@@ -56,6 +64,9 @@ ENTRY (__strchrnul)
 
 1:	bsfl	%ecx, %edx
 	leaq	-16(%rdi,%rdx), %rax
+#ifdef __CHKP__
+	bndcu 	(%rax), %bnd0
+#endif
 	ret
 END (__strchrnul)
 
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 7680937..ece49c9 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -128,7 +128,16 @@ libc_hidden_def (__strncasecmp)
 ENTRY (STRCMP)
 #ifdef NOT_IN_libc
 /* Simple version since we can't use SSE registers in ld.so.  */
-L(oop):	movb	(%rdi), %al
+#ifdef __CHKP__
+	bndcl	(%rdi), %bnd0
+	bndcl	(%rsi), %bnd1
+#endif
+L(oop):
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+#endif
+	movb	(%rdi), %al
 	cmpb	(%rsi), %al
 	jne	L(neq)
 	incq	%rdi
@@ -177,6 +186,12 @@ END (STRCMP)
 	je	LABEL(Byte0)
 	mov	%rdx, %r11
 # endif
+#ifdef __CHKP__
+	bndcl  	(%rdi), %bnd0
+	bndcu  	(%rdi), %bnd0
+	bndcl  	(%rsi), %bnd1
+	bndcu  	(%rsi), %bnd1
+#endif
 	mov	%esi, %ecx
 	mov	%edi, %eax
 /* Use 64bit AND here to avoid long NOP padding.  */
@@ -243,6 +258,10 @@ END (STRCMP)
 # endif
 	add	$16, %rsi		/* prepare to search next 16 bytes */
 	add	$16, %rdi		/* prepare to search next 16 bytes */
+#ifdef __CHKP__
+	bndcu  	(%rdi), %bnd0
+	bndcu  	(%rsi), %bnd1
+#endif
 
 	/*
 	 * Determine source and destination string offsets from 16-byte alignment.
@@ -263,6 +282,11 @@ LABEL(crosscache):
 	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
 	xchg	%ecx, %eax
 	xchg	%rsi, %rdi
+#ifdef __CHKP__
+	bndmov 	%bnd0, %bnd2
+	bndmov 	%bnd1, %bnd0
+	bndmov 	%bnd2, %bnd1
+#endif
 LABEL(bigger):
 	lea	15(%rax), %r9
 	sub	%rcx, %r9
@@ -310,6 +334,10 @@ LABEL(ashr_0):
 	 */
 	.p2align 4
 LABEL(loop_ashr_0):
+#ifdef __CHKP__
+	bndcu	-1(%rdi, %rcx), %bnd0
+	bndcu	-1(%rsi, %rcx), %bnd1
+#endif
 	movdqa	(%rsi, %rcx), %xmm1
 	movdqa	(%rdi, %rcx), %xmm2
 	TOLOWER (%xmm1, %xmm2)
@@ -326,6 +354,10 @@ LABEL(loop_ashr_0):
 	jbe	LABEL(strcmp_exitz)
 # endif
 	add	$16, %rcx
+#ifdef __CHKP__
+	bndcu	-1(%rdi, %rcx), %bnd0
+	bndcu	-1(%rsi, %rcx), %bnd1
+#endif
 	movdqa	(%rsi, %rcx), %xmm1
 	movdqa	(%rdi, %rcx), %xmm2
 	TOLOWER (%xmm1, %xmm2)
@@ -377,6 +409,15 @@ LABEL(ashr_1):
 	lea	1(%rdi), %r10
 	and	$0xfff, %r10		/* offset into 4K page */
 	sub	$0x1000, %r10		/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_1)
+LABEL(ashr_1_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_1)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_1):
@@ -460,7 +501,11 @@ LABEL(nibble_ashr_1):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10		/* substract 4K from %r10 */
+# ifdef __CHKP__
+	ja	LABEL(ashr_1_check)
+# else
 	jmp	LABEL(gobble_ashr_1)
+# endif
 
 	/*
 	 * Once find null char, determine if there is a string mismatch
@@ -507,6 +552,15 @@ LABEL(ashr_2):
 	lea	2(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_2)
+LABEL(ashr_2_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_2)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_2):
@@ -588,7 +642,11 @@ LABEL(nibble_ashr_2):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_2_check)
+# else
 	jmp	LABEL(gobble_ashr_2)
+# endif
 
 	.p2align 4
 LABEL(ashr_2_exittail):
@@ -632,6 +690,15 @@ LABEL(ashr_3):
 	lea	3(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_3)
+LABEL(ashr_3_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_3)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_3):
@@ -713,7 +780,11 @@ LABEL(nibble_ashr_3):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_3_check)
+# else
 	jmp	LABEL(gobble_ashr_3)
+# endif
 
 	.p2align 4
 LABEL(ashr_3_exittail):
@@ -757,6 +828,15 @@ LABEL(ashr_4):
 	lea	4(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_4)
+LABEL(ashr_4_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_4)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_4):
@@ -838,7 +918,11 @@ LABEL(nibble_ashr_4):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_4_check)
+# else
 	jmp	LABEL(gobble_ashr_4)
+# endif
 
 	.p2align 4
 LABEL(ashr_4_exittail):
@@ -882,6 +966,15 @@ LABEL(ashr_5):
 	lea	5(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_5)
+LABEL(ashr_5_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_5)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_5):
@@ -963,7 +1056,11 @@ LABEL(nibble_ashr_5):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_5_check)
+# else
 	jmp	LABEL(gobble_ashr_5)
+# endif
 
 	.p2align 4
 LABEL(ashr_5_exittail):
@@ -1007,6 +1104,15 @@ LABEL(ashr_6):
 	lea	6(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_6)
+LABEL(ashr_6_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_6)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_6):
@@ -1088,7 +1194,11 @@ LABEL(nibble_ashr_6):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_6_check)
+# else
 	jmp	LABEL(gobble_ashr_6)
+# endif
 
 	.p2align 4
 LABEL(ashr_6_exittail):
@@ -1132,6 +1242,15 @@ LABEL(ashr_7):
 	lea	7(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_7)
+LABEL(ashr_7_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_7)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_7):
@@ -1213,7 +1332,11 @@ LABEL(nibble_ashr_7):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_7_check)
+# else
 	jmp	LABEL(gobble_ashr_7)
+# endif
 
 	.p2align 4
 LABEL(ashr_7_exittail):
@@ -1257,6 +1380,15 @@ LABEL(ashr_8):
 	lea	8(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_8)
+LABEL(ashr_8_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_8)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_8):
@@ -1338,7 +1470,11 @@ LABEL(nibble_ashr_8):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_8_check)
+# else
 	jmp	LABEL(gobble_ashr_8)
+# endif
 
 	.p2align 4
 LABEL(ashr_8_exittail):
@@ -1382,6 +1518,15 @@ LABEL(ashr_9):
 	lea	9(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_9)
+LABEL(ashr_9_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_9)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_9):
@@ -1463,7 +1608,11 @@ LABEL(nibble_ashr_9):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_9_check)
+# else
 	jmp	LABEL(gobble_ashr_9)
+# endif
 
 	.p2align 4
 LABEL(ashr_9_exittail):
@@ -1507,6 +1656,15 @@ LABEL(ashr_10):
 	lea	10(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_10)
+LABEL(ashr_10_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_10)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_10):
@@ -1588,7 +1746,11 @@ LABEL(nibble_ashr_10):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_10_check)
+# else
 	jmp	LABEL(gobble_ashr_10)
+# endif
 
 	.p2align 4
 LABEL(ashr_10_exittail):
@@ -1632,6 +1794,15 @@ LABEL(ashr_11):
 	lea	11(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_11)
+LABEL(ashr_11_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_11)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_11):
@@ -1713,7 +1884,11 @@ LABEL(nibble_ashr_11):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_11_check)
+# else
 	jmp	LABEL(gobble_ashr_11)
+# endif
 
 	.p2align 4
 LABEL(ashr_11_exittail):
@@ -1757,6 +1932,15 @@ LABEL(ashr_12):
 	lea	12(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_12)
+LABEL(ashr_12_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_12)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_12):
@@ -1838,7 +2022,11 @@ LABEL(nibble_ashr_12):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_12_check)
+# else
 	jmp	LABEL(gobble_ashr_12)
+# endif
 
 	.p2align 4
 LABEL(ashr_12_exittail):
@@ -1882,6 +2070,15 @@ LABEL(ashr_13):
 	lea	13(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_13)
+LABEL(ashr_13_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_13)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_13):
@@ -1963,7 +2160,11 @@ LABEL(nibble_ashr_13):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_13_check)
+# else
 	jmp	LABEL(gobble_ashr_13)
+# endif
 
 	.p2align 4
 LABEL(ashr_13_exittail):
@@ -2007,6 +2208,15 @@ LABEL(ashr_14):
 	lea	14(%rdi), %r10
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_14)
+LABEL(ashr_14_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_14)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_14):
@@ -2088,7 +2298,11 @@ LABEL(nibble_ashr_14):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_14_check)
+# else
 	jmp	LABEL(gobble_ashr_14)
+# endif
 
 	.p2align 4
 LABEL(ashr_14_exittail):
@@ -2134,6 +2348,15 @@ LABEL(ashr_15):
 	and	$0xfff, %r10	/* offset into 4K page */
 
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
+# ifdef __CHKP__
+	bndcu	-16(%rdi, %rcx), %bnd0
+	bndcu	-16(%rsi, %rcx), %bnd1
+	jmp 	LABEL(loop_ashr_15)
+LABEL(ashr_15_check):
+	bndcu	(%rdi, %rcx), %bnd0
+	bndcu	(%rsi, %rcx), %bnd1
+	jmp	LABEL(gobble_ashr_15)
+# endif
 
 	.p2align 4
 LABEL(loop_ashr_15):
@@ -2215,7 +2438,11 @@ LABEL(nibble_ashr_15):
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
+# ifdef __CHKP__
+	ja	LABEL(ashr_15_check)
+# else
 	jmp	LABEL(gobble_ashr_15)
+# endif
 
 	.p2align 4
 LABEL(ashr_15_exittail):
@@ -2240,6 +2467,11 @@ LABEL(less32bytes):
 	test	%r8d, %r8d
 	jz	LABEL(ret)
 	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
+#ifdef __CHKP__
+	bndmov	%bnd0, %bnd2
+	bndmov	%bnd1, %bnd0
+	bndmov	%bnd2, %bnd1
+#endif
 
 	.p2align 4
 LABEL(ret):
@@ -2250,6 +2482,10 @@ LABEL(less16bytes):
 	sub	%rdx, %r11
 	jbe	LABEL(strcmp_exitz)
 # endif
+/*#ifdef __CHKP__
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+#endif*/
 	movzbl	(%rsi, %rdx), %ecx
 	movzbl	(%rdi, %rdx), %eax
 
diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S
index 6128247..2b78e95 100644
--- a/sysdeps/x86_64/strcpy.S
+++ b/sysdeps/x86_64/strcpy.S
@@ -26,6 +26,10 @@
 
 	.text
 ENTRY (STRCPY)
+#ifdef __CHKP__
+	bndcl (%rdi), %bnd0
+	bndcl (%rsi), %bnd1
+#endif
 	movq %rsi, %rcx		/* Source register. */
 	andl $7, %ecx		/* mask alignment bits */
 	movq %rdi, %rdx		/* Duplicate destination pointer.  */
@@ -36,8 +40,14 @@ ENTRY (STRCPY)
 	addl $8,%ecx
 	/* Search the first bytes directly.  */
 0:
+#ifdef __CHKP__
+	bndcu	(%rsi), %bnd1
+#endif
 	movb	(%rsi), %al	/* Fetch a byte */
 	testb	%al, %al	/* Is it NUL? */
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movb	%al, (%rdx)	/* Store it */
 	jz	4f		/* If it was NUL, done! */
 	incq	%rsi
@@ -54,6 +64,9 @@ ENTRY (STRCPY)
 	.p2align 4
 1:
 	/* 1st unroll.  */
+#ifdef __CHKP__
+	bndcu	(%rsi), %bnd1
+#endif
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
 	addq	$8, %rsi	/* Adjust pointer for next word.  */
 	movq	%rax, %r9	/* Save a copy for NUL finding.  */
@@ -68,10 +81,16 @@ ENTRY (STRCPY)
 
 	jnz	3f		/* found NUL => return pointer */
 
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movq	%rax, (%rdx)	/* Write value to destination.  */
 	addq	$8, %rdx	/* Adjust pointer.  */
 
 	/* 2nd unroll.  */
+#ifdef __CHKP__
+	bndcu	(%rsi), %bnd1
+#endif
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
 	addq	$8, %rsi	/* Adjust pointer for next word.  */
 	movq	%rax, %r9	/* Save a copy for NUL finding.  */
@@ -86,10 +105,16 @@ ENTRY (STRCPY)
 
 	jnz	3f		/* found NUL => return pointer */
 
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movq	%rax, (%rdx)	/* Write value to destination.  */
 	addq	$8, %rdx	/* Adjust pointer.  */
 
 	/* 3rd unroll.  */
+#ifdef __CHKP__
+	bndcu	(%rsi), %bnd1
+#endif
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
 	addq	$8, %rsi	/* Adjust pointer for next word.  */
 	movq	%rax, %r9	/* Save a copy for NUL finding.  */
@@ -104,10 +129,16 @@ ENTRY (STRCPY)
 
 	jnz	3f		/* found NUL => return pointer */
 
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movq	%rax, (%rdx)	/* Write value to destination.  */
 	addq	$8, %rdx	/* Adjust pointer.  */
 
 	/* 4th unroll.  */
+#ifdef __CHKP__
+	bndcu	(%rsi), %bnd1
+#endif
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
 	addq	$8, %rsi	/* Adjust pointer for next word.  */
 	movq	%rax, %r9	/* Save a copy for NUL finding.  */
@@ -122,6 +153,9 @@ ENTRY (STRCPY)
 
 	jnz	3f		/* found NUL => return pointer */
 
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movq	%rax, (%rdx)	/* Write value to destination.  */
 	addq	$8, %rdx	/* Adjust pointer.  */
 	jmp	1b		/* Next iteration.  */
@@ -132,10 +166,16 @@ ENTRY (STRCPY)
 3:
 	/* Note that stpcpy needs to return with the value of the NUL
 	   byte.  */
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movb	%al, (%rdx)	/* 1st byte.  */
 	testb	%al, %al	/* Is it NUL.  */
 	jz	4f		/* yes, finish.  */
 	incq	%rdx		/* Increment destination.  */
+#ifdef __CHKP__
+	bndcu	(%rdx), %bnd0
+#endif
 	movb	%ah, (%rdx)	/* 2nd byte.  */
 	testb	%ah, %ah	/* Is it NUL?.  */
 	jz	4f		/* yes, finish.  */
diff --git a/sysdeps/x86_64/strcpy_chk-c.c b/sysdeps/x86_64/strcpy_chk-c.c
new file mode 100644
index 0000000..4deabcc
--- /dev/null
+++ b/sysdeps/x86_64/strcpy_chk-c.c
@@ -0,0 +1,3 @@
+#ifdef __CHKP__
+# include <debug/strcpy_chk.c>
+#endif
diff --git a/sysdeps/x86_64/strcpy_chk.S b/sysdeps/x86_64/strcpy_chk.S
index 7e171de..4b79124 100644
--- a/sysdeps/x86_64/strcpy_chk.S
+++ b/sysdeps/x86_64/strcpy_chk.S
@@ -18,6 +18,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifndef __CHKP__
 #include <sysdep.h>
 #include "asm-syntax.h"
 
@@ -206,3 +207,4 @@ ENTRY (STRCPY_CHK)
 	jmp	HIDDEN_JUMPTARGET (__chk_fail)
 
 END (STRCPY_CHK)
+#endif
diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
index 65f8a9e..0acca21 100644
--- a/sysdeps/x86_64/strcspn.S
+++ b/sysdeps/x86_64/strcspn.S
@@ -29,6 +29,12 @@
 
 	.text
 ENTRY (strcspn)
+# ifdef __CHKP__
+	bndcl (%rdi), %bnd0
+	bndcu (%rdi), %bnd0
+	bndcl (%rsi), %bnd1
+	bndcu (%rsi), %bnd1
+# endif
 
 	movq %rdi, %rdx		/* Save SRC.  */
 
@@ -54,21 +60,34 @@ ENTRY (strcspn)
    have a correct zero-extended 64-bit value in %rcx.  */
 
 	.p2align 4
-L(2):	movb (%rax), %cl	/* get byte from skipset */
+L(2):
+# ifdef __CHKP__
+	bndcu (%rax), %bnd1
+# endif
+	movb (%rax), %cl	/* get byte from skipset */
 	testb %cl, %cl		/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
 
+# ifdef __CHKP__
+	bndcu 1(%rax), %bnd1
+# endif
 	movb 1(%rax), %cl	/* get byte from skipset */
 	testb $0xff, %cl	/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
 
+# ifdef __CHKP__
+	bndcu 2(%rax), %bnd1
+# endif
 	movb 2(%rax), %cl	/* get byte from skipset */
 	testb $0xff, %cl	/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
 
+# ifdef __CHKP__
+	bndcu 3(%rax), %bnd1
+# endif
 	movb 3(%rax), %cl	/* get byte from skipset */
 	addq $4, %rax		/* increment skipset pointer */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
@@ -89,18 +108,30 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */
 	.p2align 4
 L(3):	addq $4, %rax		/* adjust pointer for full loop round */
 
+# ifdef __CHKP__
+	bndcu (%rax), %bnd0
+# endif
 	movb (%rax), %cl	/* get byte from string */
 	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
 	je L(4)			/* yes => return */
 
+# ifdef __CHKP__
+	bndcu 1(%rax), %bnd0
+# endif
 	movb 1(%rax), %cl	/* get byte from string */
 	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
 	je L(5)			/* yes => return */
 
+# ifdef __CHKP__
+	bndcu 2(%rax), %bnd0
+# endif
 	movb 2(%rax), %cl	/* get byte from string */
 	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
 	jz L(6)			/* yes => return */
 
+# ifdef __CHKP__
+	bndcu 3(%rax), %bnd0
+# endif
 	movb 3(%rax), %cl	/* get byte from string */
 	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
 	jne L(3)		/* no => start loop again */
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index eeb1092..065f0e6 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -63,6 +63,10 @@ L(n_nonzero):
 	mov	%rsi, %r11
 #endif
 
+#ifdef __CHKP__
+	bndcl	(%rdi), %bnd0
+	bndcu	(%rdi), %bnd0
+#endif
 	pxor	%xmm8, %xmm8
 	pxor	%xmm9, %xmm9
 	pxor	%xmm10, %xmm10
@@ -157,6 +161,9 @@ L(loop_init):
 L(loop):
 
 	addq	$64, %rax
+# ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+# endif
 	cmpq	%rax, %r10
 	je	L(exit_end)
 
@@ -182,6 +189,9 @@ L(first):
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
 	subq	%rdi, %rax
+# ifdef __CHKP__
+	bndcu	-1(%rdi, %rax), %bnd0
+# endif
 	ret
 
 	.p2align 4
@@ -192,6 +202,9 @@ L(exit):
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
 	subq	%rdi, %rax
+# ifdef __CHKP__
+	bndcu	-1(%rdi, %rax), %bnd0
+# endif
 	ret
 
 #else
@@ -199,6 +212,9 @@ L(exit):
 	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
 	.p2align 4
 L(loop):
+# ifdef __CHKP__
+	bndcu	64(%rax), %bnd0
+# endif
 
 	movdqa	64(%rax), %xmm8
 	pminub	80(%rax), %xmm8
@@ -231,6 +247,9 @@ L(exit0):
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
 	subq	%rdi, %rax
+# ifdef __CHKP__
+	bndcu	-1(%rdi, %rax), %bnd0
+# endif
 	ret
 
 #endif
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index e413b07..0bd3405 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -22,6 +22,10 @@
 
 	.text
 ENTRY (strrchr)
+# ifdef __CHKP__
+	bndcl	(%rdi), %bnd0
+	bndcu	(%rdi), %bnd0
+# endif
 	movd	%esi, %xmm1
 	movq	%rdi, %rcx
 	punpcklbw %xmm1, %xmm1
@@ -46,7 +50,11 @@ ENTRY (strrchr)
 	orl	%ecx, %esi
 	jnz	1f
 
-2:	movdqa	(%rdi), %xmm0
+2:
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+# endif
+	movdqa	(%rdi), %xmm0
 	leaq	16(%rdi), %rdi
 	movdqa	%xmm0, %xmm3
 	pcmpeqb	%xmm1, %xmm0
@@ -73,6 +81,9 @@ ENTRY (strrchr)
 	bsrl	%edx, %edx
 	jz	4f
 	leaq	-16(%rdi,%rdx), %rax
+# ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+# endif
 4:	ret
 END (strrchr)
 
diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
index 2911da2..bd3be8a 100644
--- a/sysdeps/x86_64/strspn.S
+++ b/sysdeps/x86_64/strspn.S
@@ -25,6 +25,12 @@
 
 	.text
 ENTRY (strspn)
+#ifdef __CHKP__
+	bndcl (%rdi), %bnd0
+	bndcu (%rdi), %bnd0
+	bndcl (%rsi), %bnd1
+	bndcu (%rsi), %bnd1
+#endif
 
 	movq %rdi, %rdx		/* Save SRC.  */
 
@@ -50,21 +56,34 @@ ENTRY (strspn)
    have a correct zero-extended 64-bit value in %rcx.  */
 
 	.p2align 4
-L(2):	movb (%rax), %cl	/* get byte from stopset */
+L(2):
+#ifdef __CHKP__
+	bndcu (%rax), %bnd1
+#endif
+	movb (%rax), %cl	/* get byte from stopset */
 	testb %cl, %cl		/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
 
+#ifdef __CHKP__
+	bndcu 1(%rax), %bnd1
+#endif
 	movb 1(%rax), %cl	/* get byte from stopset */
 	testb $0xff, %cl	/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
 
+#ifdef __CHKP__
+	bndcu 2(%rax), %bnd1
+#endif
 	movb 2(%rax), %cl	/* get byte from stopset */
 	testb $0xff, %cl	/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
 
+#ifdef __CHKP__
+	bndcu 3(%rax), %bnd1
+#endif
 	movb 3(%rax), %cl	/* get byte from stopset */
 	addq $4, %rax		/* increment stopset pointer */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
@@ -85,18 +104,30 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */
 	.p2align 4
 L(3):	addq $4, %rax		/* adjust pointer for full loop round */
 
+#ifdef __CHKP__
+	bndcu (%rax), %bnd0
+#endif
 	movb (%rax), %cl	/* get byte from string */
 	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
 	jz L(4)			/* no => return */
 
+#ifdef __CHKP__
+	bndcu 1(%rax), %bnd0
+#endif
 	movb 1(%rax), %cl	/* get byte from string */
 	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
 	jz L(5)			/* no => return */
 
+#ifdef __CHKP__
+	bndcu 2(%rax), %bnd0
+#endif
 	movb 2(%rax), %cl	/* get byte from string */
 	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
 	jz L(6)			/* no => return */
 
+#ifdef __CHKP__
+	bndcu 3(%rax), %bnd0
+#endif
 	movb 3(%rax), %cl	/* get byte from string */
 	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
 	jnz L(3)		/* yes => start loop again */
diff --git a/sysdeps/x86_64/strtok.S b/sysdeps/x86_64/strtok.S
index 5636d9a..17e2521 100644
--- a/sysdeps/x86_64/strtok.S
+++ b/sysdeps/x86_64/strtok.S
@@ -90,6 +90,9 @@ ENTRY (FUNCTION)
 	   the last run.  */
 	cmpq $0, %rdx
 	cmove %rax, %rdx
+#ifdef __CHKP__
+	bndldx (,%rax,1),%bnd0
+#endif
 	testq %rdx, %rdx
 	jz L(returnNULL)
 	movq %rsi, %rax		/* Get start of delimiter set.  */
diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S
index 3f098dc..3ab1e47 100644
--- a/sysdeps/x86_64/wcschr.S
+++ b/sysdeps/x86_64/wcschr.S
@@ -22,6 +22,11 @@
 	.text
 ENTRY (wcschr)
 
+#ifdef __CHKP__
+	bndcl  	(%rdi), %bnd0
+	bndcu  	(%rdi), %bnd0
+#endif
+
 	movd	%rsi, %xmm1
 	pxor	%xmm2, %xmm2
 	mov	%rdi, %rcx
@@ -43,6 +48,9 @@ ENTRY (wcschr)
 
 	and	$-16, %rdi
 
+#ifdef __CHKP__
+	bndcu  	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pcmpeqd	%xmm0, %xmm2
 	add	$16, %rdi
@@ -78,6 +86,9 @@ L(cross_cache):
 L(unaligned_match):
 	add	%rdi, %rax
 	add	%rcx, %rax
+#ifdef __CHKP__
+	bndcu  	(%rax), %bnd0
+#endif
 	ret
 
 	.p2align 4
@@ -91,6 +102,9 @@ L(unaligned_no_match):
 	.p2align 4
 /* Loop start on aligned string.  */
 L(loop):
+#ifdef __CHKP__
+	bndcu  	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pcmpeqd	%xmm0, %xmm2
 	add	$16, %rdi
@@ -100,6 +114,9 @@ L(loop):
 	or	%rax, %rdx
 	jnz	L(matches)
 
+#ifdef __CHKP__
+	bndcu  	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pcmpeqd	%xmm0, %xmm2
 	add	$16, %rdi
@@ -109,6 +126,9 @@ L(loop):
 	or	%rax, %rdx
 	jnz	L(matches)
 
+#ifdef __CHKP__
+	bndcu  	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pcmpeqd	%xmm0, %xmm2
 	add	$16, %rdi
@@ -118,6 +138,9 @@ L(loop):
 	or	%rax, %rdx
 	jnz	L(matches)
 
+#ifdef __CHKP__
+	bndcu  	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pcmpeqd	%xmm0, %xmm2
 	add	$16, %rdi
@@ -142,6 +165,9 @@ L(matches):
 L(match):
 	sub	$16, %rdi
 	add	%rdi, %rax
+#ifdef __CHKP__
+	bndcu  	(%rax), %bnd0
+#endif
 	ret
 
 	.p2align 4
diff --git a/sysdeps/x86_64/wcscmp.S b/sysdeps/x86_64/wcscmp.S
index d6b516b..38e2849 100644
--- a/sysdeps/x86_64/wcscmp.S
+++ b/sysdeps/x86_64/wcscmp.S
@@ -28,6 +28,14 @@ ENTRY (wcscmp)
 */
 	mov	%esi, %eax
 	mov	%edi, %edx
+
+#ifdef __CHKP__
+	bndcl  	(%rdi), %bnd0
+	bndcu  	(%rdi), %bnd0
+	bndcl  	(%rsi), %bnd1
+	bndcu  	(%rsi), %bnd1
+#endif
+
 	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
 	mov	%al, %ch
 	mov	%dl, %cl
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index 5927352..a7d944f 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -21,20 +21,45 @@
 
 	.text
 ENTRY (__wcslen)
+#ifdef __CHKP__
+	bndcl	(%rdi), %bnd0
+	bndcu	(%rdi), %bnd0
+#endif
 	cmpl	$0, (%rdi)
 	jz	L(exit_tail0)
+#ifdef __CHKP__
+	bndcu	4(%rdi), %bnd0
+#endif
 	cmpl	$0, 4(%rdi)
 	jz	L(exit_tail1)
+#ifdef __CHKP__
+	bndcu	8(%rdi), %bnd0
+#endif
 	cmpl	$0, 8(%rdi)
 	jz	L(exit_tail2)
+#ifdef __CHKP__
+	bndcu	12(%rdi), %bnd0
+#endif
 	cmpl	$0, 12(%rdi)
 	jz	L(exit_tail3)
+#ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+#endif
 	cmpl	$0, 16(%rdi)
 	jz	L(exit_tail4)
+#ifdef __CHKP__
+	bndcu	20(%rdi), %bnd0
+#endif
 	cmpl	$0, 20(%rdi)
 	jz	L(exit_tail5)
+#ifdef __CHKP__
+	bndcu	24(%rdi), %bnd0
+#endif
 	cmpl	$0, 24(%rdi)
 	jz	L(exit_tail6)
+#ifdef __CHKP__
+	bndcu	28(%rdi), %bnd0
+#endif
 	cmpl	$0, 28(%rdi)
 	jz	L(exit_tail7)
 
@@ -44,6 +69,9 @@ ENTRY (__wcslen)
 	lea	16(%rdi), %rcx
 	and	$-16, %rax
 
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	pcmpeqd	(%rax), %xmm0
 	pmovmskb %xmm0, %edx
 	pxor	%xmm1, %xmm1
@@ -51,6 +79,9 @@ ENTRY (__wcslen)
 	lea	16(%rax), %rax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	pcmpeqd	(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	pxor	%xmm2, %xmm2
@@ -58,6 +89,9 @@ ENTRY (__wcslen)
 	lea	16(%rax), %rax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	pcmpeqd	(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	pxor	%xmm3, %xmm3
@@ -65,54 +99,81 @@ ENTRY (__wcslen)
 	lea	16(%rax), %rax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	pcmpeqd	(%rax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
 	lea	16(%rax), %rax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	pcmpeqd	(%rax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	lea	16(%rax), %rax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	pcmpeqd	(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	lea	16(%rax), %rax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	pcmpeqd	(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	lea	16(%rax), %rax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	pcmpeqd	(%rax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
 	lea	16(%rax), %rax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	pcmpeqd	(%rax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	lea	16(%rax), %rax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	pcmpeqd	(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	lea	16(%rax), %rax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	pcmpeqd	(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	lea	16(%rax), %rax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	pcmpeqd	(%rax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
@@ -123,6 +184,9 @@ ENTRY (__wcslen)
 
 	.p2align 4
 L(aligned_64_loop):
+#ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+#endif
 	movaps	(%rax), %xmm0
 	movaps	16(%rax), %xmm1
 	movaps	32(%rax), %xmm2
@@ -173,6 +237,9 @@ L(exit):
 	mov	%dl, %cl
 	and	$15, %cl
 	jz	L(exit_1)
+#ifdef __CHKP__
+	bndcu	-1(%rdi, %rax, 4), %bnd0
+#endif
 	ret
 
 	.p2align 4
@@ -181,11 +248,17 @@ L(exit_high):
 	and	$15, %ch
 	jz	L(exit_3)
 	add	$2, %rax
+#ifdef __CHKP__
+	bndcu	-1(%rdi, %rax, 4), %bnd0
+#endif
 	ret
 
 	.p2align 4
 L(exit_1):
 	add	$1, %rax
+#ifdef __CHKP__
+	bndcu	-1(%rdi, %rax, 4), %bnd0
+#endif
 	ret
 
 	.p2align 4
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index ea1e2e5..8edfc46 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -19,9 +19,22 @@
 
 #include <sysdep.h>
 
+#ifdef __CHKP__
+# define RETURN \
+      bndcu  (%rax), %bnd0; \
+      ret
+#else
+# define RETURN ret
+#endif
+
+
 	.text
 ENTRY (wcsrchr)
 
+#ifdef __CHKP__
+	bndcl	(%rdi), %bnd0
+	bndcu	(%rdi), %bnd0
+#endif
 	movd	%rsi, %xmm1
 	mov	%rdi, %rcx
 	punpckldq %xmm1, %xmm1
@@ -92,6 +105,9 @@ L(unaligned_match):
 /* Loop start on aligned string.  */
 	.p2align 4
 L(loop):
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm0
 	pcmpeqd	%xmm0, %xmm2
 	add	$16, %rdi
@@ -101,6 +117,9 @@ L(loop):
 	or	%rax, %rcx
 	jnz	L(matches)
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm3
 	pcmpeqd	%xmm3, %xmm2
 	add	$16, %rdi
@@ -110,6 +129,9 @@ L(loop):
 	or	%rax, %rcx
 	jnz	L(matches)
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm4
 	pcmpeqd	%xmm4, %xmm2
 	add	$16, %rdi
@@ -119,6 +141,9 @@ L(loop):
 	or	%rax, %rcx
 	jnz	L(matches)
 
+#ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+#endif
 	movdqa	(%rdi), %xmm5
 	pcmpeqd	%xmm5, %xmm2
 	add	$16, %rdi
@@ -145,7 +170,7 @@ L(return_value):
 	test	$15 << 4, %al
 	jnz	L(match_second_wchar)
 	lea	-16(%rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(match):
@@ -175,14 +200,14 @@ L(find_zero):
 	test	$15 << 4, %al
 	jnz	L(match_second_wchar)
 	lea	-16(%rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(find_zero_in_first_wchar):
 	test	$1, %rax
 	jz	L(return_value)
 	lea	-16(%rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(find_zero_in_second_wchar):
@@ -192,7 +217,7 @@ L(find_zero_in_second_wchar):
 	test	$15 << 4, %al
 	jnz	L(match_second_wchar)
 	lea	-16(%rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(find_zero_in_third_wchar):
@@ -204,12 +229,12 @@ L(find_zero_in_third_wchar):
 	test	$15 << 4, %al
 	jnz	L(match_second_wchar)
 	lea	-16(%rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(prolog_find_zero):
 	add	%rcx, %rdi
-	mov     %rdx, %rcx
+	mov	%rdx, %rcx
 L(prolog_find_zero_1):
 	test	$15, %cl
 	jnz	L(prolog_find_zero_in_first_wchar)
@@ -228,14 +253,14 @@ L(prolog_find_zero_1):
 	test	$15 << 4, %al
 	jnz	L(match_second_wchar)
 	lea	-16(%rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(prolog_find_zero_in_first_wchar):
 	test	$1, %rax
 	jz	L(return_null)
 	lea	-16(%rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(prolog_find_zero_in_second_wchar):
@@ -245,7 +270,7 @@ L(prolog_find_zero_in_second_wchar):
 	test	$15 << 4, %al
 	jnz	L(match_second_wchar)
 	lea	-16(%rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(prolog_find_zero_in_third_wchar):
@@ -257,22 +282,22 @@ L(prolog_find_zero_in_third_wchar):
 	test	$15 << 4, %al
 	jnz	L(match_second_wchar)
 	lea	-16(%rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(match_second_wchar):
 	lea	-12(%rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(match_third_wchar):
 	lea	-8(%rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(match_fourth_wchar):
 	lea	-4(%rdi), %rax
-	ret
+	RETURN
 
 	.p2align 4
 L(return_null):

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2c6f2eca5187fbd39bdbe267ce0d2c81fe0de696

commit 2c6f2eca5187fbd39bdbe267ce0d2c81fe0de696
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Fri May 24 13:18:17 2013 +0400

    Implemented bounds check support for string/memory routines for x86_32.
    Warning: Not completed and haven't tested.

diff --git a/sysdeps/i386/i486/strcat.S b/sysdeps/i386/i486/strcat.S
index 7d45862..af2602e 100644
--- a/sysdeps/i386/i486/strcat.S
+++ b/sysdeps/i386/i486/strcat.S
@@ -35,9 +35,19 @@ ENTRY (strcat)
 
 	movl DEST(%esp), %edx
 	movl SRC(%esp), %ecx
+#ifdef __CHKP__
+	bndldx DEST(%esp,%edx,1), %bnd0
+	bndldx SRC(%esp,%ecx,1), %bnd1
+	bndcl (%ecx), %bnd1
+	bndcu (%ecx), %bnd1
+#endif
 
 	testb $0xff, (%ecx)	/* Is source string empty? */
 	jz L(8)			/* yes => return */
+#ifdef __CHKP__
+	bndcl (%edx), %bnd0
+	bndcu (%edx), %bnd0
+#endif
 
 	/* Test the first bytes separately until destination is aligned.  */
 	testl $3, %edx		/* destination pointer aligned? */
@@ -66,7 +76,11 @@ ENTRY (strcat)
 
 L(4):	addl $16,%edx		/* increment destination pointer for round */
 
-L(1):	movl (%edx), %eax	/* get word (= 4 bytes) in question */
+L(1):
+#ifdef __CHKP__
+	bndcu (%edx), %bnd0
+#endif
+	movl (%edx), %eax	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 
 	/* If you compare this with the algorithm in memchr.S you will
@@ -98,6 +112,9 @@ L(1):	movl (%edx), %eax	/* get word (= 4 bytes) in question */
 	/* If at least one byte of the word is C we don't get 0 in %ecx.  */
 	jnz L(3)
 
+#ifdef __CHKP__
+	bndcu 4(%edx), %bnd0
+#endif
 	movl 4(%edx), %eax	/* get word from source */
 	movl $0xfefefeff, %edi	/* magic value */
 	addl %eax, %edi		/* add the magic value to the word.  We get
@@ -110,6 +127,9 @@ L(1):	movl (%edx), %eax	/* get word (= 4 bytes) in question */
 				   the addition will not result in 0.  */
 	jnz L(5)		/* one byte is NUL => stop copying */
 
+#ifdef __CHKP__
+	bndcu 8(%edx), %bnd0
+#endif
 	movl 8(%edx), %eax	/* get word from source */
 	movl $0xfefefeff, %edi	/* magic value */
 	addl %eax, %edi		/* add the magic value to the word.  We get
@@ -122,6 +142,9 @@ L(1):	movl (%edx), %eax	/* get word (= 4 bytes) in question */
 				   the addition will not result in 0.  */
 	jnz L(6)		/* one byte is NUL => stop copying */
 
+#ifdef __CHKP__
+	bndcu 12(%edx), %bnd0
+#endif
 	movl 12(%edx), %eax	/* get word from source */
 	movl $0xfefefeff, %edi	/* magic value */
 	addl %eax, %edi		/* add the magic value to the word.  We get
@@ -155,6 +178,10 @@ L(2):	subl %ecx, %edx		/* reduce number of loop variants */
 	/* Now we have to align the source pointer.  */
 	testl $3, %ecx		/* pointer correctly aligned? */
 	jz L(29)		/* yes => start copy loop */
+#ifdef __CHKP__
+	bndcu (%ecx), %bnd1
+	bndcu (%ecx, %edx), %bnd0
+#endif
 	movb (%ecx), %al	/* get first byte */
 	movb %al, (%ecx,%edx)	/* and store it */
 	andb %al, %al		/* is byte NUL? */
@@ -163,6 +190,10 @@ L(2):	subl %ecx, %edx		/* reduce number of loop variants */
 
 	testl $3, %ecx		/* pointer correctly aligned? */
 	jz L(29)		/* yes => start copy loop */
+#ifdef __CHKP__
+	bndcu (%ecx), %bnd1
+	bndcu (%ecx, %edx), %bnd0
+#endif
 	movb (%ecx), %al	/* get first byte */
 	movb %al, (%ecx,%edx)	/* and store it */
 	andb %al, %al		/* is byte NUL? */
@@ -171,6 +202,10 @@ L(2):	subl %ecx, %edx		/* reduce number of loop variants */
 
 	testl $3, %ecx		/* pointer correctly aligned? */
 	jz L(29)		/* yes => start copy loop */
+#ifdef __CHKP__
+	bndcu (%ecx), %bnd1
+	bndcu (%ecx, %edx), %bnd0
+#endif
 	movb (%ecx), %al	/* get first byte */
 	movb %al, (%ecx,%edx)	/* and store it */
 	andb %al, %al		/* is byte NUL? */
@@ -182,10 +217,18 @@ L(2):	subl %ecx, %edx		/* reduce number of loop variants */
 
 	ALIGN(4)
 
-L(28):	movl %eax, 12(%ecx,%edx)/* store word at destination */
+L(28):
+#ifdef __CHKP__
+	bndcu 12(%ecx, %edx), %bnd0
+#endif
+	movl %eax, 12(%ecx,%edx)/* store word at destination */
 	addl $16, %ecx		/* adjust pointer for full round */
 
-L(29):	movl (%ecx), %eax	/* get word from source */
+L(29):
+#ifdef __CHKP__
+	bndcu (%ecx), %bnd1
+#endif
+	movl (%ecx), %eax	/* get word from source */
 	movl $0xfefefeff, %edi	/* magic value */
 	addl %eax, %edi		/* add the magic value to the word.  We get
 				   carry bits reported for each byte which
@@ -196,8 +239,14 @@ L(29):	movl (%ecx), %eax	/* get word from source */
 	incl %edi		/* add 1: if one carry bit was *not* set
 				   the addition will not result in 0.  */
 	jnz L(9)		/* one byte is NUL => stop copying */
+#ifdef __CHKP__
+	bndcu (%ecx, %edx), %bnd0
+#endif
 	movl %eax, (%ecx,%edx)	/* store word to destination */
 
+#ifdef __CHKP__
+	bndcu 4(%ecx), %bnd1
+#endif
 	movl 4(%ecx), %eax	/* get word from source */
 	movl $0xfefefeff, %edi	/* magic value */
 	addl %eax, %edi		/* add the magic value to the word.  We get
@@ -209,8 +258,14 @@ L(29):	movl (%ecx), %eax	/* get word from source */
 	incl %edi		/* add 1: if one carry bit was *not* set
 				   the addition will not result in 0.  */
 	jnz L(91)		/* one byte is NUL => stop copying */
+#ifdef __CHKP__
+	bndcu 4(%ecx, %edx), %bnd0
+#endif
 	movl %eax, 4(%ecx,%edx)	/* store word to destination */
 
+#ifdef __CHKP__
+	bndcu 8(%ecx), %bnd1
+#endif
 	movl 8(%ecx), %eax	/* get word from source */
 	movl $0xfefefeff, %edi	/* magic value */
 	addl %eax, %edi		/* add the magic value to the word.  We get
@@ -222,8 +277,14 @@ L(29):	movl (%ecx), %eax	/* get word from source */
 	incl %edi		/* add 1: if one carry bit was *not* set
 				   the addition will not result in 0.  */
 	jnz L(92)		/* one byte is NUL => stop copying */
+#ifdef __CHKP__
+	bndcu 8(%ecx, %edx), %bnd0
+#endif
 	movl %eax, 8(%ecx,%edx)	/* store word to destination */
 
+#ifdef __CHKP__
+	bndcu 12(%ecx), %bnd1
+#endif
 	movl 12(%ecx), %eax	/* get word from source */
 	movl $0xfefefeff, %edi	/* magic value */
 	addl %eax, %edi		/* add the magic value to the word.  We get
@@ -240,15 +301,25 @@ L(93):	addl $4, %ecx		/* adjust pointer */
 L(92):	addl $4, %ecx
 L(91):	addl $4, %ecx
 
-L(9):	movb %al, (%ecx,%edx)	/* store first byte of last word */
+L(9):
+#ifdef __CHKP__
+	bndcu (%ecx, %edx), %bnd0
+#endif
+	movb %al, (%ecx,%edx)	/* store first byte of last word */
 	orb %al, %al		/* is it NUL? */
 	jz L(8)			/* yes => return */
 
+#ifdef __CHKP__
+	bndcu 1(%ecx, %edx), %bnd0
+#endif
 	movb %ah, 1(%ecx,%edx)	/* store second byte of last word */
 	orb %ah, %ah		/* is it NUL? */
 	jz L(8)			/* yes => return */
 
 	shrl $16, %eax		/* make upper bytes accessible */
+#ifdef __CHKP__
+	bndcu 2(%ecx, %edx), %bnd0
+#endif
 	movb %al, 2(%ecx,%edx)	/* store third byte of last word */
 	orb %al, %al		/* is it NUL? */
 	jz L(8)			/* yes => return */
diff --git a/sysdeps/i386/i586/strchr.S b/sysdeps/i386/i586/strchr.S
index 648d528..4efa935 100644
--- a/sysdeps/i386/i586/strchr.S
+++ b/sysdeps/i386/i586/strchr.S
@@ -54,6 +54,10 @@ ENTRY (strchr)
 
 	movl STR(%esp), %eax
 	movl CHR(%esp), %edx
+#ifdef __CHKP__
+	bndldx 	STR(%esp,%eax,1), %bnd0
+	bndcl  	(%eax), %bnd0
+#endif
 
 	movl %eax, %edi		/* duplicate string pointer for later */
 	cfi_rel_offset (edi, 12)
@@ -83,6 +87,9 @@ ENTRY (strchr)
 	xorb %dl, %cl		/* load single byte and test for NUL */
 	je L(3)			/* yes => return NULL */
 
+#ifdef __CHKP__
+	bndcu 1(%eax), %bnd0
+#endif
 	movb 1(%eax), %cl	/* load single byte */
 	incl %eax
 
@@ -97,7 +104,11 @@ ENTRY (strchr)
 
 	jne L(11)
 
-L(0):	movb (%eax), %cl	/* load single byte */
+L(0):
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
+	movb (%eax), %cl	/* load single byte */
 
 	cmpb %cl, %dl		/* is byte == C? */
 	je L(out)		/* aligned => return pointer */
@@ -115,7 +126,11 @@ L(0):	movb (%eax), %cl	/* load single byte */
 	   four instruction up to `L1' will not be executed in the loop
 	   because the same code is found at the end of the loop, but
 	   there it is executed in parallel with other instructions.  */
-L(11):	movl (%eax), %ecx
+L(11):
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
+	movl (%eax), %ecx
 	movl $magic, %ebp
 
 	movl $magic, %edi
@@ -159,6 +174,9 @@ L(1):	xorl %ecx, %ebp			/* (word^magic) */
 		movl $magic, %esi	/* load magic value */
 		xorl %edx, %ebx		/* clear words which are C */
 
+#ifdef __CHKP__
+	bndcu   (%eax), %bnd0
+#endif
 					movl (%eax), %ecx
 		addl %ebx, %esi		/* (word+magic) */
 
@@ -189,6 +207,9 @@ L(1):	xorl %ecx, %ebp			/* (word^magic) */
 						movl $magic, %esi
 						xorl %edx, %ebx
 
+#ifdef __CHKP__
+	bndcu   (%eax), %bnd0
+#endif
 	movl (%eax), %ecx
 						addl %ebx, %esi
 
@@ -219,6 +240,9 @@ L(1):	xorl %ecx, %ebp			/* (word^magic) */
 		movl $magic, %esi
 		xorl %edx, %ebx
 
+#ifdef __CHKP__
+	bndcu   (%eax), %bnd0
+#endif
 					movl (%eax), %ecx
 		addl %ebx, %esi
 
@@ -249,6 +273,9 @@ L(1):	xorl %ecx, %ebp			/* (word^magic) */
 						movl $magic, %esi
 						xorl %edx, %ebx
 
+#ifdef __CHKP__
+	bndcu   (%eax), %bnd0
+#endif
 	movl (%eax), %ecx
 						addl %ebx, %esi
 
diff --git a/sysdeps/i386/i586/strcpy.S b/sysdeps/i386/i586/strcpy.S
index c940369..6392a8e 100644
--- a/sysdeps/i386/i586/strcpy.S
+++ b/sysdeps/i386/i586/strcpy.S
@@ -45,6 +45,10 @@ ENTRY (STRCPY)
 	cfi_rel_offset (edi, 8)
 	movl	SRC(%esp), %esi
 	cfi_rel_offset (esi, 4)
+#ifdef __CHKP__
+	bndldx	DEST(%esp,%edi,1), %bnd0
+	bndldx	SRC(%esp,%esi,1), %bnd1
+#endif
 
 	xorl	%eax, %eax
 	leal	-1(%esi), %ecx
@@ -61,6 +65,9 @@ ENTRY (STRCPY)
 	/* 0xb is the distance between 2: and 1: but we avoid writing
 	   1f-2b because the assembler generates worse code.  */
 	leal	0xb(%edx,%ecx,8), %ecx
+# ifdef __CHKP__
+	jmp	L(1)
+# endif
 #else
 	leal	1f(,%ecx,8), %ecx
 #endif
diff --git a/sysdeps/i386/i586/strlen.S b/sysdeps/i386/i586/strlen.S
index b50fffa..9034625 100644
--- a/sysdeps/i386/i586/strlen.S
+++ b/sysdeps/i386/i586/strlen.S
@@ -41,6 +41,10 @@
 ENTRY (strlen)
 
 	movl STR(%esp), %eax
+#ifdef __CHKP__
+	bndldx STR(%esp,%eax,1), %bnd0
+	bndcu (%eax),%bnd0
+#endif
 	movl $3, %edx		/* load mask (= 3) */
 
 	andl %eax, %edx		/* separate last two bits of address */
@@ -48,10 +52,16 @@ ENTRY (strlen)
 	jz L(1)			/* aligned => start loop */
 	jp L(0)			/* exactly two bits set */
 
+#ifdef __CHKP__
+	bndcu (%eax),%bnd0
+#endif
 	cmpb %dh, (%eax)	/* is byte NUL? */
 	je L(2)			/* yes => return */
 
 	incl %eax		/* increment pointer */
+#ifdef __CHKP__
+	bndcu (%eax),%bnd0
+#endif
 	cmpb %dh, (%eax)	/* is byte NUL? */
 
 	je L(2)			/* yes => return */
@@ -61,7 +71,11 @@ ENTRY (strlen)
 
 	jz L(1)
 
-L(0):	cmpb %dh, (%eax)	/* is byte NUL? */
+L(0):
+#ifdef __CHKP__
+	bndcu (%eax),%bnd0
+#endif
+	cmpb %dh, (%eax)	/* is byte NUL? */
 	je L(2)			/* yes => return */
 
 	incl %eax		/* increment pointer */
@@ -174,7 +188,11 @@ L(3):	subl $4, %eax		/* correct too early pointer increment */
 
 	incl %eax		/* increment pointer */
 
-L(2):	subl STR(%esp), %eax	/* now compute the length as difference
+L(2):
+#ifdef __CHKP__
+	bndcu (%eax),%bnd0
+#endif
+	subl STR(%esp), %eax	/* now compute the length as difference
 				   between start and terminating NUL
 				   character */
 	ret
diff --git a/sysdeps/i386/i686/memcmp.S b/sysdeps/i386/i686/memcmp.S
index b8091a6..6cb03e7 100644
--- a/sysdeps/i386/i686/memcmp.S
+++ b/sysdeps/i386/i686/memcmp.S
@@ -48,9 +48,19 @@ ENTRY (memcmp)
 	movl	BLK1(%esp), %eax
 	movl	BLK2(%esp), %edx
 	movl	LEN(%esp), %ecx
+#ifdef __CHKP__
+	bndldx	BLK1(%esp,%eax,1), %bnd0
+	bndldx	BLK2(%esp,%edx,1), %bnd1
+#endif
 
 	cmpl 	$1, %ecx
 	jne	L(not_1)
+#ifdef __CHKP__
+	bndcl	(%eax), %bnd0
+	bndcu	(%eax), %bnd0
+	bndcl	(%edx), %bnd1
+	bndcu	(%edx), %bnd1
+#endif
 	movzbl	(%eax), %ecx		/* LEN == 1  */
 	cmpb	(%edx), %cl
 	jne	L(neq)
@@ -69,6 +79,12 @@ L(neq):
 	cfi_rel_offset (ebx, 0)
 L(not_1):
 	jl	L(bye)			/* LEN == 0  */
+#ifdef __CHKP__
+	bndcl	(%eax), %bnd0
+	bndcu	(%eax), %bnd0
+	bndcl	(%edx), %bnd1
+	bndcu	(%edx), %bnd1
+#endif
 
 	pushl	%esi
 	cfi_adjust_cfa_offset (4)
@@ -84,36 +100,64 @@ L(not_1):
 
 	ALIGN (4)
 L(28bytes):
+#ifdef __CHKP__
+	bndcu	-28(%esi), %bnd0
+	bndcu	-28(%edx), %bnd1
+#endif
 	movl	-28(%esi), %eax
 	movl	-28(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(24bytes):
+#ifdef __CHKP__
+	bndcu	-24(%esi), %bnd0
+	bndcu	-24(%edx), %bnd1
+#endif
 	movl	-24(%esi), %eax
 	movl	-24(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(20bytes):
+#ifdef __CHKP__
+	bndcu	-20(%esi), %bnd0
+	bndcu	-20(%edx), %bnd1
+#endif
 	movl	-20(%esi), %eax
 	movl	-20(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(16bytes):
+#ifdef __CHKP__
+	bndcu	-16(%esi), %bnd0
+	bndcu	-16(%edx), %bnd1
+#endif
 	movl	-16(%esi), %eax
 	movl	-16(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(12bytes):
+#ifdef __CHKP__
+	bndcu	-12(%esi), %bnd0
+	bndcu	-12(%edx), %bnd1
+#endif
 	movl	-12(%esi), %eax
 	movl	-12(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(8bytes):
+#ifdef __CHKP__
+	bndcu	-8(%esi), %bnd0
+	bndcu	-8(%edx), %bnd1
+#endif
 	movl	-8(%esi), %eax
 	movl	-8(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(4bytes):
+#ifdef __CHKP__
+	bndcu	-4(%esi), %bnd0
+	bndcu	-4(%edx), %bnd1
+#endif
 	movl	-4(%esi), %eax
 	movl	-4(%edx), %ecx
 	cmpl	%ecx, %eax
@@ -129,41 +173,73 @@ L(0bytes):
 	cfi_rel_offset (esi, 0)
 	cfi_rel_offset (ebx, 4)
 L(29bytes):
+#ifdef __CHKP__
+	bndcu	-29(%esi), %bnd0
+	bndcu	-29(%edx), %bnd1
+#endif
 	movl	-29(%esi), %eax
 	movl	-29(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(25bytes):
+#ifdef __CHKP__
+	bndcu	-25(%esi), %bnd0
+	bndcu	-25(%edx), %bnd1
+#endif
 	movl	-25(%esi), %eax
 	movl	-25(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(21bytes):
+#ifdef __CHKP__
+	bndcu	-21(%esi), %bnd0
+	bndcu	-21(%edx), %bnd1
+#endif
 	movl	-21(%esi), %eax
 	movl	-21(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(17bytes):
+#ifdef __CHKP__
+	bndcu	-17(%esi), %bnd0
+	bndcu	-17(%edx), %bnd1
+#endif
 	movl	-17(%esi), %eax
 	movl	-17(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(13bytes):
+#ifdef __CHKP__
+	bndcu	-13(%esi), %bnd0
+	bndcu	-13(%edx), %bnd1
+#endif
 	movl	-13(%esi), %eax
 	movl	-13(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(9bytes):
+#ifdef __CHKP__
+	bndcu	-9(%esi), %bnd0
+	bndcu	-9(%edx), %bnd1
+#endif
 	movl	-9(%esi), %eax
 	movl	-9(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(5bytes):
+#ifdef __CHKP__
+	bndcu	-5(%esi), %bnd0
+	bndcu	-5(%edx), %bnd1
+#endif
 	movl	-5(%esi), %eax
 	movl	-5(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(1bytes):
+#ifdef __CHKP__
+	bndcu	-1(%esi), %bnd0
+	bndcu	-1(%edx), %bnd1
+#endif
 	movzbl	-1(%esi), %eax
 	cmpb	-1(%edx), %al
 	jne	L(set)
@@ -177,41 +253,73 @@ L(1bytes):
 	cfi_rel_offset (esi, 0)
 	cfi_rel_offset (ebx, 4)
 L(30bytes):
+#ifdef __CHKP__
+	bndcu	-30(%esi), %bnd0
+	bndcu	-30(%edx), %bnd1
+#endif
 	movl	-30(%esi), %eax
 	movl	-30(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(26bytes):
+#ifdef __CHKP__
+	bndcu	-26(%esi), %bnd0
+	bndcu	-26(%edx), %bnd1
+#endif
 	movl	-26(%esi), %eax
 	movl	-26(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(22bytes):
+#ifdef __CHKP__
+	bndcu	-22(%esi), %bnd0
+	bndcu	-22(%edx), %bnd1
+#endif
 	movl	-22(%esi), %eax
 	movl	-22(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(18bytes):
+#ifdef __CHKP__
+	bndcu	-18(%esi), %bnd0
+	bndcu	-18(%edx), %bnd1
+#endif
 	movl	-18(%esi), %eax
 	movl	-18(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(14bytes):
+#ifdef __CHKP__
+	bndcu	-14(%esi), %bnd0
+	bndcu	-14(%edx), %bnd1
+#endif
 	movl	-14(%esi), %eax
 	movl	-14(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(10bytes):
+#ifdef __CHKP__
+	bndcu	-10(%esi), %bnd0
+	bndcu	-10(%edx), %bnd1
+#endif
 	movl	-10(%esi), %eax
 	movl	-10(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(6bytes):
+#ifdef __CHKP__
+	bndcu	-6(%esi), %bnd0
+	bndcu	-6(%edx), %bnd1
+#endif
 	movl	-6(%esi), %eax
 	movl	-6(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(2bytes):
+#ifdef __CHKP__
+	bndcu	-2(%esi), %bnd0
+	bndcu	-2(%edx), %bnd1
+#endif
 	movzwl	-2(%esi), %eax
 	movzwl	-2(%edx), %ecx
 	cmpb	%cl, %al
@@ -228,41 +336,73 @@ L(2bytes):
 	cfi_rel_offset (esi, 0)
 	cfi_rel_offset (ebx, 4)
 L(31bytes):
+#ifdef __CHKP__
+	bndcu	-31(%esi), %bnd0
+	bndcu	-31(%edx), %bnd1
+#endif
 	movl	-31(%esi), %eax
 	movl	-31(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(27bytes):
+#ifdef __CHKP__
+	bndcu	-27(%esi), %bnd0
+	bndcu	-27(%edx), %bnd1
+#endif
 	movl	-27(%esi), %eax
 	movl	-27(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(23bytes):
+#ifdef __CHKP__
+	bndcu	-23(%esi), %bnd0
+	bndcu	-23(%edx), %bnd1
+#endif
 	movl	-23(%esi), %eax
 	movl	-23(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(19bytes):
+#ifdef __CHKP__
+	bndcu	-19(%esi), %bnd0
+	bndcu	-19(%edx), %bnd1
+#endif
 	movl	-19(%esi), %eax
 	movl	-19(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(15bytes):
+#ifdef __CHKP__
+	bndcu	-15(%esi), %bnd0
+	bndcu	-15(%edx), %bnd1
+#endif
 	movl	-15(%esi), %eax
 	movl	-15(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(11bytes):
+#ifdef __CHKP__
+	bndcu	-11(%esi), %bnd0
+	bndcu	-11(%edx), %bnd1
+#endif
 	movl	-11(%esi), %eax
 	movl	-11(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(7bytes):
+#ifdef __CHKP__
+	bndcu	-7(%esi), %bnd0
+	bndcu	-7(%edx), %bnd1
+#endif
 	movl	-7(%esi), %eax
 	movl	-7(%edx), %ecx
 	cmpl	%ecx, %eax
 	jne	L(find_diff)
 L(3bytes):
+#ifdef __CHKP__
+	bndcu	-3(%esi), %bnd0
+	bndcu	-3(%edx), %bnd1
+#endif
 	movzwl	-3(%esi), %eax
 	movzwl	-3(%edx), %ecx
 	cmpb	%cl, %al
@@ -286,34 +426,66 @@ L(3bytes):
 L(32bytesormore):
 	subl	$32, %ecx
 
+#ifdef __CHKP__
+	bndcu	(%esi), %bnd0
+	bndcu	(%edx), %bnd1
+#endif
 	movl	(%esi), %eax
 	cmpl	(%edx), %eax
 	jne	L(load_ecx)
 
+#ifdef __CHKP__
+	bndcu	4(%esi), %bnd0
+	bndcu	4(%edx), %bnd1
+#endif
 	movl	4(%esi), %eax
 	cmpl	4(%edx), %eax
 	jne	L(load_ecx_4)
 
+#ifdef __CHKP__
+	bndcu	8(%esi), %bnd0
+	bndcu	8(%edx), %bnd1
+#endif
 	movl	8(%esi), %eax
 	cmpl	8(%edx), %eax
 	jne	L(load_ecx_8)
 
+#ifdef __CHKP__
+	bndcu	12(%esi), %bnd0
+	bndcu	12(%edx), %bnd1
+#endif
 	movl	12(%esi), %eax
 	cmpl	12(%edx), %eax
 	jne	L(load_ecx_12)
 
+#ifdef __CHKP__
+	bndcu	16(%esi), %bnd0
+	bndcu	16(%edx), %bnd1
+#endif
 	movl	16(%esi), %eax
 	cmpl	16(%edx), %eax
 	jne	L(load_ecx_16)
 
+#ifdef __CHKP__
+	bndcu	20(%esi), %bnd0
+	bndcu	20(%edx), %bnd1
+#endif
 	movl	20(%esi), %eax
 	cmpl	20(%edx), %eax
 	jne	L(load_ecx_20)
 
+#ifdef __CHKP__
+	bndcu	24(%esi), %bnd0
+	bndcu	24(%edx), %bnd1
+#endif
 	movl	24(%esi), %eax
 	cmpl	24(%edx), %eax
 	jne	L(load_ecx_24)
 
+#ifdef __CHKP__
+	bndcu	28(%esi), %bnd0
+	bndcu	28(%edx), %bnd1
+#endif
 	movl	28(%esi), %eax
 	cmpl	28(%edx), %eax
 	jne	L(load_ecx_28)
diff --git a/sysdeps/i386/i686/memset.S b/sysdeps/i386/i686/memset.S
index aed79a8..3fd4370 100644
--- a/sysdeps/i386/i686/memset.S
+++ b/sysdeps/i386/i686/memset.S
@@ -50,6 +50,11 @@ ENTRY (memset)
 	cfi_adjust_cfa_offset (4)
 	movl	DEST(%esp), %edx
 	movl	LEN(%esp), %ecx
+#ifdef __CHKP__
+	bndldx  DEST(%esp,%edx,1),%bnd0
+	bndcl  	(%edx), %bnd0
+	bndcu  	-1(%edx, %ecx), %bnd0
+#endif
 #if BZERO_P
 	xorl	%eax, %eax	/* fill with 0 */
 #else
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 8946bfa..7a4999a 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -6,9 +6,7 @@ endif
 
 ifeq ($(subdir),string)
 gen-as-const-headers += locale-defines.sym
-sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
-		   memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
-		   memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
+sysdep_routines += bzero-sse2 memset-sse2 \
 		   memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
 		   strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
 		   memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift \
@@ -23,7 +21,8 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
 		   strnlen-sse2 strnlen-c \
 		   strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \
 		   strncase_l-c strncase-c strncase_l-ssse3 \
-		   strcasecmp_l-sse4 strncase_l-sse4
+		   strcasecmp_l-sse4 strncase_l-sse4 mpx_memcpy_nobnd \
+			mpx_mempcpy_nobnd mpx_memmove_nobnd
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/Versions b/sysdeps/i386/i686/multiarch/Versions
index 59b185a..7f0cbbc 100644
--- a/sysdeps/i386/i686/multiarch/Versions
+++ b/sysdeps/i386/i686/multiarch/Versions
@@ -2,4 +2,11 @@ libc {
   GLIBC_PRIVATE {
     __get_cpu_features;
   }
+%ifdef __CHKP__
+  GLIBC_2.14 {
+   mpx_memcpy_nobnd;
+   mpx_memmove_nobnd;
+   mpx_mempcpy_nobnd;
+  }
+%endif
 }
diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/__bcopy.S
similarity index 100%
rename from sysdeps/i386/i686/multiarch/bcopy.S
rename to sysdeps/i386/i686/multiarch/__bcopy.S
diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/__memcpy.S
similarity index 100%
rename from sysdeps/i386/i686/multiarch/memcpy.S
rename to sysdeps/i386/i686/multiarch/__memcpy.S
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/__memcpy_chk.S
similarity index 100%
rename from sysdeps/i386/i686/multiarch/memcpy_chk.S
rename to sysdeps/i386/i686/multiarch/__memcpy_chk.S
diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/__memmove.S
similarity index 100%
rename from sysdeps/i386/i686/multiarch/memmove.S
rename to sysdeps/i386/i686/multiarch/__memmove.S
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/__memmove_chk.S
similarity index 100%
rename from sysdeps/i386/i686/multiarch/memmove_chk.S
rename to sysdeps/i386/i686/multiarch/__memmove_chk.S
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/__mempcpy.S
similarity index 100%
rename from sysdeps/i386/i686/multiarch/mempcpy.S
rename to sysdeps/i386/i686/multiarch/__mempcpy.S
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/__mempcpy_chk.S
similarity index 100%
rename from sysdeps/i386/i686/multiarch/mempcpy_chk.S
rename to sysdeps/i386/i686/multiarch/__mempcpy_chk.S
diff --git a/sysdeps/i386/i686/multiarch/bcopy.c b/sysdeps/i386/i686/multiarch/bcopy.c
new file mode 100644
index 0000000..6f5efba
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bcopy.c
@@ -0,0 +1,7 @@
+#include <stddef.h>
+
+void
+bcopy (const void *src, void *dst, size_t n)
+{
+  memmove (dst, src, n);
+}
diff --git a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
index 2c282bd..63f0704 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
+++ b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -37,11 +37,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   size_t i = 0;
 
   /* Support sysdeps/i386/i686/multiarch/bcopy.S.  */
-  IFUNC_IMPL (i, name, bcopy,
-	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSSE3,
-			      __bcopy_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSSE3, __bcopy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
+//  IFUNC_IMPL (i, name, bcopy,
+//	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSSE3,
+//			      __bcopy_ssse3_rep)
+//	      IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSSE3, __bcopy_ssse3)
+//	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/bzero.S.  */
   IFUNC_IMPL (i, name, bzero,
@@ -64,21 +64,21 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memmove_chk.S.  */
-  IFUNC_IMPL (i, name, __memmove_chk,
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
-			      __memmove_chk_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
-			      __memmove_chk_ssse3)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
-			      __memmove_chk_ia32))
+//  IFUNC_IMPL (i, name, __memmove_chk,
+//	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
+//			      __memmove_chk_ssse3_rep)
+//	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
+//			      __memmove_chk_ssse3)
+//	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+//			      __memmove_chk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memmove.S.  */
-  IFUNC_IMPL (i, name, memmove,
-	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
-			      __memmove_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
-			      __memmove_ssse3)
-	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
+//  IFUNC_IMPL (i, name, memmove,
+//	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
+//			      __memmove_ssse3_rep)
+//	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
+//			      __memmove_ssse3)
+//	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memrchr.S.  */
   IFUNC_IMPL (i, name, memrchr,
@@ -274,37 +274,37 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
 #ifdef SHARED
   /* Support sysdeps/i386/i686/multiarch/memcpy_chk.S.  */
-  IFUNC_IMPL (i, name, __memcpy_chk,
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
-			      __memcpy_chk_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
-			      __memcpy_chk_ssse3)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
-			      __memcpy_chk_ia32))
+//  IFUNC_IMPL (i, name, __memcpy_chk,
+//	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
+//			      __memcpy_chk_ssse3_rep)
+//	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
+//			      __memcpy_chk_ssse3)
+//	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+//			      __memcpy_chk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/memcpy.S.  */
-  IFUNC_IMPL (i, name, memcpy,
-	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
-			      __memcpy_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
+//  IFUNC_IMPL (i, name, memcpy,
+//	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
+//			      __memcpy_ssse3_rep)
+//	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
+//	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S.  */
-  IFUNC_IMPL (i, name, __mempcpy_chk,
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
-			      __mempcpy_chk_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
-			      __mempcpy_chk_ssse3)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
-			      __mempcpy_chk_ia32))
+//  IFUNC_IMPL (i, name, __mempcpy_chk,
+//	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
+//			      __mempcpy_chk_ssse3_rep)
+//	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
+//			      __mempcpy_chk_ssse3)
+//	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+//			      __mempcpy_chk_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/mempcpy.S.  */
-  IFUNC_IMPL (i, name, mempcpy,
-	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
-			      __mempcpy_ssse3_rep)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
-			      __mempcpy_ssse3)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
+//  IFUNC_IMPL (i, name, mempcpy,
+//	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
+//			      __mempcpy_ssse3_rep)
+//	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
+//			      __mempcpy_ssse3)
+//	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
 
   /* Support sysdeps/i386/i686/multiarch/strlen.S.  */
   IFUNC_IMPL (i, name, strlen,
diff --git a/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
index d364177..80be0d9 100644
--- a/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
+++ b/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
@@ -58,6 +58,12 @@ ENTRY (MEMCHR)
 # endif
 	mov	%ecx, %eax
 
+# ifdef __CHKP__
+	bndldx	STR1(%esp,%ecx,1), %bnd0
+	bndcl 	(%ecx), %bnd0
+	bndcu 	(%ecx), %bnd0
+# endif
+
 	punpcklbw %xmm1, %xmm1
 	punpcklbw %xmm1, %xmm1
 
@@ -79,9 +85,18 @@ ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	sub	%ecx, %edx
 	jbe	L(return_null_1)
-# endif
 	add	%ecx, %eax
+#  ifdef __CHKP__
+	bndcu 	(%eax), %bnd0
+#  endif
+	ret
+# else
+	add	%ecx, %eax
+#  ifdef __CHKP__
+	bndcu 	(%eax), %bnd0
+#  endif
 	ret
+# endif
 
 	.p2align 4
 L(unaligned_no_match_1):
@@ -163,8 +178,15 @@ L(loop_prolog):
 # ifndef USE_AS_RAWMEMCHR
 	sub	$64, %edx
 	jbe	L(exit_loop)
+#  ifdef __CHKP__
+	bndcu   (%edi), %bnd0
+#  endif
 	movdqa	(%edi), %xmm0
 # else
+
+#  ifdef __CHKP__
+	bndcu   (%edx), %bnd0
+#  endif
 	movdqa	(%edx), %xmm0
 # endif
 	pcmpeqb	%xmm1, %xmm0
@@ -173,8 +195,15 @@ L(loop_prolog):
 	jnz	L(matches)
 
 # ifndef USE_AS_RAWMEMCHR
+#  ifdef __CHKP__
+	bndcu   16(%edi), %bnd0
+#  endif
 	movdqa	16(%edi), %xmm2
 # else
+
+#  ifdef __CHKP__
+	bndcu   16(%edx), %bnd0
+#  endif
 	movdqa	16(%edx), %xmm2
 # endif
 	pcmpeqb	%xmm1, %xmm2
@@ -183,8 +212,15 @@ L(loop_prolog):
 	jnz	L(matches16)
 
 # ifndef USE_AS_RAWMEMCHR
+#  ifdef __CHKP__
+	bndcu   32(%edi), %bnd0
+#  endif
 	movdqa	32(%edi), %xmm3
 # else
+
+#  ifdef __CHKP__
+	bndcu   32(%edx), %bnd0
+#  endif
 	movdqa	32(%edx), %xmm3
 # endif
 	pcmpeqb	%xmm1, %xmm3
@@ -193,8 +229,15 @@ L(loop_prolog):
 	jnz	L(matches32)
 
 # ifndef USE_AS_RAWMEMCHR
+#  ifdef __CHKP__
+	bndcu   48(%edi), %bnd0
+#  endif
 	movdqa	48(%edi), %xmm4
 # else
+
+#  ifdef __CHKP__
+	bndcu   48(%edx), %bnd0
+#  endif
 	movdqa	48(%edx), %xmm4
 # endif
 	pcmpeqb	%xmm1, %xmm4
@@ -277,11 +320,18 @@ L(align64_loop):
 # ifndef USE_AS_RAWMEMCHR
 	sub	$64, %edx
 	jbe	L(exit_loop)
+#  ifdef __CHKP__
+	bndcu   (%edi), %bnd0
+#  endif
 	movdqa	(%edi), %xmm0
 	movdqa	16(%edi), %xmm2
 	movdqa	32(%edi), %xmm3
 	movdqa	48(%edi), %xmm4
 # else
+
+#  ifdef __CHKP__
+	bndcu   (%edx), %bnd0
+#  endif
 	movdqa	(%edx), %xmm0
 	movdqa	16(%edx), %xmm2
 	movdqa	32(%edx), %xmm3
@@ -342,9 +392,15 @@ L(align64_loop):
 
 # ifndef USE_AS_RAWMEMCHR
 	lea	48(%edi, %eax), %eax
+#  ifdef __CHKP__
+	bndcu 	(%eax), %bnd0
+#  endif
 	RETURN
 # else
 	lea	48(%edx, %eax), %eax
+#  ifdef __CHKP__
+	bndcu 	(%eax), %bnd0
+#  endif
 	ret
 # endif
 
@@ -404,9 +460,15 @@ L(matches0):
 	bsf	%eax, %eax
 # ifndef USE_AS_RAWMEMCHR
 	lea	-16(%eax, %edi), %eax
+#  ifdef __CHKP__
+	bndcu 	(%eax), %bnd0
+#  endif
 	RETURN
 # else
 	lea	-16(%eax, %edx), %eax
+#  ifdef __CHKP__
+	bndcu 	(%eax), %bnd0
+#  endif
 	ret
 # endif
 
@@ -415,9 +477,15 @@ L(matches):
 	bsf	%eax, %eax
 # ifndef USE_AS_RAWMEMCHR
 	add	%edi, %eax
+#  ifdef __CHKP__
+	bndcu 	(%eax), %bnd0
+#  endif
 	RETURN
 # else
 	add	%edx, %eax
+#  ifdef __CHKP__
+	bndcu 	(%eax), %bnd0
+#  endif
 	ret
 # endif
 
@@ -426,9 +494,15 @@ L(matches16):
 	bsf	%eax, %eax
 # ifndef USE_AS_RAWMEMCHR
 	lea	16(%eax, %edi), %eax
+#  ifdef __CHKP__
+	bndcu 	(%eax), %bnd0
+#  endif
 	RETURN
 # else
 	lea	16(%eax, %edx), %eax
+#  ifdef __CHKP__
+	bndcu 	(%eax), %bnd0
+#  endif
 	ret
 # endif
 
@@ -437,9 +511,15 @@ L(matches32):
 	bsf	%eax, %eax
 # ifndef USE_AS_RAWMEMCHR
 	lea	32(%eax, %edi), %eax
+#  ifdef __CHKP__
+	bndcu 	(%eax), %bnd0
+#  endif
 	RETURN
 # else
 	lea	32(%eax, %edx), %eax
+#  ifdef __CHKP__
+	bndcu 	(%eax), %bnd0
+#  endif
 	ret
 # endif
 
diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
index 2984a37..3ccfe66 100644
--- a/sysdeps/i386/i686/multiarch/memcmp-sse4.S
+++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -91,6 +91,15 @@ ENTRY (MEMCMP)
 	jbe	L(less1bytes)
 # endif
 
+# ifdef __CHKP__
+	bndldx 	BLK1(%esp,%eax,1), %bnd0
+	bndldx 	BLK2(%esp,%edx,1), %bnd1
+	bndcl	(%eax), %bnd0
+	bndcl	(%edx), %bnd1
+	bndcu	(%eax), %bnd0
+	bndcu	(%edx), %bnd1
+# endif
+
 	pxor	%xmm0, %xmm0
 	cmp	$64, %ecx
 	ja	L(64bytesormore)
@@ -115,6 +124,10 @@ L(less8bytes):
 	cmpb	(%edx), %bl
 	jne	L(nonzero)
 
+# ifdef __CHKP__
+	bndcu	1(%eax), %bnd0
+	bndcu	1(%edx), %bnd1
+# endif
 	mov	1(%eax), %bl
 	cmpb	1(%edx), %bl
 	jne	L(nonzero)
@@ -122,6 +135,10 @@ L(less8bytes):
 	cmp	$2, %ecx
 	jz	L(0bytes)
 
+# ifdef __CHKP__
+	bndcu	2(%eax), %bnd0
+	bndcu	2(%edx), %bnd1
+# endif
 	mov	2(%eax), %bl
 	cmpb	2(%edx), %bl
 	jne	L(nonzero)
@@ -129,6 +146,10 @@ L(less8bytes):
 	cmp	$3, %ecx
 	jz	L(0bytes)
 
+# ifdef __CHKP__
+	bndcu	3(%eax), %bnd0
+	bndcu	3(%edx), %bnd1
+# endif
 	mov	3(%eax), %bl
 	cmpb	3(%edx), %bl
 	jne	L(nonzero)
@@ -136,6 +157,10 @@ L(less8bytes):
 	cmp	$4, %ecx
 	jz	L(0bytes)
 
+# ifdef __CHKP__
+	bndcu	4(%eax), %bnd0
+	bndcu	4(%edx), %bnd1
+# endif
 	mov	4(%eax), %bl
 	cmpb	4(%edx), %bl
 	jne	L(nonzero)
@@ -143,6 +168,10 @@ L(less8bytes):
 	cmp	$5, %ecx
 	jz	L(0bytes)
 
+# ifdef __CHKP__
+	bndcu	5(%eax), %bnd0
+	bndcu	5(%edx), %bnd1
+# endif
 	mov	5(%eax), %bl
 	cmpb	5(%edx), %bl
 	jne	L(nonzero)
@@ -150,6 +179,10 @@ L(less8bytes):
 	cmp	$6, %ecx
 	jz	L(0bytes)
 
+# ifdef __CHKP__
+	bndcu	6(%eax), %bnd0
+	bndcu	6(%edx), %bnd1
+# endif
 	mov	6(%eax), %bl
 	cmpb	6(%edx), %bl
 	je	L(0bytes)
@@ -198,6 +231,14 @@ L(return0):
 	.p2align 4
 L(less1bytes):
 	jb	L(0bytesend)
+# ifdef __CHKP__
+	bndldx 	BLK1(%esp,%eax,1), %bnd0
+	bndldx 	BLK2(%esp,%edx,1), %bnd1
+	bndcl	(%eax), %bnd0
+	bndcl	(%edx), %bnd1
+	bndcu	(%eax), %bnd0
+	bndcu	(%edx), %bnd1
+# endif
 	movzbl	(%eax), %eax
 	movzbl	(%edx), %edx
 	sub	%edx, %eax
@@ -221,18 +262,30 @@ L(64bytesormore_loop):
 	ptest	%xmm2, %xmm0
 	jnc	L(find_16diff)
 
+# ifdef __CHKP__
+	bndcu	16(%eax), %bnd0
+	bndcu	16(%edx), %bnd1
+# endif
 	movdqu	16(%eax), %xmm1
 	movdqu	16(%edx), %xmm2
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(find_32diff)
 
+# ifdef __CHKP__
+	bndcu	32(%eax), %bnd0
+	bndcu	32(%edx), %bnd1
+# endif
 	movdqu	32(%eax), %xmm1
 	movdqu	32(%edx), %xmm2
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(find_48diff)
 
+# ifdef __CHKP__
+	bndcu	48(%eax), %bnd0
+	bndcu	48(%edx), %bnd1
+# endif
 	movdqu	48(%eax), %xmm1
 	movdqu	48(%edx), %xmm2
 	pxor	%xmm1, %xmm2
diff --git a/sysdeps/i386/i686/multiarch/memcpy.c b/sysdeps/i386/i686/multiarch/memcpy.c
new file mode 100644
index 0000000..824cdcb
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcpy.c
@@ -0,0 +1,40 @@
+#include <stddef.h>
+
+void *
+__memcpy (void *dst, const void *src, size_t n)
+{
+  const char *s = src;
+  char *d = dst;
+  void *ret = dst;
+  size_t offset_src = ((size_t) s) & (sizeof(size_t) - 1);
+  size_t offset_dst = ((size_t) d) & (sizeof(size_t) - 1);
+
+  if (offset_src != offset_dst)
+  {
+    while (n--)
+      *d++ = *s++;
+  }
+  else
+  {
+	 if (offset_src) offset_src = sizeof(size_t) - offset_src;
+    while (n-- && offset_src--)
+      *d++ = *s++;
+    n++;
+    if (!n) return ret;
+    void **d1 = (void **)d;
+    void **s1 = (void **)s;
+    while (n >= sizeof(void *))
+    {
+      n -= sizeof(void *);
+      *d1++ = *s1++;
+    }
+    s = (char *)s1;
+    d = (char *)d1;
+    while (n--)
+      *d++ = *s++;
+  }
+  return ret;
+}
+
+weak_alias (__memcpy, __GI_memcpy)
+weak_alias (__memcpy, memcpy)
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.c b/sysdeps/i386/i686/multiarch/memcpy_chk.c
new file mode 100644
index 0000000..1eee86c
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcpy_chk.c
@@ -0,0 +1 @@
+#include <debug/memcpy_chk.c>
diff --git a/sysdeps/i386/i686/multiarch/memmove.c b/sysdeps/i386/i686/multiarch/memmove.c
new file mode 100644
index 0000000..9e5ad6d
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memmove.c
@@ -0,0 +1,76 @@
+#include <stddef.h>
+
+void *
+__memmove (void *dst, const void *src, size_t n)
+{
+  const char *s = src;
+  char *d = dst;
+  void *ret = dst;
+  size_t offset_src = ((size_t) s) & (sizeof(size_t) - 1);
+  size_t offset_dst = ((size_t) d) & (sizeof(size_t) - 1);
+
+  if (offset_src != offset_dst)
+  {
+    if (s < d)
+    {
+      // backward copying
+      d += n;
+      s += n;
+      while (n--)
+        *--d = *--s;
+    }
+    else
+      // forward copying
+      while (n--)
+        *d++ = *s++;
+  }
+  else
+  {
+    if (s < d)
+    {
+		offset_src = (offset_src + (size_t)src) & (sizeof(size_t) - 1);
+      // backward copying
+      d += n;
+      s += n;
+      while (n-- && offset_src--)
+        *--d = *--s;
+      n++;
+      if (!n) return ret;
+      void **d1 = (void **)d;
+      void **s1 = (void **)s;
+      while (n >= sizeof(void *))
+      {
+        n -= sizeof(void *);
+        *--d1 = *--s1;
+      }
+      s = (char *)s1;
+      d = (char *)d1;
+      while (n--)
+        *--d = *--s;
+    }
+    else
+    {
+		if (offset_src) offset_src = sizeof(size_t) - offset_src;
+      // forward copying
+      while (n-- && offset_src--)
+        *d++ = *s++;
+      n++;
+      if (!n) return ret;
+      void **d1 = (void **)d;
+      void **s1 = (void **)s;
+      while (n >= sizeof(void *))
+      {
+        n -= sizeof(void *);
+        *d1++ = *s1++;
+      }
+      s = (char *)s1;
+      d = (char *)d1;
+      while (n--)
+        *d++ = *s++;
+    }
+  }
+  return ret;
+}
+
+weak_alias (__memmove, __GI_memmove)
+weak_alias (__memmove, memmove)
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.c b/sysdeps/i386/i686/multiarch/memmove_chk.c
new file mode 100644
index 0000000..bbf53d0
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memmove_chk.c
@@ -0,0 +1 @@
+#include <debug/memmove_chk.c>
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.c b/sysdeps/i386/i686/multiarch/mempcpy.c
new file mode 100644
index 0000000..6cbdad1
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mempcpy.c
@@ -0,0 +1,40 @@
+#include <stddef.h>
+
+void *
+mempcpy (void *dst, const void *src, size_t n)
+{
+  const char *s = src;
+  char *d = dst;
+  void *ret = dst + n;
+  size_t offset_src = ((size_t) s) & (sizeof(size_t) - 1);
+  size_t offset_dst = ((size_t) d) & (sizeof(size_t) - 1);
+
+  if (offset_src != offset_dst)
+  {
+    while (n--)
+      *d++ = *s++;
+  }
+  else
+  {
+	 if (offset_src) offset_src = sizeof(size_t) - offset_src;
+    while (n-- && offset_src--)
+      *d++ = *s++;
+    n++;
+    if (!n) return ret;
+    void **d1 = (void **)d;
+    void **s1 = (void **)s;
+    while (n >= sizeof(void *))
+    {
+      n -= sizeof(void *);
+      *d1++ = *s1++;
+    }
+    s = (char *)s1;
+    d = (char *)d1;
+    while (n--)
+      *d++ = *s++;
+  }
+  return ret;
+}
+
+weak_alias (mempcpy, __GI_mempcpy)
+weak_alias (mempcpy, __mempcpy)
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.c b/sysdeps/i386/i686/multiarch/mempcpy_chk.c
new file mode 100644
index 0000000..ba17078
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mempcpy_chk.c
@@ -0,0 +1 @@
+#include <debug/mempcpy_chk.c>
diff --git a/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
index c5c3e97..75c947c 100644
--- a/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
+++ b/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
@@ -45,6 +45,12 @@ ENTRY (MEMCHR)
 	movd	STR2(%esp), %xmm1
 	mov	LEN(%esp), %edx
 
+# ifdef __CHKP__
+	bndldx 	STR1(%esp,%ecx,1), %bnd0
+	bndcl  	(%ecx), %bnd0
+	bndcu  	-1(%ecx, %edx), %bnd0
+# endif
+
 	sub	$16, %edx
 	jbe	L(length_less16)
 
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
index bcea296..ce112b1 100644
--- a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
+++ b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -90,6 +90,7 @@ ENTRY (__memset_sse2_rep)
 	ENTRANCE
 
 	movl	LEN(%esp), %ecx
+
 #ifdef USE_AS_BZERO
 	xor	%eax, %eax
 #else
@@ -101,6 +102,11 @@ ENTRY (__memset_sse2_rep)
 	or	%edx, %eax
 #endif
 	movl	DEST(%esp), %edx
+#ifdef __CHKP__
+	bndldx  DEST(%esp,%edx,1),%bnd0
+	bndcl  	(%edx), %bnd0
+	bndcu  	-1(%edx, %ecx), %bnd0
+#endif
 	cmp	$32, %ecx
 	jae	L(32bytesormore)
 
diff --git a/sysdeps/i386/i686/multiarch/mpx_memcpy_nobnd.S b/sysdeps/i386/i686/multiarch/mpx_memcpy_nobnd.S
new file mode 100644
index 0000000..b7f4e0e
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mpx_memcpy_nobnd.S
@@ -0,0 +1,1803 @@
+/* memcpy with SSSE3 and REP string.
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		mpx_memcpy_nobnd
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC		PARMS
+# define DEST		SRC+4
+# define LEN		DEST+4
+#else
+# define DEST		PARMS
+# define SRC		DEST+4
+# define LEN		SRC+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifdef SHARED
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */				\
+    SETUP_PIC_REG(bx);						\
+    /* Get the address of the jump table.  */			\
+    addl	$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
+    addl	$(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+#else
+# define PARMS		4
+# define ENTRANCE
+# define RETURN_END	ret
+# define RETURN		RETURN_END
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+#ifdef __CHKP__
+	bndldx 	SRC(%esp,%eax,1), %bnd1
+	bndldx 	DEST(%esp,%edx,1), %bnd0
+	bndcl  	(%eax), %bnd1
+	bndcu  	-1(%eax, %ecx), %bnd1
+	bndcl  	(%edx), %bnd0
+	bndcu  	-1(%edx, %ecx), %bnd0
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%eax, %edx
+	jb	L(copy_forward)
+	je	L(fwd_write_0bytes)
+	cmp	$48, %ecx
+	jb	L(bk_write_less48bytes)
+	add	%ecx, %eax
+	cmp	%eax, %edx
+	movl	SRC(%esp), %eax
+	jb	L(copy_backward)
+
+L(copy_forward):
+#endif
+	cmp	$48, %ecx
+	jae	L(48bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dl, %al
+	jb	L(bk_write)
+#endif
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+#endif
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(48bytesormore):
+	movdqu	(%eax), %xmm0
+	PUSH (%edi)
+	movl	%edx, %edi
+	and	$-16, %edx
+	PUSH (%esi)
+	cfi_remember_state
+	add	$16, %edx
+	movl	%edi, %esi
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+	cmp	__x86_shared_cache_size_half, %ecx
+# endif
+#endif
+
+	mov	%eax, %edi
+	jae	L(large_page)
+	and	$0xf, %edi
+	jz	L(shl_0)
+
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+	ALIGN (4)
+L(shl_0):
+	movdqu	%xmm0, (%esi)
+	xor	%edi, %edi
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+L(shl_0_loop):
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	movdqa	16(%eax, %edi), %xmm1
+	sub	$32, %ecx
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+L(shl_0_gobble):
+
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_data_cache_size_half@GOTOFF(%ebx), %edi
+# else
+	mov	__x86_data_cache_size_half, %edi
+# endif
+#endif
+	mov	%edi, %esi
+	shr	$3, %esi
+	sub	%esi, %edi
+	cmp	%edi, %ecx
+	jae	L(shl_0_gobble_mem_start)
+	sub	$128, %ecx
+	ALIGN (4)
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	movaps	0x10(%eax), %xmm1
+	movaps	0x20(%eax), %xmm2
+	movaps	0x30(%eax), %xmm3
+	movaps	0x40(%eax), %xmm4
+	movaps	0x50(%eax), %xmm5
+	movaps	0x60(%eax), %xmm6
+	movaps	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$128, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	%xmm4, 0x40(%edx)
+	movaps	%xmm5, 0x50(%edx)
+	movaps	%xmm6, 0x60(%edx)
+	movaps	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_cache_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_cache_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_0_gobble_mem_start):
+	cmp	%al, %dl
+	je	L(copy_page_by_rep)
+	sub	$128, %ecx
+L(shl_0_gobble_mem_loop):
+	prefetchnta 0x1c0(%eax)
+	prefetchnta 0x280(%eax)
+	prefetchnta 0x1c0(%edx)
+	prefetchnta 0x280(%edx)
+
+	movdqa	(%eax), %xmm0
+	movaps	0x10(%eax), %xmm1
+	movaps	0x20(%eax), %xmm2
+	movaps	0x30(%eax), %xmm3
+	movaps	0x40(%eax), %xmm4
+	movaps	0x50(%eax), %xmm5
+	movaps	0x60(%eax), %xmm6
+	movaps	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	sub	$0x80, %ecx
+	movdqa	%xmm0, (%edx)
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	%xmm4, 0x40(%edx)
+	movaps	%xmm5, 0x50(%edx)
+	movaps	%xmm6, 0x60(%edx)
+	movaps	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+
+	jae	L(shl_0_gobble_mem_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+
+	movdqa	0x20(%eax), %xmm0
+	movdqa	0x30(%eax), %xmm1
+	add	$0x40, %eax
+
+	movdqa	%xmm0, 0x20(%edx)
+	movdqa	%xmm1, 0x30(%edx)
+	add	$0x40, %edx
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	add	$0x20, %eax
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	add	$0x20, %edx
+L(shl_0_mem_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_mem_less_16bytes)
+	sub	$0x10, %ecx
+	movdqa	(%eax), %xmm0
+	add	$0x10, %eax
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_mem_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	POP (%esi)
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_1):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$1, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_1_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_1_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_1_loop)
+
+L(shl_1_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	1(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_2):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$2, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_2_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_2_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_2_loop)
+
+L(shl_2_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	2(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_3):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$3, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_3_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_3_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_3_loop)
+
+L(shl_3_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	3(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_4):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$4, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_4_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_4_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_4_loop)
+
+L(shl_4_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	4(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_5):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$5, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_5_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_5_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	palignr	$5, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_5_loop)
+
+L(shl_5_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	5(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_6):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$6, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_6_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_6_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	palignr	$6, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_6_loop)
+
+L(shl_6_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	6(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_7):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$7, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_7_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_7_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	palignr	$7, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_7_loop)
+
+L(shl_7_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	7(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_8):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$8, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_8_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_8_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	palignr	$8, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_8_loop)
+
+L(shl_8_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	8(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_9):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$9, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_9_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_9_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	palignr	$9, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_9_loop)
+
+L(shl_9_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	9(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_10):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$10, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_10_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_10_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	palignr	$10, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_10_loop)
+
+L(shl_10_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	10(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_11):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$11, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_11_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_11_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	palignr	$11, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_11_loop)
+
+L(shl_11_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	11(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_12):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$12, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_12_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_12_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	palignr	$12, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_12_loop)
+
+L(shl_12_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	12(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_13):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$13, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_13_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_13_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	palignr	$13, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_13_loop)
+
+L(shl_13_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	13(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_14):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$14, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_14_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_14_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	palignr	$14, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_14_loop)
+
+L(shl_14_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	14(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shl_15):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	sub	$15, %eax
+	movaps	(%eax), %xmm1
+	xor	%edi, %edi
+	sub	$32, %ecx
+	movdqu	%xmm0, (%esi)
+	POP (%esi)
+L(shl_15_loop):
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm1, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jb	L(shl_15_end)
+
+	movdqa	16(%eax, %edi), %xmm2
+	sub	$32, %ecx
+	movdqa	32(%eax, %edi), %xmm3
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	palignr	$15, %xmm4, %xmm2
+	lea	32(%edi), %edi
+	movdqa	%xmm2, -32(%edx, %edi)
+	movdqa	%xmm3, -16(%edx, %edi)
+
+	jae	L(shl_15_loop)
+
+L(shl_15_end):
+	add	$32, %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	lea	15(%edi, %eax), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+	ALIGN (4)
+L(fwd_write_44bytes):
+	movl	-44(%eax), %ecx
+	movl	%ecx, -44(%edx)
+L(fwd_write_40bytes):
+	movl	-40(%eax), %ecx
+	movl	%ecx, -40(%edx)
+L(fwd_write_36bytes):
+	movl	-36(%eax), %ecx
+	movl	%ecx, -36(%edx)
+L(fwd_write_32bytes):
+	movl	-32(%eax), %ecx
+	movl	%ecx, -32(%edx)
+L(fwd_write_28bytes):
+	movl	-28(%eax), %ecx
+	movl	%ecx, -28(%edx)
+L(fwd_write_24bytes):
+	movl	-24(%eax), %ecx
+	movl	%ecx, -24(%edx)
+L(fwd_write_20bytes):
+	movl	-20(%eax), %ecx
+	movl	%ecx, -20(%edx)
+L(fwd_write_16bytes):
+	movl	-16(%eax), %ecx
+	movl	%ecx, -16(%edx)
+L(fwd_write_12bytes):
+	movl	-12(%eax), %ecx
+	movl	%ecx, -12(%edx)
+L(fwd_write_8bytes):
+	movl	-8(%eax), %ecx
+	movl	%ecx, -8(%edx)
+L(fwd_write_4bytes):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_5bytes):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_45bytes):
+	movl	-45(%eax), %ecx
+	movl	%ecx, -45(%edx)
+L(fwd_write_41bytes):
+	movl	-41(%eax), %ecx
+	movl	%ecx, -41(%edx)
+L(fwd_write_37bytes):
+	movl	-37(%eax), %ecx
+	movl	%ecx, -37(%edx)
+L(fwd_write_33bytes):
+	movl	-33(%eax), %ecx
+	movl	%ecx, -33(%edx)
+L(fwd_write_29bytes):
+	movl	-29(%eax), %ecx
+	movl	%ecx, -29(%edx)
+L(fwd_write_25bytes):
+	movl	-25(%eax), %ecx
+	movl	%ecx, -25(%edx)
+L(fwd_write_21bytes):
+	movl	-21(%eax), %ecx
+	movl	%ecx, -21(%edx)
+L(fwd_write_17bytes):
+	movl	-17(%eax), %ecx
+	movl	%ecx, -17(%edx)
+L(fwd_write_13bytes):
+	movl	-13(%eax), %ecx
+	movl	%ecx, -13(%edx)
+L(fwd_write_9bytes):
+	movl	-9(%eax), %ecx
+	movl	%ecx, -9(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+L(fwd_write_1bytes):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_46bytes):
+	movl	-46(%eax), %ecx
+	movl	%ecx, -46(%edx)
+L(fwd_write_42bytes):
+	movl	-42(%eax), %ecx
+	movl	%ecx, -42(%edx)
+L(fwd_write_38bytes):
+	movl	-38(%eax), %ecx
+	movl	%ecx, -38(%edx)
+L(fwd_write_34bytes):
+	movl	-34(%eax), %ecx
+	movl	%ecx, -34(%edx)
+L(fwd_write_30bytes):
+	movl	-30(%eax), %ecx
+	movl	%ecx, -30(%edx)
+L(fwd_write_26bytes):
+	movl	-26(%eax), %ecx
+	movl	%ecx, -26(%edx)
+L(fwd_write_22bytes):
+	movl	-22(%eax), %ecx
+	movl	%ecx, -22(%edx)
+L(fwd_write_18bytes):
+	movl	-18(%eax), %ecx
+	movl	%ecx, -18(%edx)
+L(fwd_write_14bytes):
+	movl	-14(%eax), %ecx
+	movl	%ecx, -14(%edx)
+L(fwd_write_10bytes):
+	movl	-10(%eax), %ecx
+	movl	%ecx, -10(%edx)
+L(fwd_write_6bytes):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+L(fwd_write_2bytes):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_47bytes):
+	movl	-47(%eax), %ecx
+	movl	%ecx, -47(%edx)
+L(fwd_write_43bytes):
+	movl	-43(%eax), %ecx
+	movl	%ecx, -43(%edx)
+L(fwd_write_39bytes):
+	movl	-39(%eax), %ecx
+	movl	%ecx, -39(%edx)
+L(fwd_write_35bytes):
+	movl	-35(%eax), %ecx
+	movl	%ecx, -35(%edx)
+L(fwd_write_31bytes):
+	movl	-31(%eax), %ecx
+	movl	%ecx, -31(%edx)
+L(fwd_write_27bytes):
+	movl	-27(%eax), %ecx
+	movl	%ecx, -27(%edx)
+L(fwd_write_23bytes):
+	movl	-23(%eax), %ecx
+	movl	%ecx, -23(%edx)
+L(fwd_write_19bytes):
+	movl	-19(%eax), %ecx
+	movl	%ecx, -19(%edx)
+L(fwd_write_15bytes):
+	movl	-15(%eax), %ecx
+	movl	%ecx, -15(%edx)
+L(fwd_write_11bytes):
+	movl	-11(%eax), %ecx
+	movl	%ecx, -11(%edx)
+L(fwd_write_7bytes):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+L(fwd_write_3bytes):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN_END
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(large_page):
+	movdqu	(%eax), %xmm1
+	movdqu	%xmm0, (%esi)
+	movntdq	%xmm1, (%edx)
+	add	$0x10, %eax
+	add	$0x10, %edx
+	sub	$0x10, %ecx
+	cmp	%al, %dl
+	je	L(copy_page_by_rep)
+L(large_page_loop_init):
+	POP (%esi)
+	sub	$0x80, %ecx
+	POP (%edi)
+L(large_page_loop):
+	prefetchnta	0x1c0(%eax)
+	prefetchnta	0x280(%eax)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	movdqu	0x40(%eax), %xmm4
+	movdqu	0x50(%eax), %xmm5
+	movdqu	0x60(%eax), %xmm6
+	movdqu	0x70(%eax), %xmm7
+	lea	0x80(%eax), %eax
+	lfence
+	sub	$0x80, %ecx
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	movntdq	%xmm4, 0x40(%edx)
+	movntdq	%xmm5, 0x50(%edx)
+	movntdq	%xmm6, 0x60(%edx)
+	movntdq	%xmm7, 0x70(%edx)
+	lea	0x80(%edx), %edx
+	jae	L(large_page_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(large_page_less_64bytes)
+
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	movdqu	0x20(%eax), %xmm2
+	movdqu	0x30(%eax), %xmm3
+	lea	0x40(%eax), %eax
+
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	movntdq	%xmm2, 0x20(%edx)
+	movntdq	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+	sub	$0x40, %ecx
+L(large_page_less_64bytes):
+	cmp	$32, %ecx
+	jb	L(large_page_less_32bytes)
+	movdqu	(%eax), %xmm0
+	movdqu	0x10(%eax), %xmm1
+	lea	0x20(%eax), %eax
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+	sub	$0x20, %ecx
+L(large_page_less_32bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(copy_page_by_rep):
+	mov	%eax, %esi
+	mov	%edx, %edi
+	mov	%ecx, %edx
+	shr	$2, %ecx
+	and	$3, %edx
+	rep	movsl
+	jz	L(copy_page_by_rep_exit)
+	cmp	$2, %edx
+	jb	L(copy_page_by_rep_left_1)
+	movzwl	(%esi), %eax
+	movw	%ax, (%edi)
+	add	$2, %esi
+	add	$2, %edi
+	sub	$2, %edx
+	jz	L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+	movzbl	(%esi), %eax
+	movb	%al, (%edi)
+L(copy_page_by_rep_exit):
+	POP (%esi)
+	POP (%edi)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_44bytes):
+	movl	40(%eax), %ecx
+	movl	%ecx, 40(%edx)
+L(bk_write_40bytes):
+	movl	36(%eax), %ecx
+	movl	%ecx, 36(%edx)
+L(bk_write_36bytes):
+	movl	32(%eax), %ecx
+	movl	%ecx, 32(%edx)
+L(bk_write_32bytes):
+	movl	28(%eax), %ecx
+	movl	%ecx, 28(%edx)
+L(bk_write_28bytes):
+	movl	24(%eax), %ecx
+	movl	%ecx, 24(%edx)
+L(bk_write_24bytes):
+	movl	20(%eax), %ecx
+	movl	%ecx, 20(%edx)
+L(bk_write_20bytes):
+	movl	16(%eax), %ecx
+	movl	%ecx, 16(%edx)
+L(bk_write_16bytes):
+	movl	12(%eax), %ecx
+	movl	%ecx, 12(%edx)
+L(bk_write_12bytes):
+	movl	8(%eax), %ecx
+	movl	%ecx, 8(%edx)
+L(bk_write_8bytes):
+	movl	4(%eax), %ecx
+	movl	%ecx, 4(%edx)
+L(bk_write_4bytes):
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_45bytes):
+	movl	41(%eax), %ecx
+	movl	%ecx, 41(%edx)
+L(bk_write_41bytes):
+	movl	37(%eax), %ecx
+	movl	%ecx, 37(%edx)
+L(bk_write_37bytes):
+	movl	33(%eax), %ecx
+	movl	%ecx, 33(%edx)
+L(bk_write_33bytes):
+	movl	29(%eax), %ecx
+	movl	%ecx, 29(%edx)
+L(bk_write_29bytes):
+	movl	25(%eax), %ecx
+	movl	%ecx, 25(%edx)
+L(bk_write_25bytes):
+	movl	21(%eax), %ecx
+	movl	%ecx, 21(%edx)
+L(bk_write_21bytes):
+	movl	17(%eax), %ecx
+	movl	%ecx, 17(%edx)
+L(bk_write_17bytes):
+	movl	13(%eax), %ecx
+	movl	%ecx, 13(%edx)
+L(bk_write_13bytes):
+	movl	9(%eax), %ecx
+	movl	%ecx, 9(%edx)
+L(bk_write_9bytes):
+	movl	5(%eax), %ecx
+	movl	%ecx, 5(%edx)
+L(bk_write_5bytes):
+	movl	1(%eax), %ecx
+	movl	%ecx, 1(%edx)
+L(bk_write_1bytes):
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_46bytes):
+	movl	42(%eax), %ecx
+	movl	%ecx, 42(%edx)
+L(bk_write_42bytes):
+	movl	38(%eax), %ecx
+	movl	%ecx, 38(%edx)
+L(bk_write_38bytes):
+	movl	34(%eax), %ecx
+	movl	%ecx, 34(%edx)
+L(bk_write_34bytes):
+	movl	30(%eax), %ecx
+	movl	%ecx, 30(%edx)
+L(bk_write_30bytes):
+	movl	26(%eax), %ecx
+	movl	%ecx, 26(%edx)
+L(bk_write_26bytes):
+	movl	22(%eax), %ecx
+	movl	%ecx, 22(%edx)
+L(bk_write_22bytes):
+	movl	18(%eax), %ecx
+	movl	%ecx, 18(%edx)
+L(bk_write_18bytes):
+	movl	14(%eax), %ecx
+	movl	%ecx, 14(%edx)
+L(bk_write_14bytes):
+	movl	10(%eax), %ecx
+	movl	%ecx, 10(%edx)
+L(bk_write_10bytes):
+	movl	6(%eax), %ecx
+	movl	%ecx, 6(%edx)
+L(bk_write_6bytes):
+	movl	2(%eax), %ecx
+	movl	%ecx, 2(%edx)
+L(bk_write_2bytes):
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_47bytes):
+	movl	43(%eax), %ecx
+	movl	%ecx, 43(%edx)
+L(bk_write_43bytes):
+	movl	39(%eax), %ecx
+	movl	%ecx, 39(%edx)
+L(bk_write_39bytes):
+	movl	35(%eax), %ecx
+	movl	%ecx, 35(%edx)
+L(bk_write_35bytes):
+	movl	31(%eax), %ecx
+	movl	%ecx, 31(%edx)
+L(bk_write_31bytes):
+	movl	27(%eax), %ecx
+	movl	%ecx, 27(%edx)
+L(bk_write_27bytes):
+	movl	23(%eax), %ecx
+	movl	%ecx, 23(%edx)
+L(bk_write_23bytes):
+	movl	19(%eax), %ecx
+	movl	%ecx, 19(%edx)
+L(bk_write_19bytes):
+	movl	15(%eax), %ecx
+	movl	%ecx, 15(%edx)
+L(bk_write_15bytes):
+	movl	11(%eax), %ecx
+	movl	%ecx, 11(%edx)
+L(bk_write_11bytes):
+	movl	7(%eax), %ecx
+	movl	%ecx, 7(%edx)
+L(bk_write_7bytes):
+	movl	3(%eax), %ecx
+	movl	%ecx, 3(%edx)
+L(bk_write_3bytes):
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN_END
+
+
+	.pushsection .rodata.ssse3,"a",@progbits
+	ALIGN (2)
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+	ALIGN (2)
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	ALIGN (2)
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+#ifdef USE_AS_MEMMOVE
+	ALIGN (4)
+L(copy_backward):
+	PUSH (%esi)
+	movl	%eax, %esi
+	add	%ecx, %edx
+	add	%ecx, %esi
+	testl	$0x3, %edx
+	jnz	L(bk_align)
+
+L(bk_aligned_4):
+	cmp	$64, %ecx
+	jae	L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+	cmp	$32, %ecx
+	jb	L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+	/* Copy 32 bytes at a time.  */
+	sub	$32, %ecx
+	movl	-4(%esi), %eax
+	movl	%eax, -4(%edx)
+	movl	-8(%esi), %eax
+	movl	%eax, -8(%edx)
+	movl	-12(%esi), %eax
+	movl	%eax, -12(%edx)
+	movl	-16(%esi), %eax
+	movl	%eax, -16(%edx)
+	movl	-20(%esi), %eax
+	movl	%eax, -20(%edx)
+	movl	-24(%esi), %eax
+	movl	%eax, -24(%edx)
+	movl	-28(%esi), %eax
+	movl	%eax, -28(%edx)
+	movl	-32(%esi), %eax
+	movl	%eax, -32(%edx)
+	sub	$32, %edx
+	sub	$32, %esi
+
+L(bk_write_less32bytes):
+	movl	%esi, %eax
+	sub	%ecx, %edx
+	sub	%ecx, %eax
+	POP (%esi)
+L(bk_write_less48bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	CFI_PUSH (%esi)
+	ALIGN (4)
+L(bk_align):
+	cmp	$8, %ecx
+	jbe	L(bk_write_less32bytes)
+	testl	$1, %edx
+	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+	   then (EDX & 2) must be != 0.  */
+	jz	L(bk_got2)
+	sub	$1, %esi
+	sub	$1, %ecx
+	sub	$1, %edx
+	movzbl	(%esi), %eax
+	movb	%al, (%edx)
+
+	testl	$2, %edx
+	jz	L(bk_aligned_4)
+
+L(bk_got2):
+	sub	$2, %esi
+	sub	$2, %ecx
+	sub	$2, %edx
+	movzwl	(%esi), %eax
+	movw	%ax, (%edx)
+	jmp	L(bk_aligned_4)
+
+	ALIGN (4)
+L(bk_write_more64bytes):
+	/* Check alignment of last byte.  */
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes.  */
+L(bk_ssse3_align):
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+	testl	$15, %edx
+	jz	L(bk_ssse3_cpy_pre)
+
+	sub	$4, %esi
+	sub	$4, %ecx
+	sub	$4, %edx
+	movl	(%esi), %eax
+	movl	%eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+	cmp	$64, %ecx
+	jb	L(bk_write_more32bytes)
+
+L(bk_ssse3_cpy):
+	sub	$64, %esi
+	sub	$64, %ecx
+	sub	$64, %edx
+	movdqu	0x30(%esi), %xmm3
+	movdqa	%xmm3, 0x30(%edx)
+	movdqu	0x20(%esi), %xmm2
+	movdqa	%xmm2, 0x20(%edx)
+	movdqu	0x10(%esi), %xmm1
+	movdqa	%xmm1, 0x10(%edx)
+	movdqu	(%esi), %xmm0
+	movdqa	%xmm0, (%edx)
+	cmp	$64, %ecx
+	jae	L(bk_ssse3_cpy)
+	jmp	L(bk_write_64bytesless)
+
+#endif
+
+END (MEMCPY)
diff --git a/sysdeps/i386/i686/multiarch/mpx_memmove_nobnd.S b/sysdeps/i386/i686/multiarch/mpx_memmove_nobnd.S
new file mode 100644
index 0000000..caaa89a
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mpx_memmove_nobnd.S
@@ -0,0 +1,3 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		mpx_memmove_nobnd
+#include "mpx_memcpy_nobnd.S"
diff --git a/sysdeps/i386/i686/multiarch/mpx_mempcpy_nobnd.S b/sysdeps/i386/i686/multiarch/mpx_mempcpy_nobnd.S
new file mode 100644
index 0000000..4b0af49
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mpx_mempcpy_nobnd.S
@@ -0,0 +1,3 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		mpx_mempcpy_nobnd
+#include "mpx_memcpy_nobnd.S"
diff --git a/sysdeps/i386/i686/multiarch/strcat-sse2.S b/sysdeps/i386/i686/multiarch/strcat-sse2.S
index 62d60cd..b1d39ae 100644
--- a/sysdeps/i386/i686/multiarch/strcat-sse2.S
+++ b/sysdeps/i386/i686/multiarch/strcat-sse2.S
@@ -95,10 +95,20 @@ ENTRY (STRCAT)
 	test	%ebx, %ebx
 	jz	L(ExitZero)
 # endif
+# ifdef __CHKP__
+	bndldx 	STR1(%esp,%eax,1), %bnd0
+	bndldx 	STR2(%esp,%esi,1), %bnd1
+	bndcl  	(%esi), %bnd1
+	bndcu  	(%esi), %bnd1
+# endif
 	cmpb	$0, (%esi)
 	mov	%esi, %ecx
 	mov	%eax, %edx
 	jz	L(ExitZero)
+# ifdef __CHKP__
+	bndcl  	(%eax), %bnd0
+	bndcu  	(%eax), %bnd0
+# endif
 
 	and	$63, %ecx
 	and	$63, %edx
@@ -113,6 +123,9 @@ ENTRY (STRCAT)
 	movdqu	(%eax), %xmm1
 	movdqu	(%esi), %xmm5
 	pcmpeqb	%xmm1, %xmm0
+# ifdef __CHKP__
+	bndcu  	16(%esi), %bnd1
+# endif
 	movdqu	16(%esi), %xmm6
 	pmovmskb %xmm0, %ecx
 	pcmpeqb	%xmm5, %xmm4
@@ -132,6 +145,9 @@ L(alignment_prolog):
 	and	$-16, %eax
 	pcmpeqb	(%eax), %xmm0
 	movdqu	(%esi), %xmm5
+# ifdef __CHKP__
+	bndcu  	16(%esi), %bnd1
+# endif
 	movdqu	16(%esi), %xmm6
 	pmovmskb %xmm0, %edx
 	pcmpeqb	%xmm5, %xmm4
@@ -148,21 +164,33 @@ L(loop_prolog):
 	pxor	%xmm3, %xmm3
 	.p2align 4
 L(align16_loop):
+# ifdef __CHKP__
+	bndcu  	16(%eax), %bnd0
+# endif
 	pcmpeqb	16(%eax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(exit16)
 
+# ifdef __CHKP__
+	bndcu  	32(%eax), %bnd0
+# endif
 	pcmpeqb	32(%eax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	jnz	L(exit32)
 
+# ifdef __CHKP__
+	bndcu  	48(%eax), %bnd0
+# endif
 	pcmpeqb	48(%eax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	jnz	L(exit48)
 
+# ifdef __CHKP__
+	bndcu  	64(%eax), %bnd0
+# endif
 	pcmpeqb	64(%eax), %xmm3
 	pmovmskb %xmm3, %edx
 	lea	64(%eax), %eax
@@ -212,6 +240,9 @@ L(StartStrcpyPart):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16BytesTail1)
 
+# ifdef __CHKP__
+	bndcu  	15(%eax), %bnd0
+# endif
 	movdqu	%xmm5, (%eax)
 	pmovmskb %xmm7, %edx
 # ifdef USE_AS_STRNCAT
@@ -250,21 +281,33 @@ L(StrlenCore7_1):
 
 	.p2align 4
 L(align16_loop_1):
+# ifdef __CHKP__
+	bndcu  	16(%eax), %bnd0
+# endif
 	pcmpeqb	16(%eax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(exit16_1)
 
+# ifdef __CHKP__
+	bndcu  	32(%eax), %bnd0
+# endif
 	pcmpeqb	32(%eax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	jnz	L(exit32_1)
 
+# ifdef __CHKP__
+	bndcu  	48(%eax), %bnd0
+# endif
 	pcmpeqb	48(%eax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	jnz	L(exit48_1)
 
+# ifdef __CHKP__
+	bndcu  	64(%eax), %bnd0
+# endif
 	pcmpeqb	64(%eax), %xmm3
 	pmovmskb %xmm3, %edx
 	lea	64(%eax), %eax
@@ -323,6 +366,9 @@ L(StartStrcpyPart_1):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16BytesTail)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi), %bnd1
+# endif
 	pcmpeqb	16(%esi), %xmm0
 	pmovmskb %xmm0, %edx
 # ifdef USE_AS_STRNCAT
@@ -341,6 +387,9 @@ L(Unalign16Both):
 	mov	$16, %ecx
 	movdqa	(%esi, %ecx), %xmm1
 	movaps	16(%esi, %ecx), %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%eax, %ecx), %bnd0
+# endif
 	movdqu	%xmm1, (%eax, %ecx)
 	pcmpeqb	%xmm2, %xmm0
 	pmovmskb %xmm0, %edx
@@ -352,6 +401,10 @@ L(Unalign16Both):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16Bytes)
 L(Unalign16BothBigN):
+# ifdef __CHKP__
+	bndcu  	16(%esi, %ecx), %bnd1
+	bndcu  	15(%eax, %ecx), %bnd0
+# endif
 	movaps	16(%esi, %ecx), %xmm3
 	movdqu	%xmm2, (%eax, %ecx)
 	pcmpeqb	%xmm3, %xmm0
@@ -364,6 +417,10 @@ L(Unalign16BothBigN):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi, %ecx), %bnd1
+	bndcu  	15(%eax, %ecx), %bnd0
+# endif
 	movaps	16(%esi, %ecx), %xmm4
 	movdqu	%xmm3, (%eax, %ecx)
 	pcmpeqb	%xmm4, %xmm0
@@ -376,6 +433,10 @@ L(Unalign16BothBigN):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi, %ecx), %bnd1
+	bndcu  	15(%eax, %ecx), %bnd0
+# endif
 	movaps	16(%esi, %ecx), %xmm1
 	movdqu	%xmm4, (%eax, %ecx)
 	pcmpeqb	%xmm1, %xmm0
@@ -388,6 +449,10 @@ L(Unalign16BothBigN):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi, %ecx), %bnd1
+	bndcu  	15(%eax, %ecx), %bnd0
+# endif
 	movaps	16(%esi, %ecx), %xmm2
 	movdqu	%xmm1, (%eax, %ecx)
 	pcmpeqb	%xmm2, %xmm0
@@ -400,6 +465,10 @@ L(Unalign16BothBigN):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi, %ecx), %bnd1
+	bndcu  	15(%eax, %ecx), %bnd0
+# endif
 	movaps	16(%esi, %ecx), %xmm3
 	movdqu	%xmm2, (%eax, %ecx)
 	pcmpeqb	%xmm3, %xmm0
@@ -412,6 +481,9 @@ L(Unalign16BothBigN):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	15(%eax, %ecx), %bnd0
+# endif
 	movdqu	%xmm3, (%eax, %ecx)
 	mov	%esi, %edx
 	lea	16(%esi, %ecx), %esi
@@ -421,6 +493,9 @@ L(Unalign16BothBigN):
 # ifdef USE_AS_STRNCAT
 	lea	128(%ebx, %edx), %ebx
 # endif
+# ifdef __CHKP__
+	bndcu  	(%esi), %bnd1
+# endif
 	movaps	(%esi), %xmm2
 	movaps	%xmm2, %xmm4
 	movaps	16(%esi), %xmm5
@@ -443,6 +518,10 @@ L(Unalign16BothBigN):
 L(Unaligned64Loop_start):
 	add	$64, %eax
 	add	$64, %esi
+# ifdef __CHKP__
+	bndcu  	(%esi), %bnd1
+	bndcu  	-1(%eax), %bnd0
+# endif
 	movdqu	%xmm4, -64(%eax)
 	movaps	(%esi), %xmm2
 	movdqa	%xmm2, %xmm4
@@ -485,11 +564,18 @@ L(Unaligned64Leave):
 	jnz	L(CopyFrom1To16BytesUnaligned_32)
 
 	bsf	%ecx, %edx
+# ifdef __CHKP__
+	bndcu  	47(%eax), %bnd0
+# endif
 	movdqu	%xmm4, (%eax)
 	movdqu	%xmm5, 16(%eax)
 	movdqu	%xmm6, 32(%eax)
 	add	$48, %esi
 	add	$48, %eax
+# ifdef __CHKP__
+	bndcu  	(%eax, %edx), %bnd0
+	bndcu  	(%esi, %edx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
 
 # ifdef USE_AS_STRNCAT
@@ -501,12 +587,18 @@ L(BigN):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16BytesTail)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi), %bnd1
+# endif
 	pcmpeqb	16(%esi), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(CopyFrom1To32Bytes)
 
 	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+# ifdef __CHKP__
+	bndcu  	15(%eax), %bnd0
+# endif
 	movdqu	%xmm1, (%eax)
 	sub	%ecx, %eax
 	sub     $48, %ebx
@@ -515,6 +607,9 @@ L(BigN):
 	mov	$16, %ecx
 	movdqa	(%esi, %ecx), %xmm1
 	movaps	16(%esi, %ecx), %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%eax, %ecx), %bnd0
+# endif
 	movdqu	%xmm1, (%eax, %ecx)
 	pcmpeqb	%xmm2, %xmm0
 	pmovmskb %xmm0, %edx
@@ -532,12 +627,20 @@ L(CopyFrom1To16Bytes):
 	add	%ecx, %eax
 	add	%ecx, %esi
 	bsf	%edx, %edx
+# ifdef __CHKP__
+	bndcu  	(%eax, %edx), %bnd0
+	bndcu  	(%esi, %edx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
 
 	.p2align 4
 L(CopyFrom1To16BytesTail):
 	add	%ecx, %esi
 	bsf	%edx, %edx
+# ifdef __CHKP__
+	bndcu  	(%eax, %edx), %bnd0
+	bndcu  	(%esi, %edx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
 
 	.p2align 4
@@ -546,6 +649,10 @@ L(CopyFrom1To32Bytes1):
 	add	$16, %eax
 L(CopyFrom1To16BytesTail1):
 	bsf	%edx, %edx
+# ifdef __CHKP__
+	bndcu  	(%eax, %edx), %bnd0
+	bndcu  	(%esi, %edx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
 
 	.p2align 4
@@ -554,34 +661,60 @@ L(CopyFrom1To32Bytes):
 	add	%ecx, %esi
 	add	$16, %edx
 	sub	%ecx, %edx
+# ifdef __CHKP__
+	bndcu  	(%eax, %edx), %bnd0
+	bndcu  	(%esi, %edx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
 
 	.p2align 4
 L(CopyFrom1To16BytesUnaligned_0):
 	bsf	%edx, %edx
+# ifdef __CHKP__
+	bndcu  	(%eax, %edx), %bnd0
+	bndcu  	(%esi, %edx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
 
 	.p2align 4
 L(CopyFrom1To16BytesUnaligned_16):
 	bsf	%ecx, %edx
+# ifdef __CHKP__
+	bndcu  	15(%eax), %bnd0
+# endif
 	movdqu	%xmm4, (%eax)
 	add	$16, %esi
 	add	$16, %eax
+# ifdef __CHKP__
+	bndcu  	(%eax, %edx), %bnd0
+	bndcu  	(%esi, %edx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
 
 	.p2align 4
 L(CopyFrom1To16BytesUnaligned_32):
 	bsf	%edx, %edx
+# ifdef __CHKP__
+	bndcu  	31(%eax), %bnd0
+# endif
 	movdqu	%xmm4, (%eax)
 	movdqu	%xmm5, 16(%eax)
 	add	$32, %esi
 	add	$32, %eax
+# ifdef __CHKP__
+	bndcu  	(%eax, %edx), %bnd0
+	bndcu  	(%esi, %edx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
 
 # ifdef USE_AS_STRNCAT
 
 	.p2align 4
 L(CopyFrom1To16BytesExit):
+# ifdef __CHKP__
+	bndcu  	(%eax, %edx), %bnd0
+	bndcu  	(%esi, %edx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
 
 /* Case2 */
@@ -594,6 +727,10 @@ L(CopyFrom1To16BytesCase2):
 	bsf	%edx, %edx
 	cmp	%ebx, %edx
 	jb	L(CopyFrom1To16BytesExit)
+# ifdef __CHKP__
+	bndcu  	1(%eax, %ebx), %bnd0
+	bndcu  	1(%esi, %ebx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
 
 	.p2align 4
@@ -605,6 +742,10 @@ L(CopyFrom1To32BytesCase2):
 	sub	%ecx, %edx
 	cmp	%ebx, %edx
 	jb	L(CopyFrom1To16BytesExit)
+# ifdef __CHKP__
+	bndcu  	1(%eax, %ebx), %bnd0
+	bndcu  	1(%esi, %ebx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
 
 L(CopyFrom1To16BytesTailCase2):
@@ -613,12 +754,20 @@ L(CopyFrom1To16BytesTailCase2):
 	bsf	%edx, %edx
 	cmp	%ebx, %edx
 	jb	L(CopyFrom1To16BytesExit)
+# ifdef __CHKP__
+	bndcu  	1(%eax, %ebx), %bnd0
+	bndcu  	1(%esi, %ebx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
 
 L(CopyFrom1To16BytesTail1Case2):
 	bsf	%edx, %edx
 	cmp	%ebx, %edx
 	jb	L(CopyFrom1To16BytesExit)
+# ifdef __CHKP__
+	bndcu  	1(%eax, %ebx), %bnd0
+	bndcu  	1(%esi, %ebx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
 
 /* Case2 or Case3,  Case3 */
@@ -631,6 +780,10 @@ L(CopyFrom1To16BytesCase3):
 	add	$16, %ebx
 	add	%ecx, %eax
 	add	%ecx, %esi
+# ifdef __CHKP__
+	bndcu  	1(%eax, %ebx), %bnd0
+	bndcu  	1(%esi, %ebx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
 
 	.p2align 4
@@ -639,6 +792,10 @@ L(CopyFrom1To32BytesCase2OrCase3):
 	jnz	L(CopyFrom1To32BytesCase2)
 	sub	%ecx, %ebx
 	add	%ecx, %esi
+# ifdef __CHKP__
+	bndcu  	1(%eax, %ebx), %bnd0
+	bndcu  	1(%esi, %ebx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
 
 	.p2align 4
@@ -647,6 +804,10 @@ L(CopyFrom1To16BytesTailCase2OrCase3):
 	jnz	L(CopyFrom1To16BytesTailCase2)
 	sub	%ecx, %ebx
 	add	%ecx, %esi
+# ifdef __CHKP__
+	bndcu  	1(%eax, %ebx), %bnd0
+	bndcu  	1(%esi, %ebx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
 
 	.p2align 4
@@ -657,6 +818,10 @@ L(CopyFrom1To32Bytes1Case2OrCase3):
 L(CopyFrom1To16BytesTail1Case2OrCase3):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16BytesTail1Case2)
+# ifdef __CHKP__
+	bndcu  	1(%eax, %ebx), %bnd0
+	bndcu  	1(%esi, %ebx), %bnd1
+# endif
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
 
 # endif
@@ -1110,15 +1275,27 @@ L(Unaligned64LeaveCase3):
 	and	$-16, %ecx
 	add	$48, %ebx
 	jl	L(CopyFrom1To16BytesCase3)
+# ifdef __CHKP__
+	bndcu  	15(%eax), %bnd0
+# endif
 	movdqu	%xmm4, (%eax)
 	sub	$16, %ebx
 	jb	L(CopyFrom1To16BytesCase3)
+# ifdef __CHKP__
+	bndcu  	31(%eax), %bnd0
+# endif
 	movdqu	%xmm5, 16(%eax)
 	sub	$16, %ebx
 	jb	L(CopyFrom1To16BytesCase3)
+# ifdef __CHKP__
+	bndcu  	47(%eax), %bnd0
+# endif
 	movdqu	%xmm6, 32(%eax)
 	sub	$16, %ebx
 	jb	L(CopyFrom1To16BytesCase3)
+# ifdef __CHKP__
+	bndcu  	63(%eax), %bnd0
+# endif
 	movdqu	%xmm7, 48(%eax)
 	xor	%bh, %bh
 	movb	%bh, 64(%eax)
@@ -1137,6 +1314,9 @@ L(Unaligned64LeaveCase2):
 
 	pcmpeqb	%xmm5, %xmm0
 	pmovmskb %xmm0, %edx
+# ifdef __CHKP__
+	bndcu  	15(%eax), %bnd0
+# endif
 	movdqu	%xmm4, (%eax)
 	add	$16, %ecx
 	sub	$16, %ebx
@@ -1146,6 +1326,9 @@ L(Unaligned64LeaveCase2):
 
 	pcmpeqb	%xmm6, %xmm0
 	pmovmskb %xmm0, %edx
+# ifdef __CHKP__
+	bndcu  	31(%eax), %bnd0
+# endif
 	movdqu	%xmm5, 16(%eax)
 	add	$16, %ecx
 	sub	$16, %ebx
@@ -1155,6 +1338,9 @@ L(Unaligned64LeaveCase2):
 
 	pcmpeqb	%xmm7, %xmm0
 	pmovmskb %xmm0, %edx
+# ifdef __CHKP__
+	bndcu  	47(%eax), %bnd0
+# endif
 	movdqu	%xmm6, 32(%eax)
 	lea	16(%eax, %ecx), %eax
 	lea	16(%esi, %ecx), %esi
diff --git a/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S b/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
index 938d74d..1e59581 100644
--- a/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
+++ b/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
@@ -46,6 +46,12 @@ ENTRY (__strchr_sse2_bsf)
 	mov	STR1(%esp), %ecx
 	movd	STR2(%esp), %xmm1
 
+# ifdef __CHKP__
+	bndldx 	STR1(%esp,%ecx,1), %bnd0
+	bndcl  	(%ecx), %bnd0
+	bndcu  	(%ecx), %bnd0
+# endif
+
 	pxor	%xmm2, %xmm2
 	mov	%ecx, %edi
 	punpcklbw %xmm1, %xmm1
@@ -81,6 +87,9 @@ ENTRY (__strchr_sse2_bsf)
 L(unaligned_match):
 	add	%edi, %eax
 	add	%ecx, %eax
+# ifdef __CHKP__
+	bndcu   (%eax), %bnd0
+# endif
 	RETURN
 
 	.p2align 4
@@ -94,6 +103,9 @@ L(unaligned_no_match):
 	.p2align 4
 /* Loop start on aligned string.  */
 L(loop):
+# ifdef __CHKP__
+	bndcu   (%edi), %bnd0
+# endif
 	movdqa	(%edi), %xmm0
 	pcmpeqb	%xmm0, %xmm2
 	add	$16, %edi
@@ -103,6 +115,9 @@ L(loop):
 	or	%eax, %edx
 	jnz	L(matches)
 
+# ifdef __CHKP__
+	bndcu   (%edi), %bnd0
+# endif
 	movdqa	(%edi), %xmm0
 	pcmpeqb	%xmm0, %xmm2
 	add	$16, %edi
@@ -112,6 +127,9 @@ L(loop):
 	or	%eax, %edx
 	jnz	L(matches)
 
+# ifdef __CHKP__
+	bndcu   (%edi), %bnd0
+# endif
 	movdqa	(%edi), %xmm0
 	pcmpeqb	%xmm0, %xmm2
 	add	$16, %edi
@@ -121,6 +139,9 @@ L(loop):
 	or	%eax, %edx
 	jnz	L(matches)
 
+# ifdef __CHKP__
+	bndcu   (%edi), %bnd0
+# endif
 	movdqa	(%edi), %xmm0
 	pcmpeqb	%xmm0, %xmm2
 	add	$16, %edi
@@ -146,6 +167,9 @@ L(matches):
 L(match):
 	sub	$16, %edi
 	add	%edi, %eax
+# ifdef __CHKP__
+	bndcu   (%eax), %bnd0
+# endif
 	RETURN
 
 /* Return NULL.  */
diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
index 355ed4e..1958b36 100644
--- a/sysdeps/i386/i686/multiarch/strcmp-sse4.S
+++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -222,6 +222,12 @@ L(ascii):
 	test	REM, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndldx 	STR1(%esp,%edx,1), %bnd0
+	bndldx 	STR2(%esp,%eax,1), %bnd1
+	bndcl  	(%edx), %bnd0
+	bndcl  	(%eax), %bnd1
+#endif
 	mov	%dx, %cx
 	and	$0xfff, %cx
 	cmp	$0xff0, %cx
@@ -280,6 +286,10 @@ L(ascii):
 	add	$16, %edx
 	add	$16, %eax
 L(first4bytes):
+#ifdef __CHKP__
+	bndcu  	(%edx), %bnd0
+	bndcu  	(%eax), %bnd1
+#endif
 	movzbl	(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	(%edx), %edi
@@ -303,6 +313,10 @@ L(first4bytes):
 	je	L(eq)
 #endif
 
+#ifdef __CHKP__
+	bndcu  	1(%edx), %bnd0
+	bndcu  	1(%eax), %bnd1
+#endif
 	movzbl	1(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	1(%edx), %edi
@@ -325,6 +339,10 @@ L(first4bytes):
 	cmp	$2, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndcu  	2(%edx), %bnd0
+	bndcu  	2(%eax), %bnd1
+#endif
 	movzbl	2(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	2(%edx), %edi
@@ -347,6 +365,10 @@ L(first4bytes):
 	cmp	$3, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndcu  	3(%edx), %bnd0
+	bndcu  	3(%eax), %bnd1
+#endif
 	movzbl	3(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	3(%edx), %edi
@@ -369,6 +391,10 @@ L(first4bytes):
 	cmp	$4, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndcu  	4(%edx), %bnd0
+	bndcu  	4(%eax), %bnd1
+#endif
 	movzbl	4(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	4(%edx), %edi
@@ -391,6 +417,10 @@ L(first4bytes):
 	cmp	$5, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndcu  	5(%edx), %bnd0
+	bndcu  	5(%eax), %bnd1
+#endif
 	movzbl	5(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	5(%edx), %edi
@@ -413,6 +443,10 @@ L(first4bytes):
 	cmp	$6, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndcu  	6(%edx), %bnd0
+	bndcu  	6(%eax), %bnd1
+#endif
 	movzbl	6(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	6(%edx), %edi
@@ -435,6 +469,10 @@ L(first4bytes):
 	cmp	$7, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndcu  	7(%edx), %bnd0
+	bndcu  	7(%eax), %bnd1
+#endif
 	movzbl	7(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	7(%edx), %edi
@@ -483,6 +521,10 @@ L(check_offset):
 	testl	%edx, %edx
 	jg	L(crosspage)
 L(loop):
+#ifdef __CHKP__
+	bndcu  	(%edi,%edx), %bnd0
+	bndcu  	(%esi,%edx), %bnd1
+#endif
 	movdqu	(%esi,%edx), %xmm2
 	movdqu	(%edi,%edx), %xmm1
 	TOLOWER (%xmm2, %xmm1)
@@ -497,6 +539,10 @@ L(loop):
 	add	$16, %edx
 	jle	L(loop)
 L(crosspage):
+#ifdef __CHKP__
+	bndcu  	(%edi,%edx), %bnd0
+	bndcu  	(%esi,%edx), %bnd1
+#endif
 	movzbl	(%edi,%edx), %eax
 	movzbl	(%esi,%edx), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
@@ -625,6 +671,10 @@ L(less16bytes):
 	add	$8, %eax
 L(less4bytes):
 
+#ifdef __CHKP__
+	bndcu  	(%edx), %bnd0
+	bndcu  	(%eax), %bnd1
+#endif
 	movzbl	(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	(%edx), %edi
@@ -647,6 +697,10 @@ L(less4bytes):
 	cmp	$1, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndcu  	1(%edx), %bnd0
+	bndcu  	1(%eax), %bnd1
+#endif
 	movzbl	1(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	1(%edx), %edi
@@ -670,6 +724,10 @@ L(less4bytes):
 	je	L(eq)
 #endif
 
+#ifdef __CHKP__
+	bndcu  	2(%edx), %bnd0
+	bndcu  	2(%eax), %bnd1
+#endif
 	movzbl	2(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	2(%edx), %edi
@@ -692,6 +750,10 @@ L(less4bytes):
 	cmp	$3, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndcu  	3(%edx), %bnd0
+	bndcu  	3(%eax), %bnd1
+#endif
 	movzbl	3(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	3(%edx), %edi
@@ -715,6 +777,10 @@ L(more4bytes):
 	cmp	$4, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndcu  	4(%edx), %bnd0
+	bndcu  	4(%eax), %bnd1
+#endif
 	movzbl	4(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	4(%edx), %edi
@@ -738,6 +804,10 @@ L(more4bytes):
 	cmp	$5, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndcu  	5(%edx), %bnd0
+	bndcu  	5(%eax), %bnd1
+#endif
 	movzbl	5(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	5(%edx), %edi
@@ -760,6 +830,10 @@ L(more4bytes):
 	cmp	$6, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndcu  	6(%edx), %bnd0
+	bndcu  	6(%eax), %bnd1
+#endif
 	movzbl	6(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	6(%edx), %edi
@@ -782,6 +856,10 @@ L(more4bytes):
 	cmp	$7, REM
 	je	L(eq)
 #endif
+#ifdef __CHKP__
+	bndcu  	7(%edx), %bnd0
+	bndcu  	7(%eax), %bnd1
+#endif
 	movzbl	7(%eax), %ecx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	movzbl	7(%edx), %edi
diff --git a/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/sysdeps/i386/i686/multiarch/strcpy-sse2.S
index d942ac2..4fdf7e0 100644
--- a/sysdeps/i386/i686/multiarch/strcpy-sse2.S
+++ b/sysdeps/i386/i686/multiarch/strcpy-sse2.S
@@ -85,6 +85,14 @@ ENTRY (STRCPY)
 	movl	LEN(%esp), %ebx
 	test	%ebx, %ebx
 	jz	L(ExitZero)
+# ifdef __CHKP__
+	bndldx 	STR1(%esp,%edi,1), %bnd0
+	bndldx 	STR2(%esp,%esi,1), %bnd1
+	bndcl  	(%esi), %bnd1
+	bndcu  	(%esi), %bnd1
+	bndcl  	(%edi), %bnd0
+	bndcu  	-1(%edi, %ebx), %bnd0
+# endif
 
 	mov	%esi, %ecx
 # ifndef USE_AS_STPCPY
@@ -111,6 +119,9 @@ ENTRY (STRCPY)
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16BytesTail)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi), %bnd1
+# endif
 	pcmpeqb	16(%esi), %xmm0
 	pmovmskb %xmm0, %edx
 # ifdef USE_AS_STPCPY
@@ -124,6 +135,9 @@ ENTRY (STRCPY)
 	jnz	L(CopyFrom1To32Bytes)
 
 	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+# ifdef __CHKP__
+	bndcu  	15(%edi), %bnd0
+# endif
 	movdqu	%xmm1, (%edi)
 
 	sub	%ecx, %edi
@@ -132,6 +146,10 @@ ENTRY (STRCPY)
 	.p2align 4
 L(Unalign16Both):
 	mov	$16, %ecx
+# ifdef __CHKP__
+	bndcu  	16(%esi, %ecx), %bnd1
+	bndcu  	15(%edi, %ecx), %bnd0
+# endif
 	movdqa	(%esi, %ecx), %xmm1
 	movaps	16(%esi, %ecx), %xmm2
 	movdqu	%xmm1, (%edi, %ecx)
@@ -143,6 +161,10 @@ L(Unalign16Both):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi, %ecx), %bnd1
+	bndcu  	15(%edi, %ecx), %bnd0
+# endif
 	movaps	16(%esi, %ecx), %xmm3
 	movdqu	%xmm2, (%edi, %ecx)
 	pcmpeqb	%xmm3, %xmm0
@@ -153,6 +175,10 @@ L(Unalign16Both):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi, %ecx), %bnd1
+	bndcu  	15(%edi, %ecx), %bnd0
+# endif
 	movaps	16(%esi, %ecx), %xmm4
 	movdqu	%xmm3, (%edi, %ecx)
 	pcmpeqb	%xmm4, %xmm0
@@ -163,6 +189,10 @@ L(Unalign16Both):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi, %ecx), %bnd1
+	bndcu  	15(%edi, %ecx), %bnd0
+# endif
 	movaps	16(%esi, %ecx), %xmm1
 	movdqu	%xmm4, (%edi, %ecx)
 	pcmpeqb	%xmm1, %xmm0
@@ -173,6 +203,10 @@ L(Unalign16Both):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi, %ecx), %bnd1
+	bndcu  	15(%edi, %ecx), %bnd0
+# endif
 	movaps	16(%esi, %ecx), %xmm2
 	movdqu	%xmm1, (%edi, %ecx)
 	pcmpeqb	%xmm2, %xmm0
@@ -183,6 +217,10 @@ L(Unalign16Both):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi, %ecx), %bnd1
+	bndcu  	15(%edi, %ecx), %bnd0
+# endif
 	movaps	16(%esi, %ecx), %xmm3
 	movdqu	%xmm2, (%edi, %ecx)
 	pcmpeqb	%xmm3, %xmm0
@@ -193,6 +231,10 @@ L(Unalign16Both):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
 
+# ifdef __CHKP__
+	bndcu  	16(%esi, %ecx), %bnd1
+	bndcu  	15(%edi, %ecx), %bnd0
+# endif
 	movdqu	%xmm3, (%edi, %ecx)
 	mov	%esi, %edx
 	lea	16(%esi, %ecx), %esi
@@ -202,6 +244,9 @@ L(Unalign16Both):
 	lea	128(%ebx, %edx), %ebx
 
 L(Unaligned64Loop):
+# ifdef __CHKP__
+	bndcu  	48(%esi), %bnd1
+# endif
 	movaps	(%esi), %xmm2
 	movaps	%xmm2, %xmm4
 	movaps	16(%esi), %xmm5
@@ -220,6 +265,10 @@ L(Unaligned64Loop):
 L(Unaligned64Loop_start):
 	add	$64, %edi
 	add	$64, %esi
+# ifdef __CHKP__
+	bndcu  	(%esi), %bnd1
+	bndcu  	(%edi), %bnd0
+# endif
 	movdqu	%xmm4, -64(%edi)
 	movaps	(%esi), %xmm2
 	movdqa	%xmm2, %xmm4
@@ -259,15 +308,27 @@ L(Unaligned64Leave):
 	jnz	L(CopyFrom1To16BytesUnaligned_32)
 
 	bsf	%ecx, %edx
+# ifdef __CHKP__
+	bndcu  	47(%edi), %bnd0
+# endif
 	movdqu	%xmm4, (%edi)
 	movdqu	%xmm5, 16(%edi)
 	movdqu	%xmm6, 32(%edi)
 # ifdef USE_AS_STPCPY
+#  ifdef __CHKP__
+	bndcu  	48(%edi, %edx), %bnd0
+#  endif
 	lea	48(%edi, %edx), %eax
 # endif
+# ifdef __CHKP__
+	bndcu  	63(%edi), %bnd0
+# endif
 	movdqu	%xmm7, 48(%edi)
 	add	$15, %ebx
 	sub	%edx, %ebx
+#  ifdef __CHKP__
+	bndcu  	49(%edi, %edx), %bnd0
+#  endif
 	lea	49(%edi, %edx), %edi
 	jmp	L(StrncpyFillTailWithZero)
 
@@ -288,6 +349,10 @@ L(SourceStringAlignmentZero):
 	test	%edx, %edx
 	jnz	L(CopyFrom1To16BytesTail1)
 
+# ifdef __CHKP__
+	bndcu  	15(%edi), %bnd0
+	bndcu  	16(%esi), %bnd1
+# endif
 	pcmpeqb	16(%esi), %xmm0
 	movdqu	%xmm1, (%edi)
 	pmovmskb %xmm0, %edx
@@ -313,7 +378,7 @@ L(CopyFrom1To16BytesTail):
 	bsf	%edx, %edx
 	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
 
-	.p2align 4
+	.p2align 9
 L(CopyFrom1To32Bytes1):
 	add	$16, %esi
 	add	$16, %edi
@@ -337,6 +402,9 @@ L(CopyFrom1To16BytesUnaligned_0):
 # ifdef USE_AS_STPCPY
 	lea	(%edi, %edx), %eax
 # endif
+# ifdef __CHKP__
+        bndcu   15(%edi), %bnd0
+# endif
 	movdqu	%xmm4, (%edi)
 	add	$63, %ebx
 	sub	%edx, %ebx
@@ -350,6 +418,9 @@ L(CopyFrom1To16BytesUnaligned_16):
 # ifdef USE_AS_STPCPY
 	lea	16(%edi, %edx), %eax
 # endif
+# ifdef __CHKP__
+        bndcu   31(%edi), %bnd0
+# endif
 	movdqu	%xmm5, 16(%edi)
 	add	$47, %ebx
 	sub	%edx, %ebx
@@ -364,6 +435,9 @@ L(CopyFrom1To16BytesUnaligned_32):
 # ifdef USE_AS_STPCPY
 	lea	32(%edi, %edx), %eax
 # endif
+# ifdef __CHKP__
+        bndcu   47(%edi), %bnd0
+# endif
 	movdqu	%xmm6, 32(%edi)
 	add	$31, %ebx
 	sub	%edx, %ebx
@@ -495,6 +569,9 @@ L(Exit1):
 	.p2align 4
 L(Exit2):
 	movw	(%esi), %dx
+# ifdef __CHKP__
+        bndcu   1(%edi), %bnd0
+# endif
 	movw	%dx, (%edi)
 # ifdef USE_AS_STPCPY
 	lea	1(%edi), %eax
@@ -507,6 +584,9 @@ L(Exit2):
 	.p2align 4
 L(Exit3):
 	movw	(%esi), %cx
+# ifdef __CHKP__
+        bndcu   2(%edi), %bnd0
+# endif
 	movw	%cx, (%edi)
 	movb	%dh, 2(%edi)
 # ifdef USE_AS_STPCPY
@@ -520,6 +600,9 @@ L(Exit3):
 	.p2align 4
 L(Exit4):
 	movl	(%esi), %edx
+# ifdef __CHKP__
+        bndcu   3(%edi), %bnd0
+# endif
 	movl	%edx, (%edi)
 # ifdef USE_AS_STPCPY
 	lea	3(%edi), %eax
@@ -532,6 +615,9 @@ L(Exit4):
 	.p2align 4
 L(Exit5):
 	movl	(%esi), %ecx
+# ifdef __CHKP__
+        bndcu   4(%edi), %bnd0
+# endif
 	movb	%dh, 4(%edi)
 	movl	%ecx, (%edi)
 # ifdef USE_AS_STPCPY
@@ -546,6 +632,9 @@ L(Exit5):
 L(Exit6):
 	movl	(%esi), %ecx
 	movw	4(%esi), %dx
+# ifdef __CHKP__
+        bndcu   5(%edi), %bnd0
+# endif
 	movl	%ecx, (%edi)
 	movw	%dx, 4(%edi)
 # ifdef USE_AS_STPCPY
@@ -560,6 +649,9 @@ L(Exit6):
 L(Exit7):
 	movl	(%esi), %ecx
 	movl	3(%esi), %edx
+# ifdef __CHKP__
+        bndcu   6(%edi), %bnd0
+# endif
 	movl	%ecx, (%edi)
 	movl	%edx, 3(%edi)
 # ifdef USE_AS_STPCPY
@@ -573,6 +665,9 @@ L(Exit7):
 	.p2align 4
 L(Exit8):
 	movlpd	(%esi), %xmm0
+# ifdef __CHKP__
+        bndcu   7(%edi), %bnd0
+# endif
 	movlpd	%xmm0, (%edi)
 # ifdef USE_AS_STPCPY
 	lea	7(%edi), %eax
@@ -585,6 +680,9 @@ L(Exit8):
 	.p2align 4
 L(Exit9):
 	movlpd	(%esi), %xmm0
+# ifdef __CHKP__
+        bndcu   8(%edi), %bnd0
+# endif
 	movb	%dh, 8(%edi)
 	movlpd	%xmm0, (%edi)
 # ifdef USE_AS_STPCPY
@@ -599,6 +697,9 @@ L(Exit9):
 L(Exit10):
 	movlpd	(%esi), %xmm0
 	movw	8(%esi), %dx
+# ifdef __CHKP__
+        bndcu   9(%edi), %bnd0
+# endif
 	movlpd	%xmm0, (%edi)
 	movw	%dx, 8(%edi)
 # ifdef USE_AS_STPCPY
@@ -613,6 +714,9 @@ L(Exit10):
 L(Exit11):
 	movlpd	(%esi), %xmm0
 	movl	7(%esi), %edx
+# ifdef __CHKP__
+        bndcu   10(%edi), %bnd0
+# endif
 	movlpd	%xmm0, (%edi)
 	movl	%edx, 7(%edi)
 # ifdef USE_AS_STPCPY
@@ -627,6 +731,9 @@ L(Exit11):
 L(Exit12):
 	movlpd	(%esi), %xmm0
 	movl	8(%esi), %edx
+# ifdef __CHKP__
+        bndcu   11(%edi), %bnd0
+# endif
 	movlpd	%xmm0, (%edi)
 	movl	%edx, 8(%edi)
 # ifdef USE_AS_STPCPY
@@ -641,6 +748,9 @@ L(Exit12):
 L(Exit13):
 	movlpd	(%esi), %xmm0
 	movlpd	5(%esi), %xmm1
+# ifdef __CHKP__
+        bndcu   12(%edi), %bnd0
+# endif
 	movlpd	%xmm0, (%edi)
 	movlpd	%xmm1, 5(%edi)
 # ifdef USE_AS_STPCPY
@@ -655,6 +765,9 @@ L(Exit13):
 L(Exit14):
 	movlpd	(%esi), %xmm0
 	movlpd	6(%esi), %xmm1
+# ifdef __CHKP__
+        bndcu   13(%edi), %bnd0
+# endif
 	movlpd	%xmm0, (%edi)
 	movlpd	%xmm1, 6(%edi)
 # ifdef USE_AS_STPCPY
@@ -669,6 +782,9 @@ L(Exit14):
 L(Exit15):
 	movlpd	(%esi), %xmm0
 	movlpd	7(%esi), %xmm1
+# ifdef __CHKP__
+        bndcu   14(%edi), %bnd0
+# endif
 	movlpd	%xmm0, (%edi)
 	movlpd	%xmm1, 7(%edi)
 # ifdef USE_AS_STPCPY
@@ -682,6 +798,9 @@ L(Exit15):
 	.p2align 4
 L(Exit16):
 	movdqu	(%esi), %xmm0
+# ifdef __CHKP__
+        bndcu   15(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 # ifdef USE_AS_STPCPY
 	lea	15(%edi), %eax
@@ -694,6 +813,9 @@ L(Exit16):
 	.p2align 4
 L(Exit17):
 	movdqu	(%esi), %xmm0
+# ifdef __CHKP__
+        bndcu   16(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movb	%dh, 16(%edi)
 # ifdef USE_AS_STPCPY
@@ -708,6 +830,9 @@ L(Exit17):
 L(Exit18):
 	movdqu	(%esi), %xmm0
 	movw	16(%esi), %cx
+# ifdef __CHKP__
+        bndcu   17(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movw	%cx, 16(%edi)
 # ifdef USE_AS_STPCPY
@@ -722,6 +847,9 @@ L(Exit18):
 L(Exit19):
 	movdqu	(%esi), %xmm0
 	movl	15(%esi), %ecx
+# ifdef __CHKP__
+        bndcu   18(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movl	%ecx, 15(%edi)
 # ifdef USE_AS_STPCPY
@@ -736,6 +864,9 @@ L(Exit19):
 L(Exit20):
 	movdqu	(%esi), %xmm0
 	movl	16(%esi), %ecx
+# ifdef __CHKP__
+        bndcu   19(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movl	%ecx, 16(%edi)
 # ifdef USE_AS_STPCPY
@@ -750,6 +881,9 @@ L(Exit20):
 L(Exit21):
 	movdqu	(%esi), %xmm0
 	movl	16(%esi), %ecx
+# ifdef __CHKP__
+        bndcu   20(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movl	%ecx, 16(%edi)
 	movb	%dh, 20(%edi)
@@ -765,6 +899,9 @@ L(Exit21):
 L(Exit22):
 	movdqu	(%esi), %xmm0
 	movlpd	14(%esi), %xmm3
+# ifdef __CHKP__
+        bndcu   21(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movlpd	%xmm3, 14(%edi)
 # ifdef USE_AS_STPCPY
@@ -779,6 +916,9 @@ L(Exit22):
 L(Exit23):
 	movdqu	(%esi), %xmm0
 	movlpd	15(%esi), %xmm3
+# ifdef __CHKP__
+        bndcu   22(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movlpd	%xmm3, 15(%edi)
 # ifdef USE_AS_STPCPY
@@ -793,6 +933,9 @@ L(Exit23):
 L(Exit24):
 	movdqu	(%esi), %xmm0
 	movlpd	16(%esi), %xmm2
+# ifdef __CHKP__
+        bndcu   23(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movlpd	%xmm2, 16(%edi)
 # ifdef USE_AS_STPCPY
@@ -807,6 +950,9 @@ L(Exit24):
 L(Exit25):
 	movdqu	(%esi), %xmm0
 	movlpd	16(%esi), %xmm2
+# ifdef __CHKP__
+        bndcu   24(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movlpd	%xmm2, 16(%edi)
 	movb	%dh, 24(%edi)
@@ -823,6 +969,9 @@ L(Exit26):
 	movdqu	(%esi), %xmm0
 	movlpd	16(%esi), %xmm2
 	movw	24(%esi), %cx
+# ifdef __CHKP__
+        bndcu   25(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movlpd	%xmm2, 16(%edi)
 	movw	%cx, 24(%edi)
@@ -839,6 +988,9 @@ L(Exit27):
 	movdqu	(%esi), %xmm0
 	movlpd	16(%esi), %xmm2
 	movl	23(%esi), %ecx
+# ifdef __CHKP__
+        bndcu   26(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movlpd	%xmm2, 16(%edi)
 	movl	%ecx, 23(%edi)
@@ -855,6 +1007,9 @@ L(Exit28):
 	movdqu	(%esi), %xmm0
 	movlpd	16(%esi), %xmm2
 	movl	24(%esi), %ecx
+# ifdef __CHKP__
+        bndcu   27(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movlpd	%xmm2, 16(%edi)
 	movl	%ecx, 24(%edi)
@@ -870,6 +1025,9 @@ L(Exit28):
 L(Exit29):
 	movdqu	(%esi), %xmm0
 	movdqu	13(%esi), %xmm2
+# ifdef __CHKP__
+        bndcu   28(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movdqu	%xmm2, 13(%edi)
 # ifdef USE_AS_STPCPY
@@ -884,6 +1042,9 @@ L(Exit29):
 L(Exit30):
 	movdqu	(%esi), %xmm0
 	movdqu	14(%esi), %xmm2
+# ifdef __CHKP__
+        bndcu   29(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movdqu	%xmm2, 14(%edi)
 # ifdef USE_AS_STPCPY
@@ -899,6 +1060,9 @@ L(Exit30):
 L(Exit31):
 	movdqu	(%esi), %xmm0
 	movdqu	15(%esi), %xmm2
+# ifdef __CHKP__
+        bndcu   30(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movdqu	%xmm2, 15(%edi)
 # ifdef USE_AS_STPCPY
@@ -913,6 +1077,9 @@ L(Exit31):
 L(Exit32):
 	movdqu	(%esi), %xmm0
 	movdqu	16(%esi), %xmm2
+# ifdef __CHKP__
+        bndcu   31(%edi), %bnd0
+# endif
 	movdqu	%xmm0, (%edi)
 	movdqu	%xmm2, 16(%edi)
 # ifdef USE_AS_STPCPY
@@ -1612,37 +1779,90 @@ ENTRY (STRCPY)
 	ENTRANCE
 	mov	STR1(%esp), %edx
 	mov	STR2(%esp), %ecx
+# ifdef __CHKP__
+	bndldx 	STR1(%esp,%edx,1), %bnd0
+	bndldx 	STR2(%esp,%ecx,1), %bnd1
+	bndcl  	(%ecx), %bnd1
+	bndcu  	(%ecx), %bnd1
+	bndcl  	(%edx), %bnd0
+	bndcu  	(%edx), %bnd0
+# endif
 
 	cmpb	$0, (%ecx)
 	jz	L(ExitTail1)
+# ifdef __CHKP__
+	bndcu  	1(%ecx), %bnd1
+# endif
 	cmpb	$0, 1(%ecx)
 	jz	L(ExitTail2)
+# ifdef __CHKP__
+	bndcu  	2(%ecx), %bnd1
+# endif
 	cmpb	$0, 2(%ecx)
 	jz	L(ExitTail3)
+# ifdef __CHKP__
+	bndcu  	3(%ecx), %bnd1
+# endif
 	cmpb	$0, 3(%ecx)
 	jz	L(ExitTail4)
+# ifdef __CHKP__
+	bndcu  	4(%ecx), %bnd1
+# endif
 	cmpb	$0, 4(%ecx)
 	jz	L(ExitTail5)
+# ifdef __CHKP__
+	bndcu  	5(%ecx), %bnd1
+# endif
 	cmpb	$0, 5(%ecx)
 	jz	L(ExitTail6)
+# ifdef __CHKP__
+	bndcu  	6(%ecx), %bnd1
+# endif
 	cmpb	$0, 6(%ecx)
 	jz	L(ExitTail7)
+# ifdef __CHKP__
+	bndcu  	7(%ecx), %bnd1
+# endif
 	cmpb	$0, 7(%ecx)
 	jz	L(ExitTail8)
+# ifdef __CHKP__
+	bndcu  	8(%ecx), %bnd1
+# endif
 	cmpb	$0, 8(%ecx)
 	jz	L(ExitTail9)
+# ifdef __CHKP__
+	bndcu  	9(%ecx), %bnd1
+# endif
 	cmpb	$0, 9(%ecx)
 	jz	L(ExitTail10)
+# ifdef __CHKP__
+	bndcu  	10(%ecx), %bnd1
+# endif
 	cmpb	$0, 10(%ecx)
 	jz	L(ExitTail11)
+# ifdef __CHKP__
+	bndcu  	11(%ecx), %bnd1
+# endif
 	cmpb	$0, 11(%ecx)
 	jz	L(ExitTail12)
+# ifdef __CHKP__
+	bndcu  	12(%ecx), %bnd1
+# endif
 	cmpb	$0, 12(%ecx)
 	jz	L(ExitTail13)
+# ifdef __CHKP__
+	bndcu  	13(%ecx), %bnd1
+# endif
 	cmpb	$0, 13(%ecx)
 	jz	L(ExitTail14)
+# ifdef __CHKP__
+	bndcu  	14(%ecx), %bnd1
+# endif
 	cmpb	$0, 14(%ecx)
 	jz	L(ExitTail15)
+# ifdef __CHKP__
+	bndcu  	15(%ecx), %bnd1
+# endif
 	cmpb	$0, 15(%ecx)
 	jz	L(ExitTail16)
 
@@ -1654,6 +1874,9 @@ ENTRY (STRCPY)
 	and	$-16, %ebx
 	pxor	%xmm0, %xmm0
 	movdqu	(%ecx), %xmm1
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+# endif
 	movdqu	%xmm1, (%edx)
 	pcmpeqb	(%ebx), %xmm0
 	pmovmskb %xmm0, %eax
@@ -1669,6 +1892,10 @@ ENTRY (STRCPY)
 	xor	%ebx, %ebx
 
 	.p2align 4
+# ifdef __CHKP__
+	bndcu  	16(%ecx), %bnd1
+	bndcu  	15(%edx), %bnd0
+# endif
 	movdqa	(%ecx), %xmm1
 	movaps	16(%ecx), %xmm2
 	movdqu	%xmm1, (%edx)
@@ -1678,6 +1905,10 @@ ENTRY (STRCPY)
 	test	%eax, %eax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%ecx, %ebx), %bnd1
+	bndcu  	15(%edx, %ebx), %bnd0
+# endif
 	movaps	16(%ecx, %ebx), %xmm3
 	movdqu	%xmm2, (%edx, %ebx)
 	pcmpeqb	%xmm3, %xmm0
@@ -1686,6 +1917,10 @@ ENTRY (STRCPY)
 	test	%eax, %eax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%ecx, %ebx), %bnd1
+	bndcu  	15(%edx, %ebx), %bnd0
+# endif
 	movaps	16(%ecx, %ebx), %xmm4
 	movdqu	%xmm3, (%edx, %ebx)
 	pcmpeqb	%xmm4, %xmm0
@@ -1694,6 +1929,10 @@ ENTRY (STRCPY)
 	test	%eax, %eax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%ecx, %ebx), %bnd1
+	bndcu  	15(%edx, %ebx), %bnd0
+# endif
 	movaps	16(%ecx, %ebx), %xmm1
 	movdqu	%xmm4, (%edx, %ebx)
 	pcmpeqb	%xmm1, %xmm0
@@ -1702,6 +1941,10 @@ ENTRY (STRCPY)
 	test	%eax, %eax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%ecx, %ebx), %bnd1
+	bndcu  	15(%edx, %ebx), %bnd0
+# endif
 	movaps	16(%ecx, %ebx), %xmm2
 	movdqu	%xmm1, (%edx, %ebx)
 	pcmpeqb	%xmm2, %xmm0
@@ -1710,6 +1953,10 @@ ENTRY (STRCPY)
 	test	%eax, %eax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%ecx, %ebx), %bnd1
+	bndcu  	15(%edx, %ebx), %bnd0
+# endif
 	movaps	16(%ecx, %ebx), %xmm3
 	movdqu	%xmm2, (%edx, %ebx)
 	pcmpeqb	%xmm3, %xmm0
@@ -1718,6 +1965,9 @@ ENTRY (STRCPY)
 	test	%eax, %eax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	15(%edx, %ebx), %bnd0
+# endif
 	movdqu	%xmm3, (%edx, %ebx)
 	mov	%ecx, %eax
 	lea	16(%ecx, %ebx), %ecx
@@ -1726,6 +1976,9 @@ ENTRY (STRCPY)
 	sub	%eax, %edx
 
 L(Aligned64Loop):
+# ifdef __CHKP__
+	bndcu  	(%ecx), %bnd1
+# endif
 	movaps	(%ecx), %xmm2
 	movaps	%xmm2, %xmm4
 	movaps	16(%ecx), %xmm5
@@ -1742,6 +1995,10 @@ L(Aligned64Loop):
 	test	%eax, %eax
 	jnz	L(Aligned64Leave)
 L(Aligned64Loop_start):
+# ifdef __CHKP__
+	bndcu  	(%ecx), %bnd1
+	bndcu  	-1(%edx), %bnd0
+# endif
 	movdqu	%xmm4, -64(%edx)
 	movaps	(%ecx), %xmm2
 	movdqa	%xmm2, %xmm4
@@ -1771,6 +2028,9 @@ L(Aligned64Leave):
 
 	pcmpeqb	%xmm5, %xmm0
 	pmovmskb %xmm0, %eax
+# ifdef __CHKP__
+	bndcu  	-49(%edx), %bnd0
+# endif
 	movdqu	%xmm4, -64(%edx)
 	test	%eax, %eax
 	lea	16(%ebx), %ebx
@@ -1778,11 +2038,17 @@ L(Aligned64Leave):
 
 	pcmpeqb	%xmm6, %xmm0
 	pmovmskb %xmm0, %eax
+# ifdef __CHKP__
+	bndcu  	-33(%edx), %bnd0
+# endif
 	movdqu	%xmm5, -48(%edx)
 	test	%eax, %eax
 	lea	16(%ebx), %ebx
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	-17(%edx), %bnd0
+# endif
 	movdqu	%xmm6, -32(%edx)
 	pcmpeqb	%xmm7, %xmm0
 	pmovmskb %xmm0, %eax
@@ -1813,6 +2079,10 @@ L(CopyFrom1To16Bytes):
 	test	$0x40, %al
 	jnz	L(Exit7)
 	/* Exit 8 */
+# ifdef __CHKP__
+	bndcu  	7(%edx), %bnd0
+	bndcu  	7(%ecx), %bnd1
+# endif
 	movl	(%ecx), %eax
 	movl	%eax, (%edx)
 	movl	4(%ecx), %eax
@@ -1841,6 +2111,10 @@ L(ExitHigh):
 	test	$0x40, %ah
 	jnz	L(Exit15)
 	/* Exit 16 */
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	15(%ecx), %bnd1
+# endif
 	movlpd	(%ecx), %xmm0
 	movlpd	%xmm0, (%edx)
 	movlpd	8(%ecx), %xmm0
@@ -1854,6 +2128,10 @@ L(ExitHigh):
 
 	.p2align 4
 L(Exit1):
+# ifdef __CHKP__
+	bndcu  	(%edx), %bnd0
+	bndcu  	(%ecx), %bnd1
+# endif
 	movb	(%ecx), %al
 	movb	%al, (%edx)
 # ifdef USE_AS_STPCPY
@@ -1865,6 +2143,10 @@ L(Exit1):
 
 	.p2align 4
 L(Exit2):
+# ifdef __CHKP__
+	bndcu  	1(%edx), %bnd0
+	bndcu  	1(%ecx), %bnd1
+# endif
 	movw	(%ecx), %ax
 	movw	%ax, (%edx)
 # ifdef USE_AS_STPCPY
@@ -1876,6 +2158,10 @@ L(Exit2):
 
 	.p2align 4
 L(Exit3):
+# ifdef __CHKP__
+	bndcu  	2(%edx), %bnd0
+	bndcu  	2(%ecx), %bnd1
+# endif
 	movw	(%ecx), %ax
 	movw	%ax, (%edx)
 	movb	2(%ecx), %al
@@ -1889,6 +2175,10 @@ L(Exit3):
 
 	.p2align 4
 L(Exit4):
+# ifdef __CHKP__
+	bndcu  	3(%edx), %bnd0
+	bndcu  	3(%ecx), %bnd1
+# endif
 	movl	(%ecx), %eax
 	movl	%eax, (%edx)
 # ifdef USE_AS_STPCPY
@@ -1900,6 +2190,10 @@ L(Exit4):
 
 	.p2align 4
 L(Exit5):
+# ifdef __CHKP__
+	bndcu  	4(%edx), %bnd0
+	bndcu  	4(%ecx), %bnd1
+# endif
 	movl	(%ecx), %eax
 	movl	%eax, (%edx)
 	movb	4(%ecx), %al
@@ -1913,6 +2207,10 @@ L(Exit5):
 
 	.p2align 4
 L(Exit6):
+# ifdef __CHKP__
+	bndcu  	5(%edx), %bnd0
+	bndcu  	5(%ecx), %bnd1
+# endif
 	movl	(%ecx), %eax
 	movl	%eax, (%edx)
 	movw	4(%ecx), %ax
@@ -1926,6 +2224,10 @@ L(Exit6):
 
 	.p2align 4
 L(Exit7):
+# ifdef __CHKP__
+	bndcu  	6(%edx), %bnd0
+	bndcu  	6(%ecx), %bnd1
+# endif
 	movl	(%ecx), %eax
 	movl	%eax, (%edx)
 	movl	3(%ecx), %eax
@@ -1939,6 +2241,10 @@ L(Exit7):
 
 	.p2align 4
 L(Exit9):
+# ifdef __CHKP__
+	bndcu  	8(%edx), %bnd0
+	bndcu  	8(%ecx), %bnd1
+# endif
 	movl	(%ecx), %eax
 	movl	%eax, (%edx)
 	movl	4(%ecx), %eax
@@ -1954,6 +2260,10 @@ L(Exit9):
 
 	.p2align 4
 L(Exit10):
+# ifdef __CHKP__
+	bndcu  	9(%edx), %bnd0
+	bndcu  	9(%ecx), %bnd1
+# endif
 	movl	(%ecx), %eax
 	movl	%eax, (%edx)
 	movl	4(%ecx), %eax
@@ -1969,6 +2279,10 @@ L(Exit10):
 
 	.p2align 4
 L(Exit11):
+# ifdef __CHKP__
+	bndcu  	10(%edx), %bnd0
+	bndcu  	10(%ecx), %bnd1
+# endif
 	movl	(%ecx), %eax
 	movl	%eax, (%edx)
 	movl	4(%ecx), %eax
@@ -1984,6 +2298,10 @@ L(Exit11):
 
 	.p2align 4
 L(Exit12):
+# ifdef __CHKP__
+	bndcu  	11(%edx), %bnd0
+	bndcu  	11(%ecx), %bnd1
+# endif
 	movl	(%ecx), %eax
 	movl	%eax, (%edx)
 	movl	4(%ecx), %eax
@@ -1999,6 +2317,10 @@ L(Exit12):
 
 	.p2align 4
 L(Exit13):
+# ifdef __CHKP__
+	bndcu  	12(%edx), %bnd0
+	bndcu  	12(%ecx), %bnd1
+# endif
 	movlpd	(%ecx), %xmm0
 	movlpd	%xmm0, (%edx)
 	movlpd	5(%ecx), %xmm0
@@ -2012,6 +2334,10 @@ L(Exit13):
 
 	.p2align 4
 L(Exit14):
+# ifdef __CHKP__
+	bndcu  	13(%edx), %bnd0
+	bndcu  	13(%ecx), %bnd1
+# endif
 	movlpd	(%ecx), %xmm0
 	movlpd	%xmm0, (%edx)
 	movlpd	6(%ecx), %xmm0
@@ -2025,6 +2351,10 @@ L(Exit14):
 
 	.p2align 4
 L(Exit15):
+# ifdef __CHKP__
+	bndcu  	14(%edx), %bnd0
+	bndcu  	14(%ecx), %bnd1
+# endif
 	movlpd	(%ecx), %xmm0
 	movlpd	%xmm0, (%edx)
 	movlpd	7(%ecx), %xmm0
@@ -2040,6 +2370,9 @@ CFI_POP (%edi)
 
 	.p2align 4
 L(ExitTail1):
+# ifdef __CHKP__
+	bndcu  	(%edx), %bnd0
+# endif
 	movb	(%ecx), %al
 	movb	%al, (%edx)
 	movl	%edx, %eax
@@ -2048,6 +2381,9 @@ L(ExitTail1):
 	.p2align 4
 L(ExitTail2):
 	movw	(%ecx), %ax
+# ifdef __CHKP__
+	bndcu  	1(%edx), %bnd0
+# endif
 	movw	%ax, (%edx)
 # ifdef USE_AS_STPCPY
 	lea	1(%edx), %eax
@@ -2059,6 +2395,9 @@ L(ExitTail2):
 	.p2align 4
 L(ExitTail3):
 	movw	(%ecx), %ax
+# ifdef __CHKP__
+	bndcu  	2(%edx), %bnd0
+# endif
 	movw	%ax, (%edx)
 	movb	2(%ecx), %al
 	movb	%al, 2(%edx)
@@ -2072,6 +2411,9 @@ L(ExitTail3):
 	.p2align 4
 L(ExitTail4):
 	movl	(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	3(%edx), %bnd0
+# endif
 	movl	%eax, (%edx)
 # ifdef USE_AS_STPCPY
 	lea	3(%edx), %eax
@@ -2083,6 +2425,9 @@ L(ExitTail4):
 	.p2align 4
 L(ExitTail5):
 	movl	(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	4(%edx), %bnd0
+# endif
 	movl	%eax, (%edx)
 	movb	4(%ecx), %al
 	movb	%al, 4(%edx)
@@ -2096,6 +2441,9 @@ L(ExitTail5):
 	.p2align 4
 L(ExitTail6):
 	movl	(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	5(%edx), %bnd0
+# endif
 	movl	%eax, (%edx)
 	movw	4(%ecx), %ax
 	movw	%ax, 4(%edx)
@@ -2109,6 +2457,9 @@ L(ExitTail6):
 	.p2align 4
 L(ExitTail7):
 	movl	(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	6(%edx), %bnd0
+# endif
 	movl	%eax, (%edx)
 	movl	3(%ecx), %eax
 	movl	%eax, 3(%edx)
@@ -2122,6 +2473,9 @@ L(ExitTail7):
 	.p2align 4
 L(ExitTail8):
 	movl	(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	7(%edx), %bnd0
+# endif
 	movl	%eax, (%edx)
 	movl	4(%ecx), %eax
 	movl	%eax, 4(%edx)
@@ -2135,6 +2489,9 @@ L(ExitTail8):
 	.p2align 4
 L(ExitTail9):
 	movl	(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	8(%edx), %bnd0
+# endif
 	movl	%eax, (%edx)
 	movl	4(%ecx), %eax
 	movl	%eax, 4(%edx)
@@ -2150,6 +2507,9 @@ L(ExitTail9):
 	.p2align 4
 L(ExitTail10):
 	movl	(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	9(%edx), %bnd0
+# endif
 	movl	%eax, (%edx)
 	movl	4(%ecx), %eax
 	movl	%eax, 4(%edx)
@@ -2165,6 +2525,9 @@ L(ExitTail10):
 	.p2align 4
 L(ExitTail11):
 	movl	(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	10(%edx), %bnd0
+# endif
 	movl	%eax, (%edx)
 	movl	4(%ecx), %eax
 	movl	%eax, 4(%edx)
@@ -2180,6 +2543,9 @@ L(ExitTail11):
 	.p2align 4
 L(ExitTail12):
 	movl	(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	11(%edx), %bnd0
+# endif
 	movl	%eax, (%edx)
 	movl	4(%ecx), %eax
 	movl	%eax, 4(%edx)
@@ -2195,6 +2561,9 @@ L(ExitTail12):
 	.p2align 4
 L(ExitTail13):
 	movlpd	(%ecx), %xmm0
+# ifdef __CHKP__
+	bndcu  	12(%edx), %bnd0
+# endif
 	movlpd	%xmm0, (%edx)
 	movlpd	5(%ecx), %xmm0
 	movlpd	%xmm0, 5(%edx)
@@ -2208,6 +2577,9 @@ L(ExitTail13):
 	.p2align 4
 L(ExitTail14):
 	movlpd	(%ecx), %xmm0
+# ifdef __CHKP__
+	bndcu  	13(%edx), %bnd0
+# endif
 	movlpd	%xmm0, (%edx)
 	movlpd	6(%ecx), %xmm0
 	movlpd	%xmm0, 6(%edx)
@@ -2221,6 +2593,9 @@ L(ExitTail14):
 	.p2align 4
 L(ExitTail15):
 	movlpd	(%ecx), %xmm0
+# ifdef __CHKP__
+	bndcu  	14(%edx), %bnd0
+# endif
 	movlpd	%xmm0, (%edx)
 	movlpd	7(%ecx), %xmm0
 	movlpd	%xmm0, 7(%edx)
@@ -2234,6 +2609,9 @@ L(ExitTail15):
 	.p2align 4
 L(ExitTail16):
 	movlpd	(%ecx), %xmm0
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+# endif
 	movlpd	%xmm0, (%edx)
 	movlpd	8(%ecx), %xmm0
 	movlpd	%xmm0, 8(%edx)
diff --git a/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
index 32db65c..ab537c1 100644
--- a/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
+++ b/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
@@ -41,6 +41,11 @@
 ENTRY ( __strlen_sse2_bsf)
 	ENTRANCE
 	mov	STR(%esp), %edi
+#ifdef __CHKP__
+	bndldx 	STR(%esp,%edi,1), %bnd0
+	bndcl  	(%edi),%bnd0
+	bndcu  	(%edi),%bnd0
+#endif
 	xor	%eax, %eax
 	mov	%edi, %ecx
 	and	$0x3f, %ecx
@@ -73,21 +78,33 @@ L(align16_start):
 	pxor	%xmm3, %xmm3
 	.p2align 4
 L(align16_loop):
+#ifdef __CHKP__
+        bndcu   16(%eax), %bnd0
+#endif
 	pcmpeqb	16(%eax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(exit16)
 
+#ifdef __CHKP__
+        bndcu   32(%eax), %bnd0
+#endif
 	pcmpeqb	32(%eax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	jnz	L(exit32)
 
+#ifdef __CHKP__
+        bndcu   48(%eax), %bnd0
+#endif
 	pcmpeqb	48(%eax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	jnz	L(exit48)
 
+#ifdef __CHKP__
+        bndcu   64(%eax), %bnd0
+#endif
 	pcmpeqb	64(%eax), %xmm3
 	pmovmskb %xmm3, %edx
 	lea	64(%eax), %eax
@@ -98,24 +115,36 @@ L(exit):
 L(exit_less16):
 	bsf	%edx, %edx
 	add	%edx, %eax
+#ifdef __CHKP__
+	bndcu 	-1(%edi, %eax), %bnd0
+#endif
 	RETURN
 L(exit16):
 	sub	%edi, %eax
 	bsf	%edx, %edx
 	add	%edx, %eax
 	add	$16, %eax
+#ifdef __CHKP__
+	bndcu 	-1(%edi, %eax), %bnd0
+#endif
 	RETURN
 L(exit32):
 	sub	%edi, %eax
 	bsf	%edx, %edx
 	add	%edx, %eax
 	add	$32, %eax
+#ifdef __CHKP__
+	bndcu 	-1(%edi, %eax), %bnd0
+#endif
 	RETURN
 L(exit48):
 	sub	%edi, %eax
 	bsf	%edx, %edx
 	add	%edx, %eax
 	add	$48, %eax
+#ifdef __CHKP__
+	bndcu	-1(%edi, %eax), %bnd0
+#endif
 	POP (%edi)
 	POP (%esi)
 	ret
diff --git a/sysdeps/i386/i686/multiarch/strlen-sse2.S b/sysdeps/i386/i686/multiarch/strlen-sse2.S
index a4f2806..3d0743e 100644
--- a/sysdeps/i386/i686/multiarch/strlen-sse2.S
+++ b/sysdeps/i386/i686/multiarch/strlen-sse2.S
@@ -41,7 +41,10 @@
 #   define PUSH(REG)	pushl	REG;	CFI_PUSH (REG)
 #   define POP(REG)	popl	REG;	CFI_POP (REG)
 #   undef RETURN
-#   define RETURN	POP (%edi); CFI_PUSH(%edi); ret
+#   define RETURN	\
+	mov STR+4(%esp),%edx; 	\
+	bndcu -1(%edx,%eax), %bnd0; \
+	POP (%edi); CFI_PUSH(%edi); ret
 #  endif
 
 #  ifndef STRLEN
@@ -51,12 +54,19 @@
 	atom_text_section
 ENTRY (STRLEN)
 	mov	STR(%esp), %edx
+#  ifdef __CHKP__
+	bndldx 	STR(%esp,%edx,1), %bnd0
+#  endif
 #  ifdef USE_AS_STRNLEN
 	PUSH	(%edi)
 	movl	LEN(%esp), %edi
 	sub	$4, %edi
 	jbe	L(len_less4_prolog)
 #  endif
+#  ifdef __CHKP__
+	bndcl  	(%edx),%bnd0
+	bndcu  	(%edx),%bnd0
+#  endif
 # endif
 	xor	%eax, %eax
 	cmpb	$0, (%edx)
@@ -122,6 +132,9 @@ ENTRY (STRLEN)
 	jbe	L(len_less64)
 # endif
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
 	pxor	%xmm1, %xmm1
@@ -129,6 +142,9 @@ ENTRY (STRLEN)
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm1
 	pmovmskb %xmm1, %edx
 	pxor	%xmm2, %xmm2
@@ -136,6 +152,9 @@ ENTRY (STRLEN)
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm2
 	pmovmskb %xmm2, %edx
 	pxor	%xmm3, %xmm3
@@ -143,6 +162,9 @@ ENTRY (STRLEN)
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
@@ -154,24 +176,36 @@ ENTRY (STRLEN)
 	jbe	L(len_less64)
 # endif
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
@@ -183,24 +217,36 @@ ENTRY (STRLEN)
 	jbe	L(len_less64)
 # endif
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
@@ -212,24 +258,36 @@ ENTRY (STRLEN)
 	jbe	L(len_less64)
 # endif
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	pcmpeqb	(%eax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
@@ -250,6 +308,9 @@ L(aligned_64_loop):
 	sub	$64, %edi
 	jbe	L(len_less64)
 # endif
+#ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+#endif
 	movaps	(%eax), %xmm0
 	movaps	16(%eax), %xmm1
 	movaps	32(%eax), %xmm2
@@ -535,6 +596,10 @@ L(len_less4_prolog):
 	add	$4, %edi
 	jz	L(exit_tail0)
 
+#  ifdef __CHKP__
+	bndcl  	(%edx),%bnd0
+	bndcu  	(%edx),%bnd0
+#  endif
 	cmpb	$0, (%edx)
 	jz	L(exit_tail0)
 	cmp	$1, %edi
diff --git a/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S b/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
index e026c40..1c907a4 100644
--- a/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
+++ b/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
@@ -42,6 +42,12 @@ ENTRY (__strrchr_sse2_bsf)
 	mov	STR1(%esp), %ecx
 	movd	STR2(%esp), %xmm1
 
+# ifdef __CHKP__
+	bndldx 	STR1(%esp,%ecx,1), %bnd0
+	bndcl  	(%ecx), %bnd0
+	bndcu  	(%ecx), %bnd0
+# endif
+
 	PUSH	(%edi)
 	pxor	%xmm2, %xmm2
 	mov	%ecx, %edi
@@ -90,6 +96,9 @@ L(unaligned_return_value1):
 	jz	L(return_null)
 	bsr	%eax, %eax
 	add	%edi, %eax
+# ifdef __CHKP__
+	bndcu   (%eax), %bnd0
+# endif
 	POP	(%edi)
 	ret
 	CFI_PUSH	(%edi)
@@ -156,6 +165,9 @@ L(unaligned_return_value):
 	jz	L(return_null)
 	bsr	%eax, %eax
 	add	%edi, %eax
+# ifdef __CHKP__
+	bndcu   (%eax), %bnd0
+# endif
 	POP	(%edi)
 	ret
 	CFI_PUSH	(%edi)
@@ -175,6 +187,9 @@ L(unaligned_match):
 /* Loop start on aligned string.  */
 	.p2align 4
 L(loop):
+# ifdef __CHKP__
+        bndcu   (%edi), %bnd0
+# endif
 	movdqa	(%edi), %xmm0
 	pcmpeqb	%xmm0, %xmm2
 	add	$16, %edi
@@ -184,6 +199,9 @@ L(loop):
 	or	%eax, %ecx
 	jnz	L(matches)
 
+# ifdef __CHKP__
+        bndcu   (%edi), %bnd0
+# endif
 	movdqa	(%edi), %xmm0
 	pcmpeqb	%xmm0, %xmm2
 	add	$16, %edi
@@ -193,6 +211,9 @@ L(loop):
 	or	%eax, %ecx
 	jnz	L(matches)
 
+# ifdef __CHKP__
+        bndcu   (%edi), %bnd0
+# endif
 	movdqa	(%edi), %xmm0
 	pcmpeqb	%xmm0, %xmm2
 	add	$16, %edi
@@ -202,6 +223,9 @@ L(loop):
 	or	%eax, %ecx
 	jnz	L(matches)
 
+# ifdef __CHKP__
+        bndcu   (%edi), %bnd0
+# endif
 	movdqa	(%edi), %xmm0
 	pcmpeqb	%xmm0, %xmm2
 	add	$16, %edi
@@ -224,6 +248,9 @@ L(return_value):
 	POP	(%esi)
 
 	sub	$16, %eax
+# ifdef __CHKP__
+	bndcu   (%eax), %bnd0
+# endif
 	POP	(%edi)
 	ret
 
@@ -255,6 +282,9 @@ L(return_value_1):
 	bsr	%eax, %eax
 	add	%edi, %eax
 	sub	$16, %eax
+# ifdef __CHKP__
+	bndcu   (%eax), %bnd0
+# endif
 	POP	(%edi)
 	ret
 
diff --git a/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/sysdeps/i386/i686/multiarch/wcschr-sse2.S
index 63101d9..e06274a 100644
--- a/sysdeps/i386/i686/multiarch/wcschr-sse2.S
+++ b/sysdeps/i386/i686/multiarch/wcschr-sse2.S
@@ -40,7 +40,11 @@ ENTRY (__wcschr_sse2)
 
 	mov	STR1(%esp), %ecx
 	movd	STR2(%esp), %xmm1
-
+# ifdef __CHKP__
+	bndldx 	STR1(%esp,%ecx,1), %bnd0
+	bndcl  	(%ecx),%bnd0
+	bndcu  	(%ecx),%bnd0
+# endif
 	mov	%ecx, %eax
 	punpckldq %xmm1, %xmm1
 	pxor	%xmm2, %xmm2
@@ -90,6 +94,9 @@ L(cross_cache):
 	test	$15, %dl
 	jnz	L(return_null)
 	lea	4(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	(%eax),%bnd0
+# endif
 	ret
 
 	CFI_PUSH (%edi)
@@ -108,6 +115,9 @@ L(unaligned_no_match):
 	.p2align 4
 L(loop):
 	add	$16, %ecx
+# ifdef __CHKP__
+	bndcu  	(%ecx),%bnd0
+# endif
 	movdqa	(%ecx), %xmm0
 	pcmpeqd	%xmm0, %xmm2
 	pcmpeqd	%xmm1, %xmm0
@@ -117,6 +127,9 @@ L(loop):
 	jnz	L(matches)
 	add	$16, %ecx
 
+# ifdef __CHKP__
+	bndcu  	(%ecx),%bnd0
+# endif
 	movdqa	(%ecx), %xmm0
 	pcmpeqd	%xmm0, %xmm2
 	pcmpeqd	%xmm1, %xmm0
@@ -126,6 +139,9 @@ L(loop):
 	jnz	L(matches)
 	add	$16, %ecx
 
+# ifdef __CHKP__
+	bndcu  	(%ecx),%bnd0
+# endif
 	movdqa	(%ecx), %xmm0
 	pcmpeqd	%xmm0, %xmm2
 	pcmpeqd	%xmm1, %xmm0
@@ -135,6 +151,9 @@ L(loop):
 	jnz	L(matches)
 	add	$16, %ecx
 
+# ifdef __CHKP__
+	bndcu  	(%ecx),%bnd0
+# endif
 	movdqa	(%ecx), %xmm0
 	pcmpeqd	%xmm0, %xmm2
 	pcmpeqd	%xmm1, %xmm0
@@ -160,11 +179,17 @@ L(match_case2):
 	test	$15, %dl
 	jnz	L(return_null)
 	lea	4(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	(%eax),%bnd0
+# endif
 	ret
 
 	.p2align 4
 L(match_case2_4):
 	mov	%ecx, %eax
+# ifdef __CHKP__
+	bndcu  	(%eax),%bnd0
+# endif
 	ret
 
 	.p2align 4
@@ -176,11 +201,17 @@ L(match_higth_case2):
 	test	$15, %dh
 	jnz	L(return_null)
 	lea	12(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	(%eax),%bnd0
+# endif
 	ret
 
 	.p2align 4
 L(match_case2_12):
 	lea	8(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	(%eax),%bnd0
+# endif
 	ret
 
 	.p2align 4
@@ -191,6 +222,9 @@ L(match_case1):
 	test	$0x01, %al
 	jnz	L(exit0)
 	lea	4(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	(%eax),%bnd0
+# endif
 	ret
 
 	.p2align 4
@@ -198,16 +232,25 @@ L(match_higth_case1):
 	test	$0x01, %ah
 	jnz	L(exit3)
 	lea	12(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	(%eax),%bnd0
+# endif
 	ret
 
 	.p2align 4
 L(exit0):
 	mov	%ecx, %eax
+# ifdef __CHKP__
+	bndcu  	(%eax),%bnd0
+# endif
 	ret
 
 	.p2align 4
 L(exit3):
 	lea	8(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	(%eax),%bnd0
+# endif
 	ret
 
 	.p2align 4
diff --git a/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
index 9b248c1..108e7fb 100644
--- a/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
+++ b/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
@@ -47,6 +47,14 @@ ENTRY (__wcscmp_sse2)
 */
 	mov	STR1(%esp), %edx
 	mov	STR2(%esp), %eax
+#ifdef __CHKP__
+	bndldx 	STR1(%esp,%edx,1), %bnd0
+	bndldx 	STR2(%esp,%eax,1), %bnd1
+	bndcl  	(%edx), %bnd0
+	bndcl  	(%eax), %bnd1
+	bndcu  	(%edx), %bnd0
+	bndcu  	(%eax), %bnd1
+#endif
 
 	mov	(%eax), %ecx
 	cmp	%ecx, (%edx)
diff --git a/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
index 47fb516..708ef41 100644
--- a/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
@@ -41,13 +41,29 @@
 ENTRY (__wcscpy_ssse3)
 	mov	STR1(%esp), %edx
 	mov	STR2(%esp), %ecx
+# ifdef __CHKP__
+	bndldx 	STR1(%esp,%edx,1), %bnd0
+	bndldx 	STR2(%esp,%ecx,1), %bnd1
+	bndcl  	(%edx), %bnd0
+	bndcl  	(%ecx), %bnd1
+	bndcu  	(%ecx), %bnd1
+# endif
 
 	cmp	$0, (%ecx)
 	jz	L(ExitTail4)
+# ifdef __CHKP__
+	bndcu  	4(%ecx), %bnd1
+# endif
 	cmp	$0, 4(%ecx)
 	jz	L(ExitTail8)
+# ifdef __CHKP__
+	bndcu  	8(%ecx), %bnd1
+# endif
 	cmp	$0, 8(%ecx)
 	jz	L(ExitTail12)
+# ifdef __CHKP__
+	bndcu  	12(%ecx), %bnd1
+# endif
 	cmp	$0, 12(%ecx)
 	jz	L(ExitTail16)
 
@@ -61,6 +77,9 @@ ENTRY (__wcscpy_ssse3)
 	pxor	%xmm0, %xmm0
 	pcmpeqd	(%esi), %xmm0
 	movdqu	(%ecx), %xmm1
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+# endif
 	movdqu	%xmm1, (%edx)
 
 	pmovmskb %xmm0, %eax
@@ -87,6 +106,10 @@ ENTRY (__wcscpy_ssse3)
 	jmp	L(Shl12)
 
 L(Align16Both):
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	16(%ecx), %bnd1
+# endif
 	movaps	(%ecx), %xmm1
 	movaps	16(%ecx), %xmm2
 	movaps	%xmm1, (%edx)
@@ -97,6 +120,10 @@ L(Align16Both):
 	test	%eax, %eax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	15(%edx, %esi), %bnd0
+	bndcu  	16(%ecx, %esi), %bnd1
+# endif
 	movaps	16(%ecx, %esi), %xmm3
 	movaps	%xmm2, (%edx, %esi)
 	pcmpeqd	%xmm3, %xmm0
@@ -106,6 +133,10 @@ L(Align16Both):
 	test	%eax, %eax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	15(%edx, %esi), %bnd0
+	bndcu  	16(%ecx, %esi), %bnd1
+# endif
 	movaps	16(%ecx, %esi), %xmm4
 	movaps	%xmm3, (%edx, %esi)
 	pcmpeqd	%xmm4, %xmm0
@@ -115,6 +146,10 @@ L(Align16Both):
 	test	%eax, %eax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	15(%edx, %esi), %bnd0
+	bndcu  	16(%ecx, %esi), %bnd1
+# endif
 	movaps	16(%ecx, %esi), %xmm1
 	movaps	%xmm4, (%edx, %esi)
 	pcmpeqd	%xmm1, %xmm0
@@ -124,6 +159,10 @@ L(Align16Both):
 	test	%eax, %eax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	15(%edx, %esi), %bnd0
+	bndcu  	16(%ecx, %esi), %bnd1
+# endif
 	movaps	16(%ecx, %esi), %xmm2
 	movaps	%xmm1, (%edx, %esi)
 	pcmpeqd	%xmm2, %xmm0
@@ -133,6 +172,10 @@ L(Align16Both):
 	test	%eax, %eax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	15(%edx, %esi), %bnd0
+	bndcu  	16(%ecx, %esi), %bnd1
+# endif
 	movaps	16(%ecx, %esi), %xmm3
 	movaps	%xmm2, (%edx, %esi)
 	pcmpeqd	%xmm3, %xmm0
@@ -142,6 +185,9 @@ L(Align16Both):
 	test	%eax, %eax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	15(%edx, %esi), %bnd0
+# endif
 	movaps	%xmm3, (%edx, %esi)
 	mov	%ecx, %eax
 	lea	16(%ecx, %esi), %ecx
@@ -152,6 +198,9 @@ L(Align16Both):
 	mov	$-0x40, %esi
 
 L(Aligned64Loop):
+# ifdef __CHKP__
+	bndcu  	(%ecx), %bnd1
+# endif
 	movaps	(%ecx), %xmm2
 	movaps	32(%ecx), %xmm3
 	movaps	%xmm2, %xmm4
@@ -168,6 +217,9 @@ L(Aligned64Loop):
 
 	test	%eax, %eax
 	jnz	L(Aligned64Leave)
+# ifdef __CHKP__
+	bndcu  	-1(%edx), %bnd0
+# endif
 	movaps	%xmm4, -64(%edx)
 	movaps	%xmm5, -48(%edx)
 	movaps	%xmm6, -32(%edx)
@@ -182,6 +234,9 @@ L(Aligned64Leave):
 
 	pcmpeqd	%xmm5, %xmm0
 	pmovmskb %xmm0, %eax
+# ifdef __CHKP__
+	bndcu  	-49(%edx), %bnd0
+# endif
 	movaps	%xmm4, -64(%edx)
 	test	%eax, %eax
 	lea	16(%esi), %esi
@@ -189,11 +244,17 @@ L(Aligned64Leave):
 
 	pcmpeqd	%xmm6, %xmm0
 	pmovmskb %xmm0, %eax
+# ifdef __CHKP__
+	bndcu  	-33(%edx), %bnd0
+# endif
 	movaps	%xmm5, -48(%edx)
 	test	%eax, %eax
 	lea	16(%esi), %esi
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	-17(%edx), %bnd0
+# endif
 	movaps	%xmm6, -32(%edx)
 	pcmpeqd	%xmm7, %xmm0
 	pmovmskb %xmm0, %eax
@@ -202,11 +263,17 @@ L(Aligned64Leave):
 	jnz	L(CopyFrom1To16Bytes)
 
 	mov	$-0x40, %esi
+# ifdef __CHKP__
+	bndcu  	-1(%edx), %bnd0
+# endif
 	movaps	%xmm7, -16(%edx)
 	jmp	L(Aligned64Loop)
 
 	.p2align 4
 L(Shl4):
+# ifdef __CHKP__
+	bndcu  	12(%ecx), %bnd1
+# endif
 	movaps	-4(%ecx), %xmm1
 	movaps	12(%ecx), %xmm2
 L(Shl4Start):
@@ -218,6 +285,10 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	28(%ecx), %bnd1
+# endif
 	movaps	%xmm2, (%edx)
 	movaps	28(%ecx), %xmm2
 
@@ -231,6 +302,10 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	28(%ecx), %bnd1
+# endif
 	movaps	%xmm2, (%edx)
 	movaps	28(%ecx), %xmm2
 
@@ -244,6 +319,10 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	28(%ecx), %bnd1
+# endif
 	movaps	%xmm2, (%edx)
 	movaps	28(%ecx), %xmm2
 
@@ -256,6 +335,10 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	28(%ecx), %bnd1
+# endif
 	movaps	%xmm2, (%edx)
 	lea	28(%ecx), %ecx
 	lea	16(%edx), %edx
@@ -269,6 +352,9 @@ L(Shl4Start):
 	movaps	-4(%ecx), %xmm1
 
 L(Shl4LoopStart):
+# ifdef __CHKP__
+	bndcu  	12(%ecx), %bnd1
+# endif
 	movaps	12(%ecx), %xmm2
 	movaps	28(%ecx), %xmm3
 	movaps	%xmm3, %xmm6
@@ -290,6 +376,9 @@ L(Shl4LoopStart):
 	lea	64(%ecx), %ecx
 	palignr	$4, %xmm1, %xmm2
 	movaps	%xmm7, %xmm1
+# ifdef __CHKP__
+	bndcu  	63(%edx), %bnd0
+# endif
 	movaps	%xmm5, 48(%edx)
 	movaps	%xmm4, 32(%edx)
 	movaps	%xmm3, 16(%edx)
@@ -300,6 +389,10 @@ L(Shl4LoopStart):
 L(Shl4LoopExit):
 	movlpd	(%ecx), %xmm0
 	movl	8(%ecx), %esi
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+# endif
+	movaps	%xmm2, (%edx)
 	movlpd	%xmm0, (%edx)
 	movl	%esi, 8(%edx)
 	POP	(%esi)
@@ -310,6 +403,9 @@ L(Shl4LoopExit):
 	test	$0x01, %al
 	jnz	L(Exit4)
 	movlpd	(%ecx), %xmm0
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+# endif
 	movlpd	%xmm0, (%edx)
 	movl	%edi, %eax
 	RETURN
@@ -318,6 +414,9 @@ L(Shl4LoopExit):
 
 	.p2align 4
 L(Shl8):
+# ifdef __CHKP__
+	bndcu  	8(%ecx), %bnd1
+# endif
 	movaps	-8(%ecx), %xmm1
 	movaps	8(%ecx), %xmm2
 L(Shl8Start):
@@ -329,6 +428,10 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	24(%ecx), %bnd1
+# endif
 	movaps	%xmm2, (%edx)
 	movaps	24(%ecx), %xmm2
 
@@ -342,6 +445,10 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	24(%ecx), %bnd1
+# endif
 	movaps	%xmm2, (%edx)
 	movaps	24(%ecx), %xmm2
 
@@ -355,6 +462,10 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	24(%ecx), %bnd1
+# endif
 	movaps	%xmm2, (%edx)
 	movaps	24(%ecx), %xmm2
 
@@ -367,6 +478,9 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+# endif
 	movaps	%xmm2, (%edx)
 	lea	24(%ecx), %ecx
 	lea	16(%edx), %edx
@@ -380,6 +494,9 @@ L(Shl8Start):
 	movaps	-8(%ecx), %xmm1
 
 L(Shl8LoopStart):
+# ifdef __CHKP__
+	bndcu  	8(%ecx), %bnd1
+# endif
 	movaps	8(%ecx), %xmm2
 	movaps	24(%ecx), %xmm3
 	movaps	%xmm3, %xmm6
@@ -401,6 +518,9 @@ L(Shl8LoopStart):
 	lea	64(%ecx), %ecx
 	palignr	$8, %xmm1, %xmm2
 	movaps	%xmm7, %xmm1
+# ifdef __CHKP__
+	bndcu  	63(%edx), %bnd0
+# endif
 	movaps	%xmm5, 48(%edx)
 	movaps	%xmm4, 32(%edx)
 	movaps	%xmm3, 16(%edx)
@@ -410,6 +530,9 @@ L(Shl8LoopStart):
 
 L(Shl8LoopExit):
 	movlpd	(%ecx), %xmm0
+# ifdef __CHKP__
+	bndcu  	7(%edx), %bnd0
+# endif
 	movlpd	%xmm0, (%edx)
 	POP	(%esi)
 	add	$8, %edx
@@ -419,6 +542,9 @@ L(Shl8LoopExit):
 	test	$0x01, %al
 	jnz	L(Exit4)
 	movlpd	(%ecx), %xmm0
+# ifdef __CHKP__
+	bndcu  	7(%edx), %bnd0
+# endif
 	movlpd	%xmm0, (%edx)
 	movl	%edi, %eax
 	RETURN
@@ -427,6 +553,9 @@ L(Shl8LoopExit):
 
 	.p2align 4
 L(Shl12):
+# ifdef __CHKP__
+	bndcu  	4(%ecx), %bnd1
+# endif
 	movaps	-12(%ecx), %xmm1
 	movaps	4(%ecx), %xmm2
 L(Shl12Start):
@@ -438,6 +567,10 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	20(%ecx), %bnd1
+# endif
 	movaps	%xmm2, (%edx)
 	movaps	20(%ecx), %xmm2
 
@@ -451,6 +584,10 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	20(%ecx), %bnd1
+# endif
 	movaps	%xmm2, (%edx)
 	movaps	20(%ecx), %xmm2
 
@@ -464,6 +601,10 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	20(%ecx), %bnd1
+# endif
 	movaps	%xmm2, (%edx)
 	movaps	20(%ecx), %xmm2
 
@@ -476,6 +617,9 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+# endif
 	movaps	%xmm2, (%edx)
 	lea	20(%ecx), %ecx
 	lea	16(%edx), %edx
@@ -489,6 +633,9 @@ L(Shl12Start):
 	movaps	-12(%ecx), %xmm1
 
 L(Shl12LoopStart):
+# ifdef __CHKP__
+	bndcu  	4(%ecx), %bnd1
+# endif
 	movaps	4(%ecx), %xmm2
 	movaps	20(%ecx), %xmm3
 	movaps	%xmm3, %xmm6
@@ -510,6 +657,9 @@ L(Shl12LoopStart):
 	lea	64(%ecx), %ecx
 	palignr	$12, %xmm1, %xmm2
 	movaps	%xmm7, %xmm1
+# ifdef __CHKP__
+	bndcu  	63(%edx), %bnd0
+# endif
 	movaps	%xmm5, 48(%edx)
 	movaps	%xmm4, 32(%edx)
 	movaps	%xmm3, 16(%edx)
@@ -519,6 +669,9 @@ L(Shl12LoopStart):
 
 L(Shl12LoopExit):
 	movl	(%ecx), %esi
+# ifdef __CHKP__
+	bndcu	3(%edx), %bnd0
+# endif
 	movl	%esi, (%edx)
 	mov	$4, %esi
 
@@ -533,6 +686,10 @@ L(CopyFrom1To16Bytes):
 	test	$0x01, %al
 	jnz	L(Exit4)
 L(Exit8):
+# ifdef __CHKP__
+	bndcu  	7(%edx), %bnd0
+	bndcu  	7(%ecx), %bnd1
+# endif
 	movlpd	(%ecx), %xmm0
 	movlpd	%xmm0, (%edx)
 	movl	%edi, %eax
@@ -543,6 +700,10 @@ L(ExitHigh):
 	test	$0x01, %ah
 	jnz	L(Exit12)
 L(Exit16):
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+	bndcu  	15(%ecx), %bnd1
+# endif
 	movdqu	(%ecx), %xmm0
 	movdqu	%xmm0, (%edx)
 	movl	%edi, %eax
@@ -550,6 +711,10 @@ L(Exit16):
 
 	.p2align 4
 L(Exit4):
+# ifdef __CHKP__
+	bndcu  	3(%edx), %bnd0
+	bndcu  	3(%ecx), %bnd1
+# endif
 	movl	(%ecx), %eax
 	movl	%eax, (%edx)
 	movl	%edi, %eax
@@ -557,6 +722,10 @@ L(Exit4):
 
 	.p2align 4
 L(Exit12):
+# ifdef __CHKP__
+	bndcu  	11(%edx), %bnd0
+	bndcu  	11(%ecx), %bnd1
+# endif
 	movlpd	(%ecx), %xmm0
 	movlpd	%xmm0, (%edx)
 	movl	8(%ecx), %eax
@@ -569,6 +738,9 @@ CFI_POP	(%edi)
 	.p2align 4
 L(ExitTail4):
 	movl	(%ecx), %eax
+# ifdef __CHKP__
+	bndcu  	3(%edx), %bnd0
+# endif
 	movl	%eax, (%edx)
 	movl	%edx, %eax
 	ret
@@ -576,6 +748,9 @@ L(ExitTail4):
 	.p2align 4
 L(ExitTail8):
 	movlpd	(%ecx), %xmm0
+# ifdef __CHKP__
+	bndcu  	7(%edx), %bnd0
+# endif
 	movlpd	%xmm0, (%edx)
 	movl	%edx, %eax
 	ret
@@ -583,6 +758,9 @@ L(ExitTail8):
 	.p2align 4
 L(ExitTail12):
 	movlpd	(%ecx), %xmm0
+# ifdef __CHKP__
+	bndcu  	11(%edx), %bnd0
+# endif
 	movlpd	%xmm0, (%edx)
 	movl	8(%ecx), %eax
 	movl	%eax, 8(%edx)
@@ -592,6 +770,9 @@ L(ExitTail12):
 	.p2align 4
 L(ExitTail16):
 	movdqu	(%ecx), %xmm0
+# ifdef __CHKP__
+	bndcu  	15(%edx), %bnd0
+# endif
 	movdqu	%xmm0, (%edx)
 	movl	%edx, %eax
 	ret
diff --git a/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/sysdeps/i386/i686/multiarch/wcslen-sse2.S
index a92b92f..9c53149 100644
--- a/sysdeps/i386/i686/multiarch/wcslen-sse2.S
+++ b/sysdeps/i386/i686/multiarch/wcslen-sse2.S
@@ -24,21 +24,47 @@
 	.text
 ENTRY (__wcslen_sse2)
 	mov	STR(%esp), %edx
+# ifdef __CHKP__
+	bndldx 	STR(%esp,%edx,1), %bnd0
+	bndcl  	(%edx),%bnd0
+	bndcu  	(%edx),%bnd0
+# endif
 
 	cmp	$0, (%edx)
 	jz	L(exit_tail0)
+# ifdef __CHKP__
+        bndcu   4(%edx), %bnd0
+# endif
 	cmp	$0, 4(%edx)
 	jz	L(exit_tail1)
+# ifdef __CHKP__
+        bndcu   8(%edx), %bnd0
+# endif
 	cmp	$0, 8(%edx)
 	jz	L(exit_tail2)
+# ifdef __CHKP__
+        bndcu   12(%edx), %bnd0
+# endif
 	cmp	$0, 12(%edx)
 	jz	L(exit_tail3)
+# ifdef __CHKP__
+        bndcu   16(%edx), %bnd0
+# endif
 	cmp	$0, 16(%edx)
 	jz	L(exit_tail4)
+# ifdef __CHKP__
+        bndcu   20(%edx), %bnd0
+# endif
 	cmp	$0, 20(%edx)
 	jz	L(exit_tail5)
+# ifdef __CHKP__
+        bndcu   24(%edx), %bnd0
+# endif
 	cmp	$0, 24(%edx)
 	jz	L(exit_tail6)
+# ifdef __CHKP__
+        bndcu   28(%edx), %bnd0
+# endif
 	cmp	$0, 28(%edx)
 	jz	L(exit_tail7)
 
@@ -48,6 +74,9 @@ ENTRY (__wcslen_sse2)
 	lea	16(%edx), %ecx
 	and	$-16, %eax
 
+# ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+# endif
 	pcmpeqd	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
 	pxor	%xmm1, %xmm1
@@ -55,6 +84,9 @@ ENTRY (__wcslen_sse2)
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+# ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+# endif
 	pcmpeqd	(%eax), %xmm1
 	pmovmskb %xmm1, %edx
 	pxor	%xmm2, %xmm2
@@ -62,6 +94,9 @@ ENTRY (__wcslen_sse2)
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+# ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+# endif
 	pcmpeqd	(%eax), %xmm2
 	pmovmskb %xmm2, %edx
 	pxor	%xmm3, %xmm3
@@ -69,6 +104,9 @@ ENTRY (__wcslen_sse2)
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+# ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+# endif
 	pcmpeqd	(%eax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
@@ -79,6 +117,9 @@ ENTRY (__wcslen_sse2)
 
 	.p2align 4
 L(aligned_64_loop):
+# ifdef __CHKP__
+        bndcu   (%eax), %bnd0
+# endif
 	movaps	(%eax), %xmm0
 	movaps	16(%eax), %xmm1
 	movaps	32(%eax), %xmm2
@@ -129,6 +170,10 @@ L(exit):
 	mov	%dl, %cl
 	and	$15, %cl
 	jz	L(exit_1)
+# ifdef __CHKP__
+	mov	STR(%esp), %edx
+        bndcu   -1(%edx, %eax, 4), %bnd0
+# endif
 	ret
 
 	.p2align 4
@@ -137,16 +182,28 @@ L(exit_high):
 	and	$15, %ch
 	jz	L(exit_3)
 	add	$2, %eax
+# ifdef __CHKP__
+	mov	STR(%esp), %edx
+        bndcu   -1(%edx, %eax, 4), %bnd0
+# endif
 	ret
 
 	.p2align 4
 L(exit_1):
 	add	$1, %eax
+# ifdef __CHKP__
+	mov	STR(%esp), %edx
+        bndcu   -1(%edx, %eax, 4), %bnd0
+# endif
 	ret
 
 	.p2align 4
 L(exit_3):
 	add	$3, %eax
+# ifdef __CHKP__
+	mov	STR(%esp), %edx
+        bndcu   -1(%edx, %eax, 4), %bnd0
+# endif
 	ret
 
 	.p2align 4
diff --git a/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
index d31e48e..f7c70e6 100644
--- a/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
@@ -36,12 +36,23 @@
 # define STR1	PARMS
 # define STR2	STR1+4
 
+# ifdef __CHKP__
+# undef RETURN
+# define RETURN bndcu (%eax),%bnd0; \
+	 POP (%edi); ret; CFI_PUSH (%edi);
+# endif
+
 	atom_text_section
 ENTRY (__wcsrchr_sse2)
 
 	ENTRANCE
 	mov	STR1(%esp), %ecx
 	movd	STR2(%esp), %xmm1
+# ifdef __CHKP__
+	bndldx 	STR1(%esp,%ecx,1), %bnd0
+	bndcl  	(%ecx),%bnd0
+	bndcu  	(%ecx),%bnd0
+# endif
 
 	mov	%ecx, %edi
 	punpckldq %xmm1, %xmm1
@@ -137,6 +148,9 @@ L(unaligned_match):
 /* Loop start on aligned string.  */
 	.p2align 4
 L(loop):
+# ifdef __CHKP__
+	bndcu  	(%edi),%bnd0
+# endif
 	movdqa	(%edi), %xmm0
 	pcmpeqd	%xmm0, %xmm2
 	add	$16, %edi
@@ -146,6 +160,9 @@ L(loop):
 	or	%eax, %ecx
 	jnz	L(matches)
 
+# ifdef __CHKP__
+	bndcu  	(%edi),%bnd0
+# endif
 	movdqa	(%edi), %xmm3
 	pcmpeqd	%xmm3, %xmm2
 	add	$16, %edi
@@ -155,6 +172,9 @@ L(loop):
 	or	%eax, %ecx
 	jnz	L(matches)
 
+# ifdef __CHKP__
+	bndcu  	(%edi),%bnd0
+# endif
 	movdqa	(%edi), %xmm4
 	pcmpeqd	%xmm4, %xmm2
 	add	$16, %edi
@@ -164,6 +184,9 @@ L(loop):
 	or	%eax, %ecx
 	jnz	L(matches)
 
+# ifdef __CHKP__
+	bndcu  	(%edi),%bnd0
+# endif
 	movdqa	(%edi), %xmm5
 	pcmpeqd	%xmm5, %xmm2
 	add	$16, %edi
diff --git a/sysdeps/i386/i686/strcmp.S b/sysdeps/i386/i686/strcmp.S
index 6ca6220..67134af 100644
--- a/sysdeps/i386/i686/strcmp.S
+++ b/sysdeps/i386/i686/strcmp.S
@@ -29,8 +29,19 @@ ENTRY (strcmp)
 
 	movl	STR1(%esp), %ecx
 	movl	STR2(%esp), %edx
-
-L(oop):	movb	(%ecx), %al
+#ifdef __CHKP__
+	bndldx	STR1(%esp,%ecx,1), %bnd0
+	bndldx	STR2(%esp,%edx,1), %bnd1
+	bndcl	(%ecx), %bnd0
+	bndcl	(%edx), %bnd1
+#endif
+
+L(oop):
+#ifdef __CHKP__
+	bndcu	(%ecx), %bnd0
+	bndcu	(%edx), %bnd1
+#endif
+	movb	(%ecx), %al
 	cmpb	(%edx), %al
 	jne	L(neq)
 	incl	%ecx
diff --git a/sysdeps/i386/i686/strtok.S b/sysdeps/i386/i686/strtok.S
index 8848faf..78a2ea9 100644
--- a/sysdeps/i386/i686/strtok.S
+++ b/sysdeps/i386/i686/strtok.S
@@ -121,6 +121,14 @@ ENTRY (FUNCTION)
 	testl %edx, %edx
 	jz L(returnNULL)
 	movl DELIM(%esp), %eax		/* Get start of delimiter set.  */
+#ifdef __CHKP__
+	bndldx STR(%esp,%edx,1),%bnd0
+	bndldx DELIM(%esp,%eax,1),%bnd1
+	bndcl (%edx), %bnd0
+	bndcu (%edx), %bnd0
+	bndcl (%eax), %bnd1
+	bndcu (%eax), %bnd1
+#endif
 
 /* For understanding the following code remember that %ecx == 0 now.
    Although all the following instruction only modify %cl we always
diff --git a/sysdeps/i386/memchr.S b/sysdeps/i386/memchr.S
index 6799500..39fe616 100644
--- a/sysdeps/i386/memchr.S
+++ b/sysdeps/i386/memchr.S
@@ -51,6 +51,11 @@ ENTRY (__memchr)
 	movl LEN(%esp), %esi	/* len: length of memory block.  */
 	cfi_rel_offset (esi, 4)
 
+#ifdef __CHKP__
+	bndldx STR(%esp,%eax,1), %bnd0
+	bndcl (%eax), %bnd0
+#endif
+
 	/* If my must not test more than three characters test
 	   them one by one.  This is especially true for 0.  */
 	cmpl $4, %esi
@@ -72,6 +77,9 @@ ENTRY (__memchr)
 
 	testb $3, %al		/* correctly aligned ? */
 	je L(2)			/* yes => begin loop */
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	cmpb %dl, (%eax)	/* compare byte */
 	je L(9)			/* target found => return */
 	incl %eax		/* increment source pointer */
@@ -80,6 +88,9 @@ ENTRY (__memchr)
 
 	testb $3, %al		/* correctly aligned ? */
 	je L(2)			/* yes => begin loop */
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	cmpb %dl, (%eax)	/* compare byte */
 	je L(9)			/* target found => return */
 	incl %eax		/* increment source pointer */
@@ -88,6 +99,9 @@ ENTRY (__memchr)
 
 	testb $3, %al		/* correctly aligned ? */
 	je L(2)			/* yes => begin loop */
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	cmpb %dl, (%eax)	/* compare byte */
 	je L(9)			/* target found => return */
 	incl %eax		/* increment source pointer */
@@ -127,7 +141,11 @@ ENTRY (__memchr)
 
 	ALIGN (4)
 
-L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+L(1):
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
 				   are now 0 */
@@ -162,6 +180,9 @@ L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 	   (following LL(13) below).  Even the len can be compared with
 	   constants instead of decrementing each time.  */
 
+#ifdef __CHKP__
+	bndcu 4(%eax), %bnd0
+#endif
 	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
@@ -176,6 +197,9 @@ L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 				   the addition will not result in 0.  */
 	jnz L(7)		/* found it => return pointer */
 
+#ifdef __CHKP__
+	bndcu 8(%eax), %bnd0
+#endif
 	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
@@ -190,6 +214,9 @@ L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 				   the addition will not result in 0.  */
 	jnz L(6)		/* found it => return pointer */
 
+#ifdef __CHKP__
+	bndcu 12(%eax), %bnd0
+#endif
 	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
@@ -213,6 +240,9 @@ L(2):	subl $16, %esi
 	cmpl $4-16, %esi	/* rest < 4 bytes? */
 	jb L(3)			/* yes, than test byte by byte */
 
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
@@ -231,6 +261,9 @@ L(2):	subl $16, %esi
 	cmpl $8-16, %esi	/* rest < 8 bytes? */
 	jb L(3)			/* yes, than test byte by byte */
 
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
@@ -249,6 +282,9 @@ L(2):	subl $16, %esi
 	cmpl $12-16, %esi	/* rest < 12 bytes? */
 	jb L(3)			/* yes, than test byte by byte */
 
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
@@ -268,18 +304,27 @@ L(2):	subl $16, %esi
 L(3):	andl $3, %esi		/* mask out uninteresting bytes */
 	jz L(4)			/* no remaining bytes => return NULL */
 
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	cmpb %dl, (%eax)	/* compare byte with CHR */
 	je L(9)			/* equal, than return pointer */
 	incl %eax		/* increment source pointer */
 	decl %esi		/* decrement length */
 	jz L(4)			/* no remaining bytes => return NULL */
 
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	cmpb %dl, (%eax)	/* compare byte with CHR */
 	je L(9)			/* equal, than return pointer */
 	incl %eax		/* increment source pointer */
 	decl %esi		/* decrement length */
 	jz L(4)			/* no remaining bytes => return NULL */
 
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	cmpb %dl, (%eax)	/* compare byte with CHR */
 	je L(9)			/* equal, than return pointer */
 
diff --git a/sysdeps/i386/memcmp.S b/sysdeps/i386/memcmp.S
index 21e0bfc..7beab65 100644
--- a/sysdeps/i386/memcmp.S
+++ b/sysdeps/i386/memcmp.S
@@ -37,6 +37,12 @@ ENTRY (memcmp)
 	cfi_rel_offset (esi, 0)
 	movl BLK2(%esp), %edi
 	movl LEN(%esp), %ecx
+#ifdef __CHKP__
+	bndldx BLK1(%esp,%esi,1), %bnd0
+	bndldx BLK2(%esp,%edi,1), %bnd1
+	bndcl (%esi), %bnd0
+	bndcl (%edi), %bnd1
+#endif
 
 	cld			/* Set direction of comparison.  */
 
@@ -59,7 +65,13 @@ ENTRY (memcmp)
 	   Note that the following operation does not change 0xffffffff.  */
 	orb $1, %al		/* Change 0 to 1.  */
 
-L(1):	popl %esi		/* Restore registers.  */
+L(1):
+#ifdef __CHKP__
+	bndcu (%esi), %bnd0
+	bndcu (%edi), %bnd1
+#endif
+	popl %esi		/* Restore registers.  */
+
 	cfi_adjust_cfa_offset (-4)
 	cfi_restore (esi)
 	movl %edx, %edi
diff --git a/sysdeps/i386/rawmemchr.S b/sysdeps/i386/rawmemchr.S
index 2bd20e0..27441dd 100644
--- a/sysdeps/i386/rawmemchr.S
+++ b/sysdeps/i386/rawmemchr.S
@@ -46,6 +46,11 @@ ENTRY (__rawmemchr)
 	movl STR(%esp), %eax
 	movl CHR(%esp), %edx
 
+#ifdef __CHKP__
+	bndldx STR(%esp,%eax,1), %bnd0
+	bndcl (%eax), %bnd0
+#endif
+
 	/* At the moment %edx contains C.  What we need for the
 	   algorithm is C in all bytes of the dword.  Avoid
 	   operations on 16 bit words because these require an
@@ -62,18 +67,27 @@ ENTRY (__rawmemchr)
 
 	testb $3, %al		/* correctly aligned ? */
 	je L(1)			/* yes => begin loop */
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	cmpb %dl, (%eax)	/* compare byte */
 	je L(9)			/* target found => return */
 	incl %eax		/* increment source pointer */
 
 	testb $3, %al		/* correctly aligned ? */
 	je L(1)			/* yes => begin loop */
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	cmpb %dl, (%eax)	/* compare byte */
 	je L(9)			/* target found => return */
 	incl %eax		/* increment source pointer */
 
 	testb $3, %al		/* correctly aligned ? */
 	je L(1)			/* yes => begin loop */
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	cmpb %dl, (%eax)	/* compare byte */
 	je L(9)			/* target found => return */
 	incl %eax		/* increment source pointer */
@@ -108,7 +122,11 @@ ENTRY (__rawmemchr)
 	/* Each round the main loop processes 16 bytes.  */
 	ALIGN (4)
 
-L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+L(1):
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
 				   are now 0 */
@@ -143,6 +161,9 @@ L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 	   (following LL(13) below).  Even the len can be compared with
 	   constants instead of decrementing each time.  */
 
+#ifdef __CHKP__
+	bndcu 4(%eax), %bnd0
+#endif
 	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
@@ -157,6 +178,9 @@ L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 				   the addition will not result in 0.  */
 	jnz L(7)		/* found it => return pointer */
 
+#ifdef __CHKP__
+	bndcu 8(%eax), %bnd0
+#endif
 	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
@@ -171,6 +195,9 @@ L(1):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 				   the addition will not result in 0.  */
 	jnz L(6)		/* found it => return pointer */
 
+#ifdef __CHKP__
+	bndcu 12(%eax), %bnd0
+#endif
 	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
@@ -211,6 +238,9 @@ L(8):	testb %cl, %cl		/* test first byte in dword */
 	/* No further test needed we we know it is one of the four bytes.  */
 
 L(9):
+#ifdef __CHKP__
+	bndcu (%eax), %bnd0
+#endif
 	popl %edi		/* pop saved register */
 	cfi_adjust_cfa_offset (-4)
 	cfi_restore (edi)
diff --git a/sysdeps/i386/stpncpy.S b/sysdeps/i386/stpncpy.S
index b23e820..22d727a 100644
--- a/sysdeps/i386/stpncpy.S
+++ b/sysdeps/i386/stpncpy.S
@@ -42,6 +42,14 @@ ENTRY (__stpncpy)
 	movl SRC(%esp), %esi
 	cfi_rel_offset (esi, 0)
 	movl LEN(%esp), %ecx
+#ifdef __CHKP__
+	bndldx DEST(%esp,%eax,1), %bnd0
+	bndldx SRC(%esp,%esi,1), %bnd1
+	bndcl (%eax), %bnd0
+	bndcu -1(%eax, %ecx), %bnd0
+	bndcl (%esi), %bnd1
+	bndcu (%esi), %bnd1
+#endif
 
 	subl %eax, %esi		/* magic: reduce number of loop variants
 				   to one using addressing mode */
diff --git a/sysdeps/i386/strchrnul.S b/sysdeps/i386/strchrnul.S
index 7ceb88e..86bf770 100644
--- a/sysdeps/i386/strchrnul.S
+++ b/sysdeps/i386/strchrnul.S
@@ -38,6 +38,11 @@ ENTRY (__strchrnul)
 	movl STR(%esp), %eax
 	movl CHR(%esp), %edx
 
+# ifdef __CHKP__
+	bndldx STR(%esp,%eax,1), %bnd0
+	bndcl (%eax), %bnd0
+	bndcu (%eax), %bnd0
+# endif
 	/* At the moment %edx contains CHR.  What we need for the
 	   algorithm is CHR in all bytes of the dword.  Avoid
 	   operations on 16 bit words because these require an
@@ -60,6 +65,9 @@ ENTRY (__strchrnul)
 
 	testb $3, %al		/* correctly aligned ? */
 	jz L(11)		/* yes => begin loop */
+# ifdef __CHKP__
+	bndcu (%eax), %bnd0
+# endif
 	movb (%eax), %cl	/* load byte in question (we need it twice) */
 	cmpb %cl, %dl		/* compare byte */
 	je L(6)			/* target found => return */
@@ -69,6 +77,9 @@ ENTRY (__strchrnul)
 
 	testb $3, %al		/* correctly aligned ? */
 	jz L(11)		/* yes => begin loop */
+# ifdef __CHKP__
+	bndcu (%eax), %bnd0
+# endif
 	movb (%eax), %cl	/* load byte in question (we need it twice) */
 	cmpb %cl, %dl		/* compare byte */
 	je L(6)			/* target found => return */
@@ -78,6 +89,9 @@ ENTRY (__strchrnul)
 
 	testb $3, %al		/* correctly aligned ? */
 	jz L(11)		/* yes => begin loop */
+# ifdef __CHKP__
+	bndcu (%eax), %bnd0
+# endif
 	movb (%eax), %cl	/* load byte in question (we need it twice) */
 	cmpb %cl, %dl		/* compare byte */
 	je L(6)			/* target found => return */
@@ -120,7 +134,11 @@ ENTRY (__strchrnul)
 
 L(1):	addl $16, %eax		/* adjust pointer for whole round */
 
-L(11):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+L(11):
+# ifdef __CHKP__
+	bndcu (%eax), %bnd0
+# endif
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
 				   are now 0 */
 	movl $0xfefefeff, %edi	/* magic value */
@@ -164,6 +182,9 @@ L(11):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 				   the addition will not result in 0.  */
 	jnz L(7)		/* found NUL => return NULL */
 
+# ifdef __CHKP__
+	bndcu 4(%eax), %bnd0
+# endif
 	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
 				   are now 0 */
@@ -189,6 +210,9 @@ L(11):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 				   the addition will not result in 0.  */
 	jnz L(71)		/* found NUL => return NULL */
 
+# ifdef __CHKP__
+	bndcu 8(%eax), %bnd0
+# endif
 	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
 				   are now 0 */
@@ -214,6 +238,9 @@ L(11):	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
 				   the addition will not result in 0.  */
 	jnz L(72)		/* found NUL => return NULL */
 
+# ifdef __CHKP__
+	bndcu 12(%eax), %bnd0
+# endif
 	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
 	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
 				   are now 0 */
@@ -268,7 +295,11 @@ L(7):	testb %cl, %cl		/* is first byte CHR? */
 	/* It must be in the fourth byte and it cannot be NUL.  */
 	incl %eax
 
-L(6):	popl %edi		/* restore saved register content */
+L(6):
+# ifdef __CHKP__
+	bndcu (%eax), %bnd0
+# endif
+	popl %edi		/* restore saved register content */
 	cfi_adjust_cfa_offset (-4)
 	cfi_restore (edi)
 
diff --git a/sysdeps/i386/strcspn.S b/sysdeps/i386/strcspn.S
index 0c262d6..1352b03 100644
--- a/sysdeps/i386/strcspn.S
+++ b/sysdeps/i386/strcspn.S
@@ -32,6 +32,14 @@ ENTRY (strcspn)
 
 	movl STR(%esp), %edx
 	movl STOP(%esp), %eax
+#ifdef __CHKP__
+	bndldx STR(%esp,%edx,1), %bnd0
+	bndldx STOP(%esp,%eax,1), %bnd1
+	bndcl (%edx), %bnd0
+	bndcl (%eax), %bnd1
+	bndcu (%edx), %bnd0
+	bndcu (%eax), %bnd1
+#endif
 
 	/* First we create a table with flags for all possible characters.
 	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
diff --git a/sysdeps/i386/strpbrk.S b/sysdeps/i386/strpbrk.S
index 246ae27..7190a06 100644
--- a/sysdeps/i386/strpbrk.S
+++ b/sysdeps/i386/strpbrk.S
@@ -33,6 +33,14 @@ ENTRY (strpbrk)
 
 	movl STR(%esp), %edx
 	movl STOP(%esp), %eax
+#ifdef __CHKP__
+	bndldx STR(%esp,%edx,1), %bnd0
+	bndldx STOP(%esp,%eax,1), %bnd1
+	bndcl (%edx), %bnd0
+	bndcl (%eax), %bnd1
+	bndcu (%edx), %bnd0
+	bndcu (%eax), %bnd1
+#endif
 
 	/* First we create a table with flags for all possible characters.
 	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
diff --git a/sysdeps/i386/strrchr.S b/sysdeps/i386/strrchr.S
index 31b8a45..858bba4 100644
--- a/sysdeps/i386/strrchr.S
+++ b/sysdeps/i386/strrchr.S
@@ -40,6 +40,10 @@ ENTRY (strrchr)
 	movl STR(%esp), %esi
 	cfi_rel_offset (esi, 0)
 	movl CHR(%esp), %ecx
+#ifdef __CHKP__
+	bndldx STR(%esp,%esi,1), %bnd0
+	bndcl (%esi), %bnd0
+#endif
 
 	/* At the moment %ecx contains C.  What we need for the
 	   algorithm is C in all bytes of the dword.  Avoid
@@ -63,6 +67,9 @@ ENTRY (strrchr)
 
 	testl $3, %esi		/* correctly aligned ? */
 	jz L(19)		/* yes => begin loop */
+#ifdef __CHKP__
+	bndcu (%esi), %bnd0
+#endif
 	movb (%esi), %dl	/* load byte in question (we need it twice) */
 	cmpb %dl, %cl		/* compare byte */
 	jne L(11)			/* target found => return */
@@ -73,6 +80,9 @@ L(11):	orb %dl, %dl		/* is NUL? */
 
 	testl $3, %esi		/* correctly aligned ? */
 	jz L(19)		/* yes => begin loop */
+#ifdef __CHKP__
+	bndcu (%esi), %bnd0
+#endif
 	movb (%esi), %dl	/* load byte in question (we need it twice) */
 	cmpb %dl, %cl		/* compare byte */
 	jne L(12)			/* target found => return */
@@ -83,6 +93,9 @@ L(12):	orb %dl, %dl		/* is NUL? */
 
 	testl $3, %esi		/* correctly aligned ? */
 	jz L(19)		/* yes => begin loop */
+#ifdef __CHKP__
+	bndcu (%esi), %bnd0
+#endif
 	movb (%esi), %dl	/* load byte in question (we need it twice) */
 	cmpb %dl, %cl		/* compare byte */
 	jne L(13)			/* target found => return */
@@ -170,7 +183,11 @@ L(51):
 
 L(1):	addl $16, %esi		/* increment pointer for full round */
 
-L(19):	movl (%esi), %edx	/* get word (= 4 bytes) in question */
+L(19):
+#ifdef __CHKP__
+	bndcu (%esi), %bnd0
+#endif
+	movl (%esi), %edx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	addl %edx, %edi		/* add the magic value to the word.  We get
 				   carry bits reported for each byte which
@@ -214,6 +231,9 @@ L(19):	movl (%esi), %edx	/* get word (= 4 bytes) in question */
 				   the addition will not result in 0.  */
 	jnz L(3)		/* C is detected in the word => examine it */
 
+#ifdef __CHKP__
+	bndcu 4(%esi), %bnd0
+#endif
 	movl 4(%esi), %edx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	addl %edx, %edi		/* add the magic value to the word.  We get
@@ -238,6 +258,9 @@ L(19):	movl (%esi), %edx	/* get word (= 4 bytes) in question */
 				   the addition will not result in 0.  */
 	jnz L(31)		/* C is detected in the word => examine it */
 
+#ifdef __CHKP__
+	bndcu 8(%esi), %bnd0
+#endif
 	movl 8(%esi), %edx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	addl %edx, %edi		/* add the magic value to the word.  We get
@@ -262,6 +285,9 @@ L(19):	movl (%esi), %edx	/* get word (= 4 bytes) in question */
 				   the addition will not result in 0.  */
 	jnz L(32)		/* C is detected in the word => examine it */
 
+#ifdef __CHKP__
+	bndcu 12(%esi), %bnd0
+#endif
 	movl 12(%esi), %edx	/* get word (= 4 bytes) in question */
 	movl $0xfefefeff, %edi	/* magic value */
 	addl %edx, %edi		/* add the magic value to the word.  We get
diff --git a/sysdeps/i386/strtok.S b/sysdeps/i386/strtok.S
index 79d540b..cfee507 100644
--- a/sysdeps/i386/strtok.S
+++ b/sysdeps/i386/strtok.S
@@ -67,6 +67,11 @@ ENTRY (FUNCTION)
 	movl STR(%esp), %edx
 	movl DELIM(%esp), %eax
 
+#ifdef __CHKP__
+	bndldx STR(%esp,%edx,1), %bnd0
+	bndldx DELIM(%esp,%eax,1), %bnd1
+#endif
+
 #if !defined USE_AS_STRTOK_R && defined PIC
 	pushl %ebx			/* Save PIC register.  */
 	cfi_adjust_cfa_offset (4)
@@ -336,6 +341,9 @@ L(11):
 	/* Store the pointer to the next character.  */
 #ifdef USE_AS_STRTOK_R
 	movl SAVE(%esp), %ecx
+# ifdef __CHKP__
+	bndmov %bnd2, %bnd0
+# endif
 #endif
 	movl %edx, SAVE_PTR
 
@@ -351,6 +359,9 @@ L(returnNULL):
 	xorl %eax, %eax
 #ifdef USE_AS_STRTOK_R
 	movl SAVE(%esp), %ecx
+# ifdef __CHKP__
+	bndmov %bnd2, %bnd0
+# endif
 #endif
 	movl %edx, SAVE_PTR
 	jmp L(epilogue)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b06794c2e76de78232cd20bf331ce8913f2fd764

commit b06794c2e76de78232cd20bf331ce8913f2fd764
Author: Liubov Dmitrieva <ldmitrie@sourceware.org>
Date:   Mon Aug 26 16:51:26 2013 +0400

    Warning! Temporary use O0 for vfprintf.c file if MPX enabled because of a compiler bug relates MPX.

diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 658804b..12befcc 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -77,6 +77,9 @@ $(objpfx)tst-printf.out: tst-printf.sh $(objpfx)tst-printf
 endif
 
 CFLAGS-vfprintf.c = -Wno-uninitialized
+ifeq ($(enable-mpx), yes)
+CFLAGS-vfprintf.c += -O0 -D__OPTIMIZE__
+endif
 CFLAGS-vfwprintf.c = -Wno-uninitialized
 CFLAGS-tst-printf.c = -Wno-format
 CFLAGS-tstdiomisc.c = -Wno-format

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b18de4c90e1c54330af6d2ebf43c3cfa6e6600b6

commit b18de4c90e1c54330af6d2ebf43c3cfa6e6600b6
Author: Liubov Dmitrieva <ldmitrie@sourceware.org>
Date:   Mon Sep 2 13:21:47 2013 +0400

    Add --enable-mpx option to configure for Intel MPX support.
    
    Conflicts:
    	elf/Makefile

diff --git a/config.make.in b/config.make.in
index 7b04568..8c1228d 100644
--- a/config.make.in
+++ b/config.make.in
@@ -96,12 +96,14 @@ build-nscd = @build_nscd@
 use-nscd = @use_nscd@
 build-hardcoded-path-in-tests= @hardcoded_path_in_tests@
 build-pt-chown = @build_pt_chown@
+enable-mpx = @enable_mpx@
 
 # Build tools.
 CC = @CC@
 CXX = @CXX@
 BUILD_CC = @BUILD_CC@
 CFLAGS = @CFLAGS@
+ASFLAGS = @ASFLAGS@
 CPPFLAGS-config = @CPPFLAGS@
 CPPUNDEFS = @CPPUNDEFS@
 ASFLAGS-config = @ASFLAGS_config@
diff --git a/configure b/configure
index afe7821..7bb8bf2 100755
--- a/configure
+++ b/configure
@@ -653,6 +653,8 @@ link_obsolete_rpc
 libc_cv_nss_crypt
 all_warnings
 force_install
+ASFLAGS
+enable_mpx
 bindnow
 hardcoded_path_in_tests
 oldest_abi
@@ -747,6 +749,7 @@ enable_lock_elision
 enable_add_ons
 enable_hidden_plt
 enable_bind_now
+enable_mpx
 enable_static_nss
 enable_force_install
 enable_kernel
@@ -1409,6 +1412,7 @@ Optional Features:
                           for add-ons if no parameter given
   --disable-hidden-plt    do not hide internal function calls to avoid PLT
   --enable-bind-now       disable lazy relocations in DSOs
+  --enable-mpx            turn on Intel MPX extension
   --enable-static-nss     build static NSS modules [default=no]
   --disable-force-install don't force installation of files from this package,
                           even if they are older than the installed files
@@ -3519,6 +3523,24 @@ fi
 
 
 
+# Check whether --enable-mpx was given.
+if test "${enable_mpx+set}" = set; then :
+  enableval=$enable_mpx; enable_mpx=$enableval
+else
+  enable_mpx=no
+fi
+
+
+
+
+if test "$ac_test_CFLAGS" != set && test "$enable_mpx" = yes ; then
+	CFLAGS="$CFLAGS -g -fcheck-pointers -mmpx -fno-chkp-check-incomplete-type";
+fi
+
+if test "$enable_mpx" = yes ; then
+   ASFLAGS="$ASFLAGS -g -fcheck-pointers -mmpx -Wa,-madd-bnd-prefix"
+fi
+
 # Check whether --enable-static-nss was given.
 if test "${enable_static_nss+set}" = set; then :
   enableval=$enable_static_nss; static_nss=$enableval
diff --git a/configure.in b/configure.in
index 9172ad1..d7eb9a6 100644
--- a/configure.in
+++ b/configure.in
@@ -216,6 +216,22 @@ AC_ARG_ENABLE([bind-now],
 	      [bindnow=no])
 AC_SUBST(bindnow)
 
+AC_ARG_ENABLE([mpx],
+	      AC_HELP_STRING([--enable-mpx],
+			     [turn on Intel MPX extension]),
+	      [enable_mpx=$enableval],
+	      [enable_mpx=no])
+
+AC_SUBST(enable_mpx)
+AC_SUBST(ASFLAGS)
+if test "$ac_test_CFLAGS" != set && test "$enable_mpx" = yes ; then
+	CFLAGS="$CFLAGS -g -fcheck-pointers -mmpx -fno-chkp-check-incomplete-type";
+fi
+
+if test "$enable_mpx" = yes ; then
+   ASFLAGS="$ASFLAGS -g -fcheck-pointers -mmpx -Wa,-madd-bnd-prefix"
+fi
+
 dnl On some platforms we cannot use dynamic loading.  We must provide
 dnl static NSS modules.
 AC_ARG_ENABLE([static-nss],
diff --git a/elf/Makefile b/elf/Makefile
index 4ef80c9..2bdf045 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -49,7 +49,10 @@ all-rtld-routines = $(rtld-routines) $(sysdep-rtld-routines)
 CFLAGS-dl-runtime.c = -fexceptions -fasynchronous-unwind-tables
 CFLAGS-dl-lookup.c = -fexceptions -fasynchronous-unwind-tables
 CFLAGS-dl-iterate-phdr.c = $(uses-callbacks)
+
+ifeq ($(enable-mpx), yes)
 CFLAGS-dl-init.c = -fno-check-pointers
+endif
 
 ifeq ($(unwind-find-fde),yes)
 routines += unwind-dw2-fde-glibc
diff --git a/manual/install.texi b/manual/install.texi
index 4575d22..68dab0d 100644
--- a/manual/install.texi
+++ b/manual/install.texi
@@ -177,6 +177,9 @@ setuid and owned by @code{root}.  The use of @file{pt_chown} introduces
 additional security risks to the system and you should enable it only if
 you understand and accept those risks.
 
+@item --enable-mpx
+By default, Intel MPX extension is disabled. This option turns it on.
+
 @item --build=@var{build-system}
 @itemx --host=@var{host-system}
 These options are for cross-compiling.  If you specify both options and

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1002f6c2a45ba235dd53d7e02ee9ad24cd5743b8

commit 1002f6c2a45ba235dd53d7e02ee9ad24cd5743b8
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Sun May 19 18:30:05 2013 +0400

    Support new siginfo in Glibc for Intel MPX.

diff --git a/sysdeps/unix/sysv/linux/x86/bits/siginfo.h b/sysdeps/unix/sysv/linux/x86/bits/siginfo.h
index bfc6aa3..23d946c 100644
--- a/sysdeps/unix/sysv/linux/x86/bits/siginfo.h
+++ b/sysdeps/unix/sysv/linux/x86/bits/siginfo.h
@@ -108,6 +108,10 @@ typedef struct
 	  {
 	    void *si_addr;	/* Faulting insn/memory ref.  */
 	    short int si_addr_lsb;	/* Valid LSB of the reported address.  */
+# ifdef __CHKP__
+	    void *si_lower;
+	    void *si_upper;
+# endif
 	  } _sigfault;
 
 	/* SIGPOLL.  */
@@ -141,6 +145,10 @@ typedef struct
 # define si_ptr		_sifields._rt.si_sigval.sival_ptr
 # define si_addr	_sifields._sigfault.si_addr
 # define si_addr_lsb	_sifields._sigfault.si_addr_lsb
+# ifdef __CHKP__
+#  define si_lower	_sifields._sigfault.si_lower
+#  define si_upper	_sifields._sigfault.si_upper
+# endif
 # define si_band	_sifields._sigpoll.si_band
 # define si_fd		_sifields._sigpoll.si_fd
 # define si_call_addr 	_sifields._sigsys._call_addr

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9096323b8cb7bfe15b5f7137dc24f92f5ba6dd17

commit 9096323b8cb7bfe15b5f7137dc24f92f5ba6dd17
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Fri Jan 25 18:40:50 2013 +0400

    Intel MPX support for x86_64 and x86_32  pthread routines.
    Always use INIT bounds in __tls_get_addr.
    Set bounds manually in _Unwind_Resume.

diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 576d9a1..ee84fa6 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -767,6 +767,9 @@ update_get_addr (GET_ADDR_ARGS)
 void *
 __tls_get_addr (GET_ADDR_ARGS)
 {
+#ifdef __CHKP__
+  GET_ADDR_PARAM  = __bnd_init_ptr_bounds(GET_ADDR_PARAM);
+#endif
   dtv_t *dtv = THREAD_DTV ();
 
   if (__builtin_expect (dtv[0].counter != GL(dl_tls_generation), 0))
diff --git a/nptl/sysdeps/unix/sysv/linux/i386/i486/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/i386/i486/pthread_cond_timedwait.S
index a6d6bc4..973ff0e 100644
--- a/nptl/sysdeps/unix/sysv/linux/i386/i486/pthread_cond_timedwait.S
+++ b/nptl/sysdeps/unix/sysv/linux/i386/i486/pthread_cond_timedwait.S
@@ -94,6 +94,13 @@ __pthread_cond_timedwait:
 	je	.Lreltmo
 #endif
 
+#ifdef __CHKP__
+	bndldx	(%esp,%ebx,1), %bnd0
+	bndldx	28(%esp,%ebp,1), %bnd2
+	bndmov	%bnd0, 48(%esp)
+	bndmov	%bnd2, 80(%esp)
+#endif
+
 	/* Get internal lock.  */
 	movl	$1, %edx
 	xorl	%eax, %eax
@@ -109,12 +116,24 @@ __pthread_cond_timedwait:
 	   different value in there this is a bad user bug.  */
 2:	cmpl	$-1, dep_mutex(%ebx)
 	movl	24(%esp), %eax
+#ifdef __CHKP__
+	bndldx	4(%esp,%eax,1), %bnd1
+	bndmov	%bnd1, 64(%esp)
+#endif
 	je	17f
 	movl	%eax, dep_mutex(%ebx)
 
 	/* Unlock the mutex.  */
 17:	xorl	%edx, %edx
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	call	__pthread_mutex_unlock_usercnt
+#ifdef __CHKP__
+	bndmov 48(%esp), %bnd0
+	bndmov 64(%esp), %bnd1
+	bndmov 80(%esp), %bnd2
+#endif
 
 	testl	%eax, %eax
 	jne	16f
@@ -296,9 +315,25 @@ __pthread_cond_timedwait:
 	   should always succeed or else the kernel did not lock the mutex
 	   correctly.  */
 	movl	dep_mutex(%ebx), %eax
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	call	__pthread_mutex_cond_lock_adjust
+#ifdef __CHKP__
+	bndmov 48(%esp), %bnd0
+	bndmov 64(%esp), %bnd1
+	bndmov 80(%esp), %bnd2
+#endif
 	xorl	%edx, %edx
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	call	__pthread_mutex_unlock_usercnt
+#ifdef __CHKP__
+	bndmov 48(%esp), %bnd0
+	bndmov 64(%esp), %bnd1
+	bndmov 80(%esp), %bnd2
+#endif
 	jmp	8b
 
 28:	addl	$1, wakeup_seq(%ebx)
@@ -356,8 +391,15 @@ __pthread_cond_timedwait:
 	movl	16(%esp), %ecx
 	testl	%ecx, %ecx
 	jnz	27f
-
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	call	__pthread_mutex_cond_lock
+#ifdef __CHKP__
+	bndmov 48(%esp), %bnd0
+	bndmov 64(%esp), %bnd1
+	bndmov 80(%esp), %bnd2
+#endif
 26:	addl	$FRAME_SIZE, %esp
 	cfi_adjust_cfa_offset(-FRAME_SIZE)
 
@@ -388,7 +430,16 @@ __pthread_cond_timedwait:
 
 	cfi_restore_state
 
-27:	call	__pthread_mutex_cond_lock_adjust
+27:
+#ifdef __CHKP__
+        bndmov %bnd1, %bnd0
+#endif
+	call	__pthread_mutex_cond_lock_adjust
+#ifdef __CHKP__
+	bndmov 48(%esp), %bnd0
+	bndmov 64(%esp), %bnd1
+	bndmov 80(%esp), %bnd2
+#endif
 	xorl	%eax, %eax
 	jmp	26b
 
@@ -529,7 +580,15 @@ __pthread_cond_timedwait:
 
 	/* Unlock the mutex.  */
 117:	xorl	%edx, %edx
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	call	__pthread_mutex_unlock_usercnt
+#ifdef __CHKP__
+	bndmov 48(%esp), %bnd0
+	bndmov 64(%esp), %bnd1
+	bndmov 80(%esp), %bnd2
+#endif
 
 	testl	%eax, %eax
 	jne	16b
@@ -899,10 +958,27 @@ __condvar_tw_cleanup:
 	cmpl	%ebx, %gs:TID
 	jne	8f
 	/* We managed to get the lock.  Fix it up before returning.  */
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	call	__pthread_mutex_cond_lock_adjust
+#ifdef __CHKP__
+	bndmov 48(%esp), %bnd0
+	bndmov 64(%esp), %bnd1
+	bndmov 80(%esp), %bnd2
+#endif
 	jmp	9f
 
-8:	call	__pthread_mutex_cond_lock
+8:
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
+	call	__pthread_mutex_cond_lock
+#ifdef __CHKP__
+	bndmov 48(%esp), %bnd0
+	bndmov 64(%esp), %bnd1
+	bndmov 80(%esp), %bnd2
+#endif
 
 9:	movl	%esi, (%esp)
 .LcallUR:
diff --git a/nptl/sysdeps/unix/sysv/linux/i386/i486/pthread_cond_wait.S b/nptl/sysdeps/unix/sysv/linux/i386/i486/pthread_cond_wait.S
index 9695dcb..af53cbf 100644
--- a/nptl/sysdeps/unix/sysv/linux/i386/i486/pthread_cond_wait.S
+++ b/nptl/sysdeps/unix/sysv/linux/i386/i486/pthread_cond_wait.S
@@ -60,6 +60,10 @@ __pthread_cond_wait:
 
 	xorl	%esi, %esi
 	movl	20(%esp), %ebx
+#ifdef __CHKP__
+	bndldx	(%esp,%ebx,1), %bnd0
+	bndmov	%bnd0, 32(%esp)
+#endif
 
 	LIBC_PROBE (cond_wait, 2, 24(%esp), %ebx)
 
@@ -78,12 +82,23 @@ __pthread_cond_wait:
 	   different value in there this is a bad user bug.  */
 2:	cmpl	$-1, dep_mutex(%ebx)
 	movl	24(%esp), %eax
+#ifdef __CHKP__
+	bndldx	4(%esp,%eax,1), %bnd1
+	bndmov	%bnd1, 48(%esp)
+#endif
 	je	15f
 	movl	%eax, dep_mutex(%ebx)
 
 	/* Unlock the mutex.  */
 15:	xorl	%edx, %edx
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	call	__pthread_mutex_unlock_usercnt
+#ifdef __CHKP__
+	bndmov 32(%esp), %bnd0
+	bndmov 48(%esp), %bnd1
+#endif
 
 	testl	%eax, %eax
 	jne	12f
@@ -270,7 +285,14 @@ __pthread_cond_wait:
 	testl	%ecx, %ecx
 	jnz	21f
 
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	call	__pthread_mutex_cond_lock
+#ifdef __CHKP__
+	bndmov 32(%esp), %bnd0
+	bndmov 48(%esp), %bnd1
+#endif
 20:	addl	$FRAME_SIZE, %esp
 	cfi_adjust_cfa_offset(-FRAME_SIZE);
 
@@ -292,7 +314,15 @@ __pthread_cond_wait:
 
 	cfi_restore_state
 
-21:	call	__pthread_mutex_cond_lock_adjust
+21:
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
+	call	__pthread_mutex_cond_lock_adjust
+#ifdef __CHKP__
+	bndmov 32(%esp), %bnd0
+	bndmov 48(%esp), %bnd1
+#endif
 	xorl	%eax, %eax
 	jmp	20b
 
@@ -308,9 +338,23 @@ __pthread_cond_wait:
 	   should always succeed or else the kernel did not lock the mutex
 	   correctly.  */
 	movl	dep_mutex(%ebx), %eax
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	call    __pthread_mutex_cond_lock_adjust
+#ifdef __CHKP__
+	bndmov 32(%esp), %bnd0
+	bndmov 48(%esp), %bnd1
+#endif
 	xorl	%edx, %edx
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	call	__pthread_mutex_unlock_usercnt
+#ifdef __CHKP__
+	bndmov 32(%esp), %bnd0
+	bndmov 48(%esp), %bnd1
+#endif
 	jmp	8b
 
 	/* Initial locking failed.  */
@@ -581,10 +625,25 @@ __condvar_w_cleanup:
 	cmpl	%ebx, %gs:TID
 	jne	8f
 	/* We managed to get the lock.  Fix it up before returning.  */
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	call	__pthread_mutex_cond_lock_adjust
+#ifdef __CHKP__
+	bndmov 32(%esp), %bnd0
+	bndmov 48(%esp), %bnd1
+#endif
 	jmp	9f
 
-8:	call	__pthread_mutex_cond_lock
+8:
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
+	call	__pthread_mutex_cond_lock
+#ifdef __CHKP__
+	bndmov 32(%esp), %bnd0
+	bndmov 48(%esp), %bnd1
+#endif
 
 9:	movl	%esi, (%esp)
 .LcallUR:
diff --git a/nptl/sysdeps/unix/sysv/linux/i386/pthread_once.S b/nptl/sysdeps/unix/sysv/linux/i386/pthread_once.S
index b405b9e..7104fba 100644
--- a/nptl/sysdeps/unix/sysv/linux/i386/pthread_once.S
+++ b/nptl/sysdeps/unix/sysv/linux/i386/pthread_once.S
@@ -114,6 +114,9 @@ __pthread_once:
 	jne	7f
 
 	leal	8(%esp), %eax
+#ifdef __CHKP__
+	bndldx	8(%esp,%eax,1), %bnd0
+#endif
 	call	HIDDEN_JUMPTARGET(__pthread_register_cancel)
 
 	/* Call the user-provided initialization function.  */
@@ -121,6 +124,9 @@ __pthread_once:
 
 	/* Pop the cleanup handler.  */
 	leal	8(%esp), %eax
+#ifdef __CHKP__
+	bndldx	8(%esp,%eax,1), %bnd0
+#endif
 	call	HIDDEN_JUMPTARGET(__pthread_unregister_cancel)
 	addl	$UNWINDBUFSIZE+8, %esp
 	cfi_adjust_cfa_offset (-UNWINDBUFSIZE-8)
@@ -168,6 +174,9 @@ __pthread_once:
 	ENTER_KERNEL
 
 	leal	8(%esp), %eax
+#ifdef __CHKP__
+	bndldx	8(%esp,%eax,1), %bnd0
+#endif
 	call	HIDDEN_JUMPTARGET (__pthread_unwind_next)
 	/* NOTREACHED */
 	hlt
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
index 6c1a75f..dc15345 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
@@ -99,6 +99,12 @@ __pthread_cond_timedwait:
 	movq	%rsi, 16(%rsp)
 	movq	%rdx, %r13
 
+#ifdef __CHKP__
+	bndmov %bnd0, 72(%rsp)
+	bndmov %bnd1, 88(%rsp)
+	bndmov %bnd2, 104(%rsp)
+#endif
+
 	je	22f
 	mov	%RSI_LP, dep_mutex(%rdi)
 
@@ -128,7 +134,15 @@ __pthread_cond_timedwait:
 	/* Unlock the mutex.  */
 32:	movq	16(%rsp), %rdi
 	xorl	%esi, %esi
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	callq	__pthread_mutex_unlock_usercnt
+#ifdef __CHKP__
+	bndmov 72(%rsp), %bnd0
+	bndmov 88(%rsp), %bnd1
+	bndmov 104(%rsp), %bnd2
+#endif
 
 	testl	%eax, %eax
 	jne	46f
@@ -338,7 +352,15 @@ __pthread_cond_timedwait:
 	testb	%r15b, %r15b
 	jnz	64f
 
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	callq	__pthread_mutex_cond_lock
+#ifdef __CHKP__
+	bndmov 72(%rsp), %bnd0
+	bndmov 88(%rsp), %bnd1
+	bndmov 104(%rsp), %bnd2
+#endif
 
 63:	testq	%rax, %rax
 	cmoveq	%r14, %rax
@@ -362,7 +384,16 @@ __pthread_cond_timedwait:
 
 	cfi_restore_state
 
-64:	callq	__pthread_mutex_cond_lock_adjust
+64:
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
+	callq	__pthread_mutex_cond_lock_adjust
+#ifdef __CHKP__
+	bndmov 72(%rsp), %bnd0
+	bndmov 88(%rsp), %bnd1
+	bndmov 104(%rsp), %bnd2
+#endif
 	movq	%r14, %rax
 	jmp	48b
 
@@ -457,7 +488,15 @@ __pthread_cond_timedwait:
 	/* Unlock the mutex.  */
 2:	movq	16(%rsp), %rdi
 	xorl	%esi, %esi
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	callq	__pthread_mutex_unlock_usercnt
+#ifdef __CHKP__
+	bndmov 72(%rsp), %bnd0
+	bndmov 88(%rsp), %bnd1
+	bndmov 104(%rsp), %bnd2
+#endif
 
 	testl	%eax, %eax
 	jne	46b
@@ -786,7 +825,15 @@ __condvar_cleanup2:
 	cmpl	%eax, %fs:TID
 	jne	7f
 	/* We managed to get the lock.  Fix it up before returning.  */
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	callq	__pthread_mutex_cond_lock_adjust
+#ifdef __CHKP__
+	bndmov 72(%rsp), %bnd0
+	bndmov 88(%rsp), %bnd1
+	bndmov 104(%rsp), %bnd2
+#endif
 	jmp	8f
 
 7:	callq	__pthread_mutex_cond_lock
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
index f0f6683..32b8d69 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
@@ -74,6 +74,11 @@ __pthread_cond_wait:
 	movq	%rdi, 8(%rsp)
 	movq	%rsi, 16(%rsp)
 
+#ifdef __CHKP__
+	bndmov %bnd0, 32(%rsp)
+	bndmov %bnd1, 48(%rsp)
+#endif
+
 	je	15f
 	mov	%RSI_LP, dep_mutex(%rdi)
 
@@ -91,7 +96,14 @@ __pthread_cond_wait:
 	/* Unlock the mutex.  */
 2:	movq	16(%rsp), %rdi
 	xorl	%esi, %esi
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	callq	__pthread_mutex_unlock_usercnt
+#ifdef __CHKP__
+	bndmov 32(%rsp), %bnd0
+	bndmov 48(%rsp), %bnd1
+#endif
 
 	testl	%eax, %eax
 	jne	12f
@@ -256,7 +268,14 @@ __pthread_cond_wait:
 	testb	%r8b, %r8b
 	jnz	18f
 
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	callq	__pthread_mutex_cond_lock
+#ifdef __CHKP__
+	bndmov 32(%rsp), %bnd0
+	bndmov 48(%rsp), %bnd1
+#endif
 
 14:	leaq	FRAME_SIZE(%rsp), %rsp
 	cfi_adjust_cfa_offset(-FRAME_SIZE)
@@ -266,7 +285,15 @@ __pthread_cond_wait:
 
 	cfi_adjust_cfa_offset(FRAME_SIZE)
 
-18:	callq	__pthread_mutex_cond_lock_adjust
+18:
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
+	callq	__pthread_mutex_cond_lock_adjust
+#ifdef __CHKP__
+	bndmov 32(%rsp), %bnd0
+	bndmov 48(%rsp), %bnd1
+#endif
 	xorl	%eax, %eax
 	jmp	14b
 
@@ -510,10 +537,16 @@ __condvar_cleanup1:
 	cmpl	%eax, %fs:TID
 	jne	7f
 	/* We managed to get the lock.  Fix it up before returning.  */
+#ifdef __CHKP__
+	bndmov %bnd1, %bnd0
+#endif
 	callq	__pthread_mutex_cond_lock_adjust
+#ifdef __CHKP__
+	bndmov 32(%rsp), %bnd0
+	bndmov 48(%rsp), %bnd1
+#endif
 	jmp	8f
 
-
 7:	callq	__pthread_mutex_cond_lock
 
 8:	movq	24(%rsp), %rdi
diff --git a/sysdeps/gnu/unwind-resume.c b/sysdeps/gnu/unwind-resume.c
index df845cd..19e06b2 100644
--- a/sysdeps/gnu/unwind-resume.c
+++ b/sysdeps/gnu/unwind-resume.c
@@ -46,6 +46,9 @@ init (void)
 void
 _Unwind_Resume (struct _Unwind_Exception *exc)
 {
+#ifdef __CHKP__
+  exc = (struct _Unwind_Exception *) __bnd_set_ptr_bounds (exc, sizeof (struct _Unwind_Exception));
+#endif
   if (__builtin_expect (libgcc_s_resume == NULL, 0))
     init ();
   libgcc_s_resume (exc);

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b98cec04223e2dc8191af5b1ee85d0f49a9eca51

commit b98cec04223e2dc8191af5b1ee85d0f49a9eca51
Author: Liubov Dmitrieva <ldmitrie@sourceware.org>
Date:   Thu Aug 29 16:33:47 2013 +0400

    Buffer overrun detected by Intel MPX in wcschr test. Fixed.

diff --git a/string/test-strchr.c b/string/test-strchr.c
index cbcf53e..572671f 100644
--- a/string/test-strchr.c
+++ b/string/test-strchr.c
@@ -219,9 +219,14 @@ do_random_tests (void)
 static void
 check1 (void)
 {
-  char s[] __attribute__((aligned(16))) = "\xff";
-  char c = '\xfe';
-  char *exp_result = stupid_STRCHR (s, c);
+  CHAR s[] __attribute__((aligned(16))) =
+#ifdef WIDE
+  L"\xff";
+#else
+  "\xff";
+#endif
+  CHAR c = '\xfe';
+  CHAR *exp_result = stupid_STRCHR (s, c);
 
   FOR_EACH_IMPL (impl, 0)
     check_result (impl, s, c, exp_result);

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b825d28a47d31d525fa1042257a88a705545268a

commit b825d28a47d31d525fa1042257a88a705545268a
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Sat Dec 22 20:51:45 2012 +0400

    [BZ 15698] Buffer overrun detected by Intel MPX at sysdeps/unix/sysv/linux/ifaddrs.c

diff --git a/sysdeps/unix/sysv/linux/ifaddrs.c b/sysdeps/unix/sysv/linux/ifaddrs.c
index 89fda15..4f5f7b5 100644
--- a/sysdeps/unix/sysv/linux/ifaddrs.c
+++ b/sysdeps/unix/sysv/linux/ifaddrs.c
@@ -782,9 +782,11 @@ getifaddrs_internal (struct ifaddrs **ifap)
 
 		      for (i = 0; i < (preflen / 8); i++)
 			*cp++ = 0xff;
-		      c = 0xff;
-		      c <<= (8 - (preflen % 8));
-		      *cp = c;
+				if (preflen < max_prefixlen) {
+		        c = 0xff;
+		        c <<= (8 - (preflen % 8));
+		        *cp = c;
+				}
 		    }
 		}
 	    }

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=980b18db54bf573d5352cd79c0a1985f59b171e5

commit 980b18db54bf573d5352cd79c0a1985f59b171e5
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Wed Dec 19 18:56:40 2012 +0400

    Buffer overrun detected by Intel MPX in stdio-common/scanf13.c. Fixed.

diff --git a/stdio-common/scanf13.c b/stdio-common/scanf13.c
index 720224a..aa58dd5 100644
--- a/stdio-common/scanf13.c
+++ b/stdio-common/scanf13.c
@@ -59,6 +59,7 @@ main (void)
     }
 
   memset (buf, '/', sizeof (buf));
+  buf[sizeof(buf) - 1] = 0;
   buf[0] = '\t';
   buf[1] = ' ';
   buf[2] = 0xc3;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3b564b91f33e3ca230eec121514ee505d0d54437

commit 3b564b91f33e3ca230eec121514ee505d0d54437
Author: ienkovic <ilya.enkovich@intel.com>
Date:   Tue Dec 25 15:16:28 2012 +0400

    Do not block SIGSEGV signal because Intel MPX runtime uses it.

diff --git a/nptl/sysdeps/pthread/gai_misc.h b/nptl/sysdeps/pthread/gai_misc.h
index 6026085..46305ca 100644
--- a/nptl/sysdeps/pthread/gai_misc.h
+++ b/nptl/sysdeps/pthread/gai_misc.h
@@ -82,6 +82,9 @@ __gai_start_notify_thread (void)
   sigset_t ss;
   sigemptyset (&ss);
   INTERNAL_SYSCALL_DECL (err);
+#ifdef __CHKP__
+  __sigdelset(&ss, SIGSEGV);
+#endif
   INTERNAL_SYSCALL (rt_sigprocmask, err, 4, SIG_SETMASK, &ss, NULL, _NSIG / 8);
 }
 
@@ -106,6 +109,9 @@ __gai_create_helper_thread (pthread_t *threadp, void *(*tf) (void *),
   sigset_t oss;
   sigfillset (&ss);
   INTERNAL_SYSCALL_DECL (err);
+#ifdef __CHKP__
+  __sigdelset(&ss, SIGSEGV);
+#endif
   INTERNAL_SYSCALL (rt_sigprocmask, err, 4, SIG_SETMASK, &ss, &oss, _NSIG / 8);
 
   int ret = pthread_create (threadp, &attr, tf, arg);
diff --git a/nptl/sysdeps/unix/sysv/linux/aio_misc.h b/nptl/sysdeps/unix/sysv/linux/aio_misc.h
index 2649dc1..3994f98 100644
--- a/nptl/sysdeps/unix/sysv/linux/aio_misc.h
+++ b/nptl/sysdeps/unix/sysv/linux/aio_misc.h
@@ -32,6 +32,9 @@ __aio_start_notify_thread (void)
   sigset_t ss;
   sigemptyset (&ss);
   INTERNAL_SYSCALL_DECL (err);
+#ifdef __CHKP__
+  __sigdelset(&ss, SIGSEGV);
+#endif
   INTERNAL_SYSCALL (rt_sigprocmask, err, 4, SIG_SETMASK, &ss, NULL, _NSIG / 8);
 }
 
@@ -54,6 +57,9 @@ __aio_create_helper_thread (pthread_t *threadp, void *(*tf) (void *),
   sigset_t oss;
   sigfillset (&ss);
   INTERNAL_SYSCALL_DECL (err);
+#ifdef __CHKP__
+  __sigdelset(&ss, SIGSEGV);
+#endif
   INTERNAL_SYSCALL (rt_sigprocmask, err, 4, SIG_SETMASK, &ss, &oss, _NSIG / 8);
 
   int ret = pthread_create (threadp, &attr, tf, arg);
diff --git a/nptl/sysdeps/unix/sysv/linux/mq_notify.c b/nptl/sysdeps/unix/sysv/linux/mq_notify.c
index 6bc34ba..b9250df 100644
--- a/nptl/sysdeps/unix/sysv/linux/mq_notify.c
+++ b/nptl/sysdeps/unix/sysv/linux/mq_notify.c
@@ -78,6 +78,9 @@ change_sigmask (int how, sigset_t *oss)
 {
   sigset_t ss;
   sigfillset (&ss);
+#ifdef __CHKP__
+  sigdelset (&ss, SIGSEGV);
+#endif
   return pthread_sigmask (how, &ss, oss);
 }
 
diff --git a/nptl/sysdeps/unix/sysv/linux/timer_routines.c b/nptl/sysdeps/unix/sysv/linux/timer_routines.c
index 57f115f..1979adc 100644
--- a/nptl/sysdeps/unix/sysv/linux/timer_routines.c
+++ b/nptl/sysdeps/unix/sysv/linux/timer_routines.c
@@ -174,6 +174,9 @@ __start_helper_thread (void)
   sigset_t oss;
   sigfillset (&ss);
   __sigaddset (&ss, SIGCANCEL);
+#ifdef __CHKP__
+  __sigdelset (&ss, SIGSEGV);
+#endif
   INTERNAL_SYSCALL_DECL (err);
   INTERNAL_SYSCALL (rt_sigprocmask, err, 4, SIG_SETMASK, &ss, &oss, _NSIG / 8);
 
diff --git a/nptl/tst-cancel7.c b/nptl/tst-cancel7.c
index ad40b9c..7e8a860 100644
--- a/nptl/tst-cancel7.c
+++ b/nptl/tst-cancel7.c
@@ -65,6 +65,9 @@ sl (void)
 
   sigset_t ss;
   sigfillset (&ss);
+#ifdef __CHKP__
+  sigdelset (&ss, SIGSEGV);
+#endif
   sigsuspend (&ss);
   exit (0);
 }
diff --git a/nptl/tst-signal1.c b/nptl/tst-signal1.c
index 81dd161..0345701 100644
--- a/nptl/tst-signal1.c
+++ b/nptl/tst-signal1.c
@@ -68,6 +68,9 @@ receiver (void)
 
   sigfillset (&ss);
 
+#ifdef __CHKP__
+  sigdelset(&ss, SIGSEGV);
+#endif
   if (pthread_sigmask (SIG_SETMASK, &ss, NULL) != 0)
     {
       puts ("1st pthread_sigmask failed");
diff --git a/nptl/tst-signal2.c b/nptl/tst-signal2.c
index 87f3bb8..23cda43 100644
--- a/nptl/tst-signal2.c
+++ b/nptl/tst-signal2.c
@@ -71,6 +71,9 @@ receiver (void)
   alarm (10);
 
   sigfillset (&ss);
+#ifdef __CHKP__
+  sigdelset(&ss, SIGSEGV);
+#endif
 
   if (pthread_sigmask (SIG_SETMASK, &ss, NULL) != 0)
     {
diff --git a/nptl/tst-signal3.c b/nptl/tst-signal3.c
index fc34f66..ae5fea6 100644
--- a/nptl/tst-signal3.c
+++ b/nptl/tst-signal3.c
@@ -96,6 +96,9 @@ do_test (void)
   /* Block all signals.  */
   sigset_t ss;
   sigfillset (&ss);
+#ifdef __CHKP__
+  sigdelset(&ss, SIGSEGV);
+#endif
 
   th_main = pthread_self ();
 
@@ -118,6 +121,9 @@ do_test (void)
 	};
       sigfillset (&sa.sa_mask);
 
+#ifdef __CHKP__
+       sigdelset(&ss, SIGSEGV);
+#endif
       if (sigaction (sig0 + i, &sa, NULL) != 0)
 	{
 	  printf ("sigaction for signal %d failed\n", i);
diff --git a/sysdeps/posix/profil.c b/sysdeps/posix/profil.c
index 86d36a9..28613af 100644
--- a/sysdeps/posix/profil.c
+++ b/sysdeps/posix/profil.c
@@ -106,6 +106,9 @@ __profil (u_short *sample_buffer, size_t size, size_t offset, u_int scale)
   act.sa_handler = (sighandler_t) &profil_counter;
   act.sa_flags = SA_RESTART;
   __sigfillset (&act.sa_mask);
+#ifdef __CHKP__
+  __sigdelset (&act.sa_mask, SIGSEGV);
+#endif
   if (__sigaction (SIGPROF, &act, oact_ptr) < 0)
     return -1;
 
diff --git a/sysdeps/posix/sigwait.c b/sysdeps/posix/sigwait.c
index b0ea14d..a980647 100644
--- a/sysdeps/posix/sigwait.c
+++ b/sysdeps/posix/sigwait.c
@@ -42,11 +42,17 @@ do_sigwait (const sigset_t *set, int *sig)
 
   /* Prepare set.  */
   __sigfillset (&tmp_mask);
+#ifdef __CHKP__
+  __sigdelset (&tmp_mask, SIGSEGV):
+#endif
 
   /* Unblock all signals in the SET and register our nice handler.  */
   action.sa_handler = ignore_signal;
   action.sa_flags = 0;
   __sigfillset (&action.sa_mask);	/* Block all signals for handler.  */
+#ifdef __CHKP__
+  __sigdelset (&action.sa_mask, SIGSEGV):
+#endif
 
   /* Make sure we recognize error conditions by setting WAS_SIG to a
      value which does not describe a legal signal number.  */
diff --git a/sysdeps/posix/sprofil.c b/sysdeps/posix/sprofil.c
index 1447a4f..42c43cd 100644
--- a/sysdeps/posix/sprofil.c
+++ b/sysdeps/posix/sprofil.c
@@ -339,6 +339,9 @@ __sprofil (struct prof *profp, int profcnt, struct timeval *tvp,
     act.sa_handler = (sighandler_t) &profil_counter_ushort;
   act.sa_flags = SA_RESTART;
   __sigfillset (&act.sa_mask);
+#ifdef __CHKP__
+  __sigdelset (&act.sa_mask, SIGSEGV);
+#endif
   if (__sigaction (SIGPROF, &act, &prof_info.saved_action) < 0)
     return -1;
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c4f6e8dc89935bbd7b3170b20590ab94a62c7cc2

commit c4f6e8dc89935bbd7b3170b20590ab94a62c7cc2
Author: Liubov Dmitrieva <ldmitrie@sourceware.org>
Date:   Thu Aug 29 17:08:14 2013 +0400

    Inappropriate code style for Intel MPX in string/strcpy.c and wcsmbc/wcscpy.c
    Fix the code if MPX is enabled.

diff --git a/string/strcpy.c b/string/strcpy.c
index b71f753..04278ec 100644
--- a/string/strcpy.c
+++ b/string/strcpy.c
@@ -26,6 +26,7 @@ char *
 strcpy (dest, src)
      char *dest;
      const char *src;
+#ifndef __CHKP__
 {
   char c;
   char *s = (char *) src;
@@ -40,4 +41,12 @@ strcpy (dest, src)
 
   return dest;
 }
+#else
+{
+  const char  *ret = dest;
+  while ((*dest++ = *src++) != '\0');
+  return ret;
+}
+#endif
+
 libc_hidden_builtin_def (strcpy)
diff --git a/wcsmbs/wcscpy.c b/wcsmbs/wcscpy.c
index 3b1e0c6..3113cf5 100644
--- a/wcsmbs/wcscpy.c
+++ b/wcsmbs/wcscpy.c
@@ -25,6 +25,7 @@ wchar_t *
 wcscpy (dest, src)
      wchar_t *dest;
      const wchar_t *src;
+#ifndef __CHKP__
 {
   wint_t c;
   wchar_t *wcp;
@@ -56,3 +57,11 @@ wcscpy (dest, src)
 
   return dest;
 }
+#else
+{
+  const wchar_t *ret = dest;
+  while ((*dest++ = *src++) != L'\0');
+  return ret;
+
+}
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=26ad8ed923d04dc3dc107ba3d92cb369cb5f7d74

commit 26ad8ed923d04dc3dc107ba3d92cb369cb5f7d74
Author: Liubov Dmitrieva <ldmitrie@sourceware.org>
Date:   Thu Aug 29 19:25:35 2013 +0400

    Inappropriate code style for Intel MPX in debug/wcscpy_chk.c. Fix the code if MPX is enabled.

diff --git a/debug/wcscpy_chk.c b/debug/wcscpy_chk.c
index 61092c3..3e6d185 100644
--- a/debug/wcscpy_chk.c
+++ b/debug/wcscpy_chk.c
@@ -23,6 +23,7 @@
 /* Copy SRC to DEST.  */
 wchar_t *
 __wcscpy_chk (wchar_t *dest, const wchar_t *src, size_t n)
+#ifndef __CHKP__
 {
   wint_t c;
   wchar_t *wcp;
@@ -58,3 +59,22 @@ __wcscpy_chk (wchar_t *dest, const wchar_t *src, size_t n)
 
   return dest;
 }
+#else
+{
+  const wchar_t *result = dest;
+  dest--;
+  wint_t c;
+
+  do
+	{
+	  if (__builtin_expect (n-- == 0, 0))
+	     __chk_fail ();
+	  c = src[0];
+	  *++dest = c;
+	  ++src;
+	}
+  while (c != L'\0');
+
+  return result;
+}
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2e0268156ce3896e0f26a7d624c49e408f55c862

commit 2e0268156ce3896e0f26a7d624c49e408f55c862
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Mon Mar 11 17:06:38 2013 +0400

    Inappropriate code style for Intel MPX in debug/wcpcpy_chk. Fix the code if MPX is enabled.

diff --git a/debug/wcpcpy_chk.c b/debug/wcpcpy_chk.c
index 7c836e6..d90f293 100644
--- a/debug/wcpcpy_chk.c
+++ b/debug/wcpcpy_chk.c
@@ -26,6 +26,7 @@
    DEST.  Check for overflows.  */
 wchar_t *
 __wcpcpy_chk (wchar_t *dest, const wchar_t *src, size_t destlen)
+#ifndef __CHKP__
 {
   wchar_t *wcp = (wchar_t *) dest - 1;
   wint_t c;
@@ -42,3 +43,21 @@ __wcpcpy_chk (wchar_t *dest, const wchar_t *src, size_t destlen)
 
   return wcp;
 }
+#else
+{
+  dest--;
+  wint_t c;
+
+  do
+    {
+      if (__builtin_expect (destlen-- == 0, 0))
+	__chk_fail ();
+      c = src[0];
+      *++dest = c;
+      ++src;
+    }
+  while (c != L'\0');
+
+  return dest;
+}
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=76e7a112597a0de1ba0ee505311f8fe10654b041

commit 76e7a112597a0de1ba0ee505311f8fe10654b041
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Mon May 27 18:54:53 2013 +0400

    Inappropriate code style for Intel MPX at wcsmbs/wcpcpy.c. Use other implementation if MPX is enabled.

diff --git a/wcsmbs/wcpcpy.c b/wcsmbs/wcpcpy.c
index 6f952b4..4c541b1 100644
--- a/wcsmbs/wcpcpy.c
+++ b/wcsmbs/wcpcpy.c
@@ -18,8 +18,9 @@
 
 #include <wchar.h>
 
-#define __need_ptrdiff_t
-#include <stddef.h>
+#ifndef __CHKP__
+# define __need_ptrdiff_t
+# include <stddef.h>
 
 
 /* Copy SRC to DEST, returning the address of the terminating L'\0' in
@@ -42,5 +43,14 @@ __wcpcpy (dest, src)
 
   return wcp;
 }
+#else
 
+wchar_t *
+__wcpcpy (wchar_t *dst, const wchar_t *src)
+{
+  while ((*dst++ = *src++) != L'\0');
+  return dst - 1;
+}
+
+#endif
 weak_alias (__wcpcpy, wcpcpy)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d77ddb29368d694593f1b2a3347a9c5d64c07eb9

commit d77ddb29368d694593f1b2a3347a9c5d64c07eb9
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Thu Dec 20 18:46:38 2012 +0400

    Inappropriate code style for Intel MPX at posix/fnmatch_loop.c. Fixed.

diff --git a/posix/fnmatch_loop.c b/posix/fnmatch_loop.c
index 078b982..802eb18 100644
--- a/posix/fnmatch_loop.c
+++ b/posix/fnmatch_loop.c
@@ -313,7 +313,7 @@ FCT (pattern, string, string_end, no_leading_period, flags, ends, alloca_used)
 		      /* Invalid character class name.  */
 		      return FNM_NOMATCH;
 
-# if defined _LIBC && ! WIDE_CHAR_VERSION
+# if defined _LIBC && ! WIDE_CHAR_VERSION && !defined __CHKP__
 		    /* The following code is glibc specific but does
 		       there a good job in speeding up the code since
 		       we can avoid the btowc() call.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=785be3d06b090e255e108239dcc3dc9a85056353

commit 785be3d06b090e255e108239dcc3dc9a85056353
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Thu Dec 20 18:23:10 2012 +0400

    Inappropriate code style for Intel MPX at argp/argp-help.c. Fixed.

diff --git a/argp/argp-help.c b/argp/argp-help.c
index ace71b4..8054785 100644
--- a/argp/argp-help.c
+++ b/argp/argp-help.c
@@ -867,7 +867,10 @@ hol_append (struct hol *hol, struct hol *more)
 
 	  /* Fix up the short options pointers from HOL.  */
 	  for (e = entries, left = hol->num_entries; left > 0; e++, left--)
-	    e->short_options += (short_options - hol->short_options);
+	   {
+	     unsigned long offset = e->short_options - hol->short_options;
+		  e->short_options = (char *)(short_options + offset);
+		}
 
 	  /* Now add the short options from MORE, fixing up its entries
 	     too.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=56e443131aca7b05c0220c195cd7500a5f8a7803

commit 56e443131aca7b05c0220c195cd7500a5f8a7803
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Wed Dec 19 17:03:44 2012 +0400

    Inappropriate code style for Intel MPX. Expand bounds in crypt/crypt.c

diff --git a/crypt/crypt.c b/crypt/crypt.c
index e429950..96ec2eb 100644
--- a/crypt/crypt.c
+++ b/crypt/crypt.c
@@ -43,7 +43,13 @@ _ufc_doit_r(itr, __data, res)
   int i;
   long32 s, *k;
   long32 *sb01 = (long32*)__data->sb0;
+#ifdef __CHKP__
+  sb01 = __bnd_set_ptr_bounds (sb01, sizeof(__data->sb0) + sizeof(__data->sb1));
+#endif
   long32 *sb23 = (long32*)__data->sb2;
+#ifdef __CHKP__
+  sb23 = __bnd_set_ptr_bounds (sb23, sizeof(__data->sb2) + sizeof(__data->sb3));
+#endif
   long32 l1, l2, r1, r2;
 
   l1 = (long32)res[0]; l2 = (long32)res[1];
@@ -89,7 +95,13 @@ _ufc_doit_r(itr, __data, res)
   int i;
   long64 l, r, s, *k;
   long64 *sb01 = (long64*)__data->sb0;
+#ifdef __CHKP__
+  sb01 = __bnd_set_ptr_bounds (sb01, sizeof(__data->sb0) + sizeof(__data->sb1));
+#endif
   long64 *sb23 = (long64*)__data->sb2;
+#ifdef __CHKP__
+  sb23 = __bnd_set_ptr_bounds (sb23, sizeof(__data->sb2) + sizeof(__data->sb3));
+#endif
 
   l = (((long64)res[0]) << 32) | ((long64)res[1]);
   r = (((long64)res[2]) << 32) | ((long64)res[3]);

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ebf5b900bbee969751db9d132024acecd28fbf62

commit ebf5b900bbee969751db9d132024acecd28fbf62
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Wed Dec 19 14:55:21 2012 +0400

    Inappropriate code style for Intel MPX in libio/fileops.c.
    Use INIT (maximum) bounds as it is hard to rewrite the algorithm.

diff --git a/libio/fileops.c b/libio/fileops.c
index e92f85b..a17504b 100644
--- a/libio/fileops.c
+++ b/libio/fileops.c
@@ -758,6 +758,9 @@ decide_maybe_mmap (_IO_FILE *fp)
       void *p;
 
       p = __mmap64 (NULL, st.st_size, PROT_READ, MAP_SHARED, fp->_fileno, 0);
+#ifdef __CHKP__
+		p = __bnd_init_ptr_bounds(p);
+#endif
       if (p != MAP_FAILED)
 	{
 	  /* OK, we managed to map the file.  Set the buffer up and use a

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=af1d2d1ffb534abeadb2a82365f0b6ef6fc96e3a

commit af1d2d1ffb534abeadb2a82365f0b6ef6fc96e3a
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Thu Nov 8 16:35:39 2012 +0400

    Inappropriate code style for Intel MPX in elf/dl-close.c
    A cast implies memory access with bounds violation.
    Let allow that.

diff --git a/elf/dl-close.c b/elf/dl-close.c
index fe3014c..15775ec 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -347,6 +347,10 @@ _dl_close_worker (struct link_map *map)
 		struct link_map *tmap = (struct link_map *)
 		  ((char *) imap->l_scope[cnt]
 		   - offsetof (struct link_map, l_searchlist));
+#ifdef __CHKP__
+		tmap = __bnd_set_ptr_bounds(tmap, sizeof(struct link_map));
+#endif
+
 		assert (tmap->l_ns == nsid);
 		if (tmap->l_idx == IDX_STILL_USED)
 		  ++remain;
@@ -393,6 +397,9 @@ _dl_close_worker (struct link_map *map)
 		      struct link_map *tmap = (struct link_map *)
 			((char *) imap->l_scope[cnt]
 			 - offsetof (struct link_map, l_searchlist));
+#ifdef __CHKP__
+			tmap = __bnd_set_ptr_bounds(tmap, sizeof(struct link_map));
+#endif
 		      if (tmap->l_idx != IDX_STILL_USED)
 			{
 			  /* Remove the scope.  Or replace with own map's

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fcad0ab9dc3deded2cfd8e7bc467e204d8b55a63

commit fcad0ab9dc3deded2cfd8e7bc467e204d8b55a63
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Tue Dec 18 19:42:52 2012 +0400

    Inappropriate code style for Intel MPX in crypt/crypt_util.c. Fixed.

diff --git a/crypt/crypt_util.c b/crypt/crypt_util.c
index 2409079..8b58668 100644
--- a/crypt/crypt_util.c
+++ b/crypt/crypt_util.c
@@ -487,7 +487,7 @@ small_tables_done:
    * DES round.
    *
    */
-
+#ifndef __CHKP__
   if (__data->sb0 + sizeof (__data->sb0) == __data->sb1
       && __data->sb1 + sizeof (__data->sb1) == __data->sb2
       && __data->sb2 + sizeof (__data->sb2) == __data->sb3)
@@ -497,11 +497,14 @@ small_tables_done:
 		  + (int)sizeof(__data->sb2)
 		  + (int)sizeof(__data->sb3));
   else {
+#endif
     _ufc_clearmem(__data->sb0, (int)sizeof(__data->sb0));
     _ufc_clearmem(__data->sb1, (int)sizeof(__data->sb1));
     _ufc_clearmem(__data->sb2, (int)sizeof(__data->sb2));
     _ufc_clearmem(__data->sb3, (int)sizeof(__data->sb3));
+#ifndef __CHKP__
   }
+#endif
 
   for(sg = 0; sg < 4; sg++) {
     int j1, j2;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0a6a6f8037cc93a6d165d925d6c029fe42998acc

commit 0a6a6f8037cc93a6d165d925d6c029fe42998acc
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Mon Oct 15 15:01:09 2012 +0400

    Inappropriate code style for Intel MPX. Fix missing of bounds in sysdeps/generic/unwind-dw2-fde.h

diff --git a/sysdeps/generic/unwind-dw2-fde.h b/sysdeps/generic/unwind-dw2-fde.h
index fad46bf..7fce24c 100644
--- a/sysdeps/generic/unwind-dw2-fde.h
+++ b/sysdeps/generic/unwind-dw2-fde.h
@@ -147,7 +147,7 @@ typedef struct dwarf_fde fde;
 static inline struct dwarf_cie *
 get_cie (struct dwarf_fde *f)
 {
-  return (void *)&f->CIE_delta - f->CIE_delta;
+  return (char *)f + offsetof (struct dwarf_fde, CIE_delta)  - f->CIE_delta;
 }
 
 static inline fde *

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=01d45f7e37130cb47d0ef788283e99bc07564f34

commit 01d45f7e37130cb47d0ef788283e99bc07564f34
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Fri Dec 14 18:41:37 2012 +0400

    Inappropriate code style for Intel MPX in debug/strcpy_chk.c  Use different version if MPX enabled.

diff --git a/debug/strcpy_chk.c b/debug/strcpy_chk.c
index 81bf46f..ba6da70 100644
--- a/debug/strcpy_chk.c
+++ b/debug/strcpy_chk.c
@@ -27,6 +27,7 @@ __strcpy_chk (dest, src, destlen)
      char *dest;
      const char *src;
      size_t destlen;
+#ifndef __CHKP__
 {
   char c;
   char *s = (char *) src;
@@ -65,3 +66,45 @@ __strcpy_chk (dest, src, destlen)
 
   return dest;
 }
+#else
+{
+  char c;
+  char *s = (char *) src;
+  char *d = (char *) dest;
+
+  while (__builtin_expect (destlen >= 4, 0))
+    {
+      c = s[0];
+      d[0] = c;
+      if (c == '\0')
+        return dest;
+      c = s[1];
+      d[1] = c;
+      if (c == '\0')
+        return dest;
+      c = s[2];
+      d[2] = c;
+      if (c == '\0')
+        return dest;
+      c = s[3];
+      d[3] = c;
+      if (c == '\0')
+        return dest;
+      destlen -= 4;
+      d += 4;
+      s += 4;
+    }
+
+  do
+    {
+      if (__builtin_expect (destlen-- == 0, 0))
+        __chk_fail ();
+      c = *s;
+      *(d++) = c;
+      s++;
+    }
+  while (c != '\0');
+
+  return dest;
+}
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=00ae469c06aeed1e7bd988875d241cc5a6339d01

commit 00ae469c06aeed1e7bd988875d241cc5a6339d01
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Fri Nov 23 18:50:27 2012 +0400

    If Intel MPX enabled: always compile with -fno-check-pointers file elf/dl-init.c
    because this file contains the code excecuting before runtime library
    initialization happens.

diff --git a/elf/Makefile b/elf/Makefile
index 3b58649..4ef80c9 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -49,6 +49,7 @@ all-rtld-routines = $(rtld-routines) $(sysdep-rtld-routines)
 CFLAGS-dl-runtime.c = -fexceptions -fasynchronous-unwind-tables
 CFLAGS-dl-lookup.c = -fexceptions -fasynchronous-unwind-tables
 CFLAGS-dl-iterate-phdr.c = $(uses-callbacks)
+CFLAGS-dl-init.c = -fno-check-pointers
 
 ifeq ($(unwind-find-fde),yes)
 routines += unwind-dw2-fde-glibc

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4cd77a6b091db5450ec634eeaeab8e36ea3bb1dd

commit 4cd77a6b091db5450ec634eeaeab8e36ea3bb1dd
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Mon Dec 17 13:44:21 2012 +0400

    Add attribute __bnd_variable_size to make using flexible size arrays Intel MPX complient.

diff --git a/bits/dirent.h b/bits/dirent.h
index 2117a7c..77cae84 100644
--- a/bits/dirent.h
+++ b/bits/dirent.h
@@ -32,7 +32,7 @@ struct dirent
     unsigned char d_namlen;	/* Length of the file name.  */
 
     /* Only this member is in the POSIX standard.  */
-    char d_name[1];		/* File name (actually longer).  */
+    char d_name[1] __attribute__((bnd_variable_size));		/* File name (actually longer).  */
   };
 
 #ifdef __USE_LARGEFILE64
@@ -43,7 +43,7 @@ struct dirent64
     unsigned char d_type;
     unsigned char d_namlen;
 
-    char d_name[1];
+    char d_name[1] __attribute__((bnd_variable_size));
   };
 #endif
 
diff --git a/bits/sched.h b/bits/sched.h
index 0c200a9..0a9513a 100644
--- a/bits/sched.h
+++ b/bits/sched.h
@@ -65,7 +65,7 @@ typedef unsigned long int __cpu_mask;
 /* Data structure to describe CPU mask.  */
 typedef struct
 {
-  __cpu_mask __bits[__CPU_SETSIZE / __NCPUBITS];
+  __cpu_mask __bits[__CPU_SETSIZE / __NCPUBITS] __attribute__((bnd_variable_size));
 } cpu_set_t;
 
 /* Access functions for CPU masks.  */
diff --git a/debug/tst-chk1.c b/debug/tst-chk1.c
index 6ca8d9d..9783d3a 100644
--- a/debug/tst-chk1.c
+++ b/debug/tst-chk1.c
@@ -137,8 +137,8 @@ do_test (void)
     }
   setenv ("LIBC_FATAL_STDERR_", "1", 1);
 
-  struct A { char buf1[9]; char buf2[1]; } a;
-  struct wA { wchar_t buf1[9]; wchar_t buf2[1]; } wa;
+  struct A { char buf1[9] __attribute__((bnd_variable_size)); char buf2[1]; } a;
+  struct wA { wchar_t buf1[9] __attribute__((bnd_variable_size)); wchar_t buf2[1]; } wa;
 
   printf ("Test checking routines at fortify level %d\n",
 #ifdef __USE_FORTIFY_LEVEL
diff --git a/dlfcn/dlfcn.h b/dlfcn/dlfcn.h
index 1ed47b1..0fab755 100644
--- a/dlfcn/dlfcn.h
+++ b/dlfcn/dlfcn.h
@@ -180,7 +180,7 @@ typedef struct
 {
   size_t dls_size;		/* Size in bytes of the whole buffer.  */
   unsigned int dls_cnt;		/* Number of elements in `dls_serpath'.  */
-  Dl_serpath dls_serpath[1];	/* Actually longer, dls_cnt elements.  */
+  Dl_serpath dls_serpath[1] __attribute__((bnd_variable_size));	/* Actually longer, dls_cnt elements.  */
 } Dl_serinfo;
 #endif /* __USE_GNU */
 
diff --git a/include/link.h b/include/link.h
index 1682467..ca253eb 100644
--- a/include/link.h
+++ b/include/link.h
@@ -318,7 +318,7 @@ struct link_map
     {
       uintptr_t cookie;
       unsigned int bindflags;
-    } l_audit[0];
+    } l_audit[0] __attribute__((bnd_variable_size));
   };
 
 
diff --git a/inet/netinet/in.h b/inet/netinet/in.h
index 89e3813..12294d0 100644
--- a/inet/netinet/in.h
+++ b/inet/netinet/in.h
@@ -319,7 +319,7 @@ struct ip_msfilter
     /* Number of source addresses.  */
     uint32_t imsf_numsrc;
     /* Source addresses.  */
-    struct in_addr imsf_slist[1];
+    struct in_addr imsf_slist[1] __attribute__((bnd_variable_size));
   };
 
 #define IP_MSFILTER_SIZE(numsrc) (sizeof (struct ip_msfilter) \
@@ -340,7 +340,7 @@ struct group_filter
     /* Number of source addresses.  */
     uint32_t gf_numsrc;
     /* Source addresses.  */
-    struct sockaddr_storage gf_slist[1];
+    struct sockaddr_storage gf_slist[1] __attribute__((bnd_variable_size));
 };
 
 #define GROUP_FILTER_SIZE(numsrc) (sizeof (struct group_filter) \
diff --git a/inet/protocols/routed.h b/inet/protocols/routed.h
index befd865..457d792 100644
--- a/inet/protocols/routed.h
+++ b/inet/protocols/routed.h
@@ -52,8 +52,8 @@ struct rip {
 	u_char	rip_vers;		/* protocol version # */
 	u_char	rip_res1[2];		/* pad to 32-bit boundary */
 	union {
-		struct	netinfo ru_nets[1];	/* variable length... */
-		char	ru_tracefile[1];	/* ditto ... */
+		struct	netinfo ru_nets[1] __attribute__((bnd_variable_size));	/* variable length... */
+		char	ru_tracefile[1] __attribute__((bnd_variable_size));	/* ditto ... */
 	} ripun;
 #define	rip_nets	ripun.ru_nets
 #define	rip_tracefile	ripun.ru_tracefile
diff --git a/intl/dcigettext.c b/intl/dcigettext.c
index f4aa215..9885a13 100644
--- a/intl/dcigettext.c
+++ b/intl/dcigettext.c
@@ -204,7 +204,7 @@ struct known_translation_t
   /* Pointer to the string in question.  */
   union
     {
-      char appended[ZERO];  /* used if domain != NULL */
+      char appended[ZERO] __attribute__((bnd_variable_size));  /* used if domain != NULL */
       const char *ptr;      /* used if domain == NULL */
     }
   msgid;
@@ -342,7 +342,7 @@ struct block_list
 typedef struct transmem_list
 {
   struct transmem_list *next;
-  char data[ZERO];
+  char data[ZERO] __attribute__((bnd_variable_size));
 } transmem_block_t;
 static struct transmem_list *transmem_list;
 #else
diff --git a/intl/gettextP.h b/intl/gettextP.h
index d1ec644..79f0a4c 100644
--- a/intl/gettextP.h
+++ b/intl/gettextP.h
@@ -160,7 +160,7 @@ struct binding
   struct binding *next;
   char *dirname;
   char *codeset;
-  char domainname[ZERO];
+  char domainname[ZERO] __attribute__((bnd_variable_size));
 };
 
 /* A counter which is incremented each time some previous translations
diff --git a/intl/gmo.h b/intl/gmo.h
index 7b50597..b4c48cc 100644
--- a/intl/gmo.h
+++ b/intl/gmo.h
@@ -137,7 +137,7 @@ struct sysdep_string
     nls_uint32 segsize;
     /* Reference to system dependent string segment, or ~0 at the end.  */
     nls_uint32 sysdepref;
-  } segments[1];
+  } segments[1] __attribute__((bnd_variable_size));
 };
 
 /* Marker for the end of the segments[] array.  This has the value 0xFFFFFFFF,
diff --git a/intl/loadinfo.h b/intl/loadinfo.h
index 7563624..8004233 100644
--- a/intl/loadinfo.h
+++ b/intl/loadinfo.h
@@ -58,7 +58,7 @@ struct loaded_l10nfile
   const void *data;
 
   struct loaded_l10nfile *next;
-  struct loaded_l10nfile *successor[1];
+  struct loaded_l10nfile *successor[1] __attribute__((bnd_variable_size));
 };
 
 
diff --git a/io/fts.h b/io/fts.h
index 0a070ba..93f94f8 100644
--- a/io/fts.h
+++ b/io/fts.h
@@ -116,7 +116,7 @@ typedef struct _ftsent {
 	u_short fts_instr;		/* fts_set() instructions */
 
 	struct stat *fts_statp;		/* stat(2) information */
-	char fts_name[1];		/* file name */
+	char fts_name[1] __attribute__((bnd_variable_size));		/* file name */
 } FTSENT;
 
 __BEGIN_DECLS
diff --git a/locale/localeinfo.h b/locale/localeinfo.h
index 3142726..8dbb598 100644
--- a/locale/localeinfo.h
+++ b/locale/localeinfo.h
@@ -84,7 +84,7 @@ struct __locale_data
     const char *string;
     unsigned int word;		/* Note endian issues vs 64-bit pointers.  */
   }
-  values __flexarr;	/* Items, usually pointers into `filedata'.  */
+  values __flexarr __attribute__((bnd_variable_size));	/* Items, usually pointers into `filedata'.  */
 };
 
 /* We know three kinds of collation sorting rules.  */
@@ -185,7 +185,7 @@ extern const union catnamestr_t
 #include "categories.def"
 #undef DEFINE_CATEGORY
   };
-  char str[0];
+  char str[0] __attribute__((bnd_variable_size));
 } _nl_category_names attribute_hidden;
 extern const uint8_t _nl_category_name_idxs[__LC_LAST] attribute_hidden;
 extern const uint8_t _nl_category_name_sizes[__LC_LAST] attribute_hidden;
diff --git a/misc/search.h b/misc/search.h
index e3b3dfd..63a7768 100644
--- a/misc/search.h
+++ b/misc/search.h
@@ -35,7 +35,7 @@ struct qelem
   {
     struct qelem *q_forw;
     struct qelem *q_back;
-    char q_data[1];
+    char q_data[1] __attribute__((bnd_variable_size));
   };
 # endif
 
diff --git a/nptl/descr.h b/nptl/descr.h
index 58176ea..a175bb0 100644
--- a/nptl/descr.h
+++ b/nptl/descr.h
@@ -162,7 +162,7 @@ struct pthread
   };
 
   /* This descriptor's link on the `stack_used' or `__stack_user' list.  */
-  list_t list;
+  list_t list __attribute__((bnd_variable_size));
 
   /* Thread ID - which is also a 'is this thread descriptor (and
      therefore stack) used' flag.  */
@@ -174,7 +174,10 @@ struct pthread
   /* List of robust mutexes the thread is holding.  */
 #ifdef __PTHREAD_MUTEX_HAVE_PREV
   void *robust_prev;
-  struct robust_list_head robust_head;
+  struct robust_list_head robust_head __attribute__((bnd_variable_size));
+  /* sometimes we want to cast pair {robust_prev (void *) and the
+   * first field of struct robust_list_head (void *)}
+   * to __pthread_list_t (struct consists of two pointers: __prev, __next) */
 
   /* The list above is strange.  It is basically a double linked list
      but the pointer to the next/previous element of the list points
@@ -186,7 +189,7 @@ struct pthread
 # define ENQUEUE_MUTEX_BOTH(mutex, val)					      \
   do {									      \
     __pthread_list_t *next = (__pthread_list_t *)			      \
-      ((((uintptr_t) THREAD_GETMEM (THREAD_SELF, robust_head.list)) & ~1ul)   \
+      ((char *)(((uintptr_t) THREAD_GETMEM (THREAD_SELF, robust_head.list)) & ~1ul)   \
        - QUEUE_PTR_ADJUST);						      \
     next->__prev = (void *) &mutex->__data.__list.__next;		      \
     mutex->__data.__list.__next = THREAD_GETMEM (THREAD_SELF,		      \
diff --git a/nptl/sysdeps/unix/sysv/linux/x86/bits/pthreadtypes.h b/nptl/sysdeps/unix/sysv/linux/x86/bits/pthreadtypes.h
index 28b49bd..0adb200 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86/bits/pthreadtypes.h
+++ b/nptl/sysdeps/unix/sysv/linux/x86/bits/pthreadtypes.h
@@ -75,7 +75,7 @@ typedef union pthread_attr_t pthread_attr_t;
 typedef struct __pthread_internal_list
 {
   struct __pthread_internal_list *__prev;
-  struct __pthread_internal_list *__next;
+  struct __pthread_internal_list *__next __attribute__((bnd_variable_size));
 } __pthread_list_t;
 #else
 typedef struct __pthread_internal_slist
diff --git a/stdio-common/psiginfo-define.h b/stdio-common/psiginfo-define.h
index e1d1a35..d76cb6b 100644
--- a/stdio-common/psiginfo-define.h
+++ b/stdio-common/psiginfo-define.h
@@ -3,7 +3,7 @@ static const union C(codestrs_t_, NOW) {
 #define P(n, s) char MF(__LINE__)[sizeof (s)];
 #include "psiginfo-data.h"
   };
-  char str[0];
+  char str[0] __attribute__((bnd_variable_size));
 } C(codestrs_, NOW) = { {
 #define P(n, s) s,
 #include "psiginfo-data.h"
diff --git a/sunrpc/clnt_udp.c b/sunrpc/clnt_udp.c
index 1b6a20b..eca7122 100644
--- a/sunrpc/clnt_udp.c
+++ b/sunrpc/clnt_udp.c
@@ -96,7 +96,7 @@ struct cu_data
     u_int cu_sendsz;
     char *cu_outbuf;
     u_int cu_recvsz;
-    char cu_inbuf[1];
+    char cu_inbuf[1] __attribute__((bnd_variable_size)) ;
   };
 
 /*
diff --git a/sysdeps/gnu/netinet/ip_icmp.h b/sysdeps/gnu/netinet/ip_icmp.h
index 136fb47..5c2cb0c 100644
--- a/sysdeps/gnu/netinet/ip_icmp.h
+++ b/sysdeps/gnu/netinet/ip_icmp.h
@@ -189,7 +189,7 @@ struct icmp
     } id_ip;
     struct icmp_ra_addr id_radv;
     u_int32_t   id_mask;
-    u_int8_t    id_data[1];
+    u_int8_t    id_data[1] __attribute__((bnd_variable_size));
   } icmp_dun;
 #define	icmp_otime	icmp_dun.id_ts.its_otime
 #define	icmp_rtime	icmp_dun.id_ts.its_rtime
diff --git a/sysdeps/unix/sysv/linux/bits/sched.h b/sysdeps/unix/sysv/linux/bits/sched.h
index e42dee8..5086380 100644
--- a/sysdeps/unix/sysv/linux/bits/sched.h
+++ b/sysdeps/unix/sysv/linux/bits/sched.h
@@ -124,7 +124,7 @@ typedef unsigned long int __cpu_mask;
 /* Data structure to describe CPU mask.  */
 typedef struct
 {
-  __cpu_mask __bits[__CPU_SETSIZE / __NCPUBITS];
+  __cpu_mask __bits[__CPU_SETSIZE / __NCPUBITS] __attribute__((bnd_variable_size));
 } cpu_set_t;
 
 /* Access functions for CPU masks.  */
diff --git a/sysvipc/sys/msg.h b/sysvipc/sys/msg.h
index a0b38f0..c06424f 100644
--- a/sysvipc/sys/msg.h
+++ b/sysvipc/sys/msg.h
@@ -51,7 +51,7 @@ typedef __ssize_t ssize_t;
 struct msgbuf
   {
     __syscall_slong_t mtype;	/* type of received/sent message */
-    char mtext[1];		/* text of the message */
+    char mtext[1] __attribute__((bnd_variable_size));		/* text of the message */
   };
 #endif
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5d92ac866acfc532a4d49af0b1c2b69c260c0ce3

commit 5d92ac866acfc532a4d49af0b1c2b69c260c0ce3
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Mon Jan 21 15:35:12 2013 +0400

    Use C code instead of inline assembler in macros of tls.h for i386 (for Intel MPX only).

diff --git a/nptl/sysdeps/i386/tls.h b/nptl/sysdeps/i386/tls.h
index 3d18b1d..bf30088 100644
--- a/nptl/sysdeps/i386/tls.h
+++ b/nptl/sysdeps/i386/tls.h
@@ -259,11 +259,24 @@ union user_desc_init
    assignments like
 	pthread_descr self = thread_self();
    do not get optimized away.  */
-# define THREAD_SELF \
+
+# ifndef __CHKP__
+#  define THREAD_SELF \
   ({ struct pthread *__self;						      \
      asm ("movl %%gs:%c1,%0" : "=r" (__self)				      \
 	  : "i" (offsetof (struct pthread, header.self)));		      \
      __self;})
+# else
+#  define THREAD_SELF \
+  ({ struct pthread *__self;						      \
+     asm ("movl %%gs:%c1,%0" : "=r" (__self)				      \
+	  : "i" (offsetof (struct pthread, header.self)));	 	      \
+    /* Set first minimum bounds to make possible reading stackblock and stackblock_size. */ \
+	  __self = __bnd_set_ptr_bounds(__self, TLS_INIT_TCB_SIZE);	 	      \
+    /* Set actual correct bounds. */ \
+     (struct pthread*) __bnd_copy_ptr_bounds(__self, __bnd_set_ptr_bounds(__self->stackblock, \
+      __self->stackblock_size)); })
+# endif
 
 /* Magic for libthread_db to know how to do THREAD_SELF.  */
 # define DB_THREAD_SELF \
@@ -272,7 +285,8 @@ union user_desc_init
 
 
 /* Read member of the thread descriptor directly.  */
-# define THREAD_GETMEM(descr, member) \
+# ifndef __CHKP__
+#  define THREAD_GETMEM(descr, member) \
   ({ __typeof (descr->member) __value;					      \
      if (sizeof (__value) == 1)						      \
        asm volatile ("movb %%gs:%P2,%b0"				      \
@@ -296,10 +310,15 @@ union user_desc_init
 			 "i" (offsetof (struct pthread, member) + 4));	      \
        }								      \
      __value; })
-
+# else
+#  define THREAD_GETMEM(descr, member) \
+   ({ struct pthread *__self = THREAD_SELF;  \
+      __self->member; })
+# endif
 
 /* Same as THREAD_GETMEM, but the member offset can be non-constant.  */
-# define THREAD_GETMEM_NC(descr, member, idx) \
+# ifndef __CHKP__
+#  define THREAD_GETMEM_NC(descr, member, idx) \
   ({ __typeof (descr->member[0]) __value;				      \
      if (sizeof (__value) == 1)						      \
        asm volatile ("movb %%gs:%P2(%3),%b0"				      \
@@ -325,10 +344,15 @@ union user_desc_init
 			  "r" (idx));					      \
        }								      \
      __value; })
-
+# else
+#  define THREAD_GETMEM_NC(descr, member, idx) \
+   ({ struct pthread *__self = THREAD_SELF;  \
+      __self->member[idx]; })
+# endif
 
 /* Same as THREAD_SETMEM, but the member offset can be non-constant.  */
-# define THREAD_SETMEM(descr, member, value) \
+# ifndef __CHKP__
+#  define THREAD_SETMEM(descr, member, value) \
   ({ if (sizeof (descr->member) == 1)					      \
        asm volatile ("movb %b0,%%gs:%P1" :				      \
 		     : "iq" (value),					      \
@@ -350,10 +374,15 @@ union user_desc_init
 			 "i" (offsetof (struct pthread, member)),	      \
 			 "i" (offsetof (struct pthread, member) + 4));	      \
        }})
-
+# else
+#  define THREAD_SETMEM(descr, member, value) \
+   ({ struct pthread *__self = THREAD_SELF;  \
+      __self->member = value; })
+#endif
 
 /* Set member of the thread descriptor directly.  */
-# define THREAD_SETMEM_NC(descr, member, idx, value) \
+# ifndef __CHKP__
+#  define THREAD_SETMEM_NC(descr, member, idx, value) \
   ({ if (sizeof (descr->member[0]) == 1)				      \
        asm volatile ("movb %b0,%%gs:%P1(%2)" :				      \
 		     : "iq" (value),					      \
@@ -377,7 +406,11 @@ union user_desc_init
 			 "i" (offsetof (struct pthread, member)),	      \
 			 "r" (idx));					      \
        }})
-
+# else
+#  define THREAD_SETMEM_NC(descr, member, idx, value) \
+   ({ struct pthread *__self = THREAD_SELF;  \
+      __self->member[idx] = value; })
+# endif
 
 /* Atomic compare and exchange on TLS, returning old value.  */
 #define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \
@@ -417,7 +450,8 @@ union user_desc_init
 
 
 /* Call the user-provided thread function.  */
-#define CALL_THREAD_FCT(descr) \
+#ifndef __CHKP__
+# define CALL_THREAD_FCT(descr) \
   ({ void *__res;							      \
      int __ignore1, __ignore2;						      \
      asm volatile ("pushl %%eax\n\t"					      \
@@ -430,7 +464,11 @@ union user_desc_init
 		   : "i" (offsetof (struct pthread, start_routine)),	      \
 		     "i" (offsetof (struct pthread, arg)));		      \
      __res; })
-
+# else
+#  define CALL_THREAD_FCT(descr) \
+   ({ struct pthread *__self = THREAD_SELF;  \
+      __self->start_routine(__self->arg); })
+# endif
 
 /* Set the stack guard field in TCB head.  */
 #define THREAD_SET_STACK_GUARD(value) \

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=022ff6731b1e0938048df6904d40e9bf876625b1

commit 022ff6731b1e0938048df6904d40e9bf876625b1
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Sat Nov 10 12:22:56 2012 +0400

    Use C code instead of inline assembler in macros of tls.h for x86_64 (for Intel MPX only).

diff --git a/nptl/sysdeps/x86_64/tls.h b/nptl/sysdeps/x86_64/tls.h
index 61df1af..d3bf15a 100644
--- a/nptl/sysdeps/x86_64/tls.h
+++ b/nptl/sysdeps/x86_64/tls.h
@@ -89,6 +89,7 @@ typedef struct
 
 
 #ifndef __ASSEMBLER__
+
 /* Get system call information.  */
 # include <sysdep.h>
 
@@ -166,10 +167,15 @@ typedef struct
 
 
 /* Return the address of the dtv for the current thread.  */
-# define THREAD_DTV() \
+# ifndef __CHKP__
+#  define THREAD_DTV() \
   ({ struct pthread *__pd;						      \
      THREAD_GETMEM (__pd, header.dtv); })
-
+# else
+#  define THREAD_DTV() \
+   ({ struct pthread *__self = THREAD_SELF;  \
+      GET_DTV(__self); })
+# endif
 
 /* Return the thread descriptor for the current thread.
 
@@ -177,18 +183,31 @@ typedef struct
    assignments like
 	pthread_descr self = thread_self();
    do not get optimized away.  */
-# define THREAD_SELF \
+# ifndef __CHKP__
+#  define THREAD_SELF \
   ({ struct pthread *__self;						      \
      asm ("mov %%fs:%c1,%0" : "=r" (__self)				      \
 	  : "i" (offsetof (struct pthread, header.self)));	 	      \
-     __self;})
+     __self; })
+# else
+#  define THREAD_SELF \
+  ({ struct pthread *__self;						      \
+     asm ("mov %%fs:%c1,%0" : "=r" (__self)				      \
+	  : "i" (offsetof (struct pthread, header.self)));	 	      \
+    /* Set first minimum bounds to make possible reading stackblock and stackblock_size. */ \
+	  __self = __bnd_set_ptr_bounds(__self, TLS_INIT_TCB_SIZE);	 	      \
+    /* Set actual correct bounds. */ \
+     (struct pthread*) __bnd_copy_ptr_bounds(__self, __bnd_set_ptr_bounds(__self->stackblock, \
+      __self->stackblock_size)); })
+# endif
 
 /* Magic for libthread_db to know how to do THREAD_SELF.  */
 # define DB_THREAD_SELF_INCLUDE  <sys/reg.h> /* For the FS constant.  */
 # define DB_THREAD_SELF CONST_THREAD_AREA (64, FS)
 
 /* Read member of the thread descriptor directly.  */
-# define THREAD_GETMEM(descr, member) \
+# ifndef __CHKP__
+#  define THREAD_GETMEM(descr, member) \
   ({ __typeof (descr->member) __value;					      \
      if (sizeof (__value) == 1)						      \
        asm volatile ("movb %%fs:%P2,%b0"				      \
@@ -202,7 +221,7 @@ typedef struct
        {								      \
 	 if (sizeof (__value) != 8)					      \
 	   /* There should not be any value with a size other than 1,	      \
-	      4 or 8.  */						      \
+	      4 or 8.  */ 						      \
 	   abort ();							      \
 									      \
 	 asm volatile ("movq %%fs:%P1,%q0"				      \
@@ -210,10 +229,15 @@ typedef struct
 		       : "i" (offsetof (struct pthread, member)));	      \
        }								      \
      __value; })
-
+# else
+#  define THREAD_GETMEM(descr, member) \
+   ({ struct pthread *__self = THREAD_SELF;  \
+      __self->member; })
+# endif
 
 /* Same as THREAD_GETMEM, but the member offset can be non-constant.  */
-# define THREAD_GETMEM_NC(descr, member, idx) \
+# ifndef __CHKP__
+#  define THREAD_GETMEM_NC(descr, member, idx) \
   ({ __typeof (descr->member[0]) __value;				      \
      if (sizeof (__value) == 1)						      \
        asm volatile ("movb %%fs:%P2(%q3),%b0"				      \
@@ -228,7 +252,7 @@ typedef struct
        {								      \
 	 if (sizeof (__value) != 8)					      \
 	   /* There should not be any value with a size other than 1,	      \
-	      4 or 8.  */						      \
+	      4 or 8.  */  						      \
 	   abort ();							      \
 									      \
 	 asm volatile ("movq %%fs:%P1(,%q2,8),%q0"			      \
@@ -237,7 +261,11 @@ typedef struct
 			 "r" (idx));					      \
        }								      \
      __value; })
-
+# else
+#  define THREAD_GETMEM_NC(descr, member, idx) \
+   ({ struct pthread *__self = THREAD_SELF;  \
+      __self->member[idx]; })
+# endif
 
 /* Loading addresses of objects on x86-64 needs to be treated special
    when generating PIC code.  */
@@ -249,7 +277,8 @@ typedef struct
 
 
 /* Same as THREAD_SETMEM, but the member offset can be non-constant.  */
-# define THREAD_SETMEM(descr, member, value) \
+# ifndef __CHKP__
+#  define THREAD_SETMEM(descr, member, value) \
   ({ if (sizeof (descr->member) == 1)					      \
        asm volatile ("movb %b0,%%fs:%P1" :				      \
 		     : "iq" (value),					      \
@@ -262,17 +291,22 @@ typedef struct
        {								      \
 	 if (sizeof (descr->member) != 8)				      \
 	   /* There should not be any value with a size other than 1,	      \
-	      4 or 8.  */						      \
+	      4 or 8.	*/					      \
 	   abort ();							      \
 									      \
 	 asm volatile ("movq %q0,%%fs:%P1" :				      \
 		       : IMM_MODE ((uint64_t) cast_to_integer (value)),	      \
 			 "i" (offsetof (struct pthread, member)));	      \
        }})
-
+# else
+#  define THREAD_SETMEM(descr, member, value) \
+   ({ struct pthread *__self = THREAD_SELF;  \
+      __self->member = value; })
+# endif
 
 /* Set member of the thread descriptor directly.  */
-# define THREAD_SETMEM_NC(descr, member, idx, value) \
+# ifndef __CHKP__
+#  define THREAD_SETMEM_NC(descr, member, idx, value) \
   ({ if (sizeof (descr->member[0]) == 1)				      \
        asm volatile ("movb %b0,%%fs:%P1(%q2)" :				      \
 		     : "iq" (value),					      \
@@ -287,7 +321,7 @@ typedef struct
        {								      \
 	 if (sizeof (descr->member[0]) != 8)				      \
 	   /* There should not be any value with a size other than 1,	      \
-	      4 or 8.  */						      \
+	      4 or 8.	*/				      \
 	   abort ();							      \
 									      \
 	 asm volatile ("movq %q0,%%fs:%P1(,%q2,8)" :			      \
@@ -295,7 +329,11 @@ typedef struct
 			 "i" (offsetof (struct pthread, member[0])),	      \
 			 "r" (idx));					      \
        }})
-
+# else
+#  define THREAD_SETMEM_NC(descr, member, idx, value) \
+   ({ struct pthread *__self = THREAD_SELF;  \
+      __self->member[idx] = value; })
+# endif
 
 /* Atomic compare and exchange on TLS, returning old value.  */
 # define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \
@@ -333,8 +371,8 @@ typedef struct
 	      /* Not necessary for other sizes in the moment.  */	      \
 	      abort (); })
 
-
-# define CALL_THREAD_FCT(descr) \
+# ifndef __CHKP__
+#  define CALL_THREAD_FCT(descr) \
   ({ void *__res;							      \
      asm volatile ("movq %%fs:%P2, %%rdi\n\t"				      \
 		   "callq *%%fs:%P1"					      \
@@ -344,7 +382,11 @@ typedef struct
 		   : "di", "si", "cx", "dx", "r8", "r9", "r10", "r11",	      \
 		     "memory", "cc");					      \
      __res; })
-
+# else
+#  define CALL_THREAD_FCT(descr) \
+   ({ struct pthread *__self = THREAD_SELF;  \
+      __self->start_routine(__self->arg); })
+# endif
 
 /* Set the stack guard field in TCB head.  */
 # define THREAD_SET_STACK_GUARD(value) \

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=558bf1c0479495b1b7759bbe58b5f0b455fe7b46

commit 558bf1c0479495b1b7759bbe58b5f0b455fe7b46
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Wed Oct 24 16:00:49 2012 +0400

    Intel MPX support for mmap and mremap wrappers of syscalls for x86_32 and x86_64.
    Create bounds.
    Use C wrapper of syscall instead of assembler wrapper for x86_64.

diff --git a/sysdeps/unix/sysv/linux/i386/Makefile b/sysdeps/unix/sysv/linux/i386/Makefile
index acc3021..f38f4b2 100644
--- a/sysdeps/unix/sysv/linux/i386/Makefile
+++ b/sysdeps/unix/sysv/linux/i386/Makefile
@@ -2,7 +2,7 @@
 default-abi := 32
 
 ifeq ($(subdir),misc)
-sysdep_routines += ioperm iopl vm86 call_pselect6 call_fallocate
+sysdep_routines += ioperm iopl vm86 call_pselect6 call_fallocate mremap
 endif
 
 ifeq ($(subdir),elf)
diff --git a/sysdeps/unix/sysv/linux/i386/mmap.S b/sysdeps/unix/sysv/linux/i386/mmap.S
index 0addf65..035a698 100644
--- a/sysdeps/unix/sysv/linux/i386/mmap.S
+++ b/sysdeps/unix/sysv/linux/i386/mmap.S
@@ -74,6 +74,11 @@ L(skip):
 	ja SYSCALL_ERROR_LABEL
 
 	/* Successful; return the syscall's value.  */
+	mov 8(%esp), %ecx
+#ifdef __CHKP__
+	bndmk -1(%eax, %ecx), %bnd0
+#endif
+
 	ret
 
 PSEUDO_END (__mmap)
diff --git a/sysdeps/unix/sysv/linux/i386/mmap64.S b/sysdeps/unix/sysv/linux/i386/mmap64.S
index 31a0f67..8b44c6e 100644
--- a/sysdeps/unix/sysv/linux/i386/mmap64.S
+++ b/sysdeps/unix/sysv/linux/i386/mmap64.S
@@ -89,6 +89,10 @@ L(do_syscall):
 	ja SYSCALL_ERROR_LABEL
 
 	/* Successful; return the syscall's value.  */
+	mov 8(%esp), %ecx
+#ifdef __CHKP__
+	bndmk -1(%eax, %ecx), %bnd0
+#endif
 	ret
 
 	cfi_adjust_cfa_offset (16)
diff --git a/sysdeps/unix/sysv/linux/i386/mremap.c b/sysdeps/unix/sysv/linux/i386/mremap.c
new file mode 100644
index 0000000..ad55d9d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/i386/mremap.c
@@ -0,0 +1,36 @@
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sysdeps/unix/sysv/linux/i386/sysdep.h>
+
+void *
+__mremap (void *old_address, size_t old_size, size_t new_size, int flags, ...)
+{
+  void *p = INLINE_SYSCALL (mremap, 4, old_address, old_size, new_size, flags);
+  if ((long) p == -1) return MAP_FAILED;
+#ifdef __CHKP__
+  return __bnd_set_ptr_bounds (p, new_size);
+#else
+  return p;
+#endif
+}
+
+weak_alias (__mremap, mremap)
diff --git a/sysdeps/unix/sysv/linux/x86_64/mmap.c b/sysdeps/unix/sysv/linux/x86_64/mmap.c
new file mode 100644
index 0000000..1ee6f96
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/mmap.c
@@ -0,0 +1,52 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sysdeps/unix/sysv/linux/x86_64/sysdep.h>
+
+void *
+__mmap (void *addr, size_t len, int prot, int flags, int fd, off_t offset)
+{
+  void *p = INLINE_SYSCALL (mmap, 6, addr, len, prot, flags, fd, offset);
+  if ((long) p == -1) return MAP_FAILED;
+#ifdef __CHKP__
+  return __bnd_set_ptr_bounds (p, len);
+#else
+  return p;
+#endif
+}
+
+weak_alias (__mmap, mmap64)
+weak_alias (__mmap, __mmap64)
+weak_alias (__mmap, mmap)
+
+void *
+__mremap (void *old_address, size_t old_size, size_t new_size, int flags, ...)
+{
+  void *p = INLINE_SYSCALL (mremap, 4, old_address, old_size, new_size, flags);
+  if ((long) p  == -1) return MAP_FAILED;
+#ifdef __CHKP__
+  return __bnd_set_ptr_bounds (p, new_size);
+#else
+  return p;
+#endif
+}
+
+weak_alias (__mremap, mremap)
diff --git a/sysdeps/unix/sysv/linux/x86_64/mmap64.c b/sysdeps/unix/sysv/linux/x86_64/mmap64.c
new file mode 100644
index 0000000..e69de29
diff --git a/sysdeps/unix/sysv/linux/x86_64/mremap.c b/sysdeps/unix/sysv/linux/x86_64/mremap.c
new file mode 100644
index 0000000..e69de29

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=152d00ad923c702a8753499421baba027d0beffc

commit 152d00ad923c702a8753499421baba027d0beffc
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Wed Oct 10 19:28:57 2012 +0400

    Save/restore bounds in x86_64 and x86_32 version of _dl_runtime_resolve.

diff --git a/sysdeps/i386/dl-trampoline.S b/sysdeps/i386/dl-trampoline.S
index 945708f..5f3acb6 100644
--- a/sysdeps/i386/dl-trampoline.S
+++ b/sysdeps/i386/dl-trampoline.S
@@ -31,9 +31,29 @@ _dl_runtime_resolve:
 	cfi_adjust_cfa_offset (4)
 	pushl %edx
 	cfi_adjust_cfa_offset (4)
+#ifndef __CHKP__
 	movl 16(%esp), %edx	# Copy args pushed by PLT in register.  Note
 	movl 12(%esp), %eax	# that `fixup' takes its parameters in regs.
+#else
+	subl $32, %esp
+	cfi_adjust_cfa_offset (32)
+	bndmov %bnd0, (%esp)
+	bndmov %bnd1, 8(%esp)
+	bndmov %bnd2, 16(%esp)
+	bndmov %bnd3, 24(%esp)
+	movl 48(%esp), %edx	# Copy args pushed by PLT in register.  Note
+	movl 44(%esp), %eax	# that `fixup' takes its parameters in regs.
+	bndldx 44(%esp, %eax), %bnd0
+#endif
 	call _dl_fixup		# Call resolver.
+#ifdef __CHKP__
+	bndmov 24(%esp), %bnd3
+	bndmov 16(%esp), %bnd2
+	bndmov 8(%esp), %bnd1
+	bndmov (%esp), %bnd0
+	addl $32, %esp
+	cfi_adjust_cfa_offset (-32)
+#endif
 	popl %edx		# Get register content back.
 	cfi_adjust_cfa_offset (-4)
 	movl (%esp), %ecx
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index a25e390..e07c192 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -31,8 +31,13 @@
 	cfi_startproc
 _dl_runtime_resolve:
 	cfi_adjust_cfa_offset(16) # Incorporate PLT
+#ifndef __CHKP__
 	subq $56,%rsp
 	cfi_adjust_cfa_offset(56)
+#else
+	subq $120,%rsp
+	cfi_adjust_cfa_offset(120)
+#endif
 	movq %rax,(%rsp)	# Preserve registers otherwise clobbered.
 	movq %rcx, 8(%rsp)
 	movq %rdx, 16(%rsp)
@@ -40,10 +45,26 @@ _dl_runtime_resolve:
 	movq %rdi, 32(%rsp)
 	movq %r8, 40(%rsp)
 	movq %r9, 48(%rsp)
+#ifndef __CHKP__
 	movq 64(%rsp), %rsi	# Copy args pushed by PLT in register.
 	movq 56(%rsp), %rdi	# %rdi: link_map, %rsi: reloc_index
+#else
+	bndmov %bnd0, 56(%rsp)
+	bndmov %bnd1, 72(%rsp)
+	bndmov %bnd2, 88(%rsp)
+	bndmov %bnd3, 104(%rsp)
+	movq 128(%rsp), %rsi	# Copy args pushed by PLT in register.
+	movq 120(%rsp), %rdi	# %rdi: link_map, %rsi: reloc_index
+	bndldx 120(%rsp, %rdi), %bnd0
+#endif
 	call _dl_fixup		# Call resolver.
 	movq %rax, %r11		# Save return value
+#ifdef __CHKP__
+	bndmov 104(%rsp), %bnd3
+	bndmov 88(%rsp), %bnd2
+	bndmov 72(%rsp), %bnd1
+	bndmov 56(%rsp), %bnd0
+#endif
 	movq 48(%rsp), %r9	# Get register content back.
 	movq 40(%rsp), %r8
 	movq 32(%rsp), %rdi
@@ -51,8 +72,13 @@ _dl_runtime_resolve:
 	movq 16(%rsp), %rdx
 	movq 8(%rsp), %rcx
 	movq (%rsp), %rax
+#ifndef __CHKP__
 	addq $72, %rsp		# Adjust stack(PLT did 2 pushes)
 	cfi_adjust_cfa_offset(-72)
+#else
+	addq $136, %rsp		# Adjust stack(PLT did 2 pushes)
+	cfi_adjust_cfa_offset(-136)
+#endif
 	jmp *%r11		# Jump to function address.
 	cfi_endproc
 	.size _dl_runtime_resolve, .-_dl_runtime_resolve

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3e938a73037cfad920442148c39208d321248485

commit 3e938a73037cfad920442148c39208d321248485
Author: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
Date:   Mon Jul 23 19:39:27 2012 +0400

    Add Intel MPX support to malloc allocator.

diff --git a/malloc/arena.c b/malloc/arena.c
index 12a48ad..2aaf1b8 100644
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -131,9 +131,15 @@ int __malloc_initialized = -1;
 #endif
 
 /* find the heap and corresponding arena for a given ptr */
-
-#define heap_for_ptr(ptr) \
+#ifndef __CHKP__
+# define heap_for_ptr(ptr) \
  ((heap_info *)((unsigned long)(ptr) & ~(HEAP_MAX_SIZE-1)))
+#else
+static heap_info *heap_for_ptr (void *ptr) {
+  heap_info *t = (heap_info *)((unsigned long)(ptr) & ~(HEAP_MAX_SIZE-1));
+  return __bnd_set_ptr_bounds(t, sizeof(heap_info));
+}
+#endif
 #define arena_for_chunk(ptr) \
  (chunk_non_main_arena(ptr) ? heap_for_ptr(ptr)->ar_ptr : &main_arena)
 
diff --git a/malloc/hooks.c b/malloc/hooks.c
index 8c25846..c5c682f 100644
--- a/malloc/hooks.c
+++ b/malloc/hooks.c
@@ -171,6 +171,9 @@ mem2chunk_check(void* mem, unsigned char **magic_p)
 			    next_chunk(prev_chunk(p))!=p) ))
       return NULL;
     magic = MAGICBYTE(p);
+#ifdef __CHKP__
+	 p = (mchunkptr) __bnd_set_ptr_bounds(p, sz + SIZE_SZ);
+#endif
     for(sz += SIZE_SZ-1; (c = ((unsigned char*)p)[sz]) != magic; sz -= c) {
       if(c<=0 || sz<(c+2*SIZE_SZ)) return NULL;
     }
diff --git a/malloc/malloc.c b/malloc/malloc.c
index dd295f5..e8fe610 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -218,7 +218,6 @@
 
 #include <malloc-machine.h>
 #include <malloc-sysdep.h>
-
 #include <atomic.h>
 #include <_itoa.h>
 #include <bits/wordsize.h>
@@ -1222,11 +1221,12 @@ nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 /*
   ---------- Size and alignment checks and conversions ----------
 */
-
+#ifndef __CHKP__
 /* conversion from malloc headers to user pointers, and back */
 
-#define chunk2mem(p)   ((void*)((char*)(p) + 2*SIZE_SZ))
-#define mem2chunk(mem) ((mchunkptr)((char*)(mem) - 2*SIZE_SZ))
+# define chunk2mem(p)   ((void*)((char*)(p) + 2*SIZE_SZ))
+# define mem2chunk(mem) ((mchunkptr)((char*)(mem) - 2*SIZE_SZ))
+#endif
 
 /* The smallest possible chunk */
 #define MIN_CHUNK_SIZE        (offsetof(struct malloc_chunk, fd_nextsize))
@@ -1239,12 +1239,11 @@ nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 /* Check if m has acceptable alignment */
 
 #define aligned_OK(m)  (((unsigned long)(m) & MALLOC_ALIGN_MASK) == 0)
+#define aligned_chunk_OK(p)  (((unsigned long)((char *)(p) + 2 * SIZE_SZ) & MALLOC_ALIGN_MASK) == 0)
 
 #define misaligned_chunk(p) \
-  ((uintptr_t)(MALLOC_ALIGNMENT == 2 * SIZE_SZ ? (p) : chunk2mem (p)) \
+  ((uintptr_t)(MALLOC_ALIGNMENT == 2 * SIZE_SZ ? (p) : ((char *)(p)  + 2 * SIZE_SZ)) \
    & MALLOC_ALIGN_MASK)
-
-
 /*
    Check if a request is so large that it would wrap around zero when
    padded and aligned. To simplify some other code, the bound is made
@@ -1312,49 +1311,116 @@ nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 /* Get size, ignoring use bits */
 #define chunksize(p)         ((p)->size & ~(SIZE_BITS))
 
+#ifdef __CHKP__
+static void* chunk2mem (void *p) {
+  void *t = p + 2 * SIZE_SZ;
+  if (chunk_is_mmapped((mchunkptr)p))
+    return (void *) __bnd_narrow_ptr_bounds (t, t, chunksize((mchunkptr)p) - 2 * SIZE_SZ);
+  /* prev_size field of the next chunk can be used */
+  return (void *) __bnd_set_ptr_bounds(t, chunksize((mchunkptr)p) - SIZE_SZ);
+}
+
+static mchunkptr mem2chunk(void *mem) {
+  mchunkptr temp = (mchunkptr)((char *)(mem) - 2 * SIZE_SZ);
+  temp = __bnd_set_ptr_bounds (temp, sizeof (struct malloc_chunk));
+  return  __bnd_set_ptr_bounds (temp, chunksize(temp) > sizeof(struct malloc_chunk) ?
+         chunksize(temp) : sizeof(struct malloc_chunk));
+}
+
+static mchunkptr next_chunk (mchunkptr p) {
+  mchunkptr temp = (mchunkptr)((char*) p + chunksize(p));
+  return (mchunkptr) __bnd_set_ptr_bounds ((void *) temp, sizeof(struct malloc_chunk));
+}
 
-/* Ptr to next physical malloc_chunk. */
-#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->size & ~SIZE_BITS) ))
+static mchunkptr prev_chunk (mchunkptr p) {
+  mchunkptr temp = (mchunkptr)((char*) p - p->prev_size);
+  return (mchunkptr) __bnd_set_ptr_bounds ((void *) temp, sizeof(struct malloc_chunk));
+}
+
+static mchunkptr chunk_at_offset (mchunkptr p, INTERNAL_SIZE_T s) {
+  mchunkptr temp = (mchunkptr)((char*) p + s);
+  return (mchunkptr) __bnd_set_ptr_bounds ((void *) temp, sizeof(struct malloc_chunk));
+}
+
+static int inuse (mchunkptr p) {
+  return next_chunk(p)->size & PREV_INUSE;
+}
+
+static int  inuse_bit_at_offset (mchunkptr p, INTERNAL_SIZE_T s) {
+  return chunk_at_offset(p, s)->size & PREV_INUSE;
+}
+
+static void set_inuse_bit_at_offset (mchunkptr p, INTERNAL_SIZE_T s) {
+  chunk_at_offset(p, s)->size |= PREV_INUSE;
+}
+
+static void clear_inuse_bit_at_offset (mchunkptr p, INTERNAL_SIZE_T s) {
+  chunk_at_offset(p, s)->size &= ~(PREV_INUSE);
+}
+
+/* Set size at head, without disturbing its use bit */
+# define set_head_size(p, s)                                              \
+{                                                                        \
+  (p) = (__typeof(p)) __bnd_set_ptr_bounds ((void *) (p), (s) > sizeof (__typeof(p)) ? \
+        (s) : sizeof(__typeof(p)));                              \
+  (p)->size = ((p)->size & SIZE_BITS) | (s);                             \
+}
+
+/* Set size/use field */
+# define set_head(p, s)                                           \
+({                                                                 \
+  (p) = (__typeof(p)) __bnd_set_ptr_bounds ((void *) (p),          \
+       ((s) & ~(SIZE_BITS)) > sizeof (__typeof(p)) ?               \
+            ((s) & ~(SIZE_BITS)): sizeof (__typeof(p)));           \
+  (p)->size = (s);                                                 \
+})
+
+/* Set size at footer (only when chunk is not in use) */
+static void set_foot (mchunkptr p, INTERNAL_SIZE_T s) {
+   chunk_at_offset(p, s)->prev_size = s;
+}
+#else
+/* Ptr to next physicaly100y malloc_chunk. */
+# define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->size & ~SIZE_BITS) ))
 
 /* Ptr to previous physical malloc_chunk */
-#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_size) ))
+# define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_size) ))
 
 /* Treat space at ptr + offset as a chunk */
-#define chunk_at_offset(p, s)  ((mchunkptr)(((char*)(p)) + (s)))
+# define chunk_at_offset(p, s)  ((mchunkptr)(((char*)(p)) + (s)))
 
 /* extract p's inuse bit */
-#define inuse(p)\
+# define inuse(p)\
 ((((mchunkptr)(((char*)(p))+((p)->size & ~SIZE_BITS)))->size) & PREV_INUSE)
 
 /* set/clear chunk as being inuse without otherwise disturbing */
-#define set_inuse(p)\
+# define set_inuse(p)\
 ((mchunkptr)(((char*)(p)) + ((p)->size & ~SIZE_BITS)))->size |= PREV_INUSE
 
-#define clear_inuse(p)\
+# define clear_inuse(p)\
 ((mchunkptr)(((char*)(p)) + ((p)->size & ~SIZE_BITS)))->size &= ~(PREV_INUSE)
 
 
 /* check/set/clear inuse bits in known places */
-#define inuse_bit_at_offset(p, s)\
+# define inuse_bit_at_offset(p, s)\
  (((mchunkptr)(((char*)(p)) + (s)))->size & PREV_INUSE)
 
-#define set_inuse_bit_at_offset(p, s)\
+# define set_inuse_bit_at_offset(p, s)\
  (((mchunkptr)(((char*)(p)) + (s)))->size |= PREV_INUSE)
 
-#define clear_inuse_bit_at_offset(p, s)\
+# define clear_inuse_bit_at_offset(p, s)\
  (((mchunkptr)(((char*)(p)) + (s)))->size &= ~(PREV_INUSE))
 
 
 /* Set size at head, without disturbing its use bit */
-#define set_head_size(p, s)  ((p)->size = (((p)->size & SIZE_BITS) | (s)))
+# define set_head_size(p, s)  ((p)->size = (((p)->size & SIZE_BITS) | (s)))
 
 /* Set size/use field */
-#define set_head(p, s)       ((p)->size = (s))
+# define set_head(p, s)       ((p)->size = (s))
 
 /* Set size at footer (only when chunk is not in use) */
-#define set_foot(p, s)       (((mchunkptr)((char*)(p) + (s)))->prev_size = (s))
-
-
+# define set_foot(p, s)       (((mchunkptr)((char*)(p) + (s)))->prev_size = (s))
+#endif
 /*
   -------------------- Internal data structures --------------------
 
@@ -1945,7 +2011,7 @@ static void do_check_chunk(mstate av, mchunkptr p)
     /* chunk is page-aligned */
     assert(((p->prev_size + sz) & (GLRO(dl_pagesize)-1)) == 0);
     /* mem is aligned */
-    assert(aligned_OK(chunk2mem(p)));
+    assert(aligned_chunk_OK(p));
   }
 }
 
@@ -1968,7 +2034,7 @@ static void do_check_free_chunk(mstate av, mchunkptr p)
   if ((unsigned long)(sz) >= MINSIZE)
   {
     assert((sz & MALLOC_ALIGN_MASK) == 0);
-    assert(aligned_OK(chunk2mem(p)));
+    assert(aligned_chunk_OK(p));
     /* ... matching footer field */
     assert(next->prev_size == sz);
     /* ... and is fully consolidated */
@@ -2042,7 +2108,7 @@ static void do_check_remalloced_chunk(mstate av, mchunkptr p, INTERNAL_SIZE_T s)
   assert((sz & MALLOC_ALIGN_MASK) == 0);
   assert((unsigned long)(sz) >= MINSIZE);
   /* ... and alignment */
-  assert(aligned_OK(chunk2mem(p)));
+  assert(aligned_chunk_OK(p));
   /* chunk is less than MINSIZE more than request */
   assert((long)(sz) - (long)(s) >= 0);
   assert((long)(sz) - (long)(s + MINSIZE) < 0);
@@ -2313,16 +2379,16 @@ static void* sysmalloc(INTERNAL_SIZE_T nb, mstate av)
 	    /* For glibc, chunk2mem increases the address by 2*SIZE_SZ and
 	       MALLOC_ALIGN_MASK is 2*SIZE_SZ-1.  Each mmap'ed area is page
 	       aligned and therefore definitely MALLOC_ALIGN_MASK-aligned.  */
-	    assert (((INTERNAL_SIZE_T)chunk2mem(mm) & MALLOC_ALIGN_MASK) == 0);
+	    assert (((INTERNAL_SIZE_T)((void *)mm + 2 * SIZE_SZ) & MALLOC_ALIGN_MASK) == 0);
 	    front_misalign = 0;
 	  }
 	else
-	  front_misalign = (INTERNAL_SIZE_T)chunk2mem(mm) & MALLOC_ALIGN_MASK;
+	  front_misalign = (INTERNAL_SIZE_T)((void *)mm + 2 * SIZE_SZ) & MALLOC_ALIGN_MASK;
 	if (front_misalign > 0) {
 	  correction = MALLOC_ALIGNMENT - front_misalign;
 	  p = (mchunkptr)(mm + correction);
-	  p->prev_size = correction;
 	  set_head(p, (size - correction) |IS_MMAPPED);
+	  p->prev_size = correction;
 	}
 	else
 	  {
@@ -2349,7 +2415,11 @@ static void* sysmalloc(INTERNAL_SIZE_T nb, mstate av)
   /* Record incoming configuration of top */
 
   old_top  = av->top;
-  old_size = chunksize(old_top);
+  if (old_top == initial_top(av)) {
+		  old_size = 0;
+  } else {
+        old_size = chunksize(old_top);
+  }
   old_end  = (char*)(chunk_at_offset(old_top, old_size));
 
   brk = snd_brk = (char*)(MORECORE_FAILURE);
@@ -2399,9 +2469,9 @@ static void* sysmalloc(INTERNAL_SIZE_T nb, mstate av)
 	 become the top chunk again later.  Note that a footer is set
 	 up, too, although the chunk is marked in use. */
       old_size = (old_size - MINSIZE) & ~MALLOC_ALIGN_MASK;
-      set_head(chunk_at_offset(old_top, old_size + 2*SIZE_SZ), 0|PREV_INUSE);
+      chunk_at_offset(old_top, old_size + 2*SIZE_SZ)->size = 0|PREV_INUSE;
       if (old_size >= MINSIZE) {
-	set_head(chunk_at_offset(old_top, old_size), (2*SIZE_SZ)|PREV_INUSE);
+         chunk_at_offset(old_top, old_size)->size = (2*SIZE_SZ)|PREV_INUSE;
 	set_foot(chunk_at_offset(old_top, old_size), (2*SIZE_SZ));
 	set_head(old_top, old_size|PREV_INUSE|NON_MAIN_ARENA);
 	_int_free(av, old_top, 1);
@@ -2545,7 +2615,7 @@ static void* sysmalloc(INTERNAL_SIZE_T nb, mstate av)
 
 	/* Guarantee alignment of first new chunk made from this space */
 
-	front_misalign = (INTERNAL_SIZE_T)chunk2mem(brk) & MALLOC_ALIGN_MASK;
+	front_misalign = (INTERNAL_SIZE_T)((void *)brk + 2*SIZE_SZ) & MALLOC_ALIGN_MASK;
 	if (front_misalign > 0) {
 
 	  /*
@@ -2599,9 +2669,9 @@ static void* sysmalloc(INTERNAL_SIZE_T nb, mstate av)
       else {
 	if (MALLOC_ALIGNMENT == 2 * SIZE_SZ)
 	  /* MORECORE/mmap must correctly align */
-	  assert(((unsigned long)chunk2mem(brk) & MALLOC_ALIGN_MASK) == 0);
+	  assert(((unsigned long)((void *)brk + 2*SIZE_SZ) & MALLOC_ALIGN_MASK) == 0);
 	else {
-	  front_misalign = (INTERNAL_SIZE_T)chunk2mem(brk) & MALLOC_ALIGN_MASK;
+	  front_misalign = (INTERNAL_SIZE_T)((void *)brk + 2*SIZE_SZ) & MALLOC_ALIGN_MASK;
 	  if (front_misalign > 0) {
 
 	    /*
@@ -2676,8 +2746,12 @@ static void* sysmalloc(INTERNAL_SIZE_T nb, mstate av)
 
   /* finally, do the allocation */
   p = av->top;
-  size = chunksize(p);
-
+  if (p != initial_top(av)) {
+    size = chunksize(p);
+  }
+  else {
+    size = 0;
+  }
   /* check that one of the above allocation paths succeeded */
   if ((unsigned long)(size) >= (unsigned long)(nb + MINSIZE)) {
     remainder_size = size - nb;
@@ -2820,11 +2894,10 @@ mremap_chunk(mchunkptr p, size_t new_size)
 
   p = (mchunkptr)(cp + offset);
 
-  assert(aligned_OK(chunk2mem(p)));
-
-  assert((p->prev_size == offset));
+  assert(aligned_chunk_OK(p));
   set_head(p, (new_size - offset)|IS_MMAPPED);
 
+  assert((p->prev_size == offset));
   mp_.mmapped_mem -= size + offset;
   mp_.mmapped_mem += new_size;
   if ((unsigned long)mp_.mmapped_mem > (unsigned long)mp_.max_mmapped_mem)
@@ -2863,7 +2936,11 @@ __libc_malloc(size_t bytes)
     (void)mutex_unlock(&ar_ptr->mutex);
   assert(!victim || chunk_is_mmapped(mem2chunk(victim)) ||
 	 ar_ptr == arena_for_chunk(mem2chunk(victim)));
+#ifndef __CHKP__
   return victim;
+#else
+  return __bnd_narrow_ptr_bounds (victim, victim, bytes);
+#endif
 }
 libc_hidden_def(__libc_malloc)
 
@@ -2951,7 +3028,12 @@ __libc_realloc(void* oldmem, size_t bytes)
     if(newp) return chunk2mem(newp);
 #endif
     /* Note the extra SIZE_SZ overhead. */
-    if(oldsize - SIZE_SZ >= nb) return oldmem; /* do nothing */
+    if(oldsize - SIZE_SZ >= nb)
+#ifndef __CHKP__
+		 return oldmem; /* do nothing */
+#else
+		 return __bnd_narrow_ptr_bounds(oldmem, oldmem, bytes); /* do nothing */
+#endif
     /* Must alloc, copy, free. */
     newmem = __libc_malloc(bytes);
     if (newmem == 0) return 0; /* propagate failure */
@@ -2993,8 +3075,11 @@ __libc_realloc(void* oldmem, size_t bytes)
 	  _int_free(ar_ptr, oldp, 0);
 	}
     }
-
+#ifndef __CHKP__
   return newp;
+#else
+  return __bnd_narrow_ptr_bounds(newp, newp, bytes);
+#endif
 }
 libc_hidden_def (__libc_realloc)
 
@@ -3029,7 +3114,11 @@ __libc_memalign(size_t alignment, size_t bytes)
     (void)mutex_unlock(&ar_ptr->mutex);
   assert(!p || chunk_is_mmapped(mem2chunk(p)) ||
 	 ar_ptr == arena_for_chunk(mem2chunk(p)));
+#ifndef __CHKP__
   return p;
+#else
+  return __bnd_narrow_ptr_bounds(p, p, bytes);
+#endif
 }
 /* For ISO C11.  */
 weak_alias (__libc_memalign, aligned_alloc)
@@ -3065,8 +3154,11 @@ __libc_valloc(size_t bytes)
     (void)mutex_unlock (&ar_ptr->mutex);
   assert(!p || chunk_is_mmapped(mem2chunk(p)) ||
 	 ar_ptr == arena_for_chunk(mem2chunk(p)));
-
+#ifndef __CHKP__
   return p;
+#else
+  return __bnd_narrow_ptr_bounds(p, p, bytes);
+#endif
 }
 
 void*
@@ -3100,7 +3192,11 @@ __libc_pvalloc(size_t bytes)
   assert(!p || chunk_is_mmapped(mem2chunk(p)) ||
 	 ar_ptr == arena_for_chunk(mem2chunk(p)));
 
+#ifndef __CHKP__
   return p;
+#else
+  return __bnd_narrow_ptr_bounds(p, p, bytes);
+#endif
 }
 
 void*
@@ -3132,6 +3228,9 @@ __libc_calloc(size_t n, size_t elem_size)
     mem = (*hook)(sz, RETURN_ADDRESS (0));
     if(mem == 0)
       return 0;
+#ifdef __CHKP__
+    mem = __bnd_narrow_ptr_bounds(mem, mem, sz);
+#endif
     return memset(mem, 0, sz);
   }
 
@@ -3145,7 +3244,12 @@ __libc_calloc(size_t n, size_t elem_size)
      need to clear. */
 #if MORECORE_CLEARS
   oldtop = top(av);
-  oldtopsize = chunksize(top(av));
+  if (oldtop == initial_top(av))
+  {
+	  oldtopsize = 0;
+  } else {
+     oldtopsize = chunksize(top(av));
+  }
 #if MORECORE_CLEARS < 2
   /* Only newly allocated memory is guaranteed to be cleared.  */
   if (av == &main_arena &&
@@ -3179,6 +3283,9 @@ __libc_calloc(size_t n, size_t elem_size)
   /* Two optional cases in which clearing not necessary */
   if (chunk_is_mmapped (p))
     {
+#ifdef __CHKP__
+      mem =  __bnd_narrow_ptr_bounds(mem, mem, sz);
+#endif
       if (__builtin_expect (perturb_byte, 0))
 	MALLOC_ZERO (mem, sz);
       return mem;
@@ -3221,8 +3328,11 @@ __libc_calloc(size_t n, size_t elem_size)
       }
     }
   }
-
+#ifndef __CHKP__
   return mem;
+#else
+  return __bnd_narrow_ptr_bounds(mem, mem, sz);
+#endif
 }
 
 /*
@@ -3676,7 +3786,11 @@ _int_malloc(mstate av, size_t bytes)
     */
 
     victim = av->top;
-    size = chunksize(victim);
+    if (victim == initial_top(av)) {
+       size = 0;
+    } else {
+       size = chunksize(victim);
+    }
 
     if ((unsigned long)(size) >= (unsigned long)(nb + MINSIZE)) {
       remainder_size = size - nb;
@@ -4051,6 +4165,9 @@ static void malloc_consolidate(mstate av)
       p = atomic_exchange_acq (fb, 0);
       if (p != 0) {
 	do {
+#ifdef __CHKP__
+	  p = __bnd_set_ptr_bounds(p, sizeof (struct malloc_chunk));
+#endif
 	  check_inuse_chunk(av, p);
 	  nextp = p->fd;
 
@@ -4336,8 +4453,8 @@ _int_memalign(mstate av, size_t alignment, size_t bytes)
 
     /* For mmapped chunks, just adjust offset */
     if (chunk_is_mmapped(p)) {
-      newp->prev_size = p->prev_size + leadsize;
       set_head(newp, newsize|IS_MMAPPED);
+      newp->prev_size = p->prev_size + leadsize;
       return chunk2mem(newp);
     }
 
@@ -4350,7 +4467,7 @@ _int_memalign(mstate av, size_t alignment, size_t bytes)
     p = newp;
 
     assert (newsize >= nb &&
-	    (((unsigned long)(chunk2mem(p))) % alignment) == 0);
+	    (((unsigned long)((char *)p + 2 * SIZE_SZ) % alignment) == 0));
   }
 
   /* Also give back spare room at the end */
@@ -4430,7 +4547,7 @@ static int mtrim(mstate av, size_t pad)
 						+ sizeof (struct malloc_chunk)
 						+ psm1) & ~psm1);
 
-		assert ((char *) chunk2mem (p) + 4 * SIZE_SZ <= paligned_mem);
+      assert ((char *) (p) + 6 * SIZE_SZ <= paligned_mem);
 		assert ((char *) p + size > paligned_mem);
 
 		/* This is the size we could potentially free.  */
@@ -4932,7 +5049,6 @@ __posix_memalign (void **memptr, size_t alignment, size_t size)
 }
 weak_alias (__posix_memalign, posix_memalign)
 
-
 int
 malloc_info (int options, FILE *fp)
 {
@@ -5121,7 +5237,6 @@ malloc_info (int options, FILE *fp)
   return 0;
 }
 
-
 strong_alias (__libc_calloc, __calloc) weak_alias (__libc_calloc, calloc)
 strong_alias (__libc_free, __cfree) weak_alias (__libc_free, cfree)
 strong_alias (__libc_free, __free) strong_alias (__libc_free, free)
diff --git a/malloc/obstack.h b/malloc/obstack.h
index d2e056b..5e4b5b9 100644
--- a/malloc/obstack.h
+++ b/malloc/obstack.h
@@ -143,7 +143,7 @@ struct _obstack_chunk		/* Lives at front of each chunk. */
 {
   char  *limit;			/* 1 past end of this chunk */
   struct _obstack_chunk *prev;	/* address of prior chunk or NULL */
-  char	contents[4];		/* objects begin here */
+  char	contents[4] __attribute__((bnd_variable_size));		/* objects begin here */
 };
 
 struct obstack		/* control current object in current chunk */

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]