This is the mail archive of the binutils@sourceware.cygnus.com mailing list for the binutils project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

[PATCH] bfd sparc64 fixes and tail call optimizations


Hi!

Attached are three patches. The first one was posted two month ago to this
list but is still not commited, the latter two are new.
The latter two implement tail call optimization in gas/ld for sparc:

the usual way of doing tail call on SPARC is:

call foo
restore ...

or

mov %o7, %g1
call foo
mov %g1, %o7

or

sethi %hi(foo), %g1
jmpl %g1 + %lo(foo), %g0
...

I have no idea how can I safely optimize the last one (perhaps I'll just
teach gcc to always emit 2nd one), but for the first two, the call can be
replaced by branch always instruction. The advantage of doing so is to avoid
clobbering return address stack on Ultra (because due to the tail call all
things from that point on will be mispredicted on RAS) and I believe even
for other CPUs ba is more pairable than call.
The optimization can be done if:

call's destination is close (at most +-8M, the usual case)
call's delay slot is either restore instruction which does not use %o7 in
rs1/rs2 or some instruction which overwrites %o7 and does not use it.
For simplicity I only implemented the optimization if there is restore or
{add,and,or,xor,sub,andn,orn,xnor}cc *,*,%o7
(and the delay slot obviously does not use %o7 in rs1/rs2).
The only program visible difference between ba and call is setting %o7 to
%pc (unless someone reads instructions as data), so if we're sure the
instruction in its delay slot will kill that value, this optimization is
safe.

	Jakub
2000-03-09  Jakub Jelinek  <jakub@redhat.com>

	* elf64-sparc.c: Add ATTRIBUTE_UNUSED to unused function parameters.
	Remove unusued variables.
	(sparc64_elf_relocate_section): Change r_symndx type to unsigned long.
	(sparc64_elf_merge_private_bfd_data): Fix shared library case from
	previous fix, so that shared libs really don't influence targets
	extension mask and memory model.

--- bfd/elf64-sparc.c.jj	Thu Mar  9 10:02:10 2000
+++ bfd/elf64-sparc.c	Thu Mar  9 12:01:37 2000
@@ -212,7 +212,7 @@ static CONST struct elf_reloc_map sparc_
 
 static reloc_howto_type *
 sparc64_elf_reloc_type_lookup (abfd, code)
-     bfd *abfd;
+     bfd *abfd ATTRIBUTE_UNUSED;
      bfd_reloc_code_real_type code;
 {
   unsigned int i;
@@ -226,7 +226,7 @@ sparc64_elf_reloc_type_lookup (abfd, cod
 
 static void
 sparc64_elf_info_to_howto (abfd, cache_ptr, dst)
-     bfd *abfd;
+     bfd *abfd ATTRIBUTE_UNUSED;
      arelent *cache_ptr;
      Elf64_Internal_Rela *dst;
 {
@@ -240,7 +240,7 @@ sparc64_elf_info_to_howto (abfd, cache_p
    
 static long
 sparc64_elf_get_reloc_upper_bound (abfd, sec)
-     bfd *abfd;
+     bfd *abfd ATTRIBUTE_UNUSED;
      asection *sec;
 {
   return (sec->reloc_count * 2 + 1) * sizeof (arelent *);
@@ -266,7 +266,6 @@ sparc64_elf_slurp_one_reloc_table (abfd,
      asymbol **symbols;
      boolean dynamic;
 {
-  struct elf_backend_data * const ebd = get_elf_backend_data (abfd);
   PTR allocated = NULL;
   bfd_byte *native_relocs;
   arelent *relent;
@@ -501,7 +500,6 @@ sparc64_elf_write_relocs (abfd, sec, dat
   for (idx = 0; idx < sec->reloc_count; idx++)
     {
       bfd_vma addr;
-      unsigned int i;
 
       ++count;
 
@@ -723,13 +721,13 @@ sparc_elf_notsup_reloc (abfd,
 			input_section,
 			output_bfd,
 			error_message)
-     bfd *abfd;
-     arelent *reloc_entry;
-     asymbol *symbol;
-     PTR data;
-     asection *input_section;
-     bfd *output_bfd;
-     char **error_message;
+     bfd *abfd ATTRIBUTE_UNUSED;
+     arelent *reloc_entry ATTRIBUTE_UNUSED;
+     asymbol *symbol ATTRIBUTE_UNUSED;
+     PTR data ATTRIBUTE_UNUSED;
+     asection *input_section ATTRIBUTE_UNUSED;
+     bfd *output_bfd ATTRIBUTE_UNUSED;
+     char **error_message ATTRIBUTE_UNUSED;
 {
   return bfd_reloc_notsupported;
 }
@@ -745,7 +743,7 @@ sparc_elf_wdisp16_reloc (abfd, reloc_ent
      PTR data;
      asection *input_section;
      bfd *output_bfd;
-     char **error_message;
+     char **error_message ATTRIBUTE_UNUSED;
 {
   bfd_vma relocation;
   bfd_vma insn;
@@ -783,7 +781,7 @@ sparc_elf_hix22_reloc (abfd,
      PTR data;
      asection *input_section;
      bfd *output_bfd;
-     char **error_message;
+     char **error_message ATTRIBUTE_UNUSED;
 {
   bfd_vma relocation;
   bfd_vma insn;
@@ -820,7 +818,7 @@ sparc_elf_lox10_reloc (abfd,
      PTR data;
      asection *input_section;
      bfd *output_bfd;
-     char **error_message;
+     char **error_message ATTRIBUTE_UNUSED;
 {
   bfd_vma relocation;
   bfd_vma insn;
@@ -1265,9 +1263,9 @@ sparc64_elf_add_symbol_hook (abfd, info,
      struct bfd_link_info *info;
      const Elf_Internal_Sym *sym;
      const char **namep;
-     flagword *flagsp;
-     asection **secp;
-     bfd_vma *valp;
+     flagword *flagsp ATTRIBUTE_UNUSED;
+     asection **secp ATTRIBUTE_UNUSED;
+     bfd_vma *valp ATTRIBUTE_UNUSED;
 {
   static char *stt_types[] = { "NOTYPE", "OBJECT", "FUNCTION" };
 
@@ -1387,7 +1385,7 @@ sparc64_elf_add_symbol_hook (abfd, info,
 
 static boolean
 sparc64_elf_output_arch_syms (output_bfd, info, finfo, func)
-     bfd *output_bfd;
+     bfd *output_bfd ATTRIBUTE_UNUSED;
      struct bfd_link_info *info;
      PTR finfo;
      boolean (*func) PARAMS ((PTR, const char *,
@@ -1460,7 +1458,7 @@ sparc64_elf_get_symbol_type (elf_sym, ty
 
 static void
 sparc64_elf_symbol_processing (abfd, asym)
-     bfd *abfd;
+     bfd *abfd ATTRIBUTE_UNUSED;
      asymbol *asym;
 {
   elf_symbol_type *elfsym;
@@ -1898,7 +1896,7 @@ sparc64_elf_relocate_section (output_bfd
     {
       int r_type;
       reloc_howto_type *howto;
-      long r_symndx;
+      unsigned long r_symndx;
       struct elf_link_hash_entry *h;
       Elf_Internal_Sym *sym;
       asection *sec;
@@ -2806,8 +2804,8 @@ sparc64_elf_merge_private_bfd_data (ibfd
 	  /* We don't want dynamic objects memory ordering and
 	     architecture to have any role. That's what dynamic linker
 	     should do.  */
-	  old_flags &= ~(EF_SPARCV9_MM | EF_SPARC_SUN_US1 | EF_SPARC_HAL_R1);
-	  old_flags |= (new_flags
+	  new_flags &= ~(EF_SPARCV9_MM | EF_SPARC_SUN_US1 | EF_SPARC_HAL_R1);
+	  new_flags |= (old_flags
 			& (EF_SPARCV9_MM
 			   | EF_SPARC_SUN_US1
 			   | EF_SPARC_HAL_R1));
@@ -2860,7 +2858,7 @@ sparc64_elf_merge_private_bfd_data (ibfd
 
 static const char *
 sparc64_elf_print_symbol_all (abfd, filep, symbol)
-     bfd *abfd;
+     bfd *abfd ATTRIBUTE_UNUSED;
      PTR filep;
      asymbol *symbol;
 {
2000-05-18  Jakub Jelinek  <jakub@redhat.com>

	* elf64-sparc.c (sparc64_elf_relocate_section): Optimize tail call
	into branch always if possible.
	* elf32-sparc.c (elf32_sparc_relocate_section): Likewise.

--- bfd/elf64-sparc.c.jj	Thu Mar  9 12:01:37 2000
+++ bfd/elf64-sparc.c	Thu May 18 13:40:46 2000
@@ -2391,6 +2391,8 @@ sparc64_elf_relocate_section (output_bfd
 	  relocation = (splt->output_section->vma
 			+ splt->output_offset
 			+ sparc64_elf_plt_entry_offset (h->plt.offset));
+	  if (r_type == R_SPARC_WPLT30)
+	    goto do_wplt30;
 	  goto do_default;
 
 	case R_SPARC_OLO10:
@@ -2465,6 +2467,53 @@ sparc64_elf_relocate_section (output_bfd
 	    r = bfd_reloc_ok;
 	  }
 	  break;
+
+	case R_SPARC_WDISP30:
+	do_wplt30:
+	  if (rel->r_offset + 4 < input_section->_raw_size)
+	    {
+	      bfd_vma x, y;
+
+	      /* If the instruction is a call with either:
+		 restore
+		 arithmetic instruction with rd == %o7
+		 where rs1 != %o7 and rs2 if it is register != %o7
+		 then we can optimize if the call destination is near
+		 by changing the call into a branch always.  */
+	      x = bfd_get_32 (input_bfd, contents + rel->r_offset);
+	      y = bfd_get_32 (input_bfd, contents + rel->r_offset + 4);
+	      if (((x >> 30) & 3) == 1 && ((y >> 30) & 3) == 2)
+		{
+		  if ((((y >> 19) & 0x3f) == 0x3d /* restore */
+		       || ((y & (0x28 << 19)) == 0 /* arithmetic */
+			   && ((y >> 25) & 0x1f) == 15))
+		      && ((y >> 14) & 0x1f) != 15
+		      && ((y & (1 << 13))
+			  || (y & 0x1f) != 15))
+		    {
+		      bfd_vma reloc;
+
+		      reloc = relocation + rel->r_addend - rel->r_offset;
+		      reloc -= (input_section->output_section->vma
+				+ input_section->output_offset);
+		      if (reloc & 3)
+			goto do_default;
+		      if ((reloc & ~(bfd_vma)0x7fffff)
+			   && ((reloc | 0x7fffff) != MINUS_ONE))
+			goto do_default;
+		      reloc >>= 2;
+		      if ((reloc & 0x3c0000) == 0
+			  || (reloc & 0x3c0000) == 0x3c0000)
+			x = 0x10680000 | (reloc & 0x7ffff); /* ba,pt %xcc */
+		      else
+			x = 0x10800000 | (reloc & 0x3fffff); /* ba */
+		      bfd_put_32 (input_bfd, x, contents + rel->r_offset);
+		      r = bfd_reloc_ok;
+		      break;
+		    }
+		}
+	    }
+	  /* FALLTHROUGH */
 
 	default:
 	do_default:
--- bfd/elf32-sparc.c.jj	Thu Mar  9 10:02:10 2000
+++ bfd/elf32-sparc.c	Thu May 18 14:04:52 2000
@@ -1515,6 +1515,7 @@ elf32_sparc_relocate_section (output_bfd
 	  break;
 	}
 
+      r = bfd_reloc_continue;
       if (r_type == R_SPARC_WDISP16)
 	{
 	  bfd_vma x;
@@ -1546,7 +1547,55 @@ elf32_sparc_relocate_section (output_bfd
 	  bfd_putl32 (/*input_bfd,*/ x, contents + rel->r_offset);
 	  r = bfd_reloc_ok;
 	}
-      else
+      else if (r_type == R_SPARC_WDISP30 || r_type == R_SPARC_WPLT30)
+	{
+	  if (rel->r_offset + 4 < input_section->_raw_size)
+	    {
+	      bfd_vma x, y;
+
+	      /* If the instruction is a call with either:
+		 restore
+		 arithmetic instruction with rd == %o7
+		 where rs1 != %o7 and rs2 if it is register != %o7
+		 then we can optimize if the call destination is near
+		 by changing the call into a branch always.  */
+	      x = bfd_get_32 (input_bfd, contents + rel->r_offset);
+	      y = bfd_get_32 (input_bfd, contents + rel->r_offset + 4);
+	      if (((x >> 30) & 3) == 1 && ((y >> 30) & 3) == 2)
+		{
+		  if ((((y >> 19) & 0x3f) == 0x3d /* restore */
+		       || ((y & (0x28 << 19)) == 0 /* arithmetic */
+			   && ((y >> 25) & 0x1f) == 15))
+		      && ((y >> 14) & 0x1f) != 15
+		      && ((y & (1 << 13))
+			  || (y & 0x1f) != 15))
+		    {
+		      bfd_vma reloc;
+
+		      reloc = relocation + rel->r_addend - rel->r_offset;
+		      reloc -= (input_section->output_section->vma
+				+ input_section->output_offset);
+		      if (reloc & 3)
+			goto do_default;
+		      if ((reloc & ~(bfd_vma)0x7fffff)
+			   && ((reloc | 0x7fffff) != ~(bfd_vma)0))
+			goto do_default;
+		      reloc >>= 2;
+		      if (((reloc & 0x3c0000) == 0
+			   || (reloc & 0x3c0000) == 0x3c0000)
+			  && (elf_elfheader (output_bfd)->e_flags & EF_SPARC_32PLUS))
+			x = 0x10680000 | (reloc & 0x7ffff); /* ba,pt %xcc */
+		      else
+			x = 0x10800000 | (reloc & 0x3fffff); /* ba */
+		      bfd_put_32 (input_bfd, x, contents + rel->r_offset);
+		      r = bfd_reloc_ok;
+		      break;
+		    }
+		}
+	    }
+	
+	}
+      if (r == bfd_reloc_continue)
 	r = _bfd_final_link_relocate (howto, input_bfd, input_section,
 				      contents, rel->r_offset,
 				      relocation, rel->r_addend);
2000-05-18  Jakub Jelinek  <jakub@redhat.com>

	* config/tc-sparc.c (md_apply_fix3): Optimize tail call into branch
	always if possible.

--- gas/config/tc-sparc.c.jj	Thu May 18 13:13:53 2000
+++ gas/config/tc-sparc.c	Thu May 18 15:20:00 2000
@@ -2915,7 +2915,49 @@ md_apply_fix3 (fixP, value, segment)
 	      || fixP->fx_addsy == NULL
 	      || symbol_section_p (fixP->fx_addsy))
 	    ++val;
+
 	  insn |= val & 0x3fffffff;
+
+	  /* See if we have a delay slot */
+	  if (fixP->fx_where + 8 <= fixP->fx_frag->fr_fix)
+	    {
+	      long delay;
+
+	      /* If the instruction is a call with either:
+		 restore
+		 arithmetic instruction with rd == %o7
+		 where rs1 != %o7 and rs2 if it is register != %o7
+		 then we can optimize if the call destination is near
+		 by changing the call into a branch always.  */
+	      if (INSN_BIG_ENDIAN)
+		delay = bfd_getb32 ((unsigned char *) buf + 4);
+	      else
+		delay = bfd_getl32 ((unsigned char *) buf + 4);
+	      if (((insn >> 30) & 3) == 1 && ((delay >> 30) & 3) == 2)
+		{
+		  if ((((delay >> 19) & 0x3f) == 0x3d /* restore */
+		       || ((delay & (0x28 << 19)) == 0 /* arithmetic */
+			   && ((delay >> 25) & 0x1f) == 15))
+		      && ((delay >> 14) & 0x1f) != 15
+		      && ((delay & (1 << 13))
+			  || (delay & 0x1f) != 15))
+		    {
+		      if ((val & 0x3fe00000)
+			   && (val & 0x3fe00000) != 0x3fe00000)
+			break;
+		      if (((val & 0x3c0000) == 0
+			   || (val & 0x3c0000) == 0x3c0000)
+			  && (sparc_arch_size == 64
+			      || current_architecture >= SPARC_OPCODE_ARCH_V9))
+			/* ba,pt %xcc */
+			insn = 0x10680000 | (val & 0x7ffff);
+		      else
+			/* ba */
+			insn = 0x10800000 | (val & 0x3fffff);
+		      break;
+		    }
+		}
+	    }
 	  break;
 
 	case BFD_RELOC_SPARC_11:

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]