Closed Bug 514195 Opened 15 years ago Closed 14 years ago

sunspider perf regression on regex-dna

Categories

(Core :: JavaScript Engine, defect)

x86
macOS
defect
Not set
normal

Tracking

()

RESOLVED WORKSFORME

People

(Reporter: sayrer, Assigned: gal)

References

Details

Attachments

(1 file, 2 obsolete files)

on my machine, a 08/24 TM nightly vs. a 08/28 TM nightly:

                               08/28               08/24

dna:       1.23x as fast       72.6ms +/- 2.6%     58.8ms +/- 6.4%
Attached image tinderbox Windows XP
I believe the second, larger spike is this bug. I filed bug 514200 on the smaller one, where the results first rise above 60.
the date in the screenshot is the approximate time this regressed. 08/27 at 1pm or so.
My tests show it happened at this revision:

changeset:   32072:1e581a31c017
user:        Andreas Gal <gal@mozilla.com>
date:        Thu Aug 27 18:46:45 2009 -0700
summary:     Remove explicitSavedRegs and loop hacks from nanojit (513139, r=dvander).
Note: this may be the same bug as bug 514200, and that one is probably easier to debug.
32071:

==12174== I   refs:      174,915,964
==12174== D   refs:       59,951,168  (40,371,968 rd   + 19,579,200 wr)

32072:

==12176== I   refs:      195,716,432
==12176== D   refs:       80,532,539  (61,355,057 rd   + 19,177,482 wr)

21 million extra insns, and 21 million extra reads.  Looks like time
for the "find the extra load in a tight loop" game.
(In reply to comment #6)
> 21 million extra insns, and 21 million extra reads.  Looks like time
> for the "find the extra load in a tight loop" game.

That should be "extra load or loads".

They're in the generated code ("???:???"), unfortunately I have no more
details.

174,915,964  PROGRAM TOTALS

89,039,922  ???:???
18,478,139  /Users/macuser/MOZ/Rev32071/js/src/BuildR
            /../jsvector.h:js_StringReplaceHelper
 7,586,911  ???:ImageLoaderMachO::doRebase(ImageLoader::LinkContext const&)
 7,106,048  /Users/macuser/MOZ/Rev32071/js/src/BuildR
            /../jsscan.cpp:GetChar(JSTokenStream*)
 5,004,671  ???:lookup
 3,854,349  ???:strcmp
 3,340,686  ???:classHash

vs

195,716,432  PROGRAM TOTALS

109,410,763  ???:???
 18,478,139  /Users/macuser/MOZ/Rev32072/js/src/BuildR
             /../jsvector.h:js_StringReplaceHelper
  7,586,911  ???:ImageLoaderMachO::doRebase(ImageLoader::LinkContext const&)
  7,106,048  /Users/macuser/MOZ/Rev32072/js/src/BuildR
             /../jsscan.cpp:GetChar(JSTokenStream*)
  5,004,671  ???:lookup
  3,854,349  ???:strcmp
  3,340,686  ???:classHash
I filed a bug on this earlier:

  00002e1de6   [label3]               eax(state)
  ## merging registers (intersect) with existing edge
  00002e1de6   mov ebx,-4(ebp)         <= restore state
      sp = ld state[0]
  00002e1de9   mov esi,0(ebx)         ebx(state)
      rp = ld state[4]

We have state in eax, but we spill and reload it from memory. I think this is the cause.
Depends on: 514102
fixing bug 514102 fixes #8 and makes us a bit faster, but doesn't fix the regexp regression. Looking into that now.
Assignee: general → gal
It seems we are spilling ourselves to death in case of nested loops:

for (var i = 0; i < 100000; ++i)
    "bbbbb".match(/b/)

Original:

  00002dffad   [prologue]            
  00002dffad   push edi              
  00002dffae   push esi              
  00002dffaf   push ebx              
  00002dffb0   push ebp              
  00002dffb1   push ebp              
  00002dffb2   mov ebp,esp           
  00002dffb4   [frag entry]          
  00002dffb4   sub esp,8             
  ## patching branch at 00002dffe3 to 00002dffb7
  00002dffb7:
  ## compiling trunk 0x808ef4
      state = iparam 0 ecx
      cpend = iparam 1 edx
      pos = ld state[16]
  00002dffb7   mov ebx,16(ecx)        ecx(state) edx(cpend)
      jf le1 -> label1
  00002dffba   cmp ebx,edx            ecx(state) edx(cpend) ebx(pos)
  00002dffbc   jnle  0x2dffd9         ecx(state) edx(cpend) ebx(pos)
      jf lt1 -> label2
  00002dffbe   cmp ebx,edx            ecx(state) edx(cpend) ebx(pos)
  00002dffc0   jnl   0x2dffdd         ecx(state) edx(cpend) ebx(pos)
      ldcs1 = ldcs pos[0]
  00002dffc2   movsz esi,0(ebx)       ecx(state) edx(cpend) ebx(pos)
      jf eq1 -> label2
  00002dffc5   cmp esi,98             ecx(state) edx(cpend) ebx(pos) esi(ldcs1)
  00002dffc8   jne   0x2dffdd         ecx(state) edx(cpend) ebx(pos)
  00002dffca   mov eax,ecx            ecx(state) ebx(pos)
      add1 = add pos, 2
  00002dffcc   add ebx,2              eax(state) ebx(pos)
      sti state[28] = add1
  00002dffcf   mov 28(eax),ebx        eax(state) ebx(add1)
      1
  00002dffd2   mov eax,1             
      ret 1
  00002dffd7   jmp 0x2dffe8          
      label1:
  00002dffd9   [label1]              
      0
  00002dffd9   xor eax,eax           
      ret 0
  00002dffdb   jmp 0x2dffe8           ecx(state) edx(cpend) ebx(pos)
      label2:
  00002dffdd   [label2]               ecx(state) edx(cpend) ebx(pos)
      add2 = add pos, 2
  00002dffdd   add ebx,2              ecx(state) edx(cpend) ebx(pos)
      sti state[16] = add2
  00002dffe0   mov 16(ecx),ebx        ecx(state) edx(cpend) ebx(add2)
      loop
  00002dffe3:
  00002dffe3   jmp 0x0               
  00002dffe8   [epilogue]            
  00002dffe8   mov esp,ebp           
  00002dffea   pop ebp               
  00002dffeb   pop ebp               
  00002dffec   pop ebx               
  00002dffed   pop esi               
  00002dffee   pop edi               
  00002dffef   ret                   

New:

  00002e4f87   [prologue]            
  00002e4f87   push ebp              
  00002e4f88   mov ebp,esp           
  00002e4f8a   [frag entry]          
  00002e4f8a   sub esp,40            
  ## compiling trunk 0x808ef4
      ebx = iparam 0 ebx
  00002e4f8d   mov -12(ebp),ebx       ebx(ebx)  <= spill ebx
      esi = iparam 1 esi
  00002e4f90   mov -16(ebp),esi       esi(esi)  <= spill esi
      edi = iparam 2 edi
  00002e4f93   mov -20(ebp),edi       edi(edi)  <= spill edi
      state = iparam 0 ecx
  00002e4f96   mov -4(ebp),ecx        ecx(state)  <= spill state
  00002e4f99   mov eax,ecx            ecx(state)
      cpend = iparam 1 edx
  00002e4f9b   mov -8(ebp),edx        eax(state) edx(cpend)  <= spill cpend
  00002e4f9e   mov ecx,edx            eax(state) edx(cpend)
      label1:
  00002e4fa0   [label1]               eax(state) ecx(cpend)
  ## merging registers (intersect) with existing edge
  00002e4fa0   mov edi,-20(ebp)       eax(state) ecx(cpend)  <= restore edi
  00002e4fa3   mov esi,-16(ebp)       eax(state) ecx(cpend) edi(edi)  <= restore esi
  00002e4fa6   mov ebx,-12(ebp)       eax(state) ecx(cpend) esi(esi) edi(edi)  <= restore ebx
      pos = ld state[16]
  00002e4fa9   mov edx,16(eax)        eax(state) ecx(cpend) ebx(ebx) esi(esi) edi(edi)
  00002e4fac   mov -24(ebp),edx       eax(state) ecx(cpend) edx(pos) ebx(ebx) esi(esi) edi(edi)  <= spill pos
      jf le1 -> label2
  00002e4faf   cmp edx,ecx            eax(state) ecx(cpend) edx(pos) ebx(ebx) esi(esi) edi(edi)
  00002e4fb1   jnle  0x2e4fda         eax(state) ecx(cpend) ebx(ebx) esi(esi) edi(edi)
                                                merging registers (union) with existing edge
  00002e4fb3   mov ebx,-24(ebp)       eax(state) ecx(cpend) esi(esi) edi(edi)  <= restore pos
      jf lt1 -> label3
  00002e4fb6   cmp ebx,ecx            eax(state) ecx(cpend) ebx(pos) esi(esi) edi(edi)
  00002e4fb8   jnl   0x2e4fe0         eax(state) ecx(cpend) ebx(pos) esi(esi) edi(edi)
      ldcs1 = ldcs pos[0]
  00002e4fba   movsz edx,0(ebx)       eax(state) ecx(cpend) ebx(pos) esi(esi) edi(edi)
      jf eq1 -> label3
  00002e4fbd   cmp edx,98             eax(state) ecx(cpend) edx(ldcs1) ebx(pos) esi(esi) edi(edi)
  00002e4fc0   jne   0x2e4fe0         eax(state) ecx(cpend) ebx(pos) esi(esi) edi(edi)
                                                merging registers (union) with existing edge
  00002e4fc2   mov ebx,-12(ebp)       esi(esi) edi(edi)  <= restore ebx
  00002e4fc5   mov ecx,-4(ebp)        ebx(ebx) esi(esi) edi(edi)  <= restore state
  00002e4fc8   mov eax,-24(ebp)       ecx(state) ebx(ebx) esi(esi) edi(edi)  <= restore pos
      add1 = add pos, 2
  00002e4fcb   add eax,2              eax(pos) ecx(state) ebx(ebx) esi(esi) edi(edi)
      sti state[28] = add1
  00002e4fce   mov 28(ecx),eax        eax(add1) ecx(state) ebx(ebx) esi(esi) edi(edi)
      1 = int 1
  00002e4fd1   mov eax,1              ebx(ebx) esi(esi) edi(edi)
      ret 1
  00002e4fd6   mov esp,ebp            ebx(ebx) esi(esi) edi(edi)
  00002e4fd8   pop ebp                ebx(ebx) esi(esi) edi(edi)
  00002e4fd9   ret                    ebx(ebx) esi(esi) edi(edi)
      label2:
  00002e4fda   [label2]               ebx(ebx) esi(esi) edi(edi)
      0 = int 0
  00002e4fda   xor eax,eax            ebx(ebx) esi(esi) edi(edi)
      ret 0
  00002e4fdc   mov esp,ebp            eax(state) ecx(cpend) ebx(pos)
  00002e4fde   pop ebp                eax(state) ecx(cpend) ebx(pos)
  00002e4fdf   ret                    eax(state) ecx(cpend) ebx(pos)
      label3:
  00002e4fe0   [label3]               eax(state) ecx(cpend) ebx(pos)
      add2 = add pos, 2
  00002e4fe0   add ebx,2              eax(state) ecx(cpend) ebx(pos)
      sti state[16] = add2
  00002e4fe3   mov 16(eax),ebx        eax(state) ecx(cpend) ebx(add2)
      j -> label1
  00002e4fe6   jmp 0x0                eax(state) ecx(cpend)
      live state
      live cpend
      x1: x  -> pc=0x2000000 imacpc=0x844f48 sp+0 rp+301989888
  00002e4feb   jmp 0x2f4fd3          
  ----------------------------------- ## BEGIN exit block (LIR_xt|LIR_xf)
  00002f4fd3:
  ## merging registers (intersect) with existing edge
  00002f4fd3   mov edi,-20(ebp)        <= restore edi
  00002f4fd6   mov esi,-16(ebp)       edi(edi)  <= restore esi
  00002f4fd9   mov ebx,-12(ebp)       esi(esi) edi(edi)  <= restore ebx
  00002f4fdc   mov edx,-8(ebp)        ebx(ebx) esi(esi) edi(edi)  <= restore cpend
  00002f4fdf   mov ecx,-4(ebp)        edx(cpend) ebx(ebx) esi(esi) edi(edi)  <= restore state
  00002f4fe2   mov eax,8671016       
  00002f4fe7   mov esp,ebp           
  00002f4fe9:
  00002f4fe9   jmp 0x2f4fee          
  00002f4fee   pop ebp               
               ret                   
  ----------------------------------- ## END exit block 0x844f54
Depends on: 514344
Ok, two bugs here. The register allocation hint function was giving us wrong advise on where parameters and spilled register are, causing extra register to register moves. This was a minor effect though.

The main issue was caused by removing the explicitSavedRegs hack from the backend. That patch makes callee-saved registers (EBX, ESI, EDI) explicitly live in LIR. They are initially initialized to hold their own identities and if we decide to use them for something else, we save the original value as we spill. When we exit a fragment, we have to restore the original values of those registers.

What happened here is that the nested loops in our regular expressions contain return statements, which force EBX/ESI/EDI to be restored. Since we assemble bottom up, we hit the returns before we hit the loop edge, and since regexp code tends to have low register pressure (only 3 live values), the return doesn't have to spill. It just makes EBX/ESI/EDI live and expects those values there. At the loop edge we have to satisfy this dependency. Since we decided (bottom up) at the loop tail that we only want state and cpend to be live at the loop entry, EBX/ESI/EDI have to be restored from memory at every loop iteration. That costs.

So long story short, we are running into a bad local minimum in our **** register allocator. This is unlikely to happen for trace code because:

1) We don't use LIR_ret
2) We have usually more register pressure, so we don't make additional things live and EBX/ESI/EDI are spilled when we exit.

However, this doesn't mean the poor register allocation doesn't hurt traces. Trace exits (LIR_x*) are very similar to LIR_ret, and potentially also suffer from this. I will dig around a bit in a separate bug.
Attached patch patch (obsolete) — Splinter Review
This is the code with the patch:

  00002e4f85   [prologue]            
  00002e4f85   push ebp              
  00002e4f86   mov ebp,esp           
  00002e4f88   [frag entry]          
  00002e4f88   sub esp,24            
  ## compiling trunk 0x808ef4
      ebx = iparam 0 ebx
  00002e4f8b   mov -12(ebp),ebx       ebx(ebx)  <= spill ebx
      esi = iparam 1 esi
  00002e4f8e   mov -16(ebp),esi       esi(esi)  <= spill esi
      edi = iparam 2 edi
  00002e4f91   mov -20(ebp),edi       edi(edi)  <= spill edi
      state = iparam 0 ecx
  00002e4f94   mov -4(ebp),ecx        ecx(state)  <= spill state
      cpend = iparam 1 edx
  00002e4f97   mov -8(ebp),edx        ecx(state) edx(cpend)  <= spill cpend
      j -> label3
  00002e4f9a   jmp 0x2e4fbd           ecx(state) edx(cpend)
      label1:
  00002e4f9c   [label1]              
  ## merging registers (intersect) with existing edge
  00002e4f9c   mov edi,-20(ebp)        <= restore edi
  00002e4f9f   mov esi,-16(ebp)       edi(edi)  <= restore esi
  00002e4fa2   mov ebx,-12(ebp)       esi(esi) edi(edi)  <= restore ebx
      0 = int 0
  00002e4fa5   xor eax,eax            ebx(ebx) esi(esi) edi(edi)
      ret 0
  00002e4fa7   mov esp,ebp           
  00002e4fa9   pop ebp               
  00002e4faa   ret                   
      label2:
  00002e4fab   [label2]              
  ## merging registers (intersect) with existing edge
  00002e4fab   mov edi,-20(ebp)        <= restore edi
  00002e4fae   mov esi,-16(ebp)       edi(edi)  <= restore esi
  00002e4fb1   mov ebx,-12(ebp)       esi(esi) edi(edi)  <= restore ebx
      1 = int 1
  00002e4fb4   mov eax,1              ebx(ebx) esi(esi) edi(edi)
      ret 1
  00002e4fb9   mov esp,ebp            ecx(state) edx(cpend)
  00002e4fbb   pop ebp                ecx(state) edx(cpend)
  00002e4fbc   ret                    ecx(state) edx(cpend)
      label3:
  00002e4fbd   [label3]               ecx(state) edx(cpend)
      pos = ld state[16]
  00002e4fbd   mov ebx,16(ecx)        ecx(state) edx(cpend)
      jf le1 -> label4
  00002e4fc0   cmp ebx,edx            ecx(state) edx(cpend) ebx(pos)
  00002e4fc2   jnle  0x2e4fdb         ecx(state) edx(cpend) ebx(pos)
      jf lt1 -> label5
  00002e4fc4   cmp ebx,edx            ecx(state) edx(cpend) ebx(pos)
  00002e4fc6   jnl   0x2e4fe0         ecx(state) edx(cpend) ebx(pos)
      ldcs1 = ldcs pos[0]
  00002e4fc8   movsz esi,0(ebx)       ecx(state) edx(cpend) ebx(pos)
      jf eq1 -> label5
  00002e4fcb   cmp esi,98             ecx(state) edx(cpend) ebx(pos) esi(ldcs1)
  00002e4fce   jne   0x2e4fe0         ecx(state) edx(cpend) ebx(pos)
      add1 = add pos, 2
  00002e4fd0   add ebx,2              ecx(state) ebx(pos)
      sti state[28] = add1
  00002e4fd3   mov 28(ecx),ebx        ecx(state) ebx(add1)
      j -> label2
  00002e4fd6   jmp 0x0               
      label4:
  00002e4fdb   [label4]              
      j -> label1
  00002e4fdb   jmp 0x0               
      label5:
  00002e4fe0   [label5]               ecx(state) edx(cpend) ebx(pos)
      add2 = add pos, 2
  00002e4fe0   add ebx,2              ecx(state) edx(cpend) ebx(pos)
      sti state[16] = add2
  00002e4fe3   mov 16(ecx),ebx        ecx(state) edx(cpend) ebx(add2)
      j -> label3
  00002e4fe6   jmp 0x0                ecx(state) edx(cpend)
      live state
      live cpend
      x1: x  -> pc=0x2000000 imacpc=0x844f74 sp+0 rp+301989888
  00002e4feb   jmp 0x2f4fd3
Attached patch patch (obsolete) — Splinter Review
This is a slightly different approach. It avoids the gross control-flow rearrangement by introducing LIR_spill. LIR_spill is intended flow known slow-paths like exception handlers or fragment exits from within loops. LIR_spill forces all registers to be evicted. This causes all registers in the current path to be re-loaded, but helps any merges further upstream. This incidentally might also solve our control-flow diamond fast path problem (I will try this later).
Code generated with this patch:

  00002e4f91   [prologue]            
  00002e4f91   push ebp              
  00002e4f92   mov ebp,esp           
  00002e4f94   [frag entry]          
  00002e4f94   sub esp,24            
  ## compiling trunk 0x808ef4
      ebx = iparam 0 ebx
  00002e4f97   mov -12(ebp),ebx       ebx(ebx)  <= spill ebx
      esi = iparam 1 esi
  00002e4f9a   mov -16(ebp),esi       esi(esi)  <= spill esi
      edi = iparam 2 edi
  00002e4f9d   mov -20(ebp),edi       edi(edi)  <= spill edi
      state = iparam 0 ecx
  00002e4fa0   mov -4(ebp),ecx        ecx(state)  <= spill state
      cpend = iparam 1 edx
  00002e4fa3   mov -8(ebp),edx        ecx(state) edx(cpend)  <= spill cpend
      label1:
  00002e4fa6   [label1]               ecx(state) edx(cpend)
      pos = ld state[16]
  00002e4fa6   mov ebx,16(ecx)        ecx(state) edx(cpend)
      jf le1 -> label2
  00002e4fa9   cmp ebx,edx            ecx(state) edx(cpend) ebx(pos)
  00002e4fab   jnle  0x2e4fd1         ecx(state) edx(cpend) ebx(pos)
      jf lt1 -> label3
  00002e4fad   cmp ebx,edx            ecx(state) edx(cpend) ebx(pos)
  00002e4faf   jnl   0x2e4fe0         ecx(state) edx(cpend) ebx(pos)
      ldcs1 = ldcs pos[0]
  00002e4fb1   movsz esi,0(ebx)       ecx(state) edx(cpend) ebx(pos)
      jf eq1 -> label3
  00002e4fb4   cmp esi,98             ecx(state) edx(cpend) ebx(pos) esi(ldcs1)
  00002e4fb7   jne   0x2e4fe0         ecx(state) edx(cpend) ebx(pos)
      add1 = add pos, 2
  00002e4fb9   add ebx,2              ecx(state) ebx(pos)
      sti state[28] = add1
  00002e4fbc   mov 28(ecx),ebx        ecx(state) ebx(add1)
      spill
  00002e4fbf   mov edi,-20(ebp)        <= restore edi
  00002e4fc2   mov esi,-16(ebp)       edi(edi)  <= restore esi
  00002e4fc5   mov ebx,-12(ebp)       esi(esi) edi(edi)  <= restore ebx
      1 = int 1
  00002e4fc8   mov eax,1              ebx(ebx) esi(esi) edi(edi)
      ret 1
  00002e4fcd   mov esp,ebp           
  00002e4fcf   pop ebp               
  00002e4fd0   ret                   
      label2:
  00002e4fd1   [label2]              
      spill
  00002e4fd1   mov edi,-20(ebp)        <= restore edi
  00002e4fd4   mov esi,-16(ebp)       edi(edi)  <= restore esi
  00002e4fd7   mov ebx,-12(ebp)       esi(esi) edi(edi)  <= restore ebx
      0 = int 0
  00002e4fda   xor eax,eax            ebx(ebx) esi(esi) edi(edi)
      ret 0
  00002e4fdc   mov esp,ebp            ecx(state) edx(cpend) ebx(pos)
  00002e4fde   pop ebp                ecx(state) edx(cpend) ebx(pos)
  00002e4fdf   ret                    ecx(state) edx(cpend) ebx(pos)
      label3:
  00002e4fe0   [label3]               ecx(state) edx(cpend) ebx(pos)
      add2 = add pos, 2
  00002e4fe0   add ebx,2              ecx(state) edx(cpend) ebx(pos)
      sti state[16] = add2
  00002e4fe3   mov 16(ecx),ebx        ecx(state) edx(cpend) ebx(add2)
      j -> label1
  00002e4fe6   jmp 0x0                ecx(state) edx(cpend)
      live state
      live cpend
      x1: x  -> pc=0x2000000 imacpc=0x844f50 sp+0 rp+301989888
  00002e4feb   jmp 0x2f4fd3          
  ----------------------------------- ## BEGIN exit block (LIR_xt|LIR_xf)
  00002f4fd3:
  ## merging registers (intersect) with existing edge
  00002f4fd3   mov edi,-20(ebp)        <= restore edi
  00002f4fd6   mov esi,-16(ebp)       edi(edi)  <= restore esi
  00002f4fd9   mov ebx,-12(ebp)       esi(esi) edi(edi)  <= restore ebx
  00002f4fdc   mov edx,-8(ebp)        ebx(ebx) esi(esi) edi(edi)  <= restore cpend
  00002f4fdf   mov ecx,-4(ebp)        edx(cpend) ebx(ebx) esi(esi) edi(edi)  <= restore state
  00002f4fe2   mov eax,8671024       
  00002f4fe7   mov esp,ebp           
  00002f4fe9:
  00002f4fe9   jmp 0x2f4fee          
  00002f4fee   pop ebp               
               ret
Attachment #398324 - Flags: review?(rreitmai)
Attachment #398324 - Flags: review?(edwsmith)
Comment on attachment 398324 [details] [diff] [review]
patch

Ed, this solves the slow path spilling problem. If we emit LIR_spill immediately before a call in a slow path, it clears the register allocations in the slow path and the fast paths automatically wins.
Depends on: 514374
Attachment #398317 - Attachment is obsolete: true
Attachment #398324 - Attachment is obsolete: true
Attachment #398324 - Flags: review?(rreitmai)
Attachment #398324 - Flags: review?(edwsmith)
Comment on attachment 398324 [details] [diff] [review]
patch

Moved patch into a separate bug for easier tracking.
Do we still need any of this work, or did it end up in other bugs?  I think it ended up in other bugs, not that I wouldn't take another 20ms on regex-dna!
Status: NEW → RESOLVED
Closed: 14 years ago
Resolution: --- → WORKSFORME
You need to log in before you can comment on or make changes to this bug.

Attachment

General

Creator:
Created:
Updated:
Size: