--- ./camstream/video/video_asm.S.old 2004-04-17 18:22:08.977932552 +0100 +++ ./camstream/video/video_asm.S 2004-04-17 18:23:37.614457752 +0100 @@ -17,52 +17,52 @@ ENTRY(calc_diff128) enter $0, $0 - push %ebx - push %esi - push %edi - - mov Number, %ecx # number of pixels - mov Dst, %edi # destination - mov Src1, %esi # src_new - mov Src2, %ebx # src_old + push %rbx + push %rsi + push %rdi + + mov Number, %rcx # number of pixels + mov Dst, %rdi # destination + mov Src1, %rsi # src_new + mov Src2, %rbx # src_old 0: lodsb # load byte - mov (%ebx), %ah # load second byte + mov (%rbx), %ah # load second byte shr $1, %al # /2 by shift-right both values shr $1, %ah sub %ah, %al # substract old value add $128, %al # add virtual 0-point stosb - inc %ebx + inc %rbx loop 0b -9: pop %edi - pop %esi - pop %ebx +9: pop %rdi + pop %rsi + pop %rbx leave ret ENTRY(calc_diff128_mmx) enter $0, $0 - push %ebx - push %esi - push %edi - - mov Number, %ecx # number of pixels - mov Dst, %edi # destination - mov Src1, %esi # src_new - mov Src2, %ebx # src_old + push %rbx + push %rsi + push %rdi + + mov Number, %rcx # number of pixels + mov Dst, %rdi # destination + mov Src1, %rsi # src_new + mov Src2, %rbx # src_old - shr $2, %ecx # MMX uses 4 bytes in a row; too bad MMX + shr $2, %rcx # MMX uses 4 bytes in a row; too bad MMX # doesnt have carry-bits or shift-right byte, # or we could have grabbed 8 bytes in a row movq mm_128w, %mm7 -0: movd (%esi), %mm0 # 00 00 00 00 b3 b2 b1 b0 +0: movd (%rsi), %mm0 # 00 00 00 00 b3 b2 b1 b0 punpcklbw mm_0, %mm0 # 00 b3 00 b2 00 b1 00 b0 - movd (%ebx), %mm1 # 00 00 00 00 c3 c2 c1 c0 + movd (%rbx), %mm1 # 00 00 00 00 c3 c2 c1 c0 punpcklbw mm_0, %mm1 # 00 c3 00 c2 00 c1 00 c0 psraw $1, %mm0 # shift, then substract; this eliminates a @@ -71,17 +71,17 @@ psubw %mm1, %mm0 # subtract paddw %mm7, %mm0 # add virtual 0-point packuswb mm_0, %mm0 # pack, unsigned saturation 00 00 00 00 d3 d2 d1 d0 - movd %mm0, (%edi) # store + movd %mm0, (%rdi) # store - add $4, %ebx - add $4, %esi - add $4, %edi + add $4, %rbx + add $4, %rsi + add $4, %rdi loop 0b 9: emms # empty MMX state - pop %edi - pop %esi - pop %ebx + pop %rdi + pop %rsi + pop %rbx leave ret @@ -89,24 +89,24 @@ # calc_intg: perform Integrator step, that is, add a buffer to another ENTRY(calc_intg128) enter $0, $0 - push %ebx - push %esi - push %edi - - mov 8(%ebp), %ecx # number of pixels - mov 12(%ebp), %edi # destination - mov 16(%ebp), %esi # 2nd buffer + push %rbx + push %rsi + push %rdi + + mov 8(%rbp), %rcx # number of pixels + mov 12(%rbp), %rdi # destination + mov 16(%rbp), %rsi # 2nd buffer 0: lodsb sub $128, %al # s - 128 shl $1, %al # *2 - add %al, (%edi) # add to destination - inc %edi + add %al, (%rdi) # add to destination + inc %rdi loop 0b -9: pop %edi - pop %esi - pop %ebx +9: pop %rdi + pop %rsi + pop %rbx leave ret @@ -116,36 +116,36 @@ # Perform Integrator step with MMX instructions ENTRY(calc_intg128_smx) enter $0, $0 - push %ebx - push %esi - push %edi - - mov 8(%ebp), %ecx # number of pixels - mov 12(%ebp), %edi # destination - mov 16(%ebp), %esi # 2nd buffer - shr $2, %ecx # divide by 4 (again, we must use word operands) + push %rbx + push %rsi + push %rdi + + mov 8(%rbp), %rcx # number of pixels + mov 12(%rbp), %rdi # destination + mov 16(%rbp), %rsi # 2nd buffer + shr $2, %rcx # divide by 4 (again, we must use word operands) movq mm_128w, %mm7 # load constant # The following instructions are hopefully mixed # to optimize execution -0: movd (%edi), %mm0 # 00 00 00 00 b3 b2 b1 b0 +0: movd (%rdi), %mm0 # 00 00 00 00 b3 b2 b1 b0 punpcklbw mm_0, %mm0 # 00 b3 00 b2 00 b1 00 b0 - movd (%esi), %mm1 # 00 00 00 00 c3 c2 c1 c0 + movd (%rsi), %mm1 # 00 00 00 00 c3 c2 c1 c0 punpcklbw mm_0, %mm1 # 00 c3 00 c2 00 c1 00 c0 - add $4, %esi # keep CPU busy + add $4, %rsi # keep CPU busy psubw %mm7, %mm1 # src - 128 psllw $1, %mm1 # * 2 paddw %mm1, %mm0 # dst += src packuswb mm_0, %mm0 # pack, unsigned saturation 00 00 00 00 d3 d2 d1 d0 - movd %mm0, (%edi) # store back - add $4, %edi + movd %mm0, (%rdi) # store back + add $4, %rdi loop 0b 9: emms # clear MMX state - pop %edi - pop %esi - pop %ebx + pop %rdi + pop %rsi + pop %rbx leave ret --- ./camstream/video/video_asm_add.S.old 2004-04-17 18:22:17.121694512 +0100 +++ ./camstream/video/video_asm_add.S 2004-04-17 18:23:54.272925280 +0100 @@ -4,102 +4,102 @@ ENTRY(calc_add) enter $0, $0 - push %ebx - push %esi - push %edi - - mov Number, %ecx - mov Dst, %edi - mov Src1, %esi + push %rbx + push %rsi + push %rdi + + mov Number, %rcx + mov Dst, %rdi + mov Src1, %rsi 0: lodsb - add %al, (%edi) - inc %edi + add %al, (%rdi) + inc %rdi loop 0b -9: pop %edi - pop %esi - pop %ebx +9: pop %rdi + pop %rsi + pop %rbx leave ret ENTRY(calc_add_mmx) enter $0, $0 - push %ebx - push %esi - push %edi - - mov Number, %ecx - mov Dst, %edi - mov Src1, %esi + push %rbx + push %rsi + push %rdi + + mov Number, %rcx + mov Dst, %rdi + mov Src1, %rsi - shr $3, %ecx # Divide by 8 (!) + shr $3, %rcx # Divide by 8 (!) -0: movq (%esi), %mm0 - add $8, %esi - movq (%edi), %mm1 +0: movq (%rsi), %mm0 + add $8, %rsi + movq (%rdi), %mm1 paddb %mm1, %mm0 # add, with overflow - movq %mm0, (%edi) - add $8, %edi + movq %mm0, (%rdi) + add $8, %rdi loop 0 -9: pop %edi - pop %esi - pop %ebx +9: pop %rdi + pop %rsi + pop %rbx leave ret ENTRY(calc_add_mmxs) enter $0, $0 - push %ebx - push %esi - push %edi - - mov Number, %ecx - mov Dst, %edi - mov Src1, %esi + push %rbx + push %rsi + push %rdi + + mov Number, %rcx + mov Dst, %rdi + mov Src1, %rsi - shr $3, %ecx # Divide by 8 (!) + shr $3, %rcx # Divide by 8 (!) -0: movq (%esi), %mm0 - add $8, %esi - movq (%edi), %mm1 +0: movq (%rsi), %mm0 + add $8, %rsi + movq (%rdi), %mm1 paddsb %mm1, %mm0 # add, with saturation - movq %mm0, (%edi) - add $8, %edi + movq %mm0, (%rdi) + add $8, %rdi loop 0 -9: pop %edi - pop %esi - pop %ebx +9: pop %rdi + pop %rsi + pop %rbx leave ret ENTRY(calc_add128_mmx) enter $0, $0 - push %ebx - push %esi - push %edi - - mov Number, %ecx - mov Dst, %edi - mov Src1, %esi + push %rbx + push %rsi + push %rdi + + mov Number, %rcx + mov Dst, %rdi + mov Src1, %rsi - shr $3, %ecx # Divide by 8 (!) + shr $3, %rcx # Divide by 8 (!) movq mm_128b, %mm7 # load constant -0: movq (%esi), %mm0 - add $8, %esi +0: movq (%rsi), %mm0 + add $8, %rsi psubb %mm7, %mm0 # compensate null-point - movq (%edi), %mm1 + movq (%rdi), %mm1 paddb %mm1, %mm0 # add, with overflow - movq %mm0, (%edi) - add $8, %edi + movq %mm0, (%rdi) + add $8, %rdi loop 0 -9: pop %edi - pop %esi - pop %ebx +9: pop %rdi + pop %rsi + pop %rbx leave ret --- ./camstream/video/video_def.h.old 2004-04-17 18:22:25.357442488 +0100 +++ ./camstream/video/video_def.h 2004-04-17 18:24:14.037920544 +0100 @@ -1,8 +1,8 @@ #define __ASSEMBLY__ #include -#define Number 8(%ebp) -#define Dst 12(%ebp) -#define Src1 16(%ebp) -#define Src2 20(%ebp) +#define Number 8(%rbp) +#define Dst 12(%rbp) +#define Src1 16(%rbp) +#define Src2 20(%rbp)