This repository has been archived by the owner on Jan 18, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 24
/
sumsq_mmx_assist.s
83 lines (73 loc) · 1.66 KB
/
sumsq_mmx_assist.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# MMX assist routines for sumsq
# Copyright 2001 Phil Karn, KA9Q
# May be used under the terms of the GNU Public License (GPL)
.text
# Evaluate sum of squares of signed 16-bit input samples
# long long sumsq_mmx_assist(signed short *in,int cnt);
.global sumsq_mmx_assist
.type sumsq_mmx_assist,@function
.align 16
sumsq_mmx_assist:
pushl %ebp
movl %esp,%ebp
pushl %esi
pushl %ecx
pushl %ebx
movl 8(%ebp),%esi
movl 12(%ebp),%ecx
xor %eax,%eax
xor %edx,%edx
# Since 4 * 32767**2 < 2**32, we can accumulate two at a time
1: subl $8,%ecx
jl 2f
movq (%esi),%mm0 # S0 S1 S2 S3
pmaddwd %mm0,%mm0 # (S0^2+S1^2) (S2^2+S3^2)
movq 8(%esi),%mm6 # S4 S5 S6 S7
pmaddwd %mm6,%mm6 # (S4^2+S5^2) (S6^2+S7^2)
paddd %mm6,%mm0 # (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2)
movd %mm0,%ebx
addl %ebx,%eax
adcl $0,%edx
psrlq $32,%mm0
movd %mm0,%ebx
addl %ebx,%eax
adcl $0,%edx
addl $16,%esi
jmp 1b
2: emms
popl %ebx
popl %ecx
popl %esi
popl %ebp
ret
# Evaluate sum of squares of signed 16-bit input samples
# long sumsq_wd_mmx_assist(signed short *in,int cnt);
# Quick version, only safe for small numbers of small input values...
.global sumsq_wd_mmx_assist
.type sumsq_wd_mmx_assist,@function
.align 16
sumsq_wd_mmx_assist:
pushl %ebp
movl %esp,%ebp
pushl %esi
movl 8(%ebp),%esi
movl 12(%ebp),%ecx
pxor %mm2,%mm2 # zero sum
1: subl $8,%ecx
jl 2f
movq (%esi),%mm0 # S0 S1 S2 S3
pmaddwd %mm0,%mm0 # (S0*S0+S1*S1) (S2*S2+S3*S3)
movq 8(%esi),%mm1
pmaddwd %mm1,%mm1
paddd %mm1,%mm2
paddd %mm0,%mm2 # accumulate
addl $16,%esi
jmp 1b
2: movd %mm2,%eax # even sum
psrlq $32,%mm2
movd %mm2,%edx # odd sum
addl %edx,%eax
emms
popl %esi
popl %ebp
ret