source: trunk/Packages/Graphics32/GR32_BlendSSE2.pas

Last change on this file was 2, checked in by chronos, 5 years ago
File size: 40.6 KB
Line 
1unit GR32_BlendSSE2;
2
3(* ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1 or LGPL 2.1 with linking exception
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Alternatively, the contents of this file may be used under the terms of the
17 * Free Pascal modified version of the GNU Lesser General Public License
18 * Version 2.1 (the "FPC modified LGPL License"), in which case the provisions
19 * of this license are applicable instead of those above.
20 * Please see the file LICENSE.txt for additional information concerning this
21 * license.
22 *
23 * The Original Code is Graphics32
24 *
25 * The Initial Developer of the Original Code is
26 * Alex A. Denisov
27 *
28 * Portions created by the Initial Developer are Copyright (C) 2000-2009
29 * the Initial Developer. All Rights Reserved.
30 *
31 * Contributor(s):
32 * Christian-W. Budde
33 * - 2019/04/01 - Refactoring
34 *
35 * ***** END LICENSE BLOCK ***** *)
36
37interface
38
39{$I GR32.inc}
40
41uses
42 GR32;
43
44function BlendReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
45procedure BlendMem_SSE2(F: TColor32; var B: TColor32); {$IFDEF FPC} assembler; {$ENDIF}
46procedure BlendMems_SSE2(F: TColor32; B: PColor32; Count: Integer); {$IFDEF FPC} assembler; {$ENDIF}
47
48function BlendRegEx_SSE2(F, B, M: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
49procedure BlendMemEx_SSE2(F: TColor32; var B:TColor32; M: TColor32); {$IFDEF FPC} assembler; {$ENDIF}
50
51function BlendRegRGB_SSE2(F, B, W: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
52procedure BlendMemRGB_SSE2(F: TColor32; var B: TColor32; W: TColor32); {$IFDEF FPC} assembler; {$ENDIF}
53
54procedure BlendLine_SSE2(Src, Dst: PColor32; Count: Integer); {$IFDEF FPC} assembler; {$ENDIF}
55procedure BlendLineEx_SSE2(Src, Dst: PColor32; Count: Integer; M: TColor32); {$IFDEF FPC} assembler; {$ENDIF}
56
57function CombineReg_SSE2(X, Y, W: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
58procedure CombineMem_SSE2(F: TColor32; var B: TColor32; W: TColor32); {$IFDEF FPC} assembler; {$ENDIF}
59procedure CombineLine_SSE2(Src, Dst: PColor32; Count: Integer; W: TColor32); {$IFDEF FPC} assembler; {$ENDIF}
60
61function MergeReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
62
63procedure EMMS_SSE2; {$IFDEF FPC} assembler; {$ENDIF}
64
65function LightenReg_SSE2(C: TColor32; Amount: Integer): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
66
67function ColorAdd_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
68function ColorSub_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
69function ColorModulate_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
70function ColorMax_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
71function ColorMin_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
72function ColorDifference_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
73function ColorExclusion_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
74function ColorScale_SSE2(C, W: TColor32): TColor32; {$IFDEF FPC} assembler; {$ENDIF}
75
76implementation
77
78uses
79 GR32_Blend,
80 GR32_LowLevel,
81 GR32_System;
82
83{ SSE2 versions }
84
85function BlendReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
86asm
87 // blend foreground color (F) to a background color (B),
88 // using alpha channel value of F
89 // EAX <- F
90 // EDX <- B
91 // Result := Fa * (Fargb - Bargb) + Bargb
92
93{$IFDEF TARGET_x86}
94 MOVD XMM0,EAX // XMM0 <- 00 00 00 00 00 00 00 00 00 00 00 00 Fa Fr Fg Fb
95 PXOR XMM3,XMM3 // XMM3 <- 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
96 MOVD XMM2,EDX // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Ba Br Bg Bb
97 PUNPCKLBW XMM0,XMM3 // XMM0 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb
98 MOV ECX,bias_ptr // ECX <- Pointer to Bias
99 PUNPCKLBW XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb
100 MOVQ XMM1,XMM0 // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb
101 PSHUFLW XMM1,XMM1,$FF // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fa 00 Fa 00 Fa
102 PSUBW XMM0,XMM2 // XMM0 <- 00 00 00 00 00 00 00 00 00 Da 00 Dr 00 Dg 00 Db
103 PSLLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb 00
104 PMULLW XMM0,XMM1 // XMM0 <- 00 00 00 00 00 00 00 00 Pa ** Pr ** Pg ** Pb **
105 PADDW XMM2,[ECX] // add bias
106 PADDW XMM2,XMM0 // XMM2 <- 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb **
107 PSRLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb
108 PACKUSWB XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Qa Qr Qg Qb
109 MOVD EAX,XMM2 // EAX <- Za Zr Zg Zb
110 OR EAX,$FF000000 // EAX <- FF Zr Zg Zb
111{$ENDIF}
112
113{$IFDEF TARGET_x64}
114 MOVD XMM0,ECX // XMM0 <- 00 00 00 00 00 00 00 00 00 00 00 00 Fa Fr Fg Fb
115 PXOR XMM3,XMM3 // XMM3 <- 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
116 MOVD XMM2,EDX // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Ba Br Bg Bb
117 PUNPCKLBW XMM0,XMM3 // XMM0 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb
118{$IFNDEF FPC}
119 MOV RAX,bias_ptr // RAX <- Pointer to Bias
120{$ELSE}
121 MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
122{$ENDIF}
123 PUNPCKLBW XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb
124 MOVQ XMM1,XMM0 // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fr 00 Fg 00 Fb
125 PSHUFLW XMM1,XMM1,$FF // XMM1 <- 00 00 00 00 00 00 00 00 00 Fa 00 Fa 00 ** 00 **
126 PSUBW XMM0,XMM2 // XMM0 <- 00 00 00 00 00 00 00 00 00 Da 00 Dr 00 Dg 00 Db
127 PSLLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 Ba 00 Br 00 Bg 00 Bb 00
128 PMULLW XMM0,XMM1 // XMM2 <- 00 00 00 00 00 00 00 00 Pa ** Pr ** Pg ** Pb **
129 PADDW XMM2,[RAX] // add bias
130 PADDW XMM2,XMM0 // XMM2 <- 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb **
131 PSRLW XMM2,8 // XMM2 <- 00 00 00 00 00 00 00 00 00 Qa ** Qr ** Qg ** Qb
132 PACKUSWB XMM2,XMM3 // XMM2 <- 00 00 00 00 00 00 00 00 00 00 00 00 Qa Qr Qg Qb
133 MOVD EAX,XMM2 // EAX <- Za Zr Zg Zb
134 OR EAX,$FF000000 // EAX <- FF Zr Zg Zb
135{$ENDIF}
136end;
137
138procedure BlendMem_SSE2(F: TColor32; var B: TColor32); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
139asm
140{$IFDEF TARGET_x86}
141 // EAX - Color X
142 // [EDX] - Color Y
143 // Result := W * (X - Y) + Y
144
145 TEST EAX,$FF000000
146 JZ @1
147 CMP EAX,$FF000000
148 JNC @2
149
150 PXOR XMM3,XMM3
151 MOVD XMM0,EAX
152 MOVD XMM2,[EDX]
153 PUNPCKLBW XMM0,XMM3
154 MOV ECX,bias_ptr
155 PUNPCKLBW XMM2,XMM3
156 MOVQ XMM1,XMM0
157 PSHUFLW XMM1,XMM1,$FF
158 PSUBW XMM0,XMM2
159 PSLLW XMM2,8
160 PMULLW XMM0,XMM1
161 PADDW XMM2,[ECX]
162 PADDW XMM2,XMM0
163 PSRLW XMM2,8
164 PACKUSWB XMM2,XMM3
165 MOVD [EDX],XMM2
166
167@1: RET
168@2: MOV [EDX], EAX
169{$ENDIF}
170
171{$IFDEF TARGET_x64}
172 // ECX - Color X
173 // [EDX] - Color Y
174 // Result := W * (X - Y) + Y
175
176 TEST ECX,$FF000000
177 JZ @1
178 CMP ECX,$FF000000
179 JNC @2
180
181 PXOR XMM3,XMM3
182 MOVD XMM0,ECX
183 MOVD XMM2,[RDX]
184 PUNPCKLBW XMM0,XMM3
185{$IFNDEF FPC}
186 MOV RAX,bias_ptr
187{$ELSE}
188 MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
189{$ENDIF}
190 PUNPCKLBW XMM2,XMM3
191 MOVQ XMM1,XMM0
192 PSHUFLW XMM1,XMM1,$FF
193 PSUBW XMM0,XMM2
194 PSLLW XMM2,8
195 PMULLW XMM0,XMM1
196 PADDW XMM2,[RAX]
197 PADDW XMM2,XMM0
198 PSRLW XMM2,8
199 PACKUSWB XMM2,XMM3
200 MOVD [RDX],XMM2
201
202@1: RET
203@2: MOV [RDX], ECX
204{$ENDIF}
205end;
206
207procedure BlendMems_SSE2(F: TColor32; B: PColor32; Count: Integer); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
208asm
209{$IFDEF TARGET_x86}
210 TEST ECX,ECX
211 JZ @2
212
213 TEST EAX,$FF000000
214 JZ @2
215
216 PUSH EBX
217
218 MOV EBX,EAX
219 SHR EBX,24
220
221 CMP EBX,$FF
222 JZ @3
223
224 MOVD XMM4,EAX
225 PXOR XMM3,XMM3
226 PUNPCKLBW XMM4,XMM3
227 MOV EBX,bias_ptr
228
229@1:
230 MOVD XMM2,[EDX]
231 PUNPCKLBW XMM2,XMM3
232 MOVQ XMM1,XMM4
233 PUNPCKLBW XMM1,XMM3
234 PUNPCKHWD XMM1,XMM1
235 MOVQ XMM0,XMM4
236 PSUBW XMM0,XMM2
237 PUNPCKHDQ XMM1,XMM1
238 PSLLW XMM2,8
239 PMULLW XMM0,XMM1
240 PADDW XMM2,[EBX]
241 PADDW XMM2,XMM0
242 PSRLW XMM2,8
243 PACKUSWB XMM2,XMM3
244 MOVD [EDX],XMM2
245
246 ADD EDX,4
247
248 DEC ECX
249 JNZ @1
250
251 POP EBX
252
253@2:
254 RET
255
256@3:
257 MOV [EDX],EAX
258 ADD EDX,4
259
260 DEC ECX
261 JNZ @3
262
263 POP EBX
264{$ENDIF}
265
266{$IFDEF TARGET_x64}
267 TEST R8D,R8D
268 JZ @2
269
270 TEST ECX,$FF000000
271 JZ @2
272
273 MOV RAX,RCX
274 SHR EAX,24
275
276 CMP EAX,$FF
277 JZ @3
278
279 MOVD XMM4,ECX
280 PXOR XMM3,XMM3
281 PUNPCKLBW XMM4,XMM3
282 MOV RAX,bias_ptr
283
284@1:
285 MOVD XMM2,[RDX]
286 PUNPCKLBW XMM2,XMM3
287 MOVQ XMM1,XMM4
288 PUNPCKLBW XMM1,XMM3
289 PUNPCKHWD XMM1,XMM1
290 MOVQ XMM0,XMM4
291 PSUBW XMM0,XMM2
292 PUNPCKHDQ XMM1,XMM1
293 PSLLW XMM2,8
294 PMULLW XMM0,XMM1
295 PADDW XMM2,[RAX]
296 PADDW XMM2,XMM0
297 PSRLW XMM2,8
298 PACKUSWB XMM2,XMM3
299 MOVD [RDX], XMM2
300
301 ADD RDX,4
302
303 DEC R8D
304 JNZ @1
305
306@2:
307 RET
308
309@3:
310 MOV [RDX],ECX
311 ADD RDX,4
312
313 DEC R8D
314 JNZ @3
315{$ENDIF}
316end;
317
318
319function BlendRegEx_SSE2(F, B, M: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
320asm
321 // blend foreground color (F) to a background color (B),
322 // using alpha channel value of F
323 // Result := M * Fa * (Fargb - Bargb) + Bargb
324
325{$IFDEF TARGET_x86}
326 // EAX <- F
327 // EDX <- B
328 // ECX <- M
329 PUSH EBX
330 MOV EBX,EAX
331 SHR EBX,24
332 INC ECX // 255:256 range bias
333 IMUL ECX,EBX
334 SHR ECX,8
335 JZ @1
336
337 PXOR XMM0,XMM0
338 MOVD XMM1,EAX
339 SHL ECX,4
340 MOVD XMM2,EDX
341 PUNPCKLBW XMM1,XMM0
342 PUNPCKLBW XMM2,XMM0
343 ADD ECX,alpha_ptr
344 PSUBW XMM1,XMM2
345 PMULLW XMM1,[ECX]
346 PSLLW XMM2,8
347 MOV ECX,bias_ptr
348 PADDW XMM2,[ECX]
349 PADDW XMM1,XMM2
350 PSRLW XMM1,8
351 PACKUSWB XMM1,XMM0
352 MOVD EAX,XMM1
353
354 POP EBX
355 RET
356
357@1: MOV EAX,EDX
358 POP EBX
359{$ENDIF}
360
361{$IFDEF TARGET_x64}
362 // ECX <- F
363 // EDX <- B
364 // R8D <- M
365
366 MOV EAX,ECX
367 SHR EAX,24
368 INC R8D // 255:256 range bias
369 IMUL R8D,EAX
370 SHR R8D,8
371 JZ @1
372
373 PXOR XMM0,XMM0
374 MOVD XMM1,ECX
375 SHL R8D,4
376 MOVD XMM2,EDX
377 PUNPCKLBW XMM1,XMM0
378 PUNPCKLBW XMM2,XMM0
379{$IFNDEF FPC}
380 ADD R8,alpha_ptr
381{$ELSE}
382 ADD R8,[RIP+alpha_ptr]
383{$ENDIF}
384 PSUBW XMM1,XMM2
385 PMULLW XMM1,[R8]
386 PSLLW XMM2,8
387{$IFNDEF FPC}
388 MOV R8,bias_ptr
389{$ELSE}
390 MOV R8,[RIP+bias_ptr]
391{$ENDIF}
392 PADDW XMM2,[R8]
393 PADDW XMM1,XMM2
394 PSRLW XMM1,8
395 PACKUSWB XMM1,XMM0
396 MOVD EAX,XMM1
397 RET
398
399@1: MOV EAX,EDX
400{$ENDIF}
401end;
402
403procedure BlendMemEx_SSE2(F: TColor32; var B:TColor32; M: TColor32); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
404asm
405{$IFDEF TARGET_x86}
406 // blend foreground color (F) to a background color (B),
407 // using alpha channel value of F
408 // EAX <- F
409 // [EDX] <- B
410 // ECX <- M
411 // Result := M * Fa * (Fargb - Bargb) + Bargb
412 TEST EAX,$FF000000
413 JZ @2
414
415 PUSH EBX
416 MOV EBX,EAX
417 SHR EBX,24
418 INC ECX // 255:256 range bias
419 IMUL ECX,EBX
420 SHR ECX,8
421 JZ @1
422
423 PXOR XMM0,XMM0
424 MOVD XMM1,EAX
425 SHL ECX,4
426 MOVD XMM2,[EDX]
427 PUNPCKLBW XMM1,XMM0
428 PUNPCKLBW XMM2,XMM0
429 ADD ECX,alpha_ptr
430 PSUBW XMM1,XMM2
431 PMULLW XMM1,[ECX]
432 PSLLW XMM2,8
433 MOV ECX,bias_ptr
434 PADDW XMM2,[ECX]
435 PADDW XMM1,XMM2
436 PSRLW XMM1,8
437 PACKUSWB XMM1,XMM0
438 MOVD [EDX],XMM1
439
440@1:
441 POP EBX
442
443@2:
444{$ENDIF}
445
446{$IFDEF TARGET_x64}
447 // blend foreground color (F) to a background color (B),
448 // using alpha channel value of F
449 // RCX <- F
450 // [RDX] <- B
451 // R8 <- M
452 // Result := M * Fa * (Fargb - Bargb) + Bargb
453
454 TEST ECX,$FF000000
455 JZ @1
456
457 MOV R9D,ECX
458 SHR R9D,24
459 INC R8D // 255:256 range bias
460 IMUL R8D,R9D
461 SHR R8D,8
462 JZ @1
463
464 PXOR XMM0,XMM0
465 MOVD XMM1,ECX
466 SHL R8D,4
467 MOVD XMM2,[RDX]
468 PUNPCKLBW XMM1,XMM0
469 PUNPCKLBW XMM2,XMM0
470{$IFNDEF FPC}
471 ADD R8,alpha_ptr
472{$ELSE}
473 ADD R8,[RIP+alpha_ptr]
474{$ENDIF}
475 PSUBW XMM1,XMM2
476 PMULLW XMM1,[R8]
477 PSLLW XMM2,8
478{$IFNDEF FPC}
479 MOV R8,bias_ptr
480{$ELSE}
481 MOV R8,[RIP+bias_ptr]
482{$ENDIF}
483 PADDW XMM2,[R8]
484 PADDW XMM1,XMM2
485 PSRLW XMM1,8
486 PACKUSWB XMM1,XMM0
487 MOVD DWORD PTR [RDX],XMM1
488@1:
489{$ENDIF}
490end;
491
492function BlendRegRGB_SSE2(F, B, W: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
493asm
494{$IFDEF TARGET_x86}
495 PXOR XMM2,XMM2
496 MOVD XMM0,EAX
497 PUNPCKLBW XMM0,XMM2
498 MOVD XMM1,EDX
499 PUNPCKLBW XMM1,XMM2
500 BSWAP ECX
501 PSUBW XMM0,XMM1
502 MOVD XMM3,ECX
503 PUNPCKLBW XMM3,XMM2
504 PMULLW XMM0,XMM3
505 MOV EAX,bias_ptr
506 PSLLW XMM1,8
507 PADDW XMM1,[EAX]
508 PADDW XMM1,XMM0
509 PSRLW XMM1,8
510 PACKUSWB XMM1,XMM2
511 MOVD EAX,XMM1
512{$ENDIF}
513
514{$IFDEF TARGET_x64}
515 PXOR XMM2,XMM2
516 MOVD XMM0,ECX
517 PUNPCKLBW XMM0,XMM2
518 MOVD XMM1,EDX
519 PUNPCKLBW XMM1,XMM2
520 BSWAP R8D
521 PSUBW XMM0,XMM1
522 MOVD XMM3,R8D
523 PUNPCKLBW XMM3,XMM2
524 PMULLW XMM0,XMM3
525{$IFNDEF FPC}
526 MOV RAX,bias_ptr
527{$ELSE}
528 MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
529{$ENDIF}
530 PSLLW XMM1,8
531 PADDW XMM1,[RAX]
532 PADDW XMM1,XMM0
533 PSRLW XMM1,8
534 PACKUSWB XMM1,XMM2
535 MOVD EAX,XMM1
536{$ENDIF}
537end;
538
539procedure BlendMemRGB_SSE2(F: TColor32; var B: TColor32; W: TColor32); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
540asm
541{$IFDEF TARGET_x86}
542 PXOR XMM2,XMM2
543 MOVD XMM0,EAX
544 PUNPCKLBW XMM0,XMM2
545 MOVD XMM1,[EDX]
546 PUNPCKLBW XMM1,XMM2
547 BSWAP ECX
548 PSUBW XMM0,XMM1
549 MOVD XMM3,ECX
550 PUNPCKLBW XMM3,XMM2
551 PMULLW XMM0,XMM3
552 MOV EAX,bias_ptr
553 PSLLW XMM1,8
554 PADDW XMM1,[EAX]
555 PADDW XMM1,XMM0
556 PSRLW XMM1,8
557 PACKUSWB XMM1,XMM2
558 MOVD [EDX],XMM1
559{$ENDIF}
560{$IFDEF TARGET_x64}
561 MOVD XMM1,R8D
562
563 PXOR XMM4,XMM4
564{$IFNDEF FPC}
565 MOV RAX,bias_ptr
566{$ELSE}
567 MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
568{$ENDIF}
569 MOVQ XMM5,[RAX]
570 MOVD XMM0,ECX
571 MOVD XMM2,[RDX]
572
573 PUNPCKLBW XMM0,XMM4
574 PUNPCKLBW XMM1,XMM4
575 PUNPCKLBW XMM2,XMM4
576
577 PSHUFLW XMM1,XMM1,$1B
578
579 // C = wA B - wB
580 PMULLW XMM0,XMM1
581 PADDW XMM0,XMM5
582 PSRLW XMM0,8
583
584 PADDW XMM0,XMM2
585
586 PMULLW XMM2,XMM1
587 PADDW XMM2,XMM5
588 PSRLW XMM2,8
589
590 PSUBW XMM0,XMM2
591
592 PACKUSWB XMM0,XMM4
593
594 MOVD [RDX],XMM0
595{$ENDIF}
596end;
597
598{$IFDEF TEST_BLENDMEMRGB128SSE4}
599procedure BlendMemRGB128_SSE4(F: TColor32; var B: TColor32; W: UInt64); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
600asm
601{$IFDEF TARGET_x86}
602 MOVQ XMM1,W
603
604 PXOR XMM4,XMM4
605 MOV ECX,[bias_ptr]
606 MOVDQA XMM5,[ECX]
607
608 MOVD XMM0,EAX
609 PINSRD XMM0,EAX,1
610 MOVQ XMM2,[EDX].QWORD
611
612 PUNPCKLBW XMM0,XMM4
613 PUNPCKLBW XMM1,XMM4
614 PUNPCKLBW XMM2,XMM4
615
616 PSHUFLW XMM1,XMM1,$1B
617 PSHUFHW XMM1,XMM1,$1B
618
619 // C = wA B - wB
620 PMULLW XMM0,XMM1
621 PADDW XMM0,XMM5
622 PSRLW XMM0,8
623
624 PADDW XMM0,XMM2
625
626 PMULLW XMM2,XMM1
627 PADDW XMM2,XMM5
628 PSRLW XMM2,8
629
630 PSUBW XMM0,XMM2
631
632 PACKUSWB XMM0,XMM4
633
634 MOVQ [EDX].QWORD,XMM0
635{$ENDIF}
636{$IFDEF TARGET_x64}
637 MOVQ XMM1,R8
638
639 PXOR XMM4,XMM4
640 MOV RAX,[RIP+bias_ptr]
641 MOVDQA XMM5,[RAX]
642
643 MOVD XMM0,ECX
644 PINSRD XMM0,ECX,1
645 MOVQ XMM2,[RDX].QWORD
646
647 PUNPCKLBW XMM0,XMM4
648 PUNPCKLBW XMM1,XMM4
649 PUNPCKLBW XMM2,XMM4
650
651 PSHUFLW XMM1,XMM1,$1B
652 PSHUFHW XMM1,XMM1,$1B
653
654 // C = wA B - wB
655 PMULLW XMM0,XMM1
656 PADDW XMM0,XMM5
657 PSRLW XMM0,8
658
659 PADDW XMM0,XMM2
660
661 PMULLW XMM2,XMM1
662 PADDW XMM2,XMM5
663 PSRLW XMM2,8
664
665 PSUBW XMM0,XMM2
666
667 PACKUSWB XMM0,XMM4
668
669 MOVQ [RDX].QWORD,XMM0
670{$ENDIF}
671end;
672{$ENDIF}
673
674procedure BlendLine_SSE2(Src, Dst: PColor32; Count: Integer); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
675{$IFDEF FPC}
676const
677 COpaque: QWORD = $FF000000FF000000;
678{$ENDIF}
679asm
680{$IFDEF TARGET_X86}
681 // EAX <- Src
682 // EDX <- Dst
683 // ECX <- Count
684
685 TEST ECX,ECX
686 JLE @3
687
688 PUSH EBX
689 PXOR XMM4,XMM4
690 MOV EBX,[bias_ptr]
691 MOVDQA XMM5,[EBX]
692 POP EBX
693
694 TEST ECX, 1
695 JZ @2
696 MOVD XMM0,[EAX]
697 MOVD XMM2,[EDX]
698
699 PUNPCKLBW XMM0,XMM4
700 PUNPCKLBW XMM2,XMM4
701
702 PSHUFLW XMM1,XMM0,$FF
703
704 // premultiply source pixel by its alpha
705 MOVQ XMM3,XMM1
706 PSRLQ XMM3,16
707 PMULLW XMM0,XMM3
708 PADDW XMM0,XMM5
709 PSRLW XMM0,8
710 PSLLQ XMM3,48
711 POR XMM0,XMM3
712
713 // C' = A' B' - aB'
714 PMULLW XMM1,XMM2
715 PADDW XMM1,XMM5
716 PSRLW XMM1,8
717 PADDW XMM0,XMM2
718 PSUBW XMM0,XMM1
719
720 PACKUSWB XMM0,XMM4
721 MOVD [EDX], XMM0
722
723@2:
724 LEA EAX, [EAX + ECX * 4]
725 LEA EDX, [EDX + ECX * 4]
726
727 SHR ECX,1
728 JZ @3
729 NEG ECX
730
731@1:
732 MOVQ XMM0,[EAX + ECX * 8].QWORD
733 MOVQ XMM2,[EDX + ECX * 8].QWORD
734
735 PUNPCKLBW XMM0,XMM4
736 PUNPCKLBW XMM2,XMM4
737
738 PSHUFLW XMM1,XMM0,$FF
739 PSHUFHW XMM1,XMM1,$FF
740
741 // premultiply source pixel by its alpha
742 MOVDQA XMM3,XMM1
743 PSRLQ XMM3,16
744 PMULLW XMM0,XMM3
745 PADDW XMM0,XMM5
746 PSRLW XMM0,8
747 PSLLQ XMM3,48
748 POR XMM0,XMM3
749
750 // C' = A' + B' - aB'
751 PMULLW XMM1,XMM2
752 PADDW XMM1,XMM5
753 PSRLW XMM1,8
754 PADDW XMM0,XMM2
755 PSUBW XMM0,XMM1
756
757 PACKUSWB XMM0,XMM4
758 MOVQ [EDX + ECX * 8].QWORD,XMM0
759
760 ADD ECX,1
761 JS @1
762@3:
763
764{$ENDIF}
765
766{$IFDEF TARGET_X64}
767 TEST R8D,R8D
768 JLE @3
769
770 PXOR XMM4,XMM4
771{$IFNDEF FPC}
772 MOV RAX,bias_ptr
773{$ELSE}
774 MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
775{$ENDIF}
776 MOVDQA XMM5,[RAX]
777
778 MOV R9D, R8D
779 SHR R9D, 1
780 TEST R9D, R9D
781 JZ @2
782
783@1:
784 MOVQ XMM0,[RCX].QWORD
785 MOVQ RAX,XMM0
786{$IFDEF FPC}
787 AND RAX,[RIP+COpaque]
788 JZ @1b
789 CMP RAX,[RIP+COpaque]
790 JZ @1a
791{$ENDIF}
792
793 MOVQ XMM2,[RDX].QWORD
794
795 PUNPCKLBW XMM0,XMM4
796 PUNPCKLBW XMM2,XMM4
797
798 PSHUFLW XMM1,XMM0,$FF
799 PSHUFHW XMM1,XMM1,$FF
800
801 // premultiply source pixel by its alpha
802 MOVDQA XMM3,XMM1
803 PSRLQ XMM3,16
804 PMULLW XMM0,XMM3
805 PADDW XMM0,XMM5
806 PSRLW XMM0,8
807 PSLLQ XMM3,48
808 POR XMM0,XMM3
809
810 // C' = A' + B' - aB'
811 PMULLW XMM1,XMM2
812 PADDW XMM1,XMM5
813 PSRLW XMM1,8
814 PADDW XMM0,XMM2
815 PSUBW XMM0,XMM1
816
817 PACKUSWB XMM0,XMM4
818@1a: MOVQ [RDX].QWORD,XMM0
819
820@1b: ADD RCX,8
821 ADD RDX,8
822
823 SUB R9D,1
824 JNZ @1
825
826@2:
827 AND R8D, 1
828 JZ @3
829
830 MOVD XMM0,[RCX]
831 MOVD XMM2,[RDX]
832
833 PUNPCKLBW XMM0,XMM4
834 PUNPCKLBW XMM2,XMM4
835
836 PSHUFLW XMM1,XMM0,$FF
837
838 // premultiply source pixel by its alpha
839 MOVQ XMM3,XMM1
840 PSRLQ XMM3,16
841 PMULLW XMM0,XMM3
842 PADDW XMM0,XMM5
843 PSRLW XMM0,8
844 PSLLQ XMM3,48
845 POR XMM0,XMM3
846
847 // C' = A' B' - aB'
848 PMULLW XMM1,XMM2
849 PADDW XMM1,XMM5
850 PSRLW XMM1,8
851 PADDW XMM0,XMM2
852 PSUBW XMM0,XMM1
853
854 PACKUSWB XMM0,XMM4
855 MOVD [RDX], XMM0
856@3:
857{$ENDIF}
858end;
859
860
861procedure BlendLineEx_SSE2(Src, Dst: PColor32; Count: Integer; M: TColor32); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
862asm
863{$IFDEF TARGET_X86}
864 // EAX <- Src
865 // EDX <- Dst
866 // ECX <- Count
867
868 // test the counter for zero or negativity
869 TEST ECX,ECX
870 JS @4
871
872 PUSH ESI
873 PUSH EDI
874 PUSH EBX
875
876 MOV ESI,EAX // ESI <- Src
877 MOV EDI,EDX // EDI <- Dst
878 MOV EDX,M // EDX <- Master Alpha
879
880 // loop start
881@1: MOV EAX,[ESI]
882 TEST EAX,$FF000000
883 JZ @3 // complete transparency, proceed to next point
884 MOV EBX,EAX
885 SHR EBX,24
886 INC EBX // 255:256 range bias
887 IMUL EBX,EDX
888 SHR EBX,8
889 JZ @3 // complete transparency, proceed to next point
890
891 // blend
892 PXOR XMM0,XMM0
893 MOVD XMM1,EAX
894 SHL EBX,4
895 MOVD XMM2,[EDI]
896 PUNPCKLBW XMM1,XMM0
897 PUNPCKLBW XMM2,XMM0
898 ADD EBX,alpha_ptr
899 PSUBW XMM1,XMM2
900 PMULLW XMM1,[EBX]
901 PSLLW XMM2,8
902 MOV EBX,bias_ptr
903 PADDW XMM2,[EBX]
904 PADDW XMM1,XMM2
905 PSRLW XMM1,8
906 PACKUSWB XMM1,XMM0
907 MOVD EAX,XMM1
908
909@2: MOV [EDI],EAX
910
911@3: ADD ESI,4
912 ADD EDI,4
913
914 // loop end
915 DEC ECX
916 JNZ @1
917
918 POP EBX
919 POP EDI
920 POP ESI
921@4:
922{$ENDIF}
923
924{$IFDEF TARGET_X64}
925 // ECX <- Src
926 // EDX <- Dst
927 // R8D <- Count
928 // R9D <- M
929
930 // test the counter for zero or negativity
931 TEST R8D,R8D
932 JS @4
933 TEST R9D,R9D
934 JZ @4
935
936 MOV R10,RCX // ESI <- Src
937
938 // loop start
939@1: MOV ECX,[R10]
940 TEST ECX,$FF000000
941 JZ @3 // complete transparency, proceed to next point
942 MOV EAX,ECX
943 SHR EAX,24
944 INC EAX // 255:256 range bias
945 IMUL EAX,R9D
946 SHR EAX,8
947 JZ @3 // complete transparency, proceed to next point
948
949 // blend
950 PXOR XMM0,XMM0
951 MOVD XMM1,ECX
952 SHL EAX,4
953 MOVD XMM2,[RDX]
954 PUNPCKLBW XMM1,XMM0
955 PUNPCKLBW XMM2,XMM0
956{$IFNDEF FPC}
957 ADD RAX,alpha_ptr
958{$ELSE}
959 ADD RAX,[RIP+alpha_ptr]
960{$ENDIF}
961 PSUBW XMM1,XMM2
962 PMULLW XMM1,[RAX]
963 PSLLW XMM2,8
964{$IFNDEF FPC}
965 MOV RAX,bias_ptr
966{$ELSE}
967 MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
968{$ENDIF}
969 PADDW XMM2,[RAX]
970 PADDW XMM1,XMM2
971 PSRLW XMM1,8
972 PACKUSWB XMM1,XMM0
973 MOVD ECX,XMM1
974
975@2: MOV [RDX],ECX
976
977@3: ADD R10,4
978 ADD RDX,4
979
980 // loop end
981 DEC R8D
982 JNZ @1
983@4:
984{$ENDIF}
985end;
986
987function CombineReg_SSE2(X, Y, W: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
988asm
989{$IFDEF TARGET_X86}
990 // EAX - Color X
991 // EDX - Color Y
992 // ECX - Weight of X [0..255]
993 // Result := W * (X - Y) + Y
994
995 MOVD XMM1,EAX
996 PXOR XMM0,XMM0
997 SHL ECX,4
998
999 MOVD XMM2,EDX
1000 PUNPCKLBW XMM1,XMM0
1001 PUNPCKLBW XMM2,XMM0
1002
1003 ADD ECX,alpha_ptr
1004
1005 PSUBW XMM1,XMM2
1006 PMULLW XMM1,[ECX]
1007 PSLLW XMM2,8
1008
1009 MOV ECX,bias_ptr
1010
1011 PADDW XMM2,[ECX]
1012 PADDW XMM1,XMM2
1013 PSRLW XMM1,8
1014 PACKUSWB XMM1,XMM0
1015 MOVD EAX,XMM1
1016{$ENDIF}
1017
1018{$IFDEF TARGET_X64}
1019 // ECX - Color X
1020 // EDX - Color Y
1021 // R8D - Weight of X [0..255]
1022 // Result := W * (X - Y) + Y
1023
1024 MOVD XMM1,ECX
1025 PXOR XMM0,XMM0
1026 SHL R8D,4
1027
1028 MOVD XMM2,EDX
1029 PUNPCKLBW XMM1,XMM0
1030 PUNPCKLBW XMM2,XMM0
1031
1032{$IFNDEF FPC}
1033 ADD R8,alpha_ptr
1034{$ELSE}
1035 ADD R8,[RIP+alpha_ptr]
1036{$ENDIF}
1037
1038 PSUBW XMM1,XMM2
1039 PMULLW XMM1,[R8]
1040 PSLLW XMM2,8
1041
1042{$IFNDEF FPC}
1043 MOV R8,bias_ptr
1044{$ELSE}
1045 MOV R8,[RIP+bias_ptr]
1046{$ENDIF}
1047
1048 PADDW XMM2,[R8]
1049 PADDW XMM1,XMM2
1050 PSRLW XMM1,8
1051 PACKUSWB XMM1,XMM0
1052 MOVD EAX,XMM1
1053{$ENDIF}
1054end;
1055
1056procedure CombineMem_SSE2(F: TColor32; var B: TColor32; W: TColor32); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1057asm
1058{$IFDEF TARGET_X86}
1059 // EAX - Color X
1060 // [EDX] - Color Y
1061 // ECX - Weight of X [0..255]
1062 // Result := W * (X - Y) + Y
1063
1064 JCXZ @1
1065
1066 CMP ECX,$FF
1067 JZ @2
1068
1069 MOVD XMM1,EAX
1070 PXOR XMM0,XMM0
1071
1072 SHL ECX,4
1073
1074 MOVD XMM2,[EDX]
1075 PUNPCKLBW XMM1,XMM0
1076 PUNPCKLBW XMM2,XMM0
1077
1078 ADD ECX,alpha_ptr
1079
1080 PSUBW XMM1,XMM2
1081 PMULLW XMM1,[ECX]
1082 PSLLW XMM2,8
1083
1084 MOV ECX,bias_ptr
1085
1086 PADDW XMM2,[ECX]
1087 PADDW XMM1,XMM2
1088 PSRLW XMM1,8
1089 PACKUSWB XMM1,XMM0
1090 MOVD [EDX],XMM1
1091
1092@1: RET
1093
1094@2: MOV [EDX],EAX
1095{$ENDIF}
1096
1097{$IFDEF TARGET_X64}
1098 // ECX - Color X
1099 // [RDX] - Color Y
1100 // R8D - Weight of X [0..255]
1101 // Result := W * (X - Y) + Y
1102
1103 TEST R8D,R8D // Set flags for R8
1104 JZ @1 // W = 0 ? => Result := EDX
1105 CMP R8D,$FF
1106 JZ @2
1107
1108 MOVD XMM1,ECX
1109 PXOR XMM0,XMM0
1110
1111 SHL R8D,4
1112
1113 MOVD XMM2,[RDX]
1114 PUNPCKLBW XMM1,XMM0
1115 PUNPCKLBW XMM2,XMM0
1116
1117{$IFNDEF FPC}
1118 ADD R8,alpha_ptr
1119{$ELSE}
1120 ADD R8,[RIP+alpha_ptr]
1121{$ENDIF}
1122
1123 PSUBW XMM1,XMM2
1124 PMULLW XMM1,[R8]
1125 PSLLW XMM2,8
1126
1127{$IFNDEF FPC}
1128 MOV RAX,bias_ptr
1129{$ELSE}
1130 MOV RAX,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
1131{$ENDIF}
1132
1133 PADDW XMM2,[RAX]
1134 PADDW XMM1,XMM2
1135 PSRLW XMM1,8
1136 PACKUSWB XMM1,XMM0
1137 MOVD [RDX],XMM1
1138
1139@1: RET
1140
1141@2: MOV [RDX],ECX
1142{$ENDIF}
1143end;
1144
1145
1146procedure CombineLine_SSE2(Src, Dst: PColor32; Count: Integer; W: TColor32); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1147asm
1148{$IFDEF TARGET_X86}
1149 // EAX <- Src
1150 // EDX <- Dst
1151 // ECX <- Count
1152
1153 // Result := W * (X - Y) + Y
1154
1155 TEST ECX,ECX
1156 JZ @3
1157
1158 PUSH EBX
1159 MOV EBX,W
1160
1161 TEST EBX,EBX
1162 JZ @2
1163
1164 CMP EBX,$FF
1165 JZ @4
1166
1167 SHL EBX,4
1168 ADD EBX,alpha_ptr
1169 MOVQ XMM3,[EBX]
1170 MOV EBX,bias_ptr
1171 MOVQ XMM4,[EBX]
1172 PXOR XMM0,XMM0
1173
1174@1: MOVD XMM1,[EAX]
1175 MOVD XMM2,[EDX]
1176 PUNPCKLBW XMM1,XMM0
1177 PUNPCKLBW XMM2,XMM0
1178
1179 PSUBW XMM1,XMM2
1180 PMULLW XMM1,XMM3
1181 PSLLW XMM2,8
1182
1183 PADDW XMM2,XMM4
1184 PADDW XMM1,XMM2
1185 PSRLW XMM1,8
1186 PACKUSWB XMM1,XMM0
1187 MOVD [EDX],XMM1
1188
1189 ADD EAX,4
1190 ADD EDX,4
1191
1192 DEC ECX
1193 JNZ @1
1194
1195@2: POP EBX
1196 POP EBP
1197
1198@3: RET $0004
1199
1200@4: SHL ECX,2
1201 CALL Move
1202 POP EBX
1203{$ENDIF}
1204
1205{$IFDEF TARGET_X64}
1206 // ECX <- Src
1207 // EDX <- Dst
1208 // R8D <- Count
1209
1210 // Result := W * (X - Y) + Y
1211
1212 TEST R8D,R8D
1213 JZ @2
1214
1215 TEST R9D,R9D
1216 JZ @2
1217
1218 CMP R9D,$FF
1219 JZ @3
1220
1221 SHL R9D,4
1222{$IFNDEF FPC}
1223 ADD R9,alpha_ptr
1224{$ELSE}
1225 ADD R9,[RIP+alpha_ptr]
1226{$ENDIF}
1227 MOVQ XMM3,[R9]
1228{$IFNDEF FPC}
1229 MOV R9,bias_ptr
1230{$ELSE}
1231 MOV R9,[RIP+bias_ptr] // XXX : Enabling PIC by relative offsetting for x64
1232{$ENDIF}
1233 MOVQ XMM4,[R9]
1234 PXOR XMM0,XMM0
1235
1236@1: MOVD XMM1,[RCX]
1237 MOVD XMM2,[RDX]
1238 PUNPCKLBW XMM1,XMM0
1239 PUNPCKLBW XMM2,XMM0
1240
1241 PSUBW XMM1,XMM2
1242 PMULLW XMM1,XMM3
1243 PSLLW XMM2,8
1244
1245 PADDW XMM2,XMM4
1246 PADDW XMM1,XMM2
1247 PSRLW XMM1,8
1248 PACKUSWB XMM1,XMM0
1249 MOVD [RDX],XMM1
1250
1251 ADD RCX,4
1252 ADD RDX,4
1253
1254 DEC R8D
1255 JNZ @1
1256
1257@2: RET
1258
1259@3: SHL R8D,2
1260 CALL Move
1261{$ENDIF}
1262end;
1263
1264function MergeReg_SSE2(F, B: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1265asm
1266 { This is an implementation of the merge formula, as described
1267 in a paper by Bruce Wallace in 1981. Merging is associative,
1268 that is, A over (B over C) = (A over B) over C. The formula is,
1269
1270 Ra = Fa + Ba * (1 - Fa)
1271 Rc = (Fa * (Fc - Bc * Ba) + Bc * Ba) / Ra
1272
1273 where
1274
1275 Rc is the resultant color,
1276 Ra is the resultant alpha,
1277 Fc is the foreground color,
1278 Fa is the foreground alpha,
1279 Bc is the background color,
1280 Ba is the background alpha.
1281
1282 Implementation:
1283
1284 Ra := 1 - (1 - Fa) * (1 - Ba);
1285 Wa := Fa / Ra;
1286 Rc := Bc + Wa * (Fc - Bc);
1287
1288 (1 - Fa) * (1 - Ba) = 1 - Fa - Ba + Fa * Ba = (1 - Ra)
1289 }
1290
1291{$IFDEF TARGET_X86}
1292 TEST EAX,$FF000000 // foreground completely transparent =>
1293 JZ @1 // result = background
1294 CMP EAX,$FF000000 // foreground completely opaque =>
1295 JNC @2 // result = foreground
1296 TEST EDX,$FF000000 // background completely transparent =>
1297 JZ @2 // result = foreground
1298
1299 PXOR XMM7,XMM7 // XMM7 <- 00
1300 MOVD XMM0,EAX // XMM0 <- Fa Fr Fg Fb
1301 SHR EAX,24 // EAX <- Fa
1302 ROR EDX,24
1303 MOVZX ECX,DL // ECX <- Ba
1304 PUNPCKLBW XMM0,XMM7 // XMM0 <- 00 Fa 00 Fr 00 Fg 00 Fb
1305 SUB EAX,$FF // EAX <- (Fa - 1)
1306 XOR ECX,$FF // ECX <- (1 - Ba)
1307 IMUL ECX,EAX // ECX <- (Fa - 1) * (1 - Ba) = Ra - 1
1308 IMUL ECX,$8081 // ECX <- Xa 00 00 00
1309 ADD ECX,$8081*$FF*$FF
1310 SHR ECX,15 // ECX <- Ra
1311 MOV DL,CH // EDX <- Br Bg Bb Ra
1312 ROR EDX,8 // EDX <- Ra Br Bg Bb
1313 MOVD XMM1,EDX // XMM1 <- Ra Br Bg Bb
1314 PUNPCKLBW XMM1,XMM7 // XMM1 <- 00 Ra 00 Br 00 Bg 00 Bb
1315 SHL EAX,20 // EAX <- Fa 00 00
1316 PSUBW XMM0,XMM1 // XMM0 <- ** Da ** Dr ** Dg ** Db
1317 ADD EAX,$0FF01000
1318 PSLLW XMM0,4
1319 XOR EDX,EDX // EDX <- 00
1320 DIV ECX // EAX <- Fa / Ra = Wa
1321 MOVD XMM4,EAX // XMM3 <- Wa
1322 PSHUFLW XMM4,XMM4,$C0 // XMM3 <- 00 00 ** Wa ** Wa ** Wa
1323 PMULHW XMM0,XMM4 // XMM0 <- 00 00 ** Pr ** Pg ** Pb
1324 PADDW XMM0,XMM1 // XMM0 <- 00 Ra 00 Rr 00 Rg 00 Rb
1325 PACKUSWB XMM0,XMM7 // XMM0 <- Ra Rr Rg Rb
1326 MOVD EAX,XMM0
1327
1328 RET
1329@1: MOV EAX,EDX
1330@2:
1331{$ENDIF}
1332
1333{$IFDEF TARGET_X64}
1334 TEST ECX,$FF000000 // foreground completely transparent =>
1335 JZ @1 // result = background
1336 MOV EAX,ECX // EAX <- Fa
1337 CMP EAX,$FF000000 // foreground completely opaque =>
1338 JNC @2 // result = foreground
1339 TEST EDX,$FF000000 // background completely transparent =>
1340 JZ @2 // result = foreground
1341
1342 PXOR XMM7,XMM7 // XMM7 <- 00
1343 MOVD XMM0,EAX // XMM0 <- Fa Fr Fg Fb
1344 SHR EAX,24 // EAX <- Fa
1345 ROR EDX,24
1346 MOVZX ECX,DL // ECX <- Ba
1347 PUNPCKLBW XMM0,XMM7 // XMM0 <- 00 Fa 00 Fr 00 Fg 00 Fb
1348 SUB EAX,$FF // EAX <- (Fa - 1)
1349 XOR ECX,$FF // ECX <- (1 - Ba)
1350 IMUL ECX,EAX // ECX <- (Fa - 1) * (1 - Ba) = Ra - 1
1351 IMUL ECX,$8081 // ECX <- Xa 00 00 00
1352 ADD ECX,$8081*$FF*$FF
1353 SHR ECX,15 // ECX <- Ra
1354 MOV DL,CH // EDX <- Br Bg Bb Ra
1355 ROR EDX,8 // EDX <- Ra Br Bg Bb
1356 MOVD XMM1,EDX // XMM1 <- Ra Br Bg Bb
1357 PUNPCKLBW XMM1,XMM7 // XMM1 <- 00 Ra 00 Br 00 Bg 00 Bb
1358 SHL EAX,20 // EAX <- Fa 00 00
1359 PSUBW XMM0,XMM1 // XMM0 <- ** Da ** Dr ** Dg ** Db
1360 ADD EAX,$0FF01000
1361 PSLLW XMM0,4
1362 XOR EDX,EDX // EDX <- 00
1363 DIV ECX // EAX <- Fa / Ra = Wa
1364 MOVD XMM4,EAX // XMM3 <- Wa
1365 PSHUFLW XMM4,XMM4,$C0 // XMM3 <- 00 00 ** Wa ** Wa ** Wa
1366 PMULHW XMM0,XMM4 // XMM0 <- 00 00 ** Pr ** Pg ** Pb
1367 PADDW XMM0,XMM1 // XMM0 <- 00 Ra 00 Rr 00 Rg 00 Rb
1368 PACKUSWB XMM0,XMM7 // XMM0 <- Ra Rr Rg Rb
1369 MOVD EAX,XMM0
1370
1371 RET
1372@1: MOV EAX,EDX
1373@2:
1374{$ENDIF}
1375end;
1376
1377procedure EMMS_SSE2; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1378asm
1379end;
1380
1381
1382function LightenReg_SSE2(C: TColor32; Amount: Integer): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1383asm
1384{$IFDEF TARGET_X86}
1385 MOVD XMM0,EAX
1386 TEST EDX,EDX
1387 JL @1
1388 IMUL EDX,$010101
1389 MOVD XMM1,EDX
1390 PADDUSB XMM0,XMM1
1391 MOVD EAX,XMM0
1392 RET
1393@1: NEG EDX
1394 IMUL EDX,$010101
1395 MOVD XMM1,EDX
1396 PSUBUSB XMM0,XMM1
1397 MOVD EAX,XMM0
1398{$ENDIF}
1399
1400{$IFDEF TARGET_X64}
1401 MOVD XMM0,ECX
1402 TEST EDX,EDX
1403 JL @1
1404 IMUL EDX,$010101
1405 MOVD XMM1,EDX
1406 PADDUSB XMM0,XMM1
1407 MOVD EAX,XMM0
1408 RET
1409@1: NEG EDX
1410 IMUL EDX,$010101
1411 MOVD XMM1,EDX
1412 PSUBUSB XMM0,XMM1
1413 MOVD EAX,XMM0
1414{$ENDIF}
1415end;
1416
1417
1418{ SSE2 Color algebra}
1419
1420function ColorAdd_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1421asm
1422{$IFDEF TARGET_X86}
1423 MOVD XMM0,EAX
1424 MOVD XMM1,EDX
1425 PADDUSB XMM0,XMM1
1426 MOVD EAX,XMM0
1427{$ENDIF}
1428
1429{$IFDEF TARGET_X64}
1430 MOVD XMM0,ECX
1431 MOVD XMM1,EDX
1432 PADDUSB XMM0,XMM1
1433 MOVD EAX,XMM0
1434{$ENDIF}
1435end;
1436
1437function ColorSub_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1438asm
1439{$IFDEF TARGET_X86}
1440 MOVD XMM0,EAX
1441 MOVD XMM1,EDX
1442 PSUBUSB XMM0,XMM1
1443 MOVD EAX,XMM0
1444{$ENDIF}
1445
1446{$IFDEF TARGET_X64}
1447 MOVD XMM0,ECX
1448 MOVD XMM1,EDX
1449 PSUBUSB XMM0,XMM1
1450 MOVD EAX,XMM0
1451{$ENDIF}
1452end;
1453
1454function ColorModulate_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1455asm
1456{$IFDEF TARGET_X86}
1457 PXOR XMM2,XMM2
1458 MOVD XMM0,EAX
1459 PUNPCKLBW XMM0,XMM2
1460 MOVD XMM1,EDX
1461 PUNPCKLBW XMM1,XMM2
1462 PMULLW XMM0,XMM1
1463 PSRLW XMM0,8
1464 PACKUSWB XMM0,XMM2
1465 MOVD EAX,XMM0
1466{$ENDIF}
1467
1468{$IFDEF TARGET_X64}
1469 PXOR XMM2,XMM2
1470 MOVD XMM0,ECX
1471 PUNPCKLBW XMM0,XMM2
1472 MOVD XMM1,EDX
1473 PUNPCKLBW XMM1,XMM2
1474 PMULLW XMM0,XMM1
1475 PSRLW XMM0,8
1476 PACKUSWB XMM0,XMM2
1477 MOVD EAX,XMM0
1478{$ENDIF}
1479end;
1480
1481function ColorMax_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1482asm
1483{$IFDEF TARGET_X86}
1484 MOVD XMM0,EAX
1485 MOVD XMM1,EDX
1486 PMAXUB XMM0,XMM1
1487 MOVD EAX,XMM0
1488{$ENDIF}
1489
1490{$IFDEF TARGET_X64}
1491 MOVD XMM0,ECX
1492 MOVD XMM1,EDX
1493 PMAXUB XMM0,XMM1
1494 MOVD EAX,XMM0
1495{$ENDIF}
1496end;
1497
1498function ColorMin_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1499asm
1500{$IFDEF TARGET_X86}
1501 MOVD XMM0,EAX
1502 MOVD XMM1,EDX
1503 PMINUB XMM0,XMM1
1504 MOVD EAX,XMM0
1505{$ENDIF}
1506
1507{$IFDEF TARGET_X64}
1508 MOVD XMM0,ECX
1509 MOVD XMM1,EDX
1510 PMINUB XMM0,XMM1
1511 MOVD EAX,XMM0
1512{$ENDIF}
1513end;
1514
1515function ColorDifference_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1516asm
1517{$IFDEF TARGET_X86}
1518 MOVD XMM0,EAX
1519 MOVD XMM1,EDX
1520 MOVQ XMM2,XMM0
1521 PSUBUSB XMM0,XMM1
1522 PSUBUSB XMM1,XMM2
1523 POR XMM0,XMM1
1524 MOVD EAX,XMM0
1525{$ENDIF}
1526
1527{$IFDEF TARGET_X64}
1528 MOVD XMM0,ECX
1529 MOVD XMM1,EDX
1530 MOVQ XMM2,XMM0
1531 PSUBUSB XMM0,XMM1
1532 PSUBUSB XMM1,XMM2
1533 POR XMM0,XMM1
1534 MOVD EAX,XMM0
1535{$ENDIF}
1536end;
1537
1538function ColorExclusion_SSE2(C1, C2: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1539asm
1540{$IFDEF TARGET_X86}
1541 PXOR XMM2,XMM2
1542 MOVD XMM0,EAX
1543 PUNPCKLBW XMM0,XMM2
1544 MOVD XMM1,EDX
1545 PUNPCKLBW XMM1,XMM2
1546 MOVQ XMM3,XMM0
1547 PADDW XMM0,XMM1
1548 PMULLW XMM1,XMM3
1549 PSRLW XMM1,7
1550 PSUBUSW XMM0,XMM1
1551 PACKUSWB XMM0,XMM2
1552 MOVD EAX,XMM0
1553{$ENDIF}
1554
1555{$IFDEF TARGET_X64}
1556 PXOR XMM2,XMM2
1557 MOVD XMM0,ECX
1558 PUNPCKLBW XMM0,XMM2
1559 MOVD XMM1,EDX
1560 PUNPCKLBW XMM1,XMM2
1561 MOVQ XMM3,XMM0
1562 PADDW XMM0,XMM1
1563 PMULLW XMM1,XMM3
1564 PSRLW XMM1,7
1565 PSUBUSW XMM0,XMM1
1566 PACKUSWB XMM0,XMM2
1567 MOVD EAX,XMM0
1568{$ENDIF}
1569end;
1570
1571function ColorScale_SSE2(C, W: TColor32): TColor32; {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1572asm
1573{$IFDEF TARGET_X86}
1574 PXOR XMM2,XMM2
1575 SHL EDX,4
1576 MOVD XMM0,EAX
1577 PUNPCKLBW XMM0,XMM2
1578 ADD EDX,alpha_ptr
1579 PMULLW XMM0,[EDX]
1580 PSRLW XMM0,8
1581 PACKUSWB XMM0,XMM2
1582 MOVD EAX,XMM0
1583{$ENDIF}
1584
1585{$IFDEF TARGET_X64}
1586 PXOR XMM2,XMM2
1587 SHL RDX,4
1588 MOVD XMM0,ECX
1589 PUNPCKLBW XMM0,XMM2
1590{$IFNDEF FPC}
1591 ADD RDX,alpha_ptr
1592{$ELSE}
1593 ADD RDX,[RIP+alpha_ptr]
1594{$ENDIF}
1595 PMULLW XMM0,[RDX]
1596 PSRLW XMM0,8
1597 PACKUSWB XMM0,XMM2
1598 MOVD EAX,XMM0
1599{$ENDIF}
1600end;
1601
1602end.
Note: See TracBrowser for help on using the repository browser.