source: trunk/Packages/Graphics32/GR32_Math.pas

Last change on this file was 2, checked in by chronos, 5 years ago
File size: 30.5 KB
Line 
1unit GR32_Math;
2
3(* ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1 or LGPL 2.1 with linking exception
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Alternatively, the contents of this file may be used under the terms of the
17 * Free Pascal modified version of the GNU Lesser General Public License
18 * Version 2.1 (the "FPC modified LGPL License"), in which case the provisions
19 * of this license are applicable instead of those above.
20 * Please see the file LICENSE.txt for additional information concerning this
21 * license.
22 *
23 * The Original Code is Additional Math Routines for Graphics32
24 *
25 * The Initial Developer of the Original Code is
26 * Mattias Andersson <mattias@centaurix.com>
27 * (parts of this unit were moved here from GR32_System.pas and GR32.pas by Alex A. Denisov)
28 *
29 * Portions created by the Initial Developer are Copyright (C) 2005-2009
30 * the Initial Developer. All Rights Reserved.
31 *
32 * Contributor(s):
33 * Michael Hansen <dyster_tid@hotmail.com>
34 *
35 * ***** END LICENSE BLOCK ***** *)
36
37interface
38
39{$I GR32.inc}
40
41uses GR32;
42
43{ Fixed point math routines }
44function FixedFloor(A: TFixed): Integer;
45function FixedCeil(A: TFixed): Integer;
46function FixedMul(A, B: TFixed): TFixed;
47function FixedDiv(A, B: TFixed): TFixed;
48function OneOver(Value: TFixed): TFixed;
49function FixedRound(A: TFixed): Integer;
50function FixedSqr(Value: TFixed): TFixed;
51function FixedSqrtLP(Value: TFixed): TFixed; // 8-bit precision
52function FixedSqrtHP(Value: TFixed): TFixed; // 16-bit precision
53// Fixed point interpolation
54function FixedCombine(W, X, Y: TFixed): TFixed;
55
56
57{ Trigonometric routines }
58
59procedure SinCos(const Theta: TFloat; out Sin, Cos: TFloat); overload;
60procedure SinCos(const Theta, Radius: Single; out Sin, Cos: Single); overload;
61procedure SinCos(const Theta, ScaleX, ScaleY: TFloat; out Sin, Cos: Single); overload;
62function Hypot(const X, Y: TFloat): TFloat; overload;
63function Hypot(const X, Y: Integer): Integer; overload;
64function FastSqrt(const Value: TFloat): TFloat;
65function FastSqrtBab1(const Value: TFloat): TFloat;
66function FastSqrtBab2(const Value: TFloat): TFloat;
67function FastInvSqrt(const Value: Single): Single; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF} overload;
68
69
70{ Misc. Routines }
71
72{ MulDiv a faster implementation of Windows.MulDiv funtion }
73function MulDiv(Multiplicand, Multiplier, Divisor: Integer): Integer;
74
75// tells if X is a power of 2, returns true when X = 1,2,4,8,16 etc.
76function IsPowerOf2(Value: Integer): Boolean; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF}
77// returns X rounded down to the nearest power of two
78function PrevPowerOf2(Value: Integer): Integer;
79// returns X rounded down to the nearest power of two, i.e. 5 -> 8, 7 -> 8, 15 -> 16
80function NextPowerOf2(Value: Integer): Integer;
81
82// fast average without overflow, useful for e.g. fixed point math
83function Average(A, B: Integer): Integer;
84// fast sign function
85function Sign(Value: Integer): Integer;
86
87function FloatMod(x, y: Double): Double; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF}
88
89function DivMod(Dividend, Divisor: Integer; var Remainder: Integer): Integer;
90
91
92{$IFDEF FPC}
93{$IFDEF TARGET_X64}
94(*
95 FPC has no similar {$EXCESSPRECISION OFF} directive,
96 but we can easily emulate that by overriding some internal math functions
97*)
98function PI: Single; [internproc: fpc_in_pi_real];
99//function Abs(D: Single): Single; [internproc: fpc_in_abs_real];
100//function Sqr(D: Single): Single; [internproc: fpc_in_sqr_real];
101function Sqrt(D: Single): Single; [internproc: fpc_in_sqrt_real];
102function ArcTan(D: Single): Single; [internproc: fpc_in_arctan_real];
103function Ln(D: Single): Single; [internproc: fpc_in_ln_real];
104function Sin(D: Single): Single; [internproc: fpc_in_sin_real];
105function Cos(D: Single): Single; [internproc: fpc_in_cos_real];
106function Exp(D: Single): Single; [internproc: fpc_in_exp_real];
107function Round(D: Single): Int64; [internproc: fpc_in_round_real];
108function Frac(D: Single): Single; [internproc: fpc_in_frac_real];
109function Int(D: Single): Single; [internproc: fpc_in_int_real];
110function Trunc(D: Single): Int64; [internproc: fpc_in_trunc_real];
111
112function Ceil(X: Single): Integer; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF}
113function Floor(X: Single): Integer; {$IFDEF INLININGSUPPORTED} inline; {$ENDIF}
114{$ENDIF}
115{$ENDIF}
116
117type
118 TCumSumProc = procedure(Values: PSingleArray; Count: Integer);
119
120var
121 CumSum: TCumSumProc;
122
123implementation
124
125uses
126 Math, GR32_System;
127
128{$IFDEF PUREPASCAL}
129const
130 FixedOneS: Single = 65536;
131{$ENDIF}
132
133
134{$IFDEF FPC}
135{$IFDEF TARGET_X64}
136function Ceil(X: Single): Integer;
137begin
138 Result := Trunc(X);
139 if (X - Result) > 0 then
140 Inc(Result);
141end;
142
143function Floor(X: Single): Integer;
144begin
145 Result := Trunc(X);
146 if (X - Result) < 0 then
147 Dec(Result);
148end;
149{$ENDIF}
150{$ENDIF}
151
152
153{ Fixed-point math }
154
155function FixedFloor(A: TFixed): Integer;
156{$IFDEF PUREPASCAL}
157begin
158 Result := A div FIXEDONE;
159{$ELSE}
160{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
161asm
162{$IFDEF TARGET_x86}
163 SAR EAX, 16
164{$ENDIF}
165{$IFDEF TARGET_x64}
166 MOV EAX, ECX
167 SAR EAX, 16
168{$ENDIF}
169{$ENDIF}
170end;
171
172function FixedCeil(A: TFixed): Integer;
173{$IFDEF PUREPASCAL}
174begin
175 Result := (A + $FFFF) div FIXEDONE;
176{$ELSE}
177{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
178asm
179{$IFDEF TARGET_x86}
180 ADD EAX, $0000FFFF
181 SAR EAX, 16
182{$ENDIF}
183{$IFDEF TARGET_x64}
184 MOV EAX, ECX
185 ADD EAX, $0000FFFF
186 SAR EAX, 16
187{$ENDIF}
188{$ENDIF}
189end;
190
191function FixedRound(A: TFixed): Integer;
192{$IFDEF PUREPASCAL}
193begin
194 Result := (A + $7FFF) div FIXEDONE;
195{$ELSE}
196{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
197asm
198{$IFDEF TARGET_x86}
199 ADD EAX, $00007FFF
200 SAR EAX, 16
201{$ENDIF}
202{$IFDEF TARGET_x64}
203 MOV EAX, ECX
204 ADD EAX, $00007FFF
205 SAR EAX, 16
206{$ENDIF}
207{$ENDIF}
208end;
209
210function FixedMul(A, B: TFixed): TFixed;
211{$IFDEF PUREPASCAL}
212begin
213 Result := Round(A * FixedToFloat * B);
214{$ELSE}
215{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
216asm
217{$IFDEF TARGET_x86}
218 IMUL EDX
219 SHRD EAX, EDX, 16
220{$ENDIF}
221{$IFDEF TARGET_x64}
222 MOV EAX, ECX
223 IMUL EDX
224 SHRD EAX, EDX, 16
225{$ENDIF}
226{$ENDIF}
227end;
228
229function FixedDiv(A, B: TFixed): TFixed;
230{$IFDEF PUREPASCAL}
231begin
232 Result := Round(A / B * FixedOne);
233{$ELSE}
234{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
235asm
236{$IFDEF TARGET_x86}
237 MOV ECX, B
238 CDQ
239 SHLD EDX, EAX, 16
240 SHL EAX, 16
241 IDIV ECX
242{$ENDIF}
243{$IFDEF TARGET_x64}
244 MOV EAX, ECX
245 MOV ECX, EDX
246 CDQ
247 SHLD EDX, EAX, 16
248 SHL EAX, 16
249 IDIV ECX
250{$ENDIF}
251{$ENDIF}
252end;
253
254function OneOver(Value: TFixed): TFixed;
255{$IFDEF PUREPASCAL}
256const
257 Dividend: Single = 4294967296; // FixedOne * FixedOne
258begin
259 Result := Round(Dividend / Value);
260{$ELSE}
261{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
262asm
263{$IFDEF TARGET_x86}
264 MOV ECX, Value
265 XOR EAX, EAX
266 MOV EDX, 1
267 IDIV ECX
268{$ENDIF}
269{$IFDEF TARGET_x64}
270 XOR EAX, EAX
271 MOV EDX, 1
272 IDIV ECX
273{$ENDIF}
274{$ENDIF}
275end;
276
277function FixedSqr(Value: TFixed): TFixed;
278{$IFDEF PUREPASCAL}
279begin
280 Result := Round(Value * FixedToFloat * Value);
281{$ELSE}
282{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
283asm
284{$IFDEF TARGET_x86}
285 IMUL EAX
286 SHRD EAX, EDX, 16
287{$ENDIF}
288{$IFDEF TARGET_x64}
289 MOV EAX, Value
290 IMUL EAX
291 SHRD EAX, EDX, 16
292{$ENDIF}
293{$ENDIF}
294end;
295
296function FixedSqrtLP(Value: TFixed): TFixed;
297{$IFDEF PUREPASCAL}
298begin
299 Result := Round(Sqrt(Value * FixedOneS));
300{$ELSE}
301{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
302asm
303{$IFDEF TARGET_x86}
304 PUSH EBX
305 MOV ECX, EAX
306 XOR EAX, EAX
307 MOV EBX, $40000000
308@SqrtLP1:
309 MOV EDX, ECX
310 SUB EDX, EBX
311 JL @SqrtLP2
312 SUB EDX, EAX
313 JL @SqrtLP2
314 MOV ECX,EDX
315 SHR EAX, 1
316 OR EAX, EBX
317 SHR EBX, 2
318 JNZ @SqrtLP1
319 SHL EAX, 8
320 JMP @SqrtLP3
321@SqrtLP2:
322 SHR EAX, 1
323 SHR EBX, 2
324 JNZ @SqrtLP1
325 SHL EAX, 8
326@SqrtLP3:
327 POP EBX
328{$ENDIF}
329{$IFDEF TARGET_x64}
330 PUSH RBX
331 XOR EAX, EAX
332 MOV EBX, $40000000
333@SqrtLP1:
334 MOV EDX, ECX
335 SUB EDX, EBX
336 JL @SqrtLP2
337 SUB EDX, EAX
338 JL @SqrtLP2
339 MOV ECX,EDX
340 SHR EAX, 1
341 OR EAX, EBX
342 SHR EBX, 2
343 JNZ @SqrtLP1
344 SHL EAX, 8
345 JMP @SqrtLP3
346@SqrtLP2:
347 SHR EAX, 1
348 SHR EBX, 2
349 JNZ @SqrtLP1
350 SHL EAX, 8
351@SqrtLP3:
352 POP RBX
353{$ENDIF}
354{$ENDIF}
355end;
356
357function FixedSqrtHP(Value: TFixed): TFixed;
358{$IFDEF PUREPASCAL}
359begin
360 Result := Round(Sqrt(Value * FixedOneS));
361{$ELSE}
362{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
363asm
364{$IFDEF TARGET_x86}
365 PUSH EBX
366 MOV ECX, EAX
367 XOR EAX, EAX
368 MOV EBX, $40000000
369@SqrtHP1:
370 MOV EDX, ECX
371 SUB EDX, EBX
372 jb @SqrtHP2
373 SUB EDX, EAX
374 jb @SqrtHP2
375 MOV ECX,EDX
376 SHR EAX, 1
377 OR EAX, EBX
378 SHR EBX, 2
379 JNZ @SqrtHP1
380 JZ @SqrtHP5
381@SqrtHP2:
382 SHR EAX, 1
383 SHR EBX, 2
384 JNZ @SqrtHP1
385@SqrtHP5:
386 MOV EBX, $00004000
387 SHL EAX, 16
388 SHL ECX, 16
389@SqrtHP3:
390 MOV EDX, ECX
391 SUB EDX, EBX
392 jb @SqrtHP4
393 SUB EDX, EAX
394 jb @SqrtHP4
395 MOV ECX, EDX
396 SHR EAX, 1
397 OR EAX, EBX
398 SHR EBX, 2
399 JNZ @SqrtHP3
400 JMP @SqrtHP6
401@SqrtHP4:
402 SHR EAX, 1
403 SHR EBX, 2
404 JNZ @SqrtHP3
405@SqrtHP6:
406 POP EBX
407{$ENDIF}
408{$IFDEF TARGET_x64}
409 PUSH RBX
410 XOR EAX, EAX
411 MOV EBX, $40000000
412@SqrtHP1:
413 MOV EDX, ECX
414 SUB EDX, EBX
415 jb @SqrtHP2
416 SUB EDX, EAX
417 jb @SqrtHP2
418 MOV ECX,EDX
419 SHR EAX, 1
420 OR EAX, EBX
421 SHR EBX, 2
422 JNZ @SqrtHP1
423 JZ @SqrtHP5
424@SqrtHP2:
425 SHR EAX, 1
426 SHR EBX, 2
427 JNZ @SqrtHP1
428@SqrtHP5:
429 MOV EBX, $00004000
430 SHL EAX, 16
431 SHL ECX, 16
432@SqrtHP3:
433 MOV EDX, ECX
434 SUB EDX, EBX
435 jb @SqrtHP4
436 SUB EDX, EAX
437 jb @SqrtHP4
438 MOV ECX, EDX
439 SHR EAX, 1
440 OR EAX, EBX
441 SHR EBX, 2
442 JNZ @SqrtHP3
443 JMP @SqrtHP6
444@SqrtHP4:
445 SHR EAX, 1
446 SHR EBX, 2
447 JNZ @SqrtHP3
448@SqrtHP6:
449 POP RBX
450{$ENDIF}
451{$ENDIF}
452end;
453
454function FixedCombine(W, X, Y: TFixed): TFixed;
455// EAX <- W, EDX <- X, ECX <- Y
456// combine fixed value X and fixed value Y with the weight of X given in W
457// Result Z = W * X + (1 - W) * Y = Y + (X - Y) * W
458// Fixed Point Version: Result Z = Y + (X - Y) * W / 65536
459{$IFDEF PUREPASCAL}
460begin
461 Result := Round(Y + (X - Y) * FixedToFloat * W);
462{$ELSE}
463{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
464asm
465{$IFDEF TARGET_x86}
466 SUB EDX, ECX
467 IMUL EDX
468 SHRD EAX, EDX, 16
469 ADD EAX, ECX
470{$ENDIF}
471{$IFDEF TARGET_x64}
472 MOV EAX, ECX
473 SUB EDX, R8D
474 IMUL EDX
475 SHRD EAX, EDX, 16
476 ADD EAX, R8D
477{$ENDIF}
478{$ENDIF}
479end;
480
481{ Trigonometry }
482
483procedure SinCos(const Theta: TFloat; out Sin, Cos: TFloat);
484{$IFDEF NATIVE_SINCOS}
485var
486 S, C: Extended;
487begin
488 Math.SinCos(Theta, S, C);
489 Sin := S;
490 Cos := C;
491{$ELSE}
492{$IFDEF TARGET_x64}
493var
494 Temp: TFloat;
495{$ENDIF}
496asm
497{$IFDEF TARGET_x86}
498 FLD Theta
499 FSINCOS
500 FSTP DWORD PTR [EDX] // cosine
501 FSTP DWORD PTR [EAX] // sine
502{$ENDIF}
503{$IFDEF TARGET_x64}
504 MOVD Temp, Theta
505 FLD Temp
506 FSINCOS
507 FSTP [Sin] // cosine
508 FSTP [Cos] // sine
509{$ENDIF}
510{$ENDIF}
511end;
512
513procedure SinCos(const Theta, Radius: TFloat; out Sin, Cos: TFloat);
514{$IFDEF NATIVE_SINCOS}
515var
516 S, C: Extended;
517begin
518 Math.SinCos(Theta, S, C);
519 Sin := S * Radius;
520 Cos := C * Radius;
521{$ELSE}
522{$IFDEF TARGET_x64}
523var
524 Temp: TFloat;
525{$ENDIF}
526asm
527{$IFDEF TARGET_x86}
528 FLD Theta
529 FSINCOS
530 FMUL Radius
531 FSTP DWORD PTR [EDX] // cosine
532 FMUL Radius
533 FSTP DWORD PTR [EAX] // sine
534{$ENDIF}
535{$IFDEF TARGET_x64}
536 MOVD Temp, Theta
537 FLD Temp
538 MOVD Temp, Radius
539 FSINCOS
540 FMUL Temp
541 FSTP [Cos]
542 FMUL Temp
543 FSTP [Sin]
544{$ENDIF}
545{$ENDIF}
546end;
547
548procedure SinCos(const Theta, ScaleX, ScaleY: TFloat; out Sin, Cos: Single); overload;
549{$IFDEF NATIVE_SINCOS}
550var
551 S, C: Extended;
552begin
553 Math.SinCos(Theta, S, C);
554 Sin := S * ScaleX;
555 Cos := C * ScaleY;
556{$ELSE}
557{$IFDEF TARGET_x64}
558var
559 Temp: TFloat;
560{$ENDIF}
561asm
562{$IFDEF TARGET_x86}
563 FLD Theta
564 FSINCOS
565 FMUL ScaleX
566 FSTP DWORD PTR [EDX] // cosine
567 FMUL ScaleY
568 FSTP DWORD PTR [EAX] // sine
569{$ENDIF}
570{$IFDEF TARGET_x64}
571 MOVD Temp, Theta
572 FLD Temp
573 FSINCOS
574 MOVD Temp, ScaleX
575 FMUL Temp
576 FSTP [Cos]
577 MOVD Temp, ScaleY
578 FMUL Temp
579 FSTP [Sin]
580{$ENDIF}
581{$ENDIF}
582end;
583
584function Hypot(const X, Y: TFloat): TFloat;
585{$IFDEF PUREPASCAL}
586begin
587 Result := Sqrt(Sqr(X) + Sqr(Y));
588{$ELSE}
589{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
590asm
591{$IFDEF TARGET_x86}
592 FLD X
593 FMUL ST,ST
594 FLD Y
595 FMUL ST,ST
596 FADDP ST(1),ST
597 FSQRT
598 FWAIT
599{$ENDIF}
600{$IFDEF TARGET_x64}
601 MULSS XMM0, XMM0
602 MULSS XMM1, XMM1
603 ADDSS XMM0, XMM1
604 SQRTSS XMM0, XMM0
605{$ENDIF}
606{$ENDIF}
607end;
608
609function Hypot(const X, Y: Integer): Integer;
610//{$IFDEF PUREPASCAL}
611begin
612 Result := Round(Math.Hypot(X, Y));
613(*
614{$ELSE}
615asm
616{$IFDEF TARGET_x64}
617 IMUL RAX, RCX, RDX
618{$ELSE}
619 FLD X
620 FMUL ST,ST
621 FLD Y
622 FMUL ST,ST
623 FADDP ST(1),ST
624 FSQRT
625 FISTP [ESP - 4]
626 MOV EAX, [ESP - 4]
627 FWAIT
628{$ENDIF}
629{$ENDIF}
630*)
631end;
632
633function FastSqrt(const Value: TFloat): TFloat;
634// see http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Approximations_that_depend_on_IEEE_representation
635{$IFDEF PUREPASCAL}
636var
637 I: Integer absolute Value;
638 J: Integer absolute Result;
639begin
640 J := (I - $3F800000) div 2 + $3F800000;
641{$ELSE}
642{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
643asm
644{$IFDEF TARGET_x86}
645 MOV EAX, DWORD PTR Value
646 SUB EAX, $3F800000
647 SAR EAX, 1
648 ADD EAX, $3F800000
649 MOV DWORD PTR [ESP - 4], EAX
650 FLD DWORD PTR [ESP - 4]
651{$ENDIF}
652{$IFDEF TARGET_x64}
653 SQRTSS XMM0, XMM0
654{$ENDIF}
655{$ENDIF}
656end;
657
658function FastSqrtBab1(const Value: TFloat): TFloat;
659// see http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Approximations_that_depend_on_IEEE_representation
660// additionally one babylonian step added
661{$IFNDEF PUREPASCAL}
662{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
663{$ENDIF}
664const
665 CHalf : TFloat = 0.5;
666{$IFDEF PUREPASCAL}
667var
668 I: Integer absolute Value;
669 J: Integer absolute Result;
670begin
671 J := (I - $3F800000) div 2 + $3F800000;
672 Result := CHalf * (Result + Value / Result);
673{$ELSE}
674asm
675{$IFDEF TARGET_x86}
676 MOV EAX, Value
677 SUB EAX, $3F800000
678 SAR EAX, 1
679 ADD EAX, $3F800000
680 MOV DWORD PTR [ESP - 4], EAX
681 FLD Value
682 FDIV DWORD PTR [ESP - 4]
683 FADD DWORD PTR [ESP - 4]
684 FMUL CHalf
685{$ENDIF}
686{$IFDEF TARGET_x64}
687 SQRTSS XMM0, XMM0
688{$ENDIF}
689{$ENDIF}
690end;
691
692function FastSqrtBab2(const Value: TFloat): TFloat;
693// see http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Approximations_that_depend_on_IEEE_representation
694// additionally two babylonian steps added
695{$IFDEF PUREPASCAL}
696const
697 CQuarter : TFloat = 0.25;
698var
699 J: Integer absolute Result;
700begin
701 Result := Value;
702 J := ((J - (1 shl 23)) shr 1) + (1 shl 29);
703 Result := Result + Value / Result;
704 Result := CQuarter * Result + Value / Result;
705{$ELSE}
706{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
707const
708 CHalf : TFloat = 0.5;
709asm
710{$IFDEF TARGET_x86}
711 MOV EAX, Value
712 SUB EAX, $3F800000
713 SAR EAX, 1
714 ADD EAX, $3F800000
715 MOV DWORD PTR [ESP - 4], EAX
716 FLD Value
717 FDIV DWORD PTR [ESP - 4]
718 FADD DWORD PTR [ESP - 4]
719 FMUL CHalf
720{$ENDIF}
721{$IFDEF TARGET_x64}
722 MOVD EAX, Value
723 SUB EAX, $3F800000
724 SAR EAX, 1
725 ADD EAX, $3F800000
726 MOVD XMM1, EAX
727 DIVSS XMM0, XMM1
728 ADDSS XMM0, XMM1
729 MOVD XMM1, [RIP + CHalf]
730 MULSS XMM0, XMM1
731{$ENDIF}
732{$ENDIF}
733end;
734
735function FastInvSqrt(const Value: Single): Single;
736var
737 IntCst : Cardinal absolute result;
738begin
739 Result := Value;
740 IntCst := ($BE6EB50C - IntCst) shr 1;
741 Result := 0.5 * Result * (3 - Value * Sqr(Result));
742end;
743
744{ Misc. }
745
746function MulDiv(Multiplicand, Multiplier, Divisor: Integer): Integer;
747{$IFDEF PUREPASCAL}
748begin
749 Result := Int64(Multiplicand) * Int64(Multiplier) div Divisor;
750{$ELSE}
751{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
752asm
753{$IFDEF TARGET_x86}
754 PUSH EBX // Imperative save
755 PUSH ESI // of EBX and ESI
756
757 MOV EBX, EAX // Result will be negative or positive so set rounding direction
758 XOR EBX, EDX // Negative: substract 1 in case of rounding
759 XOR EBX, ECX // Positive: add 1
760
761 OR EAX, EAX // Make all operands positive, ready for unsigned operations
762 JNS @m1Ok // minimizing branching
763 NEG EAX
764@m1Ok:
765 OR EDX, EDX
766 JNS @m2Ok
767 NEG EDX
768@m2Ok:
769 OR ECX, ECX
770 JNS @DivOk
771 NEG ECX
772@DivOK:
773 MUL EDX // Unsigned multiply (Multiplicand*Multiplier)
774
775 MOV ESI, EDX // Check for overflow, by comparing
776 SHL ESI, 1 // 2 times the high-order 32 bits of the product (EDX)
777 CMP ESI, ECX // with the Divisor.
778 JAE @Overfl // If equal or greater than overflow with division anticipated
779
780 DIV ECX // Unsigned divide of product by Divisor
781
782 SUB ECX, EDX // Check if the result must be adjusted by adding or substracting
783 CMP ECX, EDX // 1 (*.5 -> nearest integer), by comparing the difference of
784 JA @NoAdd // Divisor and remainder with the remainder. If it is greater then
785 INC EAX // no rounding needed; add 1 to result otherwise
786@NoAdd:
787 OR EBX, EDX // From unsigned operations back the to original sign of the result
788 JNS @Exit // must be positive
789 NEG EAX // must be negative
790 JMP @Exit
791@Overfl:
792 OR EAX, -1 // 3 bytes alternative for MOV EAX,-1. Windows.MulDiv "overflow"
793 // and "zero-divide" return value
794@Exit:
795 POP ESI // Restore
796 POP EBX // esi and EBX
797{$ENDIF}
798{$IFDEF TARGET_x64}
799 MOV EAX, ECX // Result will be negative or positive so set rounding direction
800 XOR ECX, EDX // Negative: substract 1 in case of rounding
801 XOR ECX, R8D // Positive: add 1
802
803 OR EAX, EAX // Make all operands positive, ready for unsigned operations
804 JNS @m1Ok // minimizing branching
805 NEG EAX
806@m1Ok:
807 OR EDX, EDX
808 JNS @m2Ok
809 NEG EDX
810@m2Ok:
811 OR R8D, R8D
812 JNS @DivOk
813 NEG R8D
814@DivOK:
815 MUL EDX // Unsigned multiply (Multiplicand*Multiplier)
816
817 MOV R9D, EDX // Check for overflow, by comparing
818 SHL R9D, 1 // 2 times the high-order 32 bits of the product (EDX)
819 CMP R9D, R8D // with the Divisor.
820 JAE @Overfl // If equal or greater than overflow with division anticipated
821
822 DIV R8D // Unsigned divide of product by Divisor
823
824 SUB R8D, EDX // Check if the result must be adjusted by adding or substracting
825 CMP R8D, EDX // 1 (*.5 -> nearest integer), by comparing the difference of
826 JA @NoAdd // Divisor and remainder with the remainder. If it is greater then
827 INC EAX // no rounding needed; add 1 to result otherwise
828@NoAdd:
829 OR ECX, EDX // From unsigned operations back the to original sign of the result
830 JNS @Exit // must be positive
831 NEG EAX // must be negative
832 JMP @Exit
833@Overfl:
834 OR EAX, -1 // 3 bytes alternative for MOV EAX,-1. Windows.MulDiv "overflow"
835 // and "zero-divide" return value
836@Exit:
837{$ENDIF}
838{$ENDIF}
839end;
840
841function IsPowerOf2(Value: Integer): Boolean;
842//returns true when X = 1,2,4,8,16 etc.
843begin
844 Result := Value and (Value - 1) = 0;
845end;
846
847function PrevPowerOf2(Value: Integer): Integer;
848//returns X rounded down to the power of two
849{$IFDEF PUREPASCAL}
850begin
851 Result := 1;
852 while Value shr 1 > 0 do
853 Result := Result shl 1;
854{$ELSE}
855{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
856asm
857{$IFDEF TARGET_x86}
858 BSR ECX, EAX
859 SHR EAX, CL
860 SHL EAX, CL
861{$ENDIF}
862{$IFDEF TARGET_x64}
863 MOV EAX, Value
864 BSR ECX, EAX
865 SHR EAX, CL
866 SHL EAX, CL
867{$ENDIF}
868{$ENDIF}
869end;
870
871function NextPowerOf2(Value: Integer): Integer;
872//returns X rounded up to the power of two, i.e. 5 -> 8, 7 -> 8, 15 -> 16
873{$IFDEF PUREPASCAL}
874begin
875 Result := 2;
876 while Value shr 1 > 0 do
877 Result := Result shl 1;
878{$ELSE}
879{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
880asm
881{$IFDEF TARGET_x86}
882 DEC EAX
883 JLE @1
884 BSR ECX, EAX
885 MOV EAX, 2
886 SHL EAX, CL
887 RET
888@1:
889 MOV EAX, 1
890{$ENDIF}
891{$IFDEF TARGET_x64}
892 MOV EAX, Value
893 DEC EAX
894 JLE @1
895 BSR ECX, EAX
896 MOV EAX, 2
897 SHL EAX, CL
898 RET
899@1:
900 MOV EAX, 1
901{$ENDIF}
902{$ENDIF}
903end;
904
905function Average(A, B: Integer): Integer;
906//fast average without overflow, useful e.g. for fixed point math
907//(A + B)/2 = (A and B) + (A xor B)/2
908{$IFDEF PUREPASCAL}
909begin
910 Result := (A and B) + (A xor B) div 2;
911{$ELSE}
912{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
913asm
914{$IFDEF TARGET_x86}
915 MOV ECX, EDX
916 XOR EDX, EAX
917 SAR EDX, 1
918 AND EAX, ECX
919 ADD EAX, EDX
920{$ENDIF}
921{$IFDEF TARGET_x64}
922 MOV EAX, A
923 MOV ECX, EDX
924 XOR EDX, EAX
925 SAR EDX, 1
926 AND EAX, ECX
927 ADD EAX, EDX
928{$ENDIF}
929{$ENDIF}
930end;
931
932function Sign(Value: Integer): Integer;
933{$IFDEF PUREPASCAL}
934begin
935 //Assumes 32 bit integer
936 Result := (- Value) shr 31 - (Value shr 31);
937{$ELSE}
938{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
939asm
940{$IFDEF TARGET_x64}
941 MOV EAX, Value
942{$ENDIF}
943 CDQ
944 NEG EAX
945 ADC EDX, EDX
946 MOV EAX, EDX
947{$ENDIF}
948end;
949
950function FloatMod(x, y: Double): Double;
951begin
952 if (y = 0) then
953 Result := X
954 else
955 Result := x - y * Floor(x / y);
956end;
957
958function DivMod(Dividend, Divisor: Integer; var Remainder: Integer): Integer;
959{$IFDEF PUREPASCAL}
960begin
961 Result := Dividend div Divisor;
962 Remainder := Dividend mod Divisor;
963{$ELSE}
964{$IFDEF FPC} assembler; nostackframe; {$ENDIF}
965asm
966{$IFDEF TARGET_x86}
967 PUSH EDX
968 CDQ
969 IDIV DWORD PTR [ESP]
970 ADD ESP, $04
971 MOV DWORD PTR [ECX], edx
972{$ENDIF}
973{$IFDEF TARGET_x64}
974 MOV RAX, RCX
975 MOV R9, RDX
976 CDQ
977 IDIV R9
978 MOV DWORD PTR [R8], EDX
979{$ENDIF}
980{$ENDIF}
981end;
982
983procedure CumSum_Pas(Values: PSingleArray; Count: Integer);
984var
985 I: Integer;
986 V: TFloat;
987begin
988 V := Values[0];
989 for I := 1 to Count - 1 do
990 begin
991 if PInteger(@Values[I])^ <> 0 then
992 V := V + Values[I];
993 Values[I] := V;
994 end;
995end;
996
997{$IFNDEF PUREPASCAL}
998// Aligned SSE2 version -- Credits: Sanyin <prevodilac@hotmail.com>
999procedure CumSum_SSE2(Values: PSingleArray; Count: Integer); {$IFDEF FPC} assembler; nostackframe; {$ENDIF}
1000asm
1001{$IFDEF TARGET_x86}
1002 MOV ECX,EDX
1003 CMP ECX,2 // if count < 2, exit
1004 JL @END
1005 CMP ECX,32 // if count < 32, avoid SSE2 overhead
1006 JL @SMALL
1007
1008{--- align memory ---}
1009 PUSH EBX
1010 PXOR XMM4,XMM4
1011 MOV EBX,EAX
1012 AND EBX,15 // get aligned count
1013 JZ @ENDALIGNING // already aligned
1014 ADD EBX,-16
1015 NEG EBX // get bytes to advance
1016 JZ @ENDALIGNING // already aligned
1017
1018 MOV ECX,EBX
1019 SAR ECX,2 // div with 4 to get cnt
1020 SUB EDX,ECX
1021
1022 ADD EAX,4
1023 DEC ECX
1024 JZ @SETUPLAST // one element
1025
1026@ALIGNINGLOOP:
1027 FLD DWORD PTR [EAX-4]
1028 FADD DWORD PTR [EAX]
1029 FSTP DWORD PTR [EAX]
1030 ADD EAX,4
1031 DEC ECX
1032 JNZ @ALIGNINGLOOP
1033
1034@SETUPLAST:
1035 MOVUPS XMM4,[EAX-4]
1036 PSLLDQ XMM4,12
1037 PSRLDQ XMM4,12
1038
1039@ENDALIGNING:
1040 POP EBX
1041 PUSH EBX
1042 MOV ECX,EDX
1043 SAR ECX,2
1044@LOOP:
1045 MOVAPS XMM0,[EAX]
1046 PXOR XMM5,XMM5
1047 PCMPEQD XMM5,XMM0
1048 PMOVMSKB EBX,XMM5
1049 CMP EBX,$0000FFFF
1050 JNE @NORMAL
1051 PSHUFD XMM0,XMM4,0
1052 JMP @SKIP
1053
1054@NORMAL:
1055 ADDPS XMM0,XMM4
1056 PSHUFD XMM1,XMM0,$e4
1057 PSLLDQ XMM1,4
1058 PSHUFD XMM2,XMM1,$90
1059 PSHUFD XMM3,XMM1,$40
1060 ADDPS XMM2,XMM3
1061 ADDPS XMM1,XMM2
1062 ADDPS XMM0,XMM1
1063
1064 PSHUFLW XMM4,XMM0,$E4
1065 PSRLDQ XMM4,12
1066
1067@SKIP:
1068 PREFETCHNTA [eax+16*16*2]
1069 MOVAPS [EAX],XMM0
1070 ADD EAX,16
1071 SUB ECX,1
1072 JNZ @LOOP
1073 POP EBX
1074 MOV ECX,EDX
1075 SAR ECX,2
1076 SHL ECX,2
1077 SUB EDX,ECX
1078 MOV ECX,EDX
1079 JZ @END
1080
1081@LOOP2:
1082 FLD DWORD PTR [EAX-4]
1083 FADD DWORD PTR [EAX]
1084 FSTP DWORD PTR [EAX]
1085 ADD EAX,4
1086 DEC ECX
1087 JNZ @LOOP2
1088 JMP @END
1089
1090@SMALL:
1091 MOV ECX,EDX
1092 ADD EAX,4
1093 DEC ECX
1094@LOOP3:
1095 FLD DWORD PTR [EAX-4]
1096 FADD DWORD PTR [EAX]
1097 FSTP DWORD PTR [EAX]
1098 ADD EAX,4
1099 DEC ECX
1100 JNZ @LOOP3
1101{$ENDIF}
1102{$IFDEF TARGET_x64}
1103 CMP EDX,2 // if count < 2, exit
1104 JL @END
1105
1106 MOV RAX,RCX
1107 MOV ECX,EDX
1108
1109 CMP ECX,32 // if count < 32, avoid SSE2 overhead
1110 JL @SMALL
1111
1112{--- align memory ---}
1113 PXOR XMM4,XMM4
1114 MOV R8D,EAX
1115 AND R8D,15 // get aligned count
1116 JZ @ENDALIGNING // already aligned
1117 ADD R8D,-16
1118 NEG R8D // get bytes to advance
1119 JZ @ENDALIGNING // already aligned
1120
1121 MOV ECX,R8D
1122 SAR ECX,2 // div with 4 to get cnt
1123 SUB EDX,ECX
1124
1125 ADD RAX,4
1126 DEC ECX
1127 JZ @SETUPLAST // one element
1128
1129@ALIGNINGLOOP:
1130 FLD DWORD PTR [RAX - 4]
1131 FADD DWORD PTR [RAX]
1132 FSTP DWORD PTR [RAX]
1133 ADD RAX,4
1134 DEC ECX
1135 JNZ @ALIGNINGLOOP
1136
1137@SETUPLAST:
1138 MOVUPS XMM4,[RAX - 4]
1139 PSLLDQ XMM4,12
1140 PSRLDQ XMM4,12
1141
1142@ENDALIGNING:
1143 MOV ECX,EDX
1144 SAR ECX,2
1145@LOOP:
1146 MOVAPS XMM0,[RAX]
1147 PXOR XMM5,XMM5
1148 PCMPEQD XMM5,XMM0
1149 PMOVMSKB R8D,XMM5
1150 CMP R8D,$0000FFFF
1151 JNE @NORMAL
1152 PSHUFD XMM0,XMM4,0
1153 JMP @SKIP
1154
1155@NORMAL:
1156 ADDPS XMM0,XMM4
1157 PSHUFD XMM1,XMM0,$e4
1158 PSLLDQ XMM1,4
1159 PSHUFD XMM2,XMM1,$90
1160 PSHUFD XMM3,XMM1,$40
1161 ADDPS XMM2,XMM3
1162 ADDPS XMM1,XMM2
1163 ADDPS XMM0,XMM1
1164
1165 PSHUFLW XMM4,XMM0,$E4
1166 PSRLDQ XMM4,12
1167
1168@SKIP:
1169 PREFETCHNTA [RAX + 32 * 2]
1170 MOVAPS [RAX],XMM0
1171 ADD RAX,16
1172 SUB ECX,1
1173 JNZ @LOOP
1174 MOV ECX,EDX
1175 SAR ECX,2
1176 SHL ECX,2
1177 SUB EDX,ECX
1178 MOV ECX,EDX
1179 JZ @END
1180
1181@LOOP2:
1182 FLD DWORD PTR [RAX - 4]
1183 FADD DWORD PTR [RAX]
1184 FSTP DWORD PTR [RAX]
1185 ADD RAX,4
1186 DEC ECX
1187 JNZ @LOOP2
1188 JMP @END
1189
1190@SMALL:
1191 ADD RAX,4
1192 DEC ECX
1193@LOOP3:
1194 FLD DWORD PTR [RAX - 4]
1195 FADD DWORD PTR [RAX]
1196 FSTP DWORD PTR [RAX]
1197 ADD RAX,4
1198 DEC ECX
1199 JNZ @LOOP3
1200{$ENDIF}
1201@END:
1202end;
1203{$ENDIF}
1204
1205
1206initialization
1207{$IFNDEF PUREPASCAL}
1208 if HasInstructionSet(ciSSE2) then
1209 CumSum := CumSum_SSE2
1210 else
1211{$ENDIF}
1212 CumSum := CumSum_Pas;
1213
1214end.
Note: See TracBrowser for help on using the repository browser.