Mozzi  version v1.1.0
sound synthesis library for Arduino
cogl_sqrti.h
1 /**
2  * cogl_sqrti:
3  * @x: integer value
4  *
5  * Very fast fixed point implementation of square root for integers.
6  *
7  * This function is at least 6x faster than clib sqrt() on x86, and (this is
8  * not a typo!) about 500x faster on ARM without FPU. It's error is < 5%
9  * for arguments < %COGL_SQRTI_ARG_5_PERCENT and < 10% for arguments <
10  * %COGL_SQRTI_ARG_10_PERCENT. The maximum argument that can be passed to
11  * this function is COGL_SQRTI_ARG_MAX.
12  *
13  * Return value: integer square root.
14  *
15  *
16  * Since: 0.2
17  */
18 #include "mozzi_fixmath.h"
19 
20 /** NOTE: This header-file used to be entirely broken for many years (due to double definition of isqrt16 and isqrt32, now removed), with no apparent complaints.
21  * The algos in here look sort of cool, but I don't think they should be part of official API to maintain. Hence the deliberate error below. */
22 #error This file is just a scratchpad for integer sqrt algorithms. It is not currently meant to an official part of the Mozzi API.
23 
24 /*-- isqrt -----------------------------------------------------------------*/
25 
26 unsigned long isqrt(unsigned long n) {
27  unsigned long s, t;
28 
29 #define sqrtBit(k)
30  t = s+(1UL<<(k-1)); t <<= k+1; if (n >= t) { n -= t; s |= 1UL<<k; }
31 
32  s = 0UL;
33 #ifdef __alpha
34  if (n >= 1UL<<62) { n -= 1UL<<62; s = 1UL<<31; }
35  sqrtBit(30); sqrtBit(29); sqrtBit(28); sqrtBit(27); sqrtBit(26);
36  sqrtBit(25); sqrtBit(24); sqrtBit(23); sqrtBit(22); sqrtBit(21);
37  sqrtBit(20); sqrtBit(19); sqrtBit(18); sqrtBit(17); sqrtBit(16);
38  sqrtBit(15);
39 #else
40  if (n >= 1UL<<30) { n -= 1UL<<30; s = 1UL<<15; }
41 #endif
42  sqrtBit(14); sqrtBit(13); sqrtBit(12); sqrtBit(11); sqrtBit(10);
43  sqrtBit(9); sqrtBit(8); sqrtBit(7); sqrtBit(6); sqrtBit(5);
44  sqrtBit(4); sqrtBit(3); sqrtBit(2); sqrtBit(1);
45  if (n > s<<1) s |= 1UL;
46 
47 #undef sqrtBit
48 
49  return s;
50 } /* end isqrt */
51 
52 
53 /**
54 http://stackoverflow.com/questions/1100090/looking-for-an-efficient-integer-square-root-algorithm-for-arm-thumb2
55 It's essentially from the Wikipedia article on square-root computing methods.
56 But it has been changed to use stdint.h types uint32_t etc. Strictly speaking
57 the return type could be changed to uint16_t.
58 
59 * \brief Fast Square root algorithm
60  *
61  * Fractional parts of the answer are discarded. That is:
62  * - SquareRoot(3) --> 1
63  * - SquareRoot(4) --> 2
64  * - SquareRoot(5) --> 2
65  * - SquareRoot(8) --> 2
66  * - SquareRoot(9) --> 3
67  *
68  * \param[in] a_nInput - unsigned integer for which to find the square root
69  *
70  * \return Integer square root of the input value.
71  */
72 uint16_t SquareRoot(uint32_t a_nInput)
73 {
74  uint32_t op = a_nInput;
75  uint32_t res = 0;
76  uint32_t one = 1uL << 30; // The second-to-top bit is set: use 1u << 14 for uint16_t type; use 1uL<<30 for uint32_t type
77 
78 
79  // "one" starts at the highest power of four <= than the argument.
80  while (one > op)
81  {
82  one >>= 2;
83  }
84 
85  while (one != 0)
86  {
87  if (op >= res + one)
88  {
89  op = op - (res + one);
90  res = res + 2 * one;
91  }
92  res >>= 1;
93  one >>= 2;
94  }
95  return res;
96 }
97 
98 
99 
100 
101 int cogl_sqrti (int number)
102 {
103 
104  /* This is a fixed point implementation of the Quake III sqrt algorithm,
105  * described, for example, at
106  * http://www.codemaestro.com/reviews/review00000105.html
107  *
108  * While the original QIII is extremely fast, the use of floating division
109  * and multiplication makes it perform very on arm processors without FPU.
110  *
111  * The key to successfully replacing the floating point operations with
112  * fixed point is in the choice of the fixed point format. The QIII
113  * algorithm does not calculate the square root, but its reciprocal ('y'
114  * below), which is only at the end turned to the inverse value. In order
115  * for the algorithm to produce satisfactory results, the reciprocal value
116  * must be represented with sufficient precission; the 16.16 we use
117  * elsewhere in clutter is not good enough, and 10.22 is used instead.
118  */
119  //CoglFixed x;
120  long x;
121  uint32_t y_1; /* 10.22 fixed point */
122  uint32_t f = 0x600000; /* '1.5' as 10.22 fixed */
123 
124  union
125  {
126  float f;
127  uint32_t i;
128  } flt, flt2;
129 
130  flt.f = number;
131 
132  x = Q15n0_to_Q15n16(number) / 2;
133 
134  /* The QIII initial estimate */
135  flt.i = 0x5f3759df - ( flt.i >> 1 );
136 
137  /* Now, we convert the float to 10.22 fixed. We exploit the mechanism
138  * described at http://www.d6.com/users/checker/pdfs/gdmfp.pdf.
139  *
140  * We want 22 bit fraction; a single precission float uses 23 bit
141  * mantisa, so we only need to add 2^(23-22) (no need for the 1.5
142  * multiplier as we are only dealing with positive numbers).
143  *
144  * Note: we have to use two separate variables here -- for some reason,
145  * if we try to use just the flt variable, gcc on ARM optimises the whole
146  * addition out, and it all goes pear shape, since without it, the bits
147  * in the float will not be correctly aligned.
148  */
149  flt2.f = flt.f + 2.0;
150  flt2.i &= 0x7FFFFF;
151 
152  /* Now we correct the estimate */
153  y_1 = (flt2.i >> 11) * (flt2.i >> 11);
154  y_1 = (y_1 >> 8) * (x >> 8);
155 
156  y_1 = f - y_1;
157  flt2.i = (flt2.i >> 11) * (y_1 >> 11);
158 
159  /* If the original argument is less than 342, we do another
160  * iteration to improve precission (for arguments >= 342, the single
161  * iteration produces generally better results).
162  */
163  if (x < 171)
164  {
165  y_1 = (flt2.i >> 11) * (flt2.i >> 11);
166  y_1 = (y_1 >> 8) * (x >> 8);
167 
168  y_1 = f - y_1;
169  flt2.i = (flt2.i >> 11) * (y_1 >> 11);
170  }
171 
172  /* Invert, round and convert from 10.22 to an integer
173  * 0x1e3c68 is a magical rounding constant that produces slightly
174  * better results than 0x200000.
175  */
176  return (number * flt2.i + 0x1e3c68) >> 22;
177 
178 }
Q15n16 Q15n0_to_Q15n16(Q15n0 a)
Convert Q15n0 int16_t to Q15n16 fix.
#define sqrtBit(k)