Open3D (C++ API)  0.18.0+252c867
SVD3x3.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2023 www.open3d.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 /**************************************************************************
8 **
9 ** svd3
10 **
11 ** Quick singular value decomposition as described by:
12 ** A. McAdams, A. Selle, R. Tamstorf, J. Teran and E. Sifakis,
13 ** Computing the Singular Value Decomposition of 3x3 matrices
14 ** with minimal branching and elementary floating point operations,
15 ** University of Wisconsin - Madison technical report TR1690, May 2011
16 **
17 ** Implemented by: Kui Wu
18 ** kwu@cs.utah.edu
19 ** Modified by: Wei Dong and Rishabh Singh
20 **
21 ** May 2018
22 **
23 **************************************************************************/
24 
25 #pragma once
26 
27 #include <cmath>
28 
29 #include "open3d/core/CUDAUtils.h"
31 
32 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
33 #include <cuda.h>
34 #endif
35 
36 #include "math.h"
38 
39 #define gone 1065353216
40 #define gsine_pi_over_eight 1053028117
41 
42 #define gcosine_pi_over_eight 1064076127
43 #define gtiny_number 1.e-20
44 #define gfour_gamma_squared 5.8284273147583007813
45 
46 #ifndef __CUDACC__
47 using std::abs;
48 using std::max;
49 using std::sqrt;
50 #endif
51 
52 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
53 #define __fadd_rn(x, y) __fadd_rn(x, y)
54 #define __fsub_rn(x, y) __fsub_rn(x, y)
55 #define __frsqrt_rn(x) __frsqrt_rn(x)
56 
57 #define __dadd_rn(x, y) __dadd_rn(x, y)
58 #define __dsub_rn(x, y) __dsub_rn(x, y)
59 #define __drsqrt_rn(x) __drcp_rn(__dsqrt_rn(x))
60 #else
61 
62 #define __fadd_rn(x, y) (x + y)
63 #define __fsub_rn(x, y) (x - y)
64 #define __frsqrt_rn(x) (1.0 / sqrt(x))
65 
66 #define __dadd_rn(x, y) (x + y)
67 #define __dsub_rn(x, y) (x - y)
68 #define __drsqrt_rn(x) (1.0 / sqrt(x))
69 
70 #define __add_rn(x, y) (x + y)
71 #define __sub_rn(x, y) (x - y)
72 #define __rsqrt_rn(x) (1.0 / sqrt(x))
73 #endif
74 
75 namespace open3d {
76 namespace core {
77 namespace linalg {
78 namespace kernel {
79 
80 template <typename scalar_t>
81 union un {
82  scalar_t f;
83  unsigned int ui;
84 };
85 
86 template <typename scalar_t>
87 OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3(const scalar_t *A_3x3,
88  scalar_t *U_3x3,
89  scalar_t *S_3x1,
90  scalar_t *V_3x3);
91 
92 template <>
94  double *U_3x3,
95  double *S_3x1,
96  double *V_3x3) {
97  double gsmall_number = 1.e-12;
98 
99  un<double> Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
100  un<double> Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
101  un<double> Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
102  un<double> Sc, Ss, Sch, Ssh;
103  un<double> Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
104  un<double> Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
105  un<double> Sqvs, Sqvvx, Sqvvy, Sqvvz;
106 
107  Sa11.f = A_3x3[0];
108  Sa12.f = A_3x3[1];
109  Sa13.f = A_3x3[2];
110  Sa21.f = A_3x3[3];
111  Sa22.f = A_3x3[4];
112  Sa23.f = A_3x3[5];
113  Sa31.f = A_3x3[6];
114  Sa32.f = A_3x3[7];
115  Sa33.f = A_3x3[8];
116 
117  //###########################################################
118  // Compute normal equations matrix
119  //###########################################################
120 
121  Ss11.f = Sa11.f * Sa11.f;
122  Stmp1.f = Sa21.f * Sa21.f;
123  Ss11.f = __dadd_rn(Stmp1.f, Ss11.f);
124  Stmp1.f = Sa31.f * Sa31.f;
125  Ss11.f = __dadd_rn(Stmp1.f, Ss11.f);
126 
127  Ss21.f = Sa12.f * Sa11.f;
128  Stmp1.f = Sa22.f * Sa21.f;
129  Ss21.f = __dadd_rn(Stmp1.f, Ss21.f);
130  Stmp1.f = Sa32.f * Sa31.f;
131  Ss21.f = __dadd_rn(Stmp1.f, Ss21.f);
132 
133  Ss31.f = Sa13.f * Sa11.f;
134  Stmp1.f = Sa23.f * Sa21.f;
135  Ss31.f = __dadd_rn(Stmp1.f, Ss31.f);
136  Stmp1.f = Sa33.f * Sa31.f;
137  Ss31.f = __dadd_rn(Stmp1.f, Ss31.f);
138 
139  Ss22.f = Sa12.f * Sa12.f;
140  Stmp1.f = Sa22.f * Sa22.f;
141  Ss22.f = __dadd_rn(Stmp1.f, Ss22.f);
142  Stmp1.f = Sa32.f * Sa32.f;
143  Ss22.f = __dadd_rn(Stmp1.f, Ss22.f);
144 
145  Ss32.f = Sa13.f * Sa12.f;
146  Stmp1.f = Sa23.f * Sa22.f;
147  Ss32.f = __dadd_rn(Stmp1.f, Ss32.f);
148  Stmp1.f = Sa33.f * Sa32.f;
149  Ss32.f = __dadd_rn(Stmp1.f, Ss32.f);
150 
151  Ss33.f = Sa13.f * Sa13.f;
152  Stmp1.f = Sa23.f * Sa23.f;
153  Ss33.f = __dadd_rn(Stmp1.f, Ss33.f);
154  Stmp1.f = Sa33.f * Sa33.f;
155  Ss33.f = __dadd_rn(Stmp1.f, Ss33.f);
156 
157  Sqvs.f = 1.f;
158  Sqvvx.f = 0.f;
159  Sqvvy.f = 0.f;
160  Sqvvz.f = 0.f;
161 
162  //###########################################################
163  // Solve symmetric eigenproblem using Jacobi iteration
164  //###########################################################
165  for (int i = 0; i < 4; i++) {
166  Ssh.f = Ss21.f * 0.5f;
167  Stmp5.f = __dsub_rn(Ss11.f, Ss22.f);
168 
169  Stmp2.f = Ssh.f * Ssh.f;
170  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
171  Ssh.ui = Stmp1.ui & Ssh.ui;
172  Sch.ui = Stmp1.ui & Stmp5.ui;
173  Stmp2.ui = ~Stmp1.ui & gone;
174  Sch.ui = Sch.ui | Stmp2.ui;
175 
176  Stmp1.f = Ssh.f * Ssh.f;
177  Stmp2.f = Sch.f * Sch.f;
178  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
179  Stmp4.f = __drsqrt_rn(Stmp3.f);
180 
181  Ssh.f = Stmp4.f * Ssh.f;
182  Sch.f = Stmp4.f * Sch.f;
183  Stmp1.f = gfour_gamma_squared * Stmp1.f;
184  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
185 
186  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
187  Ssh.ui = ~Stmp1.ui & Ssh.ui;
188  Ssh.ui = Ssh.ui | Stmp2.ui;
189  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
190  Sch.ui = ~Stmp1.ui & Sch.ui;
191  Sch.ui = Sch.ui | Stmp2.ui;
192 
193  Stmp1.f = Ssh.f * Ssh.f;
194  Stmp2.f = Sch.f * Sch.f;
195  Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
196  Ss.f = Sch.f * Ssh.f;
197  Ss.f = __dadd_rn(Ss.f, Ss.f);
198 
199 #ifdef DEBUG_JACOBI_CONJUGATE
200  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
201  Sch.f);
202 #endif
203  //###########################################################
204  // Perform the actual Givens conjugation
205  //###########################################################
206 
207  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
208  Ss33.f = Ss33.f * Stmp3.f;
209  Ss31.f = Ss31.f * Stmp3.f;
210  Ss32.f = Ss32.f * Stmp3.f;
211  Ss33.f = Ss33.f * Stmp3.f;
212 
213  Stmp1.f = Ss.f * Ss31.f;
214  Stmp2.f = Ss.f * Ss32.f;
215  Ss31.f = Sc.f * Ss31.f;
216  Ss32.f = Sc.f * Ss32.f;
217  Ss31.f = __dadd_rn(Stmp2.f, Ss31.f);
218  Ss32.f = __dsub_rn(Ss32.f, Stmp1.f);
219 
220  Stmp2.f = Ss.f * Ss.f;
221  Stmp1.f = Ss22.f * Stmp2.f;
222  Stmp3.f = Ss11.f * Stmp2.f;
223  Stmp4.f = Sc.f * Sc.f;
224  Ss11.f = Ss11.f * Stmp4.f;
225  Ss22.f = Ss22.f * Stmp4.f;
226  Ss11.f = __dadd_rn(Ss11.f, Stmp1.f);
227  Ss22.f = __dadd_rn(Ss22.f, Stmp3.f);
228  Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
229  Stmp2.f = __dadd_rn(Ss21.f, Ss21.f);
230  Ss21.f = Ss21.f * Stmp4.f;
231  Stmp4.f = Sc.f * Ss.f;
232  Stmp2.f = Stmp2.f * Stmp4.f;
233  Stmp5.f = Stmp5.f * Stmp4.f;
234  Ss11.f = __dadd_rn(Ss11.f, Stmp2.f);
235  Ss21.f = __dsub_rn(Ss21.f, Stmp5.f);
236  Ss22.f = __dsub_rn(Ss22.f, Stmp2.f);
237 
238 #ifdef DEBUG_JACOBI_CONJUGATE
239  printf("%.20g\n", Ss11.f);
240  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
241  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
242 #endif
243 
244  //###########################################################
245  // Compute the cumulative rotation, in quaternion form
246  //###########################################################
247 
248  Stmp1.f = Ssh.f * Sqvvx.f;
249  Stmp2.f = Ssh.f * Sqvvy.f;
250  Stmp3.f = Ssh.f * Sqvvz.f;
251  Ssh.f = Ssh.f * Sqvs.f;
252 
253  Sqvs.f = Sch.f * Sqvs.f;
254  Sqvvx.f = Sch.f * Sqvvx.f;
255  Sqvvy.f = Sch.f * Sqvvy.f;
256  Sqvvz.f = Sch.f * Sqvvz.f;
257 
258  Sqvvz.f = __dadd_rn(Sqvvz.f, Ssh.f);
259  Sqvs.f = __dsub_rn(Sqvs.f, Stmp3.f);
260  Sqvvx.f = __dadd_rn(Sqvvx.f, Stmp2.f);
261  Sqvvy.f = __dsub_rn(Sqvvy.f, Stmp1.f);
262 
263 #ifdef DEBUG_JACOBI_CONJUGATE
264  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
265  Sqvs.f);
266 #endif
267 
269  // (1->3)
271  Ssh.f = Ss32.f * 0.5f;
272  Stmp5.f = __dsub_rn(Ss22.f, Ss33.f);
273 
274  Stmp2.f = Ssh.f * Ssh.f;
275  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
276  Ssh.ui = Stmp1.ui & Ssh.ui;
277  Sch.ui = Stmp1.ui & Stmp5.ui;
278  Stmp2.ui = ~Stmp1.ui & gone;
279  Sch.ui = Sch.ui | Stmp2.ui;
280 
281  Stmp1.f = Ssh.f * Ssh.f;
282  Stmp2.f = Sch.f * Sch.f;
283  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
284  Stmp4.f = __drsqrt_rn(Stmp3.f);
285 
286  Ssh.f = Stmp4.f * Ssh.f;
287  Sch.f = Stmp4.f * Sch.f;
288  Stmp1.f = gfour_gamma_squared * Stmp1.f;
289  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
290 
291  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
292  Ssh.ui = ~Stmp1.ui & Ssh.ui;
293  Ssh.ui = Ssh.ui | Stmp2.ui;
294  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
295  Sch.ui = ~Stmp1.ui & Sch.ui;
296  Sch.ui = Sch.ui | Stmp2.ui;
297 
298  Stmp1.f = Ssh.f * Ssh.f;
299  Stmp2.f = Sch.f * Sch.f;
300  Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
301  Ss.f = Sch.f * Ssh.f;
302  Ss.f = __dadd_rn(Ss.f, Ss.f);
303 
304 #ifdef DEBUG_JACOBI_CONJUGATE
305  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
306  Sch.f);
307 #endif
308 
309  //###########################################################
310  // Perform the actual Givens conjugation
311  //###########################################################
312 
313  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
314  Ss11.f = Ss11.f * Stmp3.f;
315  Ss21.f = Ss21.f * Stmp3.f;
316  Ss31.f = Ss31.f * Stmp3.f;
317  Ss11.f = Ss11.f * Stmp3.f;
318 
319  Stmp1.f = Ss.f * Ss21.f;
320  Stmp2.f = Ss.f * Ss31.f;
321  Ss21.f = Sc.f * Ss21.f;
322  Ss31.f = Sc.f * Ss31.f;
323  Ss21.f = __dadd_rn(Stmp2.f, Ss21.f);
324  Ss31.f = __dsub_rn(Ss31.f, Stmp1.f);
325 
326  Stmp2.f = Ss.f * Ss.f;
327  Stmp1.f = Ss33.f * Stmp2.f;
328  Stmp3.f = Ss22.f * Stmp2.f;
329  Stmp4.f = Sc.f * Sc.f;
330  Ss22.f = Ss22.f * Stmp4.f;
331  Ss33.f = Ss33.f * Stmp4.f;
332  Ss22.f = __dadd_rn(Ss22.f, Stmp1.f);
333  Ss33.f = __dadd_rn(Ss33.f, Stmp3.f);
334  Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
335  Stmp2.f = __dadd_rn(Ss32.f, Ss32.f);
336  Ss32.f = Ss32.f * Stmp4.f;
337  Stmp4.f = Sc.f * Ss.f;
338  Stmp2.f = Stmp2.f * Stmp4.f;
339  Stmp5.f = Stmp5.f * Stmp4.f;
340  Ss22.f = __dadd_rn(Ss22.f, Stmp2.f);
341  Ss32.f = __dsub_rn(Ss32.f, Stmp5.f);
342  Ss33.f = __dsub_rn(Ss33.f, Stmp2.f);
343 
344 #ifdef DEBUG_JACOBI_CONJUGATE
345  printf("%.20g\n", Ss11.f);
346  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
347  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
348 #endif
349 
350  //###########################################################
351  // Compute the cumulative rotation, in quaternion form
352  //###########################################################
353 
354  Stmp1.f = Ssh.f * Sqvvx.f;
355  Stmp2.f = Ssh.f * Sqvvy.f;
356  Stmp3.f = Ssh.f * Sqvvz.f;
357  Ssh.f = Ssh.f * Sqvs.f;
358 
359  Sqvs.f = Sch.f * Sqvs.f;
360  Sqvvx.f = Sch.f * Sqvvx.f;
361  Sqvvy.f = Sch.f * Sqvvy.f;
362  Sqvvz.f = Sch.f * Sqvvz.f;
363 
364  Sqvvx.f = __dadd_rn(Sqvvx.f, Ssh.f);
365  Sqvs.f = __dsub_rn(Sqvs.f, Stmp1.f);
366  Sqvvy.f = __dadd_rn(Sqvvy.f, Stmp3.f);
367  Sqvvz.f = __dsub_rn(Sqvvz.f, Stmp2.f);
368 
369 #ifdef DEBUG_JACOBI_CONJUGATE
370  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
371  Sqvs.f);
372 #endif
373 #if 1
375  // 1 -> 2
377 
378  Ssh.f = Ss31.f * 0.5f;
379  Stmp5.f = __dsub_rn(Ss33.f, Ss11.f);
380 
381  Stmp2.f = Ssh.f * Ssh.f;
382  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
383  Ssh.ui = Stmp1.ui & Ssh.ui;
384  Sch.ui = Stmp1.ui & Stmp5.ui;
385  Stmp2.ui = ~Stmp1.ui & gone;
386  Sch.ui = Sch.ui | Stmp2.ui;
387 
388  Stmp1.f = Ssh.f * Ssh.f;
389  Stmp2.f = Sch.f * Sch.f;
390  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
391  Stmp4.f = __drsqrt_rn(Stmp3.f);
392 
393  Ssh.f = Stmp4.f * Ssh.f;
394  Sch.f = Stmp4.f * Sch.f;
395  Stmp1.f = gfour_gamma_squared * Stmp1.f;
396  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
397 
398  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
399  Ssh.ui = ~Stmp1.ui & Ssh.ui;
400  Ssh.ui = Ssh.ui | Stmp2.ui;
401  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
402  Sch.ui = ~Stmp1.ui & Sch.ui;
403  Sch.ui = Sch.ui | Stmp2.ui;
404 
405  Stmp1.f = Ssh.f * Ssh.f;
406  Stmp2.f = Sch.f * Sch.f;
407  Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
408  Ss.f = Sch.f * Ssh.f;
409  Ss.f = __dadd_rn(Ss.f, Ss.f);
410 
411 #ifdef DEBUG_JACOBI_CONJUGATE
412  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
413  Sch.f);
414 #endif
415 
416  //###########################################################
417  // Perform the actual Givens conjugation
418  //###########################################################
419 
420  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
421  Ss22.f = Ss22.f * Stmp3.f;
422  Ss32.f = Ss32.f * Stmp3.f;
423  Ss21.f = Ss21.f * Stmp3.f;
424  Ss22.f = Ss22.f * Stmp3.f;
425 
426  Stmp1.f = Ss.f * Ss32.f;
427  Stmp2.f = Ss.f * Ss21.f;
428  Ss32.f = Sc.f * Ss32.f;
429  Ss21.f = Sc.f * Ss21.f;
430  Ss32.f = __dadd_rn(Stmp2.f, Ss32.f);
431  Ss21.f = __dsub_rn(Ss21.f, Stmp1.f);
432 
433  Stmp2.f = Ss.f * Ss.f;
434  Stmp1.f = Ss11.f * Stmp2.f;
435  Stmp3.f = Ss33.f * Stmp2.f;
436  Stmp4.f = Sc.f * Sc.f;
437  Ss33.f = Ss33.f * Stmp4.f;
438  Ss11.f = Ss11.f * Stmp4.f;
439  Ss33.f = __dadd_rn(Ss33.f, Stmp1.f);
440  Ss11.f = __dadd_rn(Ss11.f, Stmp3.f);
441  Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
442  Stmp2.f = __dadd_rn(Ss31.f, Ss31.f);
443  Ss31.f = Ss31.f * Stmp4.f;
444  Stmp4.f = Sc.f * Ss.f;
445  Stmp2.f = Stmp2.f * Stmp4.f;
446  Stmp5.f = Stmp5.f * Stmp4.f;
447  Ss33.f = __dadd_rn(Ss33.f, Stmp2.f);
448  Ss31.f = __dsub_rn(Ss31.f, Stmp5.f);
449  Ss11.f = __dsub_rn(Ss11.f, Stmp2.f);
450 
451 #ifdef DEBUG_JACOBI_CONJUGATE
452  printf("%.20g\n", Ss11.f);
453  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
454  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
455 #endif
456 
457  //###########################################################
458  // Compute the cumulative rotation, in quaternion form
459  //###########################################################
460 
461  Stmp1.f = Ssh.f * Sqvvx.f;
462  Stmp2.f = Ssh.f * Sqvvy.f;
463  Stmp3.f = Ssh.f * Sqvvz.f;
464  Ssh.f = Ssh.f * Sqvs.f;
465 
466  Sqvs.f = Sch.f * Sqvs.f;
467  Sqvvx.f = Sch.f * Sqvvx.f;
468  Sqvvy.f = Sch.f * Sqvvy.f;
469  Sqvvz.f = Sch.f * Sqvvz.f;
470 
471  Sqvvy.f = __dadd_rn(Sqvvy.f, Ssh.f);
472  Sqvs.f = __dsub_rn(Sqvs.f, Stmp2.f);
473  Sqvvz.f = __dadd_rn(Sqvvz.f, Stmp1.f);
474  Sqvvx.f = __dsub_rn(Sqvvx.f, Stmp3.f);
475 #endif
476  }
477 
478  //###########################################################
479  // Normalize quaternion for matrix V
480  //###########################################################
481 
482  Stmp2.f = Sqvs.f * Sqvs.f;
483  Stmp1.f = Sqvvx.f * Sqvvx.f;
484  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
485  Stmp1.f = Sqvvy.f * Sqvvy.f;
486  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
487  Stmp1.f = Sqvvz.f * Sqvvz.f;
488  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
489 
490  Stmp1.f = __drsqrt_rn(Stmp2.f);
491  Stmp4.f = Stmp1.f * 0.5f;
492  Stmp3.f = Stmp1.f * Stmp4.f;
493  Stmp3.f = Stmp1.f * Stmp3.f;
494  Stmp3.f = Stmp2.f * Stmp3.f;
495  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
496  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
497 
498  Sqvs.f = Sqvs.f * Stmp1.f;
499  Sqvvx.f = Sqvvx.f * Stmp1.f;
500  Sqvvy.f = Sqvvy.f * Stmp1.f;
501  Sqvvz.f = Sqvvz.f * Stmp1.f;
502 
503  //###########################################################
504  // Transform quaternion to matrix V
505  //###########################################################
506 
507  Stmp1.f = Sqvvx.f * Sqvvx.f;
508  Stmp2.f = Sqvvy.f * Sqvvy.f;
509  Stmp3.f = Sqvvz.f * Sqvvz.f;
510  Sv11.f = Sqvs.f * Sqvs.f;
511  Sv22.f = __dsub_rn(Sv11.f, Stmp1.f);
512  Sv33.f = __dsub_rn(Sv22.f, Stmp2.f);
513  Sv33.f = __dadd_rn(Sv33.f, Stmp3.f);
514  Sv22.f = __dadd_rn(Sv22.f, Stmp2.f);
515  Sv22.f = __dsub_rn(Sv22.f, Stmp3.f);
516  Sv11.f = __dadd_rn(Sv11.f, Stmp1.f);
517  Sv11.f = __dsub_rn(Sv11.f, Stmp2.f);
518  Sv11.f = __dsub_rn(Sv11.f, Stmp3.f);
519  Stmp1.f = __dadd_rn(Sqvvx.f, Sqvvx.f);
520  Stmp2.f = __dadd_rn(Sqvvy.f, Sqvvy.f);
521  Stmp3.f = __dadd_rn(Sqvvz.f, Sqvvz.f);
522  Sv32.f = Sqvs.f * Stmp1.f;
523  Sv13.f = Sqvs.f * Stmp2.f;
524  Sv21.f = Sqvs.f * Stmp3.f;
525  Stmp1.f = Sqvvy.f * Stmp1.f;
526  Stmp2.f = Sqvvz.f * Stmp2.f;
527  Stmp3.f = Sqvvx.f * Stmp3.f;
528  Sv12.f = __dsub_rn(Stmp1.f, Sv21.f);
529  Sv23.f = __dsub_rn(Stmp2.f, Sv32.f);
530  Sv31.f = __dsub_rn(Stmp3.f, Sv13.f);
531  Sv21.f = __dadd_rn(Stmp1.f, Sv21.f);
532  Sv32.f = __dadd_rn(Stmp2.f, Sv32.f);
533  Sv13.f = __dadd_rn(Stmp3.f, Sv13.f);
534 
536  // Multiply (from the right) with V
537  //###########################################################
538 
539  Stmp2.f = Sa12.f;
540  Stmp3.f = Sa13.f;
541  Sa12.f = Sv12.f * Sa11.f;
542  Sa13.f = Sv13.f * Sa11.f;
543  Sa11.f = Sv11.f * Sa11.f;
544  Stmp1.f = Sv21.f * Stmp2.f;
545  Sa11.f = __dadd_rn(Sa11.f, Stmp1.f);
546  Stmp1.f = Sv31.f * Stmp3.f;
547  Sa11.f = __dadd_rn(Sa11.f, Stmp1.f);
548  Stmp1.f = Sv22.f * Stmp2.f;
549  Sa12.f = __dadd_rn(Sa12.f, Stmp1.f);
550  Stmp1.f = Sv32.f * Stmp3.f;
551  Sa12.f = __dadd_rn(Sa12.f, Stmp1.f);
552  Stmp1.f = Sv23.f * Stmp2.f;
553  Sa13.f = __dadd_rn(Sa13.f, Stmp1.f);
554  Stmp1.f = Sv33.f * Stmp3.f;
555  Sa13.f = __dadd_rn(Sa13.f, Stmp1.f);
556 
557  Stmp2.f = Sa22.f;
558  Stmp3.f = Sa23.f;
559  Sa22.f = Sv12.f * Sa21.f;
560  Sa23.f = Sv13.f * Sa21.f;
561  Sa21.f = Sv11.f * Sa21.f;
562  Stmp1.f = Sv21.f * Stmp2.f;
563  Sa21.f = __dadd_rn(Sa21.f, Stmp1.f);
564  Stmp1.f = Sv31.f * Stmp3.f;
565  Sa21.f = __dadd_rn(Sa21.f, Stmp1.f);
566  Stmp1.f = Sv22.f * Stmp2.f;
567  Sa22.f = __dadd_rn(Sa22.f, Stmp1.f);
568  Stmp1.f = Sv32.f * Stmp3.f;
569  Sa22.f = __dadd_rn(Sa22.f, Stmp1.f);
570  Stmp1.f = Sv23.f * Stmp2.f;
571  Sa23.f = __dadd_rn(Sa23.f, Stmp1.f);
572  Stmp1.f = Sv33.f * Stmp3.f;
573  Sa23.f = __dadd_rn(Sa23.f, Stmp1.f);
574 
575  Stmp2.f = Sa32.f;
576  Stmp3.f = Sa33.f;
577  Sa32.f = Sv12.f * Sa31.f;
578  Sa33.f = Sv13.f * Sa31.f;
579  Sa31.f = Sv11.f * Sa31.f;
580  Stmp1.f = Sv21.f * Stmp2.f;
581  Sa31.f = __dadd_rn(Sa31.f, Stmp1.f);
582  Stmp1.f = Sv31.f * Stmp3.f;
583  Sa31.f = __dadd_rn(Sa31.f, Stmp1.f);
584  Stmp1.f = Sv22.f * Stmp2.f;
585  Sa32.f = __dadd_rn(Sa32.f, Stmp1.f);
586  Stmp1.f = Sv32.f * Stmp3.f;
587  Sa32.f = __dadd_rn(Sa32.f, Stmp1.f);
588  Stmp1.f = Sv23.f * Stmp2.f;
589  Sa33.f = __dadd_rn(Sa33.f, Stmp1.f);
590  Stmp1.f = Sv33.f * Stmp3.f;
591  Sa33.f = __dadd_rn(Sa33.f, Stmp1.f);
592 
593  //###########################################################
594  // Permute columns such that the singular values are sorted
595  //###########################################################
596 
597  Stmp1.f = Sa11.f * Sa11.f;
598  Stmp4.f = Sa21.f * Sa21.f;
599  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
600  Stmp4.f = Sa31.f * Sa31.f;
601  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
602 
603  Stmp2.f = Sa12.f * Sa12.f;
604  Stmp4.f = Sa22.f * Sa22.f;
605  Stmp2.f = __dadd_rn(Stmp2.f, Stmp4.f);
606  Stmp4.f = Sa32.f * Sa32.f;
607  Stmp2.f = __dadd_rn(Stmp2.f, Stmp4.f);
608 
609  Stmp3.f = Sa13.f * Sa13.f;
610  Stmp4.f = Sa23.f * Sa23.f;
611  Stmp3.f = __dadd_rn(Stmp3.f, Stmp4.f);
612  Stmp4.f = Sa33.f * Sa33.f;
613  Stmp3.f = __dadd_rn(Stmp3.f, Stmp4.f);
614 
615  // Swap columns 1-2 if necessary
616 
617  Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
618  Stmp5.ui = Sa11.ui ^ Sa12.ui;
619  Stmp5.ui = Stmp5.ui & Stmp4.ui;
620  Sa11.ui = Sa11.ui ^ Stmp5.ui;
621  Sa12.ui = Sa12.ui ^ Stmp5.ui;
622 
623  Stmp5.ui = Sa21.ui ^ Sa22.ui;
624  Stmp5.ui = Stmp5.ui & Stmp4.ui;
625  Sa21.ui = Sa21.ui ^ Stmp5.ui;
626  Sa22.ui = Sa22.ui ^ Stmp5.ui;
627 
628  Stmp5.ui = Sa31.ui ^ Sa32.ui;
629  Stmp5.ui = Stmp5.ui & Stmp4.ui;
630  Sa31.ui = Sa31.ui ^ Stmp5.ui;
631  Sa32.ui = Sa32.ui ^ Stmp5.ui;
632 
633  Stmp5.ui = Sv11.ui ^ Sv12.ui;
634  Stmp5.ui = Stmp5.ui & Stmp4.ui;
635  Sv11.ui = Sv11.ui ^ Stmp5.ui;
636  Sv12.ui = Sv12.ui ^ Stmp5.ui;
637 
638  Stmp5.ui = Sv21.ui ^ Sv22.ui;
639  Stmp5.ui = Stmp5.ui & Stmp4.ui;
640  Sv21.ui = Sv21.ui ^ Stmp5.ui;
641  Sv22.ui = Sv22.ui ^ Stmp5.ui;
642 
643  Stmp5.ui = Sv31.ui ^ Sv32.ui;
644  Stmp5.ui = Stmp5.ui & Stmp4.ui;
645  Sv31.ui = Sv31.ui ^ Stmp5.ui;
646  Sv32.ui = Sv32.ui ^ Stmp5.ui;
647 
648  Stmp5.ui = Stmp1.ui ^ Stmp2.ui;
649  Stmp5.ui = Stmp5.ui & Stmp4.ui;
650  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
651  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
652 
653  // If columns 1-2 have been swapped, negate 2nd column of A and V so that V
654  // is still a rotation
655 
656  Stmp5.f = -2.f;
657  Stmp5.ui = Stmp5.ui & Stmp4.ui;
658  Stmp4.f = 1.f;
659  Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
660 
661  Sa12.f = Sa12.f * Stmp4.f;
662  Sa22.f = Sa22.f * Stmp4.f;
663  Sa32.f = Sa32.f * Stmp4.f;
664 
665  Sv12.f = Sv12.f * Stmp4.f;
666  Sv22.f = Sv22.f * Stmp4.f;
667  Sv32.f = Sv32.f * Stmp4.f;
668 
669  // Swap columns 1-3 if necessary
670 
671  Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
672  Stmp5.ui = Sa11.ui ^ Sa13.ui;
673  Stmp5.ui = Stmp5.ui & Stmp4.ui;
674  Sa11.ui = Sa11.ui ^ Stmp5.ui;
675  Sa13.ui = Sa13.ui ^ Stmp5.ui;
676 
677  Stmp5.ui = Sa21.ui ^ Sa23.ui;
678  Stmp5.ui = Stmp5.ui & Stmp4.ui;
679  Sa21.ui = Sa21.ui ^ Stmp5.ui;
680  Sa23.ui = Sa23.ui ^ Stmp5.ui;
681 
682  Stmp5.ui = Sa31.ui ^ Sa33.ui;
683  Stmp5.ui = Stmp5.ui & Stmp4.ui;
684  Sa31.ui = Sa31.ui ^ Stmp5.ui;
685  Sa33.ui = Sa33.ui ^ Stmp5.ui;
686 
687  Stmp5.ui = Sv11.ui ^ Sv13.ui;
688  Stmp5.ui = Stmp5.ui & Stmp4.ui;
689  Sv11.ui = Sv11.ui ^ Stmp5.ui;
690  Sv13.ui = Sv13.ui ^ Stmp5.ui;
691 
692  Stmp5.ui = Sv21.ui ^ Sv23.ui;
693  Stmp5.ui = Stmp5.ui & Stmp4.ui;
694  Sv21.ui = Sv21.ui ^ Stmp5.ui;
695  Sv23.ui = Sv23.ui ^ Stmp5.ui;
696 
697  Stmp5.ui = Sv31.ui ^ Sv33.ui;
698  Stmp5.ui = Stmp5.ui & Stmp4.ui;
699  Sv31.ui = Sv31.ui ^ Stmp5.ui;
700  Sv33.ui = Sv33.ui ^ Stmp5.ui;
701 
702  Stmp5.ui = Stmp1.ui ^ Stmp3.ui;
703  Stmp5.ui = Stmp5.ui & Stmp4.ui;
704  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
705  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
706 
707  // If columns 1-3 have been swapped, negate 1st column of A and V so that V
708  // is still a rotation
709 
710  Stmp5.f = -2.f;
711  Stmp5.ui = Stmp5.ui & Stmp4.ui;
712  Stmp4.f = 1.f;
713  Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
714 
715  Sa11.f = Sa11.f * Stmp4.f;
716  Sa21.f = Sa21.f * Stmp4.f;
717  Sa31.f = Sa31.f * Stmp4.f;
718 
719  Sv11.f = Sv11.f * Stmp4.f;
720  Sv21.f = Sv21.f * Stmp4.f;
721  Sv31.f = Sv31.f * Stmp4.f;
722 
723  // Swap columns 2-3 if necessary
724 
725  Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
726  Stmp5.ui = Sa12.ui ^ Sa13.ui;
727  Stmp5.ui = Stmp5.ui & Stmp4.ui;
728  Sa12.ui = Sa12.ui ^ Stmp5.ui;
729  Sa13.ui = Sa13.ui ^ Stmp5.ui;
730 
731  Stmp5.ui = Sa22.ui ^ Sa23.ui;
732  Stmp5.ui = Stmp5.ui & Stmp4.ui;
733  Sa22.ui = Sa22.ui ^ Stmp5.ui;
734  Sa23.ui = Sa23.ui ^ Stmp5.ui;
735 
736  Stmp5.ui = Sa32.ui ^ Sa33.ui;
737  Stmp5.ui = Stmp5.ui & Stmp4.ui;
738  Sa32.ui = Sa32.ui ^ Stmp5.ui;
739  Sa33.ui = Sa33.ui ^ Stmp5.ui;
740 
741  Stmp5.ui = Sv12.ui ^ Sv13.ui;
742  Stmp5.ui = Stmp5.ui & Stmp4.ui;
743  Sv12.ui = Sv12.ui ^ Stmp5.ui;
744  Sv13.ui = Sv13.ui ^ Stmp5.ui;
745 
746  Stmp5.ui = Sv22.ui ^ Sv23.ui;
747  Stmp5.ui = Stmp5.ui & Stmp4.ui;
748  Sv22.ui = Sv22.ui ^ Stmp5.ui;
749  Sv23.ui = Sv23.ui ^ Stmp5.ui;
750 
751  Stmp5.ui = Sv32.ui ^ Sv33.ui;
752  Stmp5.ui = Stmp5.ui & Stmp4.ui;
753  Sv32.ui = Sv32.ui ^ Stmp5.ui;
754  Sv33.ui = Sv33.ui ^ Stmp5.ui;
755 
756  Stmp5.ui = Stmp2.ui ^ Stmp3.ui;
757  Stmp5.ui = Stmp5.ui & Stmp4.ui;
758  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
759  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
760 
761  // If columns 2-3 have been swapped, negate 3rd column of A and V so that V
762  // is still a rotation
763 
764  Stmp5.f = -2.f;
765  Stmp5.ui = Stmp5.ui & Stmp4.ui;
766  Stmp4.f = 1.f;
767  Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
768 
769  Sa13.f = Sa13.f * Stmp4.f;
770  Sa23.f = Sa23.f * Stmp4.f;
771  Sa33.f = Sa33.f * Stmp4.f;
772 
773  Sv13.f = Sv13.f * Stmp4.f;
774  Sv23.f = Sv23.f * Stmp4.f;
775  Sv33.f = Sv33.f * Stmp4.f;
776 
777  //###########################################################
778  // Construct QR factorization of A*V (=U*D) using Givens rotations
779  //###########################################################
780 
781  Su11.f = 1.f;
782  Su12.f = 0.f;
783  Su13.f = 0.f;
784  Su21.f = 0.f;
785  Su22.f = 1.f;
786  Su23.f = 0.f;
787  Su31.f = 0.f;
788  Su32.f = 0.f;
789  Su33.f = 1.f;
790 
791  Ssh.f = Sa21.f * Sa21.f;
792  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
793  Ssh.ui = Ssh.ui & Sa21.ui;
794 
795  Stmp5.f = 0.f;
796  Sch.f = __dsub_rn(Stmp5.f, Sa11.f);
797  Sch.f = max(Sch.f, Sa11.f);
798  Sch.f = max(Sch.f, gsmall_number);
799  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
800 
801  Stmp1.f = Sch.f * Sch.f;
802  Stmp2.f = Ssh.f * Ssh.f;
803  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
804  Stmp1.f = __drsqrt_rn(Stmp2.f);
805 
806  Stmp4.f = Stmp1.f * 0.5f;
807  Stmp3.f = Stmp1.f * Stmp4.f;
808  Stmp3.f = Stmp1.f * Stmp3.f;
809  Stmp3.f = Stmp2.f * Stmp3.f;
810  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
811  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
812  Stmp1.f = Stmp1.f * Stmp2.f;
813 
814  Sch.f = __dadd_rn(Sch.f, Stmp1.f);
815 
816  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
817  Stmp2.ui = ~Stmp5.ui & Sch.ui;
818  Sch.ui = Stmp5.ui & Sch.ui;
819  Ssh.ui = Stmp5.ui & Ssh.ui;
820  Sch.ui = Sch.ui | Stmp1.ui;
821  Ssh.ui = Ssh.ui | Stmp2.ui;
822 
823  Stmp1.f = Sch.f * Sch.f;
824  Stmp2.f = Ssh.f * Ssh.f;
825  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
826  Stmp1.f = __drsqrt_rn(Stmp2.f);
827 
828  Stmp4.f = Stmp1.f * 0.5f;
829  Stmp3.f = Stmp1.f * Stmp4.f;
830  Stmp3.f = Stmp1.f * Stmp3.f;
831  Stmp3.f = Stmp2.f * Stmp3.f;
832  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
833  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
834 
835  Sch.f = Sch.f * Stmp1.f;
836  Ssh.f = Ssh.f * Stmp1.f;
837 
838  Sc.f = Sch.f * Sch.f;
839  Ss.f = Ssh.f * Ssh.f;
840  Sc.f = __dsub_rn(Sc.f, Ss.f);
841  Ss.f = Ssh.f * Sch.f;
842  Ss.f = __dadd_rn(Ss.f, Ss.f);
843 
844  //###########################################################
845  // Rotate matrix A
846  //###########################################################
847 
848  Stmp1.f = Ss.f * Sa11.f;
849  Stmp2.f = Ss.f * Sa21.f;
850  Sa11.f = Sc.f * Sa11.f;
851  Sa21.f = Sc.f * Sa21.f;
852  Sa11.f = __dadd_rn(Sa11.f, Stmp2.f);
853  Sa21.f = __dsub_rn(Sa21.f, Stmp1.f);
854 
855  Stmp1.f = Ss.f * Sa12.f;
856  Stmp2.f = Ss.f * Sa22.f;
857  Sa12.f = Sc.f * Sa12.f;
858  Sa22.f = Sc.f * Sa22.f;
859  Sa12.f = __dadd_rn(Sa12.f, Stmp2.f);
860  Sa22.f = __dsub_rn(Sa22.f, Stmp1.f);
861 
862  Stmp1.f = Ss.f * Sa13.f;
863  Stmp2.f = Ss.f * Sa23.f;
864  Sa13.f = Sc.f * Sa13.f;
865  Sa23.f = Sc.f * Sa23.f;
866  Sa13.f = __dadd_rn(Sa13.f, Stmp2.f);
867  Sa23.f = __dsub_rn(Sa23.f, Stmp1.f);
868 
869  //###########################################################
870  // Update matrix U
871  //###########################################################
872 
873  Stmp1.f = Ss.f * Su11.f;
874  Stmp2.f = Ss.f * Su12.f;
875  Su11.f = Sc.f * Su11.f;
876  Su12.f = Sc.f * Su12.f;
877  Su11.f = __dadd_rn(Su11.f, Stmp2.f);
878  Su12.f = __dsub_rn(Su12.f, Stmp1.f);
879 
880  Stmp1.f = Ss.f * Su21.f;
881  Stmp2.f = Ss.f * Su22.f;
882  Su21.f = Sc.f * Su21.f;
883  Su22.f = Sc.f * Su22.f;
884  Su21.f = __dadd_rn(Su21.f, Stmp2.f);
885  Su22.f = __dsub_rn(Su22.f, Stmp1.f);
886 
887  Stmp1.f = Ss.f * Su31.f;
888  Stmp2.f = Ss.f * Su32.f;
889  Su31.f = Sc.f * Su31.f;
890  Su32.f = Sc.f * Su32.f;
891  Su31.f = __dadd_rn(Su31.f, Stmp2.f);
892  Su32.f = __dsub_rn(Su32.f, Stmp1.f);
893 
894  // Second Givens rotation
895 
896  Ssh.f = Sa31.f * Sa31.f;
897  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
898  Ssh.ui = Ssh.ui & Sa31.ui;
899 
900  Stmp5.f = 0.f;
901  Sch.f = __dsub_rn(Stmp5.f, Sa11.f);
902  Sch.f = max(Sch.f, Sa11.f);
903  Sch.f = max(Sch.f, gsmall_number);
904  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
905 
906  Stmp1.f = Sch.f * Sch.f;
907  Stmp2.f = Ssh.f * Ssh.f;
908  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
909  Stmp1.f = __drsqrt_rn(Stmp2.f);
910 
911  Stmp4.f = Stmp1.f * 0.5;
912  Stmp3.f = Stmp1.f * Stmp4.f;
913  Stmp3.f = Stmp1.f * Stmp3.f;
914  Stmp3.f = Stmp2.f * Stmp3.f;
915  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
916  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
917  Stmp1.f = Stmp1.f * Stmp2.f;
918 
919  Sch.f = __dadd_rn(Sch.f, Stmp1.f);
920 
921  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
922  Stmp2.ui = ~Stmp5.ui & Sch.ui;
923  Sch.ui = Stmp5.ui & Sch.ui;
924  Ssh.ui = Stmp5.ui & Ssh.ui;
925  Sch.ui = Sch.ui | Stmp1.ui;
926  Ssh.ui = Ssh.ui | Stmp2.ui;
927 
928  Stmp1.f = Sch.f * Sch.f;
929  Stmp2.f = Ssh.f * Ssh.f;
930  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
931  Stmp1.f = __drsqrt_rn(Stmp2.f);
932 
933  Stmp4.f = Stmp1.f * 0.5f;
934  Stmp3.f = Stmp1.f * Stmp4.f;
935  Stmp3.f = Stmp1.f * Stmp3.f;
936  Stmp3.f = Stmp2.f * Stmp3.f;
937  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
938  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
939 
940  Sch.f = Sch.f * Stmp1.f;
941  Ssh.f = Ssh.f * Stmp1.f;
942 
943  Sc.f = Sch.f * Sch.f;
944  Ss.f = Ssh.f * Ssh.f;
945  Sc.f = __dsub_rn(Sc.f, Ss.f);
946  Ss.f = Ssh.f * Sch.f;
947  Ss.f = __dadd_rn(Ss.f, Ss.f);
948 
949  //###########################################################
950  // Rotate matrix A
951  //###########################################################
952 
953  Stmp1.f = Ss.f * Sa11.f;
954  Stmp2.f = Ss.f * Sa31.f;
955  Sa11.f = Sc.f * Sa11.f;
956  Sa31.f = Sc.f * Sa31.f;
957  Sa11.f = __dadd_rn(Sa11.f, Stmp2.f);
958  Sa31.f = __dsub_rn(Sa31.f, Stmp1.f);
959 
960  Stmp1.f = Ss.f * Sa12.f;
961  Stmp2.f = Ss.f * Sa32.f;
962  Sa12.f = Sc.f * Sa12.f;
963  Sa32.f = Sc.f * Sa32.f;
964  Sa12.f = __dadd_rn(Sa12.f, Stmp2.f);
965  Sa32.f = __dsub_rn(Sa32.f, Stmp1.f);
966 
967  Stmp1.f = Ss.f * Sa13.f;
968  Stmp2.f = Ss.f * Sa33.f;
969  Sa13.f = Sc.f * Sa13.f;
970  Sa33.f = Sc.f * Sa33.f;
971  Sa13.f = __dadd_rn(Sa13.f, Stmp2.f);
972  Sa33.f = __dsub_rn(Sa33.f, Stmp1.f);
973 
974  //###########################################################
975  // Update matrix U
976  //###########################################################
977 
978  Stmp1.f = Ss.f * Su11.f;
979  Stmp2.f = Ss.f * Su13.f;
980  Su11.f = Sc.f * Su11.f;
981  Su13.f = Sc.f * Su13.f;
982  Su11.f = __dadd_rn(Su11.f, Stmp2.f);
983  Su13.f = __dsub_rn(Su13.f, Stmp1.f);
984 
985  Stmp1.f = Ss.f * Su21.f;
986  Stmp2.f = Ss.f * Su23.f;
987  Su21.f = Sc.f * Su21.f;
988  Su23.f = Sc.f * Su23.f;
989  Su21.f = __dadd_rn(Su21.f, Stmp2.f);
990  Su23.f = __dsub_rn(Su23.f, Stmp1.f);
991 
992  Stmp1.f = Ss.f * Su31.f;
993  Stmp2.f = Ss.f * Su33.f;
994  Su31.f = Sc.f * Su31.f;
995  Su33.f = Sc.f * Su33.f;
996  Su31.f = __dadd_rn(Su31.f, Stmp2.f);
997  Su33.f = __dsub_rn(Su33.f, Stmp1.f);
998 
999  // Third Givens Rotation
1000 
1001  Ssh.f = Sa32.f * Sa32.f;
1002  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1003  Ssh.ui = Ssh.ui & Sa32.ui;
1004 
1005  Stmp5.f = 0.f;
1006  Sch.f = __dsub_rn(Stmp5.f, Sa22.f);
1007  Sch.f = max(Sch.f, Sa22.f);
1008  Sch.f = max(Sch.f, gsmall_number);
1009  Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
1010 
1011  Stmp1.f = Sch.f * Sch.f;
1012  Stmp2.f = Ssh.f * Ssh.f;
1013  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
1014  Stmp1.f = __drsqrt_rn(Stmp2.f);
1015 
1016  Stmp4.f = Stmp1.f * 0.5f;
1017  Stmp3.f = Stmp1.f * Stmp4.f;
1018  Stmp3.f = Stmp1.f * Stmp3.f;
1019  Stmp3.f = Stmp2.f * Stmp3.f;
1020  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
1021  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
1022  Stmp1.f = Stmp1.f * Stmp2.f;
1023 
1024  Sch.f = __dadd_rn(Sch.f, Stmp1.f);
1025 
1026  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1027  Stmp2.ui = ~Stmp5.ui & Sch.ui;
1028  Sch.ui = Stmp5.ui & Sch.ui;
1029  Ssh.ui = Stmp5.ui & Ssh.ui;
1030  Sch.ui = Sch.ui | Stmp1.ui;
1031  Ssh.ui = Ssh.ui | Stmp2.ui;
1032 
1033  Stmp1.f = Sch.f * Sch.f;
1034  Stmp2.f = Ssh.f * Ssh.f;
1035  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
1036  Stmp1.f = __drsqrt_rn(Stmp2.f);
1037 
1038  Stmp4.f = Stmp1.f * 0.5f;
1039  Stmp3.f = Stmp1.f * Stmp4.f;
1040  Stmp3.f = Stmp1.f * Stmp3.f;
1041  Stmp3.f = Stmp2.f * Stmp3.f;
1042  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
1043  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
1044 
1045  Sch.f = Sch.f * Stmp1.f;
1046  Ssh.f = Ssh.f * Stmp1.f;
1047 
1048  Sc.f = Sch.f * Sch.f;
1049  Ss.f = Ssh.f * Ssh.f;
1050  Sc.f = __dsub_rn(Sc.f, Ss.f);
1051  Ss.f = Ssh.f * Sch.f;
1052  Ss.f = __dadd_rn(Ss.f, Ss.f);
1053 
1054  //###########################################################
1055  // Rotate matrix A
1056  //###########################################################
1057 
1058  Stmp1.f = Ss.f * Sa21.f;
1059  Stmp2.f = Ss.f * Sa31.f;
1060  Sa21.f = Sc.f * Sa21.f;
1061  Sa31.f = Sc.f * Sa31.f;
1062  Sa21.f = __dadd_rn(Sa21.f, Stmp2.f);
1063  Sa31.f = __dsub_rn(Sa31.f, Stmp1.f);
1064 
1065  Stmp1.f = Ss.f * Sa22.f;
1066  Stmp2.f = Ss.f * Sa32.f;
1067  Sa22.f = Sc.f * Sa22.f;
1068  Sa32.f = Sc.f * Sa32.f;
1069  Sa22.f = __dadd_rn(Sa22.f, Stmp2.f);
1070  Sa32.f = __dsub_rn(Sa32.f, Stmp1.f);
1071 
1072  Stmp1.f = Ss.f * Sa23.f;
1073  Stmp2.f = Ss.f * Sa33.f;
1074  Sa23.f = Sc.f * Sa23.f;
1075  Sa33.f = Sc.f * Sa33.f;
1076  Sa23.f = __dadd_rn(Sa23.f, Stmp2.f);
1077  Sa33.f = __dsub_rn(Sa33.f, Stmp1.f);
1078 
1079  //###########################################################
1080  // Update matrix U
1081  //###########################################################
1082 
1083  Stmp1.f = Ss.f * Su12.f;
1084  Stmp2.f = Ss.f * Su13.f;
1085  Su12.f = Sc.f * Su12.f;
1086  Su13.f = Sc.f * Su13.f;
1087  Su12.f = __dadd_rn(Su12.f, Stmp2.f);
1088  Su13.f = __dsub_rn(Su13.f, Stmp1.f);
1089 
1090  Stmp1.f = Ss.f * Su22.f;
1091  Stmp2.f = Ss.f * Su23.f;
1092  Su22.f = Sc.f * Su22.f;
1093  Su23.f = Sc.f * Su23.f;
1094  Su22.f = __dadd_rn(Su22.f, Stmp2.f);
1095  Su23.f = __dsub_rn(Su23.f, Stmp1.f);
1096 
1097  Stmp1.f = Ss.f * Su32.f;
1098  Stmp2.f = Ss.f * Su33.f;
1099  Su32.f = Sc.f * Su32.f;
1100  Su33.f = Sc.f * Su33.f;
1101  Su32.f = __dadd_rn(Su32.f, Stmp2.f);
1102  Su33.f = __dsub_rn(Su33.f, Stmp1.f);
1103 
1104  V_3x3[0] = Sv11.f;
1105  V_3x3[1] = Sv12.f;
1106  V_3x3[2] = Sv13.f;
1107  V_3x3[3] = Sv21.f;
1108  V_3x3[4] = Sv22.f;
1109  V_3x3[5] = Sv23.f;
1110  V_3x3[6] = Sv31.f;
1111  V_3x3[7] = Sv32.f;
1112  V_3x3[8] = Sv33.f;
1113 
1114  U_3x3[0] = Su11.f;
1115  U_3x3[1] = Su12.f;
1116  U_3x3[2] = Su13.f;
1117  U_3x3[3] = Su21.f;
1118  U_3x3[4] = Su22.f;
1119  U_3x3[5] = Su23.f;
1120  U_3x3[6] = Su31.f;
1121  U_3x3[7] = Su32.f;
1122  U_3x3[8] = Su33.f;
1123 
1124  S_3x1[0] = Sa11.f;
1125  // s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
1126  S_3x1[1] = Sa22.f;
1127  // s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
1128  S_3x1[2] = Sa33.f;
1129 }
1130 
1131 template <>
1133  float *U_3x3,
1134  float *S_3x1,
1135  float *V_3x3) {
1136  float gsmall_number = 1.e-12;
1137 
1138  un<float> Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
1139  un<float> Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
1140  un<float> Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
1141  un<float> Sc, Ss, Sch, Ssh;
1142  un<float> Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
1143  un<float> Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
1144  un<float> Sqvs, Sqvvx, Sqvvy, Sqvvz;
1145 
1146  Sa11.f = A_3x3[0];
1147  Sa12.f = A_3x3[1];
1148  Sa13.f = A_3x3[2];
1149  Sa21.f = A_3x3[3];
1150  Sa22.f = A_3x3[4];
1151  Sa23.f = A_3x3[5];
1152  Sa31.f = A_3x3[6];
1153  Sa32.f = A_3x3[7];
1154  Sa33.f = A_3x3[8];
1155 
1156  //###########################################################
1157  // Compute normal equations matrix
1158  //###########################################################
1159 
1160  Ss11.f = Sa11.f * Sa11.f;
1161  Stmp1.f = Sa21.f * Sa21.f;
1162  Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
1163  Stmp1.f = Sa31.f * Sa31.f;
1164  Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
1165 
1166  Ss21.f = Sa12.f * Sa11.f;
1167  Stmp1.f = Sa22.f * Sa21.f;
1168  Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
1169  Stmp1.f = Sa32.f * Sa31.f;
1170  Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
1171 
1172  Ss31.f = Sa13.f * Sa11.f;
1173  Stmp1.f = Sa23.f * Sa21.f;
1174  Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
1175  Stmp1.f = Sa33.f * Sa31.f;
1176  Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
1177 
1178  Ss22.f = Sa12.f * Sa12.f;
1179  Stmp1.f = Sa22.f * Sa22.f;
1180  Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
1181  Stmp1.f = Sa32.f * Sa32.f;
1182  Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
1183 
1184  Ss32.f = Sa13.f * Sa12.f;
1185  Stmp1.f = Sa23.f * Sa22.f;
1186  Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
1187  Stmp1.f = Sa33.f * Sa32.f;
1188  Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
1189 
1190  Ss33.f = Sa13.f * Sa13.f;
1191  Stmp1.f = Sa23.f * Sa23.f;
1192  Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
1193  Stmp1.f = Sa33.f * Sa33.f;
1194  Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
1195 
1196  Sqvs.f = 1.f;
1197  Sqvvx.f = 0.f;
1198  Sqvvy.f = 0.f;
1199  Sqvvz.f = 0.f;
1200 
1201  //###########################################################
1202  // Solve symmetric eigenproblem using Jacobi iteration
1203  //###########################################################
1204  for (int i = 0; i < 4; i++) {
1205  Ssh.f = Ss21.f * 0.5f;
1206  Stmp5.f = __fsub_rn(Ss11.f, Ss22.f);
1207 
1208  Stmp2.f = Ssh.f * Ssh.f;
1209  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1210  Ssh.ui = Stmp1.ui & Ssh.ui;
1211  Sch.ui = Stmp1.ui & Stmp5.ui;
1212  Stmp2.ui = ~Stmp1.ui & gone;
1213  Sch.ui = Sch.ui | Stmp2.ui;
1214 
1215  Stmp1.f = Ssh.f * Ssh.f;
1216  Stmp2.f = Sch.f * Sch.f;
1217  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1218  Stmp4.f = __frsqrt_rn(Stmp3.f);
1219 
1220  Ssh.f = Stmp4.f * Ssh.f;
1221  Sch.f = Stmp4.f * Sch.f;
1222  Stmp1.f = gfour_gamma_squared * Stmp1.f;
1223  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1224 
1225  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1226  Ssh.ui = ~Stmp1.ui & Ssh.ui;
1227  Ssh.ui = Ssh.ui | Stmp2.ui;
1228  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1229  Sch.ui = ~Stmp1.ui & Sch.ui;
1230  Sch.ui = Sch.ui | Stmp2.ui;
1231 
1232  Stmp1.f = Ssh.f * Ssh.f;
1233  Stmp2.f = Sch.f * Sch.f;
1234  Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1235  Ss.f = Sch.f * Ssh.f;
1236  Ss.f = __fadd_rn(Ss.f, Ss.f);
1237 
1238 #ifdef DEBUG_JACOBI_CONJUGATE
1239  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1240  Sch.f);
1241 #endif
1242  //###########################################################
1243  // Perform the actual Givens conjugation
1244  //###########################################################
1245 
1246  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1247  Ss33.f = Ss33.f * Stmp3.f;
1248  Ss31.f = Ss31.f * Stmp3.f;
1249  Ss32.f = Ss32.f * Stmp3.f;
1250  Ss33.f = Ss33.f * Stmp3.f;
1251 
1252  Stmp1.f = Ss.f * Ss31.f;
1253  Stmp2.f = Ss.f * Ss32.f;
1254  Ss31.f = Sc.f * Ss31.f;
1255  Ss32.f = Sc.f * Ss32.f;
1256  Ss31.f = __fadd_rn(Stmp2.f, Ss31.f);
1257  Ss32.f = __fsub_rn(Ss32.f, Stmp1.f);
1258 
1259  Stmp2.f = Ss.f * Ss.f;
1260  Stmp1.f = Ss22.f * Stmp2.f;
1261  Stmp3.f = Ss11.f * Stmp2.f;
1262  Stmp4.f = Sc.f * Sc.f;
1263  Ss11.f = Ss11.f * Stmp4.f;
1264  Ss22.f = Ss22.f * Stmp4.f;
1265  Ss11.f = __fadd_rn(Ss11.f, Stmp1.f);
1266  Ss22.f = __fadd_rn(Ss22.f, Stmp3.f);
1267  Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1268  Stmp2.f = __fadd_rn(Ss21.f, Ss21.f);
1269  Ss21.f = Ss21.f * Stmp4.f;
1270  Stmp4.f = Sc.f * Ss.f;
1271  Stmp2.f = Stmp2.f * Stmp4.f;
1272  Stmp5.f = Stmp5.f * Stmp4.f;
1273  Ss11.f = __fadd_rn(Ss11.f, Stmp2.f);
1274  Ss21.f = __fsub_rn(Ss21.f, Stmp5.f);
1275  Ss22.f = __fsub_rn(Ss22.f, Stmp2.f);
1276 
1277 #ifdef DEBUG_JACOBI_CONJUGATE
1278  printf("%.20g\n", Ss11.f);
1279  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1280  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1281 #endif
1282 
1283  //###########################################################
1284  // Compute the cumulative rotation, in quaternion form
1285  //###########################################################
1286 
1287  Stmp1.f = Ssh.f * Sqvvx.f;
1288  Stmp2.f = Ssh.f * Sqvvy.f;
1289  Stmp3.f = Ssh.f * Sqvvz.f;
1290  Ssh.f = Ssh.f * Sqvs.f;
1291 
1292  Sqvs.f = Sch.f * Sqvs.f;
1293  Sqvvx.f = Sch.f * Sqvvx.f;
1294  Sqvvy.f = Sch.f * Sqvvy.f;
1295  Sqvvz.f = Sch.f * Sqvvz.f;
1296 
1297  Sqvvz.f = __fadd_rn(Sqvvz.f, Ssh.f);
1298  Sqvs.f = __fsub_rn(Sqvs.f, Stmp3.f);
1299  Sqvvx.f = __fadd_rn(Sqvvx.f, Stmp2.f);
1300  Sqvvy.f = __fsub_rn(Sqvvy.f, Stmp1.f);
1301 
1302 #ifdef DEBUG_JACOBI_CONJUGATE
1303  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
1304  Sqvs.f);
1305 #endif
1306 
1308  // (1->3)
1310  Ssh.f = Ss32.f * 0.5f;
1311  Stmp5.f = __fsub_rn(Ss22.f, Ss33.f);
1312 
1313  Stmp2.f = Ssh.f * Ssh.f;
1314  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1315  Ssh.ui = Stmp1.ui & Ssh.ui;
1316  Sch.ui = Stmp1.ui & Stmp5.ui;
1317  Stmp2.ui = ~Stmp1.ui & gone;
1318  Sch.ui = Sch.ui | Stmp2.ui;
1319 
1320  Stmp1.f = Ssh.f * Ssh.f;
1321  Stmp2.f = Sch.f * Sch.f;
1322  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1323  Stmp4.f = __frsqrt_rn(Stmp3.f);
1324 
1325  Ssh.f = Stmp4.f * Ssh.f;
1326  Sch.f = Stmp4.f * Sch.f;
1327  Stmp1.f = gfour_gamma_squared * Stmp1.f;
1328  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1329 
1330  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1331  Ssh.ui = ~Stmp1.ui & Ssh.ui;
1332  Ssh.ui = Ssh.ui | Stmp2.ui;
1333  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1334  Sch.ui = ~Stmp1.ui & Sch.ui;
1335  Sch.ui = Sch.ui | Stmp2.ui;
1336 
1337  Stmp1.f = Ssh.f * Ssh.f;
1338  Stmp2.f = Sch.f * Sch.f;
1339  Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1340  Ss.f = Sch.f * Ssh.f;
1341  Ss.f = __fadd_rn(Ss.f, Ss.f);
1342 
1343 #ifdef DEBUG_JACOBI_CONJUGATE
1344  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1345  Sch.f);
1346 #endif
1347 
1348  //###########################################################
1349  // Perform the actual Givens conjugation
1350  //###########################################################
1351 
1352  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1353  Ss11.f = Ss11.f * Stmp3.f;
1354  Ss21.f = Ss21.f * Stmp3.f;
1355  Ss31.f = Ss31.f * Stmp3.f;
1356  Ss11.f = Ss11.f * Stmp3.f;
1357 
1358  Stmp1.f = Ss.f * Ss21.f;
1359  Stmp2.f = Ss.f * Ss31.f;
1360  Ss21.f = Sc.f * Ss21.f;
1361  Ss31.f = Sc.f * Ss31.f;
1362  Ss21.f = __fadd_rn(Stmp2.f, Ss21.f);
1363  Ss31.f = __fsub_rn(Ss31.f, Stmp1.f);
1364 
1365  Stmp2.f = Ss.f * Ss.f;
1366  Stmp1.f = Ss33.f * Stmp2.f;
1367  Stmp3.f = Ss22.f * Stmp2.f;
1368  Stmp4.f = Sc.f * Sc.f;
1369  Ss22.f = Ss22.f * Stmp4.f;
1370  Ss33.f = Ss33.f * Stmp4.f;
1371  Ss22.f = __fadd_rn(Ss22.f, Stmp1.f);
1372  Ss33.f = __fadd_rn(Ss33.f, Stmp3.f);
1373  Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1374  Stmp2.f = __fadd_rn(Ss32.f, Ss32.f);
1375  Ss32.f = Ss32.f * Stmp4.f;
1376  Stmp4.f = Sc.f * Ss.f;
1377  Stmp2.f = Stmp2.f * Stmp4.f;
1378  Stmp5.f = Stmp5.f * Stmp4.f;
1379  Ss22.f = __fadd_rn(Ss22.f, Stmp2.f);
1380  Ss32.f = __fsub_rn(Ss32.f, Stmp5.f);
1381  Ss33.f = __fsub_rn(Ss33.f, Stmp2.f);
1382 
1383 #ifdef DEBUG_JACOBI_CONJUGATE
1384  printf("%.20g\n", Ss11.f);
1385  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1386  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1387 #endif
1388 
1389  //###########################################################
1390  // Compute the cumulative rotation, in quaternion form
1391  //###########################################################
1392 
1393  Stmp1.f = Ssh.f * Sqvvx.f;
1394  Stmp2.f = Ssh.f * Sqvvy.f;
1395  Stmp3.f = Ssh.f * Sqvvz.f;
1396  Ssh.f = Ssh.f * Sqvs.f;
1397 
1398  Sqvs.f = Sch.f * Sqvs.f;
1399  Sqvvx.f = Sch.f * Sqvvx.f;
1400  Sqvvy.f = Sch.f * Sqvvy.f;
1401  Sqvvz.f = Sch.f * Sqvvz.f;
1402 
1403  Sqvvx.f = __fadd_rn(Sqvvx.f, Ssh.f);
1404  Sqvs.f = __fsub_rn(Sqvs.f, Stmp1.f);
1405  Sqvvy.f = __fadd_rn(Sqvvy.f, Stmp3.f);
1406  Sqvvz.f = __fsub_rn(Sqvvz.f, Stmp2.f);
1407 
1408 #ifdef DEBUG_JACOBI_CONJUGATE
1409  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
1410  Sqvs.f);
1411 #endif
1412 #if 1
1414  // 1 -> 2
1416 
1417  Ssh.f = Ss31.f * 0.5f;
1418  Stmp5.f = __fsub_rn(Ss33.f, Ss11.f);
1419 
1420  Stmp2.f = Ssh.f * Ssh.f;
1421  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1422  Ssh.ui = Stmp1.ui & Ssh.ui;
1423  Sch.ui = Stmp1.ui & Stmp5.ui;
1424  Stmp2.ui = ~Stmp1.ui & gone;
1425  Sch.ui = Sch.ui | Stmp2.ui;
1426 
1427  Stmp1.f = Ssh.f * Ssh.f;
1428  Stmp2.f = Sch.f * Sch.f;
1429  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1430  Stmp4.f = __frsqrt_rn(Stmp3.f);
1431 
1432  Ssh.f = Stmp4.f * Ssh.f;
1433  Sch.f = Stmp4.f * Sch.f;
1434  Stmp1.f = gfour_gamma_squared * Stmp1.f;
1435  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1436 
1437  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1438  Ssh.ui = ~Stmp1.ui & Ssh.ui;
1439  Ssh.ui = Ssh.ui | Stmp2.ui;
1440  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1441  Sch.ui = ~Stmp1.ui & Sch.ui;
1442  Sch.ui = Sch.ui | Stmp2.ui;
1443 
1444  Stmp1.f = Ssh.f * Ssh.f;
1445  Stmp2.f = Sch.f * Sch.f;
1446  Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1447  Ss.f = Sch.f * Ssh.f;
1448  Ss.f = __fadd_rn(Ss.f, Ss.f);
1449 
1450 #ifdef DEBUG_JACOBI_CONJUGATE
1451  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1452  Sch.f);
1453 #endif
1454 
1455  //###########################################################
1456  // Perform the actual Givens conjugation
1457  //###########################################################
1458 
1459  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1460  Ss22.f = Ss22.f * Stmp3.f;
1461  Ss32.f = Ss32.f * Stmp3.f;
1462  Ss21.f = Ss21.f * Stmp3.f;
1463  Ss22.f = Ss22.f * Stmp3.f;
1464 
1465  Stmp1.f = Ss.f * Ss32.f;
1466  Stmp2.f = Ss.f * Ss21.f;
1467  Ss32.f = Sc.f * Ss32.f;
1468  Ss21.f = Sc.f * Ss21.f;
1469  Ss32.f = __fadd_rn(Stmp2.f, Ss32.f);
1470  Ss21.f = __fsub_rn(Ss21.f, Stmp1.f);
1471 
1472  Stmp2.f = Ss.f * Ss.f;
1473  Stmp1.f = Ss11.f * Stmp2.f;
1474  Stmp3.f = Ss33.f * Stmp2.f;
1475  Stmp4.f = Sc.f * Sc.f;
1476  Ss33.f = Ss33.f * Stmp4.f;
1477  Ss11.f = Ss11.f * Stmp4.f;
1478  Ss33.f = __fadd_rn(Ss33.f, Stmp1.f);
1479  Ss11.f = __fadd_rn(Ss11.f, Stmp3.f);
1480  Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1481  Stmp2.f = __fadd_rn(Ss31.f, Ss31.f);
1482  Ss31.f = Ss31.f * Stmp4.f;
1483  Stmp4.f = Sc.f * Ss.f;
1484  Stmp2.f = Stmp2.f * Stmp4.f;
1485  Stmp5.f = Stmp5.f * Stmp4.f;
1486  Ss33.f = __fadd_rn(Ss33.f, Stmp2.f);
1487  Ss31.f = __fsub_rn(Ss31.f, Stmp5.f);
1488  Ss11.f = __fsub_rn(Ss11.f, Stmp2.f);
1489 
1490 #ifdef DEBUG_JACOBI_CONJUGATE
1491  printf("%.20g\n", Ss11.f);
1492  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1493  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1494 #endif
1495 
1496  //###########################################################
1497  // Compute the cumulative rotation, in quaternion form
1498  //###########################################################
1499 
1500  Stmp1.f = Ssh.f * Sqvvx.f;
1501  Stmp2.f = Ssh.f * Sqvvy.f;
1502  Stmp3.f = Ssh.f * Sqvvz.f;
1503  Ssh.f = Ssh.f * Sqvs.f;
1504 
1505  Sqvs.f = Sch.f * Sqvs.f;
1506  Sqvvx.f = Sch.f * Sqvvx.f;
1507  Sqvvy.f = Sch.f * Sqvvy.f;
1508  Sqvvz.f = Sch.f * Sqvvz.f;
1509 
1510  Sqvvy.f = __fadd_rn(Sqvvy.f, Ssh.f);
1511  Sqvs.f = __fsub_rn(Sqvs.f, Stmp2.f);
1512  Sqvvz.f = __fadd_rn(Sqvvz.f, Stmp1.f);
1513  Sqvvx.f = __fsub_rn(Sqvvx.f, Stmp3.f);
1514 #endif
1515  }
1516 
1517  //###########################################################
1518  // Normalize quaternion for matrix V
1519  //###########################################################
1520 
1521  Stmp2.f = Sqvs.f * Sqvs.f;
1522  Stmp1.f = Sqvvx.f * Sqvvx.f;
1523  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1524  Stmp1.f = Sqvvy.f * Sqvvy.f;
1525  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1526  Stmp1.f = Sqvvz.f * Sqvvz.f;
1527  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1528 
1529  Stmp1.f = __frsqrt_rn(Stmp2.f);
1530  Stmp4.f = Stmp1.f * 0.5f;
1531  Stmp3.f = Stmp1.f * Stmp4.f;
1532  Stmp3.f = Stmp1.f * Stmp3.f;
1533  Stmp3.f = Stmp2.f * Stmp3.f;
1534  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1535  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1536 
1537  Sqvs.f = Sqvs.f * Stmp1.f;
1538  Sqvvx.f = Sqvvx.f * Stmp1.f;
1539  Sqvvy.f = Sqvvy.f * Stmp1.f;
1540  Sqvvz.f = Sqvvz.f * Stmp1.f;
1541 
1542  //###########################################################
1543  // Transform quaternion to matrix V
1544  //###########################################################
1545 
1546  Stmp1.f = Sqvvx.f * Sqvvx.f;
1547  Stmp2.f = Sqvvy.f * Sqvvy.f;
1548  Stmp3.f = Sqvvz.f * Sqvvz.f;
1549  Sv11.f = Sqvs.f * Sqvs.f;
1550  Sv22.f = __fsub_rn(Sv11.f, Stmp1.f);
1551  Sv33.f = __fsub_rn(Sv22.f, Stmp2.f);
1552  Sv33.f = __fadd_rn(Sv33.f, Stmp3.f);
1553  Sv22.f = __fadd_rn(Sv22.f, Stmp2.f);
1554  Sv22.f = __fsub_rn(Sv22.f, Stmp3.f);
1555  Sv11.f = __fadd_rn(Sv11.f, Stmp1.f);
1556  Sv11.f = __fsub_rn(Sv11.f, Stmp2.f);
1557  Sv11.f = __fsub_rn(Sv11.f, Stmp3.f);
1558  Stmp1.f = __fadd_rn(Sqvvx.f, Sqvvx.f);
1559  Stmp2.f = __fadd_rn(Sqvvy.f, Sqvvy.f);
1560  Stmp3.f = __fadd_rn(Sqvvz.f, Sqvvz.f);
1561  Sv32.f = Sqvs.f * Stmp1.f;
1562  Sv13.f = Sqvs.f * Stmp2.f;
1563  Sv21.f = Sqvs.f * Stmp3.f;
1564  Stmp1.f = Sqvvy.f * Stmp1.f;
1565  Stmp2.f = Sqvvz.f * Stmp2.f;
1566  Stmp3.f = Sqvvx.f * Stmp3.f;
1567  Sv12.f = __fsub_rn(Stmp1.f, Sv21.f);
1568  Sv23.f = __fsub_rn(Stmp2.f, Sv32.f);
1569  Sv31.f = __fsub_rn(Stmp3.f, Sv13.f);
1570  Sv21.f = __fadd_rn(Stmp1.f, Sv21.f);
1571  Sv32.f = __fadd_rn(Stmp2.f, Sv32.f);
1572  Sv13.f = __fadd_rn(Stmp3.f, Sv13.f);
1573 
1575  // Multiply (from the right) with V
1576  //###########################################################
1577 
1578  Stmp2.f = Sa12.f;
1579  Stmp3.f = Sa13.f;
1580  Sa12.f = Sv12.f * Sa11.f;
1581  Sa13.f = Sv13.f * Sa11.f;
1582  Sa11.f = Sv11.f * Sa11.f;
1583  Stmp1.f = Sv21.f * Stmp2.f;
1584  Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
1585  Stmp1.f = Sv31.f * Stmp3.f;
1586  Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
1587  Stmp1.f = Sv22.f * Stmp2.f;
1588  Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
1589  Stmp1.f = Sv32.f * Stmp3.f;
1590  Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
1591  Stmp1.f = Sv23.f * Stmp2.f;
1592  Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
1593  Stmp1.f = Sv33.f * Stmp3.f;
1594  Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
1595 
1596  Stmp2.f = Sa22.f;
1597  Stmp3.f = Sa23.f;
1598  Sa22.f = Sv12.f * Sa21.f;
1599  Sa23.f = Sv13.f * Sa21.f;
1600  Sa21.f = Sv11.f * Sa21.f;
1601  Stmp1.f = Sv21.f * Stmp2.f;
1602  Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
1603  Stmp1.f = Sv31.f * Stmp3.f;
1604  Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
1605  Stmp1.f = Sv22.f * Stmp2.f;
1606  Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
1607  Stmp1.f = Sv32.f * Stmp3.f;
1608  Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
1609  Stmp1.f = Sv23.f * Stmp2.f;
1610  Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
1611  Stmp1.f = Sv33.f * Stmp3.f;
1612  Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
1613 
1614  Stmp2.f = Sa32.f;
1615  Stmp3.f = Sa33.f;
1616  Sa32.f = Sv12.f * Sa31.f;
1617  Sa33.f = Sv13.f * Sa31.f;
1618  Sa31.f = Sv11.f * Sa31.f;
1619  Stmp1.f = Sv21.f * Stmp2.f;
1620  Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
1621  Stmp1.f = Sv31.f * Stmp3.f;
1622  Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
1623  Stmp1.f = Sv22.f * Stmp2.f;
1624  Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
1625  Stmp1.f = Sv32.f * Stmp3.f;
1626  Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
1627  Stmp1.f = Sv23.f * Stmp2.f;
1628  Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
1629  Stmp1.f = Sv33.f * Stmp3.f;
1630  Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
1631 
1632  //###########################################################
1633  // Permute columns such that the singular values are sorted
1634  //###########################################################
1635 
1636  Stmp1.f = Sa11.f * Sa11.f;
1637  Stmp4.f = Sa21.f * Sa21.f;
1638  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1639  Stmp4.f = Sa31.f * Sa31.f;
1640  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1641 
1642  Stmp2.f = Sa12.f * Sa12.f;
1643  Stmp4.f = Sa22.f * Sa22.f;
1644  Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
1645  Stmp4.f = Sa32.f * Sa32.f;
1646  Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
1647 
1648  Stmp3.f = Sa13.f * Sa13.f;
1649  Stmp4.f = Sa23.f * Sa23.f;
1650  Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
1651  Stmp4.f = Sa33.f * Sa33.f;
1652  Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
1653 
1654  // Swap columns 1-2 if necessary
1655 
1656  Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
1657  Stmp5.ui = Sa11.ui ^ Sa12.ui;
1658  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1659  Sa11.ui = Sa11.ui ^ Stmp5.ui;
1660  Sa12.ui = Sa12.ui ^ Stmp5.ui;
1661 
1662  Stmp5.ui = Sa21.ui ^ Sa22.ui;
1663  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1664  Sa21.ui = Sa21.ui ^ Stmp5.ui;
1665  Sa22.ui = Sa22.ui ^ Stmp5.ui;
1666 
1667  Stmp5.ui = Sa31.ui ^ Sa32.ui;
1668  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1669  Sa31.ui = Sa31.ui ^ Stmp5.ui;
1670  Sa32.ui = Sa32.ui ^ Stmp5.ui;
1671 
1672  Stmp5.ui = Sv11.ui ^ Sv12.ui;
1673  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1674  Sv11.ui = Sv11.ui ^ Stmp5.ui;
1675  Sv12.ui = Sv12.ui ^ Stmp5.ui;
1676 
1677  Stmp5.ui = Sv21.ui ^ Sv22.ui;
1678  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1679  Sv21.ui = Sv21.ui ^ Stmp5.ui;
1680  Sv22.ui = Sv22.ui ^ Stmp5.ui;
1681 
1682  Stmp5.ui = Sv31.ui ^ Sv32.ui;
1683  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1684  Sv31.ui = Sv31.ui ^ Stmp5.ui;
1685  Sv32.ui = Sv32.ui ^ Stmp5.ui;
1686 
1687  Stmp5.ui = Stmp1.ui ^ Stmp2.ui;
1688  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1689  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
1690  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
1691 
1692  // If columns 1-2 have been swapped, negate 2nd column of A and V so that V
1693  // is still a rotation
1694 
1695  Stmp5.f = -2.f;
1696  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1697  Stmp4.f = 1.f;
1698  Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1699 
1700  Sa12.f = Sa12.f * Stmp4.f;
1701  Sa22.f = Sa22.f * Stmp4.f;
1702  Sa32.f = Sa32.f * Stmp4.f;
1703 
1704  Sv12.f = Sv12.f * Stmp4.f;
1705  Sv22.f = Sv22.f * Stmp4.f;
1706  Sv32.f = Sv32.f * Stmp4.f;
1707 
1708  // Swap columns 1-3 if necessary
1709 
1710  Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
1711  Stmp5.ui = Sa11.ui ^ Sa13.ui;
1712  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1713  Sa11.ui = Sa11.ui ^ Stmp5.ui;
1714  Sa13.ui = Sa13.ui ^ Stmp5.ui;
1715 
1716  Stmp5.ui = Sa21.ui ^ Sa23.ui;
1717  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1718  Sa21.ui = Sa21.ui ^ Stmp5.ui;
1719  Sa23.ui = Sa23.ui ^ Stmp5.ui;
1720 
1721  Stmp5.ui = Sa31.ui ^ Sa33.ui;
1722  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1723  Sa31.ui = Sa31.ui ^ Stmp5.ui;
1724  Sa33.ui = Sa33.ui ^ Stmp5.ui;
1725 
1726  Stmp5.ui = Sv11.ui ^ Sv13.ui;
1727  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1728  Sv11.ui = Sv11.ui ^ Stmp5.ui;
1729  Sv13.ui = Sv13.ui ^ Stmp5.ui;
1730 
1731  Stmp5.ui = Sv21.ui ^ Sv23.ui;
1732  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1733  Sv21.ui = Sv21.ui ^ Stmp5.ui;
1734  Sv23.ui = Sv23.ui ^ Stmp5.ui;
1735 
1736  Stmp5.ui = Sv31.ui ^ Sv33.ui;
1737  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1738  Sv31.ui = Sv31.ui ^ Stmp5.ui;
1739  Sv33.ui = Sv33.ui ^ Stmp5.ui;
1740 
1741  Stmp5.ui = Stmp1.ui ^ Stmp3.ui;
1742  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1743  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
1744  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
1745 
1746  // If columns 1-3 have been swapped, negate 1st column of A and V so that V
1747  // is still a rotation
1748 
1749  Stmp5.f = -2.f;
1750  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1751  Stmp4.f = 1.f;
1752  Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1753 
1754  Sa11.f = Sa11.f * Stmp4.f;
1755  Sa21.f = Sa21.f * Stmp4.f;
1756  Sa31.f = Sa31.f * Stmp4.f;
1757 
1758  Sv11.f = Sv11.f * Stmp4.f;
1759  Sv21.f = Sv21.f * Stmp4.f;
1760  Sv31.f = Sv31.f * Stmp4.f;
1761 
1762  // Swap columns 2-3 if necessary
1763 
1764  Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
1765  Stmp5.ui = Sa12.ui ^ Sa13.ui;
1766  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1767  Sa12.ui = Sa12.ui ^ Stmp5.ui;
1768  Sa13.ui = Sa13.ui ^ Stmp5.ui;
1769 
1770  Stmp5.ui = Sa22.ui ^ Sa23.ui;
1771  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1772  Sa22.ui = Sa22.ui ^ Stmp5.ui;
1773  Sa23.ui = Sa23.ui ^ Stmp5.ui;
1774 
1775  Stmp5.ui = Sa32.ui ^ Sa33.ui;
1776  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1777  Sa32.ui = Sa32.ui ^ Stmp5.ui;
1778  Sa33.ui = Sa33.ui ^ Stmp5.ui;
1779 
1780  Stmp5.ui = Sv12.ui ^ Sv13.ui;
1781  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1782  Sv12.ui = Sv12.ui ^ Stmp5.ui;
1783  Sv13.ui = Sv13.ui ^ Stmp5.ui;
1784 
1785  Stmp5.ui = Sv22.ui ^ Sv23.ui;
1786  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1787  Sv22.ui = Sv22.ui ^ Stmp5.ui;
1788  Sv23.ui = Sv23.ui ^ Stmp5.ui;
1789 
1790  Stmp5.ui = Sv32.ui ^ Sv33.ui;
1791  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1792  Sv32.ui = Sv32.ui ^ Stmp5.ui;
1793  Sv33.ui = Sv33.ui ^ Stmp5.ui;
1794 
1795  Stmp5.ui = Stmp2.ui ^ Stmp3.ui;
1796  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1797  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
1798  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
1799 
1800  // If columns 2-3 have been swapped, negate 3rd column of A and V so that V
1801  // is still a rotation
1802 
1803  Stmp5.f = -2.f;
1804  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1805  Stmp4.f = 1.f;
1806  Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1807 
1808  Sa13.f = Sa13.f * Stmp4.f;
1809  Sa23.f = Sa23.f * Stmp4.f;
1810  Sa33.f = Sa33.f * Stmp4.f;
1811 
1812  Sv13.f = Sv13.f * Stmp4.f;
1813  Sv23.f = Sv23.f * Stmp4.f;
1814  Sv33.f = Sv33.f * Stmp4.f;
1815 
1816  //###########################################################
1817  // Construct QR factorization of A*V (=U*D) using Givens rotations
1818  //###########################################################
1819 
1820  Su11.f = 1.f;
1821  Su12.f = 0.f;
1822  Su13.f = 0.f;
1823  Su21.f = 0.f;
1824  Su22.f = 1.f;
1825  Su23.f = 0.f;
1826  Su31.f = 0.f;
1827  Su32.f = 0.f;
1828  Su33.f = 1.f;
1829 
1830  Ssh.f = Sa21.f * Sa21.f;
1831  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1832  Ssh.ui = Ssh.ui & Sa21.ui;
1833 
1834  Stmp5.f = 0.f;
1835  Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
1836  Sch.f = max(Sch.f, Sa11.f);
1837  Sch.f = max(Sch.f, gsmall_number);
1838  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
1839 
1840  Stmp1.f = Sch.f * Sch.f;
1841  Stmp2.f = Ssh.f * Ssh.f;
1842  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1843  Stmp1.f = __frsqrt_rn(Stmp2.f);
1844 
1845  Stmp4.f = Stmp1.f * 0.5f;
1846  Stmp3.f = Stmp1.f * Stmp4.f;
1847  Stmp3.f = Stmp1.f * Stmp3.f;
1848  Stmp3.f = Stmp2.f * Stmp3.f;
1849  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1850  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1851  Stmp1.f = Stmp1.f * Stmp2.f;
1852 
1853  Sch.f = __fadd_rn(Sch.f, Stmp1.f);
1854 
1855  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1856  Stmp2.ui = ~Stmp5.ui & Sch.ui;
1857  Sch.ui = Stmp5.ui & Sch.ui;
1858  Ssh.ui = Stmp5.ui & Ssh.ui;
1859  Sch.ui = Sch.ui | Stmp1.ui;
1860  Ssh.ui = Ssh.ui | Stmp2.ui;
1861 
1862  Stmp1.f = Sch.f * Sch.f;
1863  Stmp2.f = Ssh.f * Ssh.f;
1864  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1865  Stmp1.f = __frsqrt_rn(Stmp2.f);
1866 
1867  Stmp4.f = Stmp1.f * 0.5f;
1868  Stmp3.f = Stmp1.f * Stmp4.f;
1869  Stmp3.f = Stmp1.f * Stmp3.f;
1870  Stmp3.f = Stmp2.f * Stmp3.f;
1871  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1872  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1873 
1874  Sch.f = Sch.f * Stmp1.f;
1875  Ssh.f = Ssh.f * Stmp1.f;
1876 
1877  Sc.f = Sch.f * Sch.f;
1878  Ss.f = Ssh.f * Ssh.f;
1879  Sc.f = __fsub_rn(Sc.f, Ss.f);
1880  Ss.f = Ssh.f * Sch.f;
1881  Ss.f = __fadd_rn(Ss.f, Ss.f);
1882 
1883  //###########################################################
1884  // Rotate matrix A
1885  //###########################################################
1886 
1887  Stmp1.f = Ss.f * Sa11.f;
1888  Stmp2.f = Ss.f * Sa21.f;
1889  Sa11.f = Sc.f * Sa11.f;
1890  Sa21.f = Sc.f * Sa21.f;
1891  Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
1892  Sa21.f = __fsub_rn(Sa21.f, Stmp1.f);
1893 
1894  Stmp1.f = Ss.f * Sa12.f;
1895  Stmp2.f = Ss.f * Sa22.f;
1896  Sa12.f = Sc.f * Sa12.f;
1897  Sa22.f = Sc.f * Sa22.f;
1898  Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
1899  Sa22.f = __fsub_rn(Sa22.f, Stmp1.f);
1900 
1901  Stmp1.f = Ss.f * Sa13.f;
1902  Stmp2.f = Ss.f * Sa23.f;
1903  Sa13.f = Sc.f * Sa13.f;
1904  Sa23.f = Sc.f * Sa23.f;
1905  Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
1906  Sa23.f = __fsub_rn(Sa23.f, Stmp1.f);
1907 
1908  //###########################################################
1909  // Update matrix U
1910  //###########################################################
1911 
1912  Stmp1.f = Ss.f * Su11.f;
1913  Stmp2.f = Ss.f * Su12.f;
1914  Su11.f = Sc.f * Su11.f;
1915  Su12.f = Sc.f * Su12.f;
1916  Su11.f = __fadd_rn(Su11.f, Stmp2.f);
1917  Su12.f = __fsub_rn(Su12.f, Stmp1.f);
1918 
1919  Stmp1.f = Ss.f * Su21.f;
1920  Stmp2.f = Ss.f * Su22.f;
1921  Su21.f = Sc.f * Su21.f;
1922  Su22.f = Sc.f * Su22.f;
1923  Su21.f = __fadd_rn(Su21.f, Stmp2.f);
1924  Su22.f = __fsub_rn(Su22.f, Stmp1.f);
1925 
1926  Stmp1.f = Ss.f * Su31.f;
1927  Stmp2.f = Ss.f * Su32.f;
1928  Su31.f = Sc.f * Su31.f;
1929  Su32.f = Sc.f * Su32.f;
1930  Su31.f = __fadd_rn(Su31.f, Stmp2.f);
1931  Su32.f = __fsub_rn(Su32.f, Stmp1.f);
1932 
1933  // Second Givens rotation
1934 
1935  Ssh.f = Sa31.f * Sa31.f;
1936  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1937  Ssh.ui = Ssh.ui & Sa31.ui;
1938 
1939  Stmp5.f = 0.f;
1940  Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
1941  Sch.f = max(Sch.f, Sa11.f);
1942  Sch.f = max(Sch.f, gsmall_number);
1943  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
1944 
1945  Stmp1.f = Sch.f * Sch.f;
1946  Stmp2.f = Ssh.f * Ssh.f;
1947  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1948  Stmp1.f = __frsqrt_rn(Stmp2.f);
1949 
1950  Stmp4.f = Stmp1.f * 0.5;
1951  Stmp3.f = Stmp1.f * Stmp4.f;
1952  Stmp3.f = Stmp1.f * Stmp3.f;
1953  Stmp3.f = Stmp2.f * Stmp3.f;
1954  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1955  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1956  Stmp1.f = Stmp1.f * Stmp2.f;
1957 
1958  Sch.f = __fadd_rn(Sch.f, Stmp1.f);
1959 
1960  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1961  Stmp2.ui = ~Stmp5.ui & Sch.ui;
1962  Sch.ui = Stmp5.ui & Sch.ui;
1963  Ssh.ui = Stmp5.ui & Ssh.ui;
1964  Sch.ui = Sch.ui | Stmp1.ui;
1965  Ssh.ui = Ssh.ui | Stmp2.ui;
1966 
1967  Stmp1.f = Sch.f * Sch.f;
1968  Stmp2.f = Ssh.f * Ssh.f;
1969  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1970  Stmp1.f = __frsqrt_rn(Stmp2.f);
1971 
1972  Stmp4.f = Stmp1.f * 0.5f;
1973  Stmp3.f = Stmp1.f * Stmp4.f;
1974  Stmp3.f = Stmp1.f * Stmp3.f;
1975  Stmp3.f = Stmp2.f * Stmp3.f;
1976  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1977  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1978 
1979  Sch.f = Sch.f * Stmp1.f;
1980  Ssh.f = Ssh.f * Stmp1.f;
1981 
1982  Sc.f = Sch.f * Sch.f;
1983  Ss.f = Ssh.f * Ssh.f;
1984  Sc.f = __fsub_rn(Sc.f, Ss.f);
1985  Ss.f = Ssh.f * Sch.f;
1986  Ss.f = __fadd_rn(Ss.f, Ss.f);
1987 
1988  //###########################################################
1989  // Rotate matrix A
1990  //###########################################################
1991 
1992  Stmp1.f = Ss.f * Sa11.f;
1993  Stmp2.f = Ss.f * Sa31.f;
1994  Sa11.f = Sc.f * Sa11.f;
1995  Sa31.f = Sc.f * Sa31.f;
1996  Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
1997  Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
1998 
1999  Stmp1.f = Ss.f * Sa12.f;
2000  Stmp2.f = Ss.f * Sa32.f;
2001  Sa12.f = Sc.f * Sa12.f;
2002  Sa32.f = Sc.f * Sa32.f;
2003  Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
2004  Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
2005 
2006  Stmp1.f = Ss.f * Sa13.f;
2007  Stmp2.f = Ss.f * Sa33.f;
2008  Sa13.f = Sc.f * Sa13.f;
2009  Sa33.f = Sc.f * Sa33.f;
2010  Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
2011  Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
2012 
2013  //###########################################################
2014  // Update matrix U
2015  //###########################################################
2016 
2017  Stmp1.f = Ss.f * Su11.f;
2018  Stmp2.f = Ss.f * Su13.f;
2019  Su11.f = Sc.f * Su11.f;
2020  Su13.f = Sc.f * Su13.f;
2021  Su11.f = __fadd_rn(Su11.f, Stmp2.f);
2022  Su13.f = __fsub_rn(Su13.f, Stmp1.f);
2023 
2024  Stmp1.f = Ss.f * Su21.f;
2025  Stmp2.f = Ss.f * Su23.f;
2026  Su21.f = Sc.f * Su21.f;
2027  Su23.f = Sc.f * Su23.f;
2028  Su21.f = __fadd_rn(Su21.f, Stmp2.f);
2029  Su23.f = __fsub_rn(Su23.f, Stmp1.f);
2030 
2031  Stmp1.f = Ss.f * Su31.f;
2032  Stmp2.f = Ss.f * Su33.f;
2033  Su31.f = Sc.f * Su31.f;
2034  Su33.f = Sc.f * Su33.f;
2035  Su31.f = __fadd_rn(Su31.f, Stmp2.f);
2036  Su33.f = __fsub_rn(Su33.f, Stmp1.f);
2037 
2038  // Third Givens Rotation
2039 
2040  Ssh.f = Sa32.f * Sa32.f;
2041  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
2042  Ssh.ui = Ssh.ui & Sa32.ui;
2043 
2044  Stmp5.f = 0.f;
2045  Sch.f = __fsub_rn(Stmp5.f, Sa22.f);
2046  Sch.f = max(Sch.f, Sa22.f);
2047  Sch.f = max(Sch.f, gsmall_number);
2048  Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
2049 
2050  Stmp1.f = Sch.f * Sch.f;
2051  Stmp2.f = Ssh.f * Ssh.f;
2052  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
2053  Stmp1.f = __frsqrt_rn(Stmp2.f);
2054 
2055  Stmp4.f = Stmp1.f * 0.5f;
2056  Stmp3.f = Stmp1.f * Stmp4.f;
2057  Stmp3.f = Stmp1.f * Stmp3.f;
2058  Stmp3.f = Stmp2.f * Stmp3.f;
2059  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
2060  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
2061  Stmp1.f = Stmp1.f * Stmp2.f;
2062 
2063  Sch.f = __fadd_rn(Sch.f, Stmp1.f);
2064 
2065  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
2066  Stmp2.ui = ~Stmp5.ui & Sch.ui;
2067  Sch.ui = Stmp5.ui & Sch.ui;
2068  Ssh.ui = Stmp5.ui & Ssh.ui;
2069  Sch.ui = Sch.ui | Stmp1.ui;
2070  Ssh.ui = Ssh.ui | Stmp2.ui;
2071 
2072  Stmp1.f = Sch.f * Sch.f;
2073  Stmp2.f = Ssh.f * Ssh.f;
2074  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
2075  Stmp1.f = __frsqrt_rn(Stmp2.f);
2076 
2077  Stmp4.f = Stmp1.f * 0.5f;
2078  Stmp3.f = Stmp1.f * Stmp4.f;
2079  Stmp3.f = Stmp1.f * Stmp3.f;
2080  Stmp3.f = Stmp2.f * Stmp3.f;
2081  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
2082  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
2083 
2084  Sch.f = Sch.f * Stmp1.f;
2085  Ssh.f = Ssh.f * Stmp1.f;
2086 
2087  Sc.f = Sch.f * Sch.f;
2088  Ss.f = Ssh.f * Ssh.f;
2089  Sc.f = __fsub_rn(Sc.f, Ss.f);
2090  Ss.f = Ssh.f * Sch.f;
2091  Ss.f = __fadd_rn(Ss.f, Ss.f);
2092 
2093  //###########################################################
2094  // Rotate matrix A
2095  //###########################################################
2096 
2097  Stmp1.f = Ss.f * Sa21.f;
2098  Stmp2.f = Ss.f * Sa31.f;
2099  Sa21.f = Sc.f * Sa21.f;
2100  Sa31.f = Sc.f * Sa31.f;
2101  Sa21.f = __fadd_rn(Sa21.f, Stmp2.f);
2102  Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
2103 
2104  Stmp1.f = Ss.f * Sa22.f;
2105  Stmp2.f = Ss.f * Sa32.f;
2106  Sa22.f = Sc.f * Sa22.f;
2107  Sa32.f = Sc.f * Sa32.f;
2108  Sa22.f = __fadd_rn(Sa22.f, Stmp2.f);
2109  Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
2110 
2111  Stmp1.f = Ss.f * Sa23.f;
2112  Stmp2.f = Ss.f * Sa33.f;
2113  Sa23.f = Sc.f * Sa23.f;
2114  Sa33.f = Sc.f * Sa33.f;
2115  Sa23.f = __fadd_rn(Sa23.f, Stmp2.f);
2116  Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
2117 
2118  //###########################################################
2119  // Update matrix U
2120  //###########################################################
2121 
2122  Stmp1.f = Ss.f * Su12.f;
2123  Stmp2.f = Ss.f * Su13.f;
2124  Su12.f = Sc.f * Su12.f;
2125  Su13.f = Sc.f * Su13.f;
2126  Su12.f = __fadd_rn(Su12.f, Stmp2.f);
2127  Su13.f = __fsub_rn(Su13.f, Stmp1.f);
2128 
2129  Stmp1.f = Ss.f * Su22.f;
2130  Stmp2.f = Ss.f * Su23.f;
2131  Su22.f = Sc.f * Su22.f;
2132  Su23.f = Sc.f * Su23.f;
2133  Su22.f = __fadd_rn(Su22.f, Stmp2.f);
2134  Su23.f = __fsub_rn(Su23.f, Stmp1.f);
2135 
2136  Stmp1.f = Ss.f * Su32.f;
2137  Stmp2.f = Ss.f * Su33.f;
2138  Su32.f = Sc.f * Su32.f;
2139  Su33.f = Sc.f * Su33.f;
2140  Su32.f = __fadd_rn(Su32.f, Stmp2.f);
2141  Su33.f = __fsub_rn(Su33.f, Stmp1.f);
2142 
2143  V_3x3[0] = Sv11.f;
2144  V_3x3[1] = Sv12.f;
2145  V_3x3[2] = Sv13.f;
2146  V_3x3[3] = Sv21.f;
2147  V_3x3[4] = Sv22.f;
2148  V_3x3[5] = Sv23.f;
2149  V_3x3[6] = Sv31.f;
2150  V_3x3[7] = Sv32.f;
2151  V_3x3[8] = Sv33.f;
2152 
2153  U_3x3[0] = Su11.f;
2154  U_3x3[1] = Su12.f;
2155  U_3x3[2] = Su13.f;
2156  U_3x3[3] = Su21.f;
2157  U_3x3[4] = Su22.f;
2158  U_3x3[5] = Su23.f;
2159  U_3x3[6] = Su31.f;
2160  U_3x3[7] = Su32.f;
2161  U_3x3[8] = Su33.f;
2162 
2163  S_3x1[0] = Sa11.f;
2164  // s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
2165  S_3x1[1] = Sa22.f;
2166  // s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
2167  S_3x1[2] = Sa33.f;
2168 }
2169 
2170 template <typename scalar_t>
2172  const scalar_t *A_3x3, // input A {3,3}
2173  const scalar_t *B_3x1, // input b {3,1}
2174  scalar_t *X_3x1) // output x {3,1}
2175 {
2176  scalar_t U[9];
2177  scalar_t V[9];
2178  scalar_t S[3];
2179  svd3x3(A_3x3, U, S, V);
2180 
2181  //###########################################################
2182  // Sigma^+
2183  //###########################################################
2184  const scalar_t epsilon = 1e-10;
2185  S[0] = abs(S[0]) < epsilon ? 0 : 1.0 / S[0];
2186  S[1] = abs(S[1]) < epsilon ? 0 : 1.0 / S[1];
2187  S[2] = abs(S[2]) < epsilon ? 0 : 1.0 / S[2];
2188 
2189  //###########################################################
2190  // (Sigma^+) * UT
2191  //###########################################################
2192  scalar_t S_UT[9];
2193 
2194  S_UT[0] = U[0] * S[0];
2195  S_UT[1] = U[3] * S[0];
2196  S_UT[2] = U[6] * S[0];
2197  S_UT[3] = U[1] * S[1];
2198  S_UT[4] = U[4] * S[1];
2199  S_UT[5] = U[7] * S[1];
2200  S_UT[6] = U[2] * S[2];
2201  S_UT[7] = U[5] * S[2];
2202  S_UT[8] = U[8] * S[2];
2203 
2204  //###########################################################
2205  // Ainv = V * [(Sigma^+) * UT]
2206  //###########################################################
2207  scalar_t Ainv[9] = {0};
2208  matmul3x3_3x3(V, S_UT, Ainv);
2209 
2210  //###########################################################
2211  // x = Ainv * b
2212  //###########################################################
2213 
2214  matmul3x3_3x1(Ainv, B_3x1, X_3x1);
2215 }
2216 
2217 } // namespace kernel
2218 } // namespace linalg
2219 } // namespace core
2220 } // namespace open3d
Common CUDA utilities.
#define OPEN3D_DEVICE
Definition: CUDAUtils.h:45
#define OPEN3D_FORCE_INLINE
Definition: CUDAUtils.h:43
#define __fsub_rn(x, y)
Definition: SVD3x3.h:63
#define __fadd_rn(x, y)
Definition: SVD3x3.h:62
#define __drsqrt_rn(x)
Definition: SVD3x3.h:68
#define gone
Definition: SVD3x3.h:39
#define __frsqrt_rn(x)
Definition: SVD3x3.h:64
#define gcosine_pi_over_eight
Definition: SVD3x3.h:42
#define __dadd_rn(x, y)
Definition: SVD3x3.h:66
#define gsine_pi_over_eight
Definition: SVD3x3.h:40
#define gfour_gamma_squared
Definition: SVD3x3.h:44
#define __dsub_rn(x, y)
Definition: SVD3x3.h:67
#define gtiny_number
Definition: SVD3x3.h:43
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3< double >(const double *A_3x3, double *U_3x3, double *S_3x1, double *V_3x3)
Definition: SVD3x3.h:93
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void matmul3x3_3x3(const scalar_t *A_3x3, const scalar_t *B_3x3, scalar_t *C_3x3)
Definition: Matrix.h:48
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3< float >(const float *A_3x3, float *U_3x3, float *S_3x1, float *V_3x3)
Definition: SVD3x3.h:1132
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3(const scalar_t *A_3x3, scalar_t *U_3x3, scalar_t *S_3x1, scalar_t *V_3x3)
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void solve_svd3x3(const scalar_t *A_3x3, const scalar_t *B_3x1, scalar_t *X_3x1)
Definition: SVD3x3.h:2171
OPEN3D_HOST_DEVICE OPEN3D_FORCE_INLINE void matmul3x3_3x1(const scalar_t *A_3x3, const scalar_t *B_3x1, scalar_t *C_3x1)
Definition: Matrix.h:39
Definition: PinholeCameraIntrinsic.cpp:16
Definition: SVD3x3.h:81
unsigned int ui
Definition: SVD3x3.h:83
scalar_t f
Definition: SVD3x3.h:82