1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
18 
19 #define _USE_MATH_DEFINES
20 #include <math.h>
21 #include <cmath>
22 #undef _USE_MATH_DEFINES
23 
24 #include <string.h>
25 #include <algorithm>
26 #include <type_traits>
27 
28 #include <tensorflow/core/lib/bfloat16/bfloat16.h>
29 #include <unsupported/Eigen/CXX11/Tensor>
30 
31 #include "philox_random.h"
32 
33 namespace tensorflow {
34 namespace random {
35 
36 // Helper function to convert a 16-bit integer to a half between [0..1).
37 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
38 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
39 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
40 // Helper function to convert a 32-bit integer to a float between [0..1).
41 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
42 // Helper function to convert two 32-bit integers to a double between [0..1).
43 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
44 
45 // Computes a + b. Requires that the result is representable in the destination
46 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
47 // need *not* be representable in that type. (The condition on b excludes the
48 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
49 // compute.)
50 template <typename Int>
SignedAdd(Int a,typename std::make_unsigned<Int>::type b)51 PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b) {
52     // Implementation note: both b_div_2 and b - b_div_2 are positive and
53     // representatble as Int.
54     auto b_div_2 = b >> 1;
55     return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
56 }
57 
58 // A class that generates uniform distribution random numbers from the
59 // underlying random integer generator.
60 // Arguments:
61 //   Generator: a generator type that returns a number of uint32 upon each
62 //              invocation. It needs to define kResultElementCount for the
63 //              sample count for each invocation, and ResultType for the
64 //              actual returned sample type.
65 //   RealType: the data type of the real numbers that will be returned by the
66 //             distribution. This could be either float or double for now.
67 // This class is meant to be implemented through specialization. The default
68 // is not defined by design.
69 template <class Generator, typename RealType>
70 class UniformDistribution;
71 
72 template <class Generator>
73 class UniformDistribution<Generator, Eigen::half> {
74    public:
75     // The number of elements that will be returned.
76     static const int kResultElementCount = Generator::kResultElementCount;
77     // Cost of generation of a single element (in cycles).
78     static const int kElementCost = 3;
79     // Indicate that this distribution may take variable number of samples
80     // during the runtime.
81     static const bool kVariableSamplesPerOutput = false;
82     typedef Array<Eigen::half, kResultElementCount> ResultType;
83     typedef Eigen::half ResultElementType;
84 
85     PHILOX_DEVICE_INLINE
operator()86     ResultType operator()(Generator* gen) {
87         typename Generator::ResultType sample = (*gen)();
88         ResultType result;
89         for (int i = 0; i < kResultElementCount; ++i) {
90             result[i] = Uint16ToHalf(sample[i]);  // Truncate the upper 16 bits.
91         }
92         return result;
93     }
94 };
95 
96 template <class Generator>
97 class UniformDistribution<Generator, bfloat16> {
98    public:
99     // The number of elements that will be returned.
100     static const int kResultElementCount = Generator::kResultElementCount;
101     // Cost of generation of a single element (in cycles).
102     static const int kElementCost = 3;
103     // Indicate that this distribution may take variable number of samples
104     // during the runtime.
105     static const bool kVariableSamplesPerOutput = false;
106     typedef Array<bfloat16, kResultElementCount> ResultType;
107     typedef bfloat16 ResultElementType;
108 
109     PHILOX_DEVICE_INLINE
operator()110     ResultType operator()(Generator* gen) {
111         typename Generator::ResultType sample = (*gen)();
112         ResultType result;
113         for (int i = 0; i < kResultElementCount; ++i) {
114             result[i] = Uint16ToGfloat16(sample[i]);
115         }
116         return result;
117     }
118 };
119 
120 template <class Generator>
121 class UniformDistribution<Generator, float> {
122    public:
123     // The number of elements that will be returned.
124     static const int kResultElementCount = Generator::kResultElementCount;
125     // Cost of generation of a single element (in cycles).
126     static const int kElementCost = 3;
127     // Indicate that this distribution may take variable number of samples
128     // during the runtime.
129     static const bool kVariableSamplesPerOutput = false;
130     typedef Array<float, kResultElementCount> ResultType;
131     typedef float ResultElementType;
132 
133     PHILOX_DEVICE_INLINE
operator()134     ResultType operator()(Generator* gen) {
135         typename Generator::ResultType sample = (*gen)();
136         ResultType result;
137         for (int i = 0; i < kResultElementCount; ++i) {
138             result[i] = Uint32ToFloat(sample[i]);
139         }
140         return result;
141     }
142 };
143 
144 template <class Generator>
145 class UniformDistribution<Generator, double> {
146    public:
147     // The number of elements that will be returned.
148     static const int kResultElementCount = Generator::kResultElementCount / 2;
149     // Cost of generation of a single element (in cycles).
150     static const int kElementCost = 3;
151     // Indicate that this distribution may take variable number of samples
152     // during the runtime.
153     static const bool kVariableSamplesPerOutput = false;
154     typedef Array<double, kResultElementCount> ResultType;
155     typedef double ResultElementType;
156 
157     PHILOX_DEVICE_INLINE
operator()158     ResultType operator()(Generator* gen) {
159         typename Generator::ResultType sample = (*gen)();
160         ResultType result;
161         for (int i = 0; i < kResultElementCount; ++i) {
162             result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
163         }
164         return result;
165     }
166 };
167 
168 template <class Generator>
169 class UniformDistribution<Generator, int32> {
170    public:
171     // The number of elements that will be returned.
172     static const int kResultElementCount = Generator::kResultElementCount;
173     // Cost of generation of a single element (in cycles).
174     static const int kElementCost = 3;
175     // Indicate that this distribution may take variable number of samples
176     // during the runtime.
177     static const bool kVariableSamplesPerOutput = false;
178     typedef Array<int32, kResultElementCount> ResultType;
179     typedef int32 ResultElementType;
180 
181     // Must have lo < hi
UniformDistribution(int32 lo,int32 hi)182     UniformDistribution(int32 lo, int32 hi)
183         : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
184 
185     PHILOX_DEVICE_INLINE
operator()186     ResultType operator()(Generator* gen) {
187         typename Generator::ResultType sample = (*gen)();
188         ResultType result;
189         for (int i = 0; i < kResultElementCount; ++i) {
190             result[i] = SignedAdd(lo_, sample[i] % range_);
191         }
192         return result;
193     }
194 
195    private:
196     // Note that lo_ is intentionally signed while range_ is intentionally
197     // unsigned.  This is because hi - lo can overflow signed integers if
198     // lo < 0 < hi, but always fits in unsigned.
199     int32 lo_;
200     uint32 range_;
201 };
202 
203 template <class Generator>
204 class UniformDistribution<Generator, int64> {
205    public:
206     // The number of elements that will be returned.
207     static const int kResultElementCount = Generator::kResultElementCount / 2;
208     // Cost of generation of a single element (in cycles).
209     static const int kElementCost = 3;
210     // Indicate that this distribution may take variable number of samples
211     // during the runtime.
212     static const bool kVariableSamplesPerOutput = false;
213     typedef Array<int64, kResultElementCount> ResultType;
214     typedef int64 ResultElementType;
215 
216     // Must have lo < hi
UniformDistribution(int64 lo,int64 hi)217     UniformDistribution(int64 lo, int64 hi)
218         : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
219 
220     PHILOX_DEVICE_INLINE
operator()221     ResultType operator()(Generator* gen) {
222         typename Generator::ResultType sample = (*gen)();
223         ResultType result;
224         for (int i = 0; i < kResultElementCount; ++i) {
225             auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
226             result[i] = SignedAdd(lo_, bits % range_);
227         }
228         return result;
229     }
230 
231    private:
232     // Note that lo_ is intentionally signed while range_ is intentionally
233     // unsigned.  This is because hi - lo can overflow signed integers if
234     // lo < 0 < hi, but always fits in unsigned.
235     int64 lo_;
236     uint64 range_;
237 };
238 
239 // A class that adapts the underlying native multiple samples to return a single
240 // sample at a time.
241 template <class Generator>
242 class SingleSampleAdapter {
243    public:
244     // The number of elements that will be returned.
245     static const int kResultElementCount = 1;
246     // The number of elements that will be returned by the underlying generator.
247     static const int kNativeElementCount = Generator::kResultElementCount;
248     typedef typename Generator::ResultElementType ResultType;
249     typedef typename Generator::ResultElementType ResultElementType;
250 
251     PHILOX_DEVICE_INLINE
SingleSampleAdapter(Generator * gen)252     explicit SingleSampleAdapter(Generator* gen)
253         : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
254 
255     PHILOX_DEVICE_INLINE
operator()256     ResultType operator()() {
257         if (used_result_index_ == Generator::kResultElementCount) {
258             unused_results_ = (*generator_)();
259             used_result_index_ = 0;
260         }
261 
262         return unused_results_[used_result_index_++];
263     }
264 
265     PHILOX_DEVICE_INLINE
Skip(uint64 num_skips)266     void Skip(uint64 num_skips) {
267         if (!num_skips) {
268             return;
269         }
270         int num_unused_results = kNativeElementCount - used_result_index_;
271         if (num_skips <= num_unused_results) {
272             used_result_index_ += num_skips;
273             return;
274         }
275         num_skips -= num_unused_results;
276         used_result_index_ = kNativeElementCount;
277         SkipFromGenerator(num_skips / kNativeElementCount);
278         num_skips = num_skips % kNativeElementCount;
279         if (num_skips) {
280             unused_results_ = (*generator_)();
281             used_result_index_ = num_skips;
282         }
283     }
284 
285    private:
286     // This implementation iteratively skips over `num_skips` samples
287     // from `generator_`. There is an O(1) implementation for PhiloxRandom
288     // in random_distributions.cc.
289     PHILOX_DEVICE_INLINE
SkipFromGenerator(uint64 num_skips)290     void SkipFromGenerator(uint64 num_skips) {
291         while (num_skips--) {
292             (*generator_)();
293         }
294     }
295 
296     Generator* generator_;
297     typename Generator::ResultType unused_results_;
298     int used_result_index_;
299 };
300 
301 // A class that generates unit normal distribution random numbers from the
302 // underlying random integer generator.
303 // Arguments:
304 //   Generator: a generator type that returns a number of uint32 upon each
305 //              each invocation. It needs to define kResultElementCount for the
306 //              sample count for each invocation, and ResultType for actual
307 //              returned sample type.
308 //   RealType: the data type of the real numbers that will be returned by the
309 //             distribution. This could be either float or double for now.
310 // This class is meant to be implemented through specialization. The default
311 // is not defined by design.
312 template <class Generator, typename RealType>
313 class NormalDistribution;
314 
315 PHILOX_DEVICE_INLINE
316 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
317 
318 PHILOX_DEVICE_INLINE
319 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1);
320 
321 // Exactly like the float version, except that we convert to half afterwards;
322 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
323 // gain from working in half internally.
324 template <class Generator>
325 class NormalDistribution<Generator, Eigen::half> {
326    public:
327     // The number of elements that will be returned.
328     static const int kResultElementCount = Generator::kResultElementCount;
329     // Cost of generation of a single element (in cycles).
330     static const int kElementCost = 70;
331     // Indicate that this distribution may take variable number of samples
332     // during the runtime.
333     static const bool kVariableSamplesPerOutput = false;
334     typedef Array<Eigen::half, kResultElementCount> ResultType;
335     typedef Eigen::half ResultElementType;
336 
337     PHILOX_DEVICE_INLINE
operator()338     ResultType operator()(Generator* gen) {
339         typename Generator::ResultType sample = (*gen)();
340         ResultType result;
341         for (int i = 0; i < kResultElementCount; i += 2) {
342             float f[2];
343             BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
344             result[i] = Eigen::half(f[0]);
345             result[i + 1] = Eigen::half(f[1]);
346         }
347         return result;
348     }
349 };
350 
351 template <class Generator>
352 class NormalDistribution<Generator, bfloat16> {
353    public:
354     // The number of elements that will be returned.
355     static const int kResultElementCount = Generator::kResultElementCount;
356     // Cost of generation of a single element (in cycles).
357     static const int kElementCost = 70;
358     // Indicate that this distribution may take variable number of samples
359     // during the runtime.
360     static const bool kVariableSamplesPerOutput = false;
361     typedef Array<bfloat16, kResultElementCount> ResultType;
362     typedef bfloat16 ResultElementType;
363 
364     PHILOX_DEVICE_INLINE
operator()365     ResultType operator()(Generator* gen) {
366         typename Generator::ResultType sample = (*gen)();
367         ResultType result;
368         static_assert(kResultElementCount % 2 == 0, "kResultElementCount should be an even number");
369         for (int i = 0; i < kResultElementCount; i += 2) {
370             float f[2];
371             // Box-Muller transform requires processing 2 elements at a time.
372             BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
373             result[i] = bfloat16(f[0]);
374             result[i + 1] = bfloat16(f[1]);
375         }
376         return result;
377     }
378 };
379 
380 template <class Generator>
381 class NormalDistribution<Generator, float> {
382    public:
383     // The number of elements that will be returned.
384     static const int kResultElementCount = Generator::kResultElementCount;
385     // Cost of generation of a single element (in cycles).
386     static const int kElementCost = 70;
387     // Indicate that this distribution may take variable number of samples
388     // during the runtime.
389     static const bool kVariableSamplesPerOutput = false;
390     typedef Array<float, kResultElementCount> ResultType;
391     typedef float ResultElementType;
392 
393     PHILOX_DEVICE_INLINE
operator()394     ResultType operator()(Generator* gen) {
395         typename Generator::ResultType sample = (*gen)();
396         ResultType result;
397         for (int i = 0; i < kResultElementCount; i += 2) {
398             BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
399         }
400         return result;
401     }
402 };
403 
404 template <class Generator>
405 class NormalDistribution<Generator, double> {
406    public:
407     // The number of elements that will be returned.
408     static const int kResultElementCount = Generator::kResultElementCount / 2;
409     // Cost of generation of a single element (in cycles).
410     static const int kElementCost = 70;
411     // Indicate that this distribution may take variable number of samples
412     // during the runtime.
413     static const bool kVariableSamplesPerOutput = false;
414     typedef Array<double, kResultElementCount> ResultType;
415     typedef double ResultElementType;
416 
417     PHILOX_DEVICE_INLINE
operator()418     ResultType operator()(Generator* gen) {
419         typename Generator::ResultType sample = (*gen)();
420         ResultType result;
421         for (int i = 0; i < kResultElementCount; i += 2) {
422             const int i2 = 2 * i;
423             BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i],
424                             &result[i + 1]);
425         }
426         return result;
427     }
428 };
429 
430 // A class that returns standard normal distribution between
431 // [-kTruncateValue, kTruncateValue].
432 // Arguments:
433 //   Generator: a generator type that returns a number of uint32 upon each
434 //              each invocation. It needs to define kResultElementCount for the
435 //              sample count for each invocation, and ResultType for actual
436 //              returned sample type.
437 //   RealType: the data type of the real numbers that will be returned by the
438 //             distribution. This could be either float or double for now.
439 // This class is meant to be implemented through specialization. The default
440 // is not defined by design.
441 template <class SingleSampleGenerator, typename RealType>
442 class TruncatedNormalDistribution;
443 
444 // Exactly like the float version, except that we convert to half afterwards;
445 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
446 // gain from working in half internally.
447 template <class SingleSampleGenerator>
448 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
449    public:
450     // The number of elements that will be returned.
451     static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
452     // Cost of generation of a single element (in cycles).
453     static const int kElementCost = 90;
454     // Indicate that this distribution may take variable number of samples
455     // during the runtime.
456     static const bool kVariableSamplesPerOutput = true;
457     // The threshold where the normal distribution is truncated.
458     const float kTruncateValue = 2.0f;
459 
460     typedef Array<Eigen::half, kResultElementCount> ResultType;
461     typedef Eigen::half ResultElementType;
462 
463     PHILOX_DEVICE_INLINE
operator()464     ResultType operator()(SingleSampleGenerator* gen) {
465         ResultType results;
466         int index = 0;
467         while (true) {
468             // Repeatedly take samples from the normal distribution, until we have
469             // the desired number of elements that fall within the pre-defined cutoff
470             // threshold.
471             const uint32 x0 = (*gen)();
472             const uint32 x1 = (*gen)();
473             float f[2];
474             BoxMullerFloat(x0, x1, &f[0], &f[1]);
475 
476             for (int i = 0; i < 2; ++i) {
477                 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
478                     results[index++] = Eigen::half(f[i]);
479                     if (index >= kResultElementCount) {
480                         return results;
481                     }
482                 }
483             }
484         }
485     }
486 };
487 
488 template <class SingleSampleGenerator>
489 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
490    public:
491     // The number of elements that will be returned.
492     static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
493     // Cost of generation of a single element (in cycles).
494     static const int kElementCost = 90;
495     // Indicate that this distribution may take variable number of samples
496     // during the runtime.
497     static const bool kVariableSamplesPerOutput = true;
498     // The threshold where the normal distribution is truncated.
499     const float kTruncateValue = 2.0f;
500 
501     typedef Array<bfloat16, kResultElementCount> ResultType;
502     typedef bfloat16 ResultElementType;
503 
504     PHILOX_DEVICE_INLINE
operator()505     ResultType operator()(SingleSampleGenerator* gen) {
506         ResultType results;
507         int index = 0;
508         while (true) {
509             // Repeatedly take samples from the normal distribution, until we have
510             // the desired number of elements that fall within the pre-defined cutoff
511             // threshold.
512             const uint32 x0 = (*gen)();
513             const uint32 x1 = (*gen)();
514             float f[2];
515             BoxMullerFloat(x0, x1, &f[0], &f[1]);
516 
517             for (int i = 0; i < 2; ++i) {
518                 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
519                     results[index++] = bfloat16(f[i]);
520                     if (index >= kResultElementCount) {
521                         return results;
522                     }
523                 }
524             }
525         }
526     }
527 };
528 
529 // Partial specialization for float.
530 template <class SingleSampleGenerator>
531 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
532    public:
533     // The number of elements that will be returned.
534     static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
535     // Cost of generation of a single element (in cycles).
536     static const int kElementCost = 90;
537     // Indicate that this distribution may take variable number of samples
538     // during the runtime.
539     static const bool kVariableSamplesPerOutput = true;
540     // The threshold where the normal distribution is truncated.
541     const float kTruncateValue = 2.0f;
542 
543     typedef Array<float, kResultElementCount> ResultType;
544     typedef float ResultElementType;
545 
546     PHILOX_DEVICE_INLINE
operator()547     ResultType operator()(SingleSampleGenerator* gen) {
548         ResultType results;
549         int index = 0;
550         while (true) {
551             // Repeatedly take samples from the normal distribution, until we have
552             // the desired number of elements that fall within the pre-defined cutoff
553             // threshold.
554             const uint32 x0 = (*gen)();
555             const uint32 x1 = (*gen)();
556             float f[2];
557             BoxMullerFloat(x0, x1, &f[0], &f[1]);
558 
559             for (int i = 0; i < 2; ++i) {
560                 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
561                     results[index++] = f[i];
562                     if (index >= kResultElementCount) {
563                         return results;
564                     }
565                 }
566             }
567         }
568     }
569 };
570 
571 // Partial specialization for double.
572 template <class SingleSampleGenerator>
573 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
574    public:
575     // The number of elements that will be returned.
576     static const int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
577                                                    ? SingleSampleGenerator::kNativeElementCount / 2
578                                                    : 1;
579     // Cost of generation of a single element (in cycles).
580     static const int kElementCost = 90;
581     // Indicate that this distribution may take variable number of samples
582     // during the runtime.
583     static const bool kVariableSamplesPerOutput = true;
584     typedef Array<double, kResultElementCount> ResultType;
585     typedef double ResultElementType;
586     const double kTruncateValue = 2.0;
587 
588     PHILOX_DEVICE_INLINE
operator()589     ResultType operator()(SingleSampleGenerator* gen) {
590         ResultType results;
591         int index = 0;
592         while (1) {
593             const uint32 x0 = (*gen)();
594             const uint32 x1 = (*gen)();
595             const uint32 x2 = (*gen)();
596             const uint32 x3 = (*gen)();
597             double d[2];
598             BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
599 
600             for (int i = 0; i < 2; ++i) {
601                 if (Eigen::numext::abs(d[i]) < kTruncateValue) {
602                     results[index++] = d[i];
603                     if (index >= kResultElementCount) {
604                         return results;
605                     }
606                 }
607             }
608         }
609     }
610 };
611 
612 // Helper function to convert two 32-bit uniform integers to two floats
613 // under the unit normal distribution.
614 PHILOX_DEVICE_INLINE
BoxMullerFloat(uint32 x0,uint32 x1,float * f0,float * f1)615 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
616     // This function implements the Box-Muller transform:
617     // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
618     // Do not send a really small number to log().
619     // We cannot mark "epsilon" as "static const" because NVCC would complain
620     const float epsilon = 1.0e-7f;
621     float u1 = Uint32ToFloat(x0);
622     if (u1 < epsilon) {
623         u1 = epsilon;
624     }
625     const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
626     const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
627 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
628     *f0 = Eigen::numext::sin(v1);
629     *f1 = Eigen::numext::cos(v1);
630 #else
631     sincosf(v1, f0, f1);
632 #endif
633     *f0 *= u2;
634     *f1 *= u2;
635 }
636 
637 // Helper function to convert four 32-bit uniform integers to two doubles
638 // under the unit normal distribution.
639 PHILOX_DEVICE_INLINE
BoxMullerDouble(uint32 x0,uint32 x1,uint32 x2,uint32 x3,double * d0,double * d1)640 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1) {
641     // This function implements the Box-Muller transform:
642     // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
643     // Do not send a really small number to log().
644     // We cannot mark "epsilon" as "static const" because NVCC would complain
645     const double epsilon = 1.0e-7;
646     double u1 = Uint64ToDouble(x0, x1);
647     if (u1 < epsilon) {
648         u1 = epsilon;
649     }
650     const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
651     const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
652 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
653     *d0 = Eigen::numext::sin(v1);
654     *d1 = Eigen::numext::cos(v1);
655 #else
656     sincos(v1, d0, d1);
657 #endif
658     *d0 *= u2;
659     *d1 *= u2;
660 }
661 
662 // Helper function to convert an 16-bit integer to a half between [0..1).
Uint16ToHalf(uint16 x)663 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
664     // IEEE754 halfs are formatted as follows (MSB first):
665     //    sign(1) exponent(5) mantissa(10)
666     // Conceptually construct the following:
667     //    sign == 0
668     //    exponent == 15  -- an excess 15 representation of a zero exponent
669     //    mantissa == 10 random bits
670     const uint16 man = x & 0x3ffu;  // 10 bit mantissa
671     const uint16 exp = static_cast<uint16>(15);
672     const uint16 val = (exp << 10) | man;
673 
674     Eigen::half result;
675     result.x = val;
676     return result - Eigen::half(1.0);
677 }
678 
679 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
680 // This can create a uniform distribution of values between [0..1).
Uint16ToGfloat16(uint16 x)681 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
682     // bfloat are formatted as follows (MSB first):
683     //    sign(1) exponent(8) mantissa(7)
684     // Conceptually construct the following:
685     //    sign == 0
686     //    exponent == 127  -- an excess 127 representation of a zero exponent
687     //    mantissa == 7 random bits
688     const uint16 man = x & 0x7fu;  // 7 bit mantissa
689     const uint16 exp = static_cast<uint16>(127);
690     const uint16 val = (exp << 7) | man;
691 
692     bfloat16 result;
693     memcpy(&result, &val, sizeof(val));
694     // The mantissa has an implicit leading 1, so the above code creates a value
695     // in [1, 2). The minus will not cause a rounding that makes the result 1.
696     // Instead it will just be close to 1.
697     return result - bfloat16(1.0);
698 }
699 
700 // Helper function to convert an 32-bit integer to a float between [0..1).
Uint32ToFloat(uint32 x)701 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
702     // IEEE754 floats are formatted as follows (MSB first):
703     //    sign(1) exponent(8) mantissa(23)
704     // Conceptually construct the following:
705     //    sign == 0
706     //    exponent == 127  -- an excess 127 representation of a zero exponent
707     //    mantissa == 23 random bits
708     const uint32 man = x & 0x7fffffu;  // 23 bit mantissa
709     const uint32 exp = static_cast<uint32>(127);
710     const uint32 val = (exp << 23) | man;
711 
712     // Assumes that endian-ness is same for float and uint32.
713     float result;
714     memcpy(&result, &val, sizeof(val));
715     return result - 1.0f;
716 }
717 
718 // Helper function to convert two 32-bit integers to a double between [0..1).
Uint64ToDouble(uint32 x0,uint32 x1)719 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
720     // IEEE754 doubles are formatted as follows (MSB first):
721     //    sign(1) exponent(11) mantissa(52)
722     // Conceptually construct the following:
723     //    sign == 0
724     //    exponent == 1023  -- an excess 1023 representation of a zero exponent
725     //    mantissa == 52 random bits
726     const uint32 mhi = x0 & 0xfffffu;                           // upper 20 bits of mantissa
727     const uint32 mlo = x1;                                      // lower 32 bits of mantissa
728     const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo;  // mantissa
729     const uint64 exp = static_cast<uint64>(1023);
730     const uint64 val = (exp << 52) | man;
731     // Assumes that endian-ness is same for double and uint64.
732     double result;
733     memcpy(&result, &val, sizeof(val));
734     return result - 1.0;
735 }
736 
737 }  // namespace random
738 }  // namespace tensorflow
739 
740 #endif  // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
741