1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
18
19 #define _USE_MATH_DEFINES
20 #include <math.h>
21 #include <cmath>
22 #undef _USE_MATH_DEFINES
23
24 #include <string.h>
25 #include <algorithm>
26 #include <type_traits>
27
28 #include <tensorflow/core/lib/bfloat16/bfloat16.h>
29 #include <unsupported/Eigen/CXX11/Tensor>
30
31 #include "philox_random.h"
32
33 namespace tensorflow {
34 namespace random {
35
36 // Helper function to convert a 16-bit integer to a half between [0..1).
37 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
38 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
39 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
40 // Helper function to convert a 32-bit integer to a float between [0..1).
41 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
42 // Helper function to convert two 32-bit integers to a double between [0..1).
43 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
44
45 // Computes a + b. Requires that the result is representable in the destination
46 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
47 // need *not* be representable in that type. (The condition on b excludes the
48 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
49 // compute.)
50 template <typename Int>
SignedAdd(Int a,typename std::make_unsigned<Int>::type b)51 PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b) {
52 // Implementation note: both b_div_2 and b - b_div_2 are positive and
53 // representatble as Int.
54 auto b_div_2 = b >> 1;
55 return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
56 }
57
58 // A class that generates uniform distribution random numbers from the
59 // underlying random integer generator.
60 // Arguments:
61 // Generator: a generator type that returns a number of uint32 upon each
62 // invocation. It needs to define kResultElementCount for the
63 // sample count for each invocation, and ResultType for the
64 // actual returned sample type.
65 // RealType: the data type of the real numbers that will be returned by the
66 // distribution. This could be either float or double for now.
67 // This class is meant to be implemented through specialization. The default
68 // is not defined by design.
69 template <class Generator, typename RealType>
70 class UniformDistribution;
71
72 template <class Generator>
73 class UniformDistribution<Generator, Eigen::half> {
74 public:
75 // The number of elements that will be returned.
76 static const int kResultElementCount = Generator::kResultElementCount;
77 // Cost of generation of a single element (in cycles).
78 static const int kElementCost = 3;
79 // Indicate that this distribution may take variable number of samples
80 // during the runtime.
81 static const bool kVariableSamplesPerOutput = false;
82 typedef Array<Eigen::half, kResultElementCount> ResultType;
83 typedef Eigen::half ResultElementType;
84
85 PHILOX_DEVICE_INLINE
operator()86 ResultType operator()(Generator* gen) {
87 typename Generator::ResultType sample = (*gen)();
88 ResultType result;
89 for (int i = 0; i < kResultElementCount; ++i) {
90 result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits.
91 }
92 return result;
93 }
94 };
95
96 template <class Generator>
97 class UniformDistribution<Generator, bfloat16> {
98 public:
99 // The number of elements that will be returned.
100 static const int kResultElementCount = Generator::kResultElementCount;
101 // Cost of generation of a single element (in cycles).
102 static const int kElementCost = 3;
103 // Indicate that this distribution may take variable number of samples
104 // during the runtime.
105 static const bool kVariableSamplesPerOutput = false;
106 typedef Array<bfloat16, kResultElementCount> ResultType;
107 typedef bfloat16 ResultElementType;
108
109 PHILOX_DEVICE_INLINE
operator()110 ResultType operator()(Generator* gen) {
111 typename Generator::ResultType sample = (*gen)();
112 ResultType result;
113 for (int i = 0; i < kResultElementCount; ++i) {
114 result[i] = Uint16ToGfloat16(sample[i]);
115 }
116 return result;
117 }
118 };
119
120 template <class Generator>
121 class UniformDistribution<Generator, float> {
122 public:
123 // The number of elements that will be returned.
124 static const int kResultElementCount = Generator::kResultElementCount;
125 // Cost of generation of a single element (in cycles).
126 static const int kElementCost = 3;
127 // Indicate that this distribution may take variable number of samples
128 // during the runtime.
129 static const bool kVariableSamplesPerOutput = false;
130 typedef Array<float, kResultElementCount> ResultType;
131 typedef float ResultElementType;
132
133 PHILOX_DEVICE_INLINE
operator()134 ResultType operator()(Generator* gen) {
135 typename Generator::ResultType sample = (*gen)();
136 ResultType result;
137 for (int i = 0; i < kResultElementCount; ++i) {
138 result[i] = Uint32ToFloat(sample[i]);
139 }
140 return result;
141 }
142 };
143
144 template <class Generator>
145 class UniformDistribution<Generator, double> {
146 public:
147 // The number of elements that will be returned.
148 static const int kResultElementCount = Generator::kResultElementCount / 2;
149 // Cost of generation of a single element (in cycles).
150 static const int kElementCost = 3;
151 // Indicate that this distribution may take variable number of samples
152 // during the runtime.
153 static const bool kVariableSamplesPerOutput = false;
154 typedef Array<double, kResultElementCount> ResultType;
155 typedef double ResultElementType;
156
157 PHILOX_DEVICE_INLINE
operator()158 ResultType operator()(Generator* gen) {
159 typename Generator::ResultType sample = (*gen)();
160 ResultType result;
161 for (int i = 0; i < kResultElementCount; ++i) {
162 result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
163 }
164 return result;
165 }
166 };
167
168 template <class Generator>
169 class UniformDistribution<Generator, int32> {
170 public:
171 // The number of elements that will be returned.
172 static const int kResultElementCount = Generator::kResultElementCount;
173 // Cost of generation of a single element (in cycles).
174 static const int kElementCost = 3;
175 // Indicate that this distribution may take variable number of samples
176 // during the runtime.
177 static const bool kVariableSamplesPerOutput = false;
178 typedef Array<int32, kResultElementCount> ResultType;
179 typedef int32 ResultElementType;
180
181 // Must have lo < hi
UniformDistribution(int32 lo,int32 hi)182 UniformDistribution(int32 lo, int32 hi)
183 : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
184
185 PHILOX_DEVICE_INLINE
operator()186 ResultType operator()(Generator* gen) {
187 typename Generator::ResultType sample = (*gen)();
188 ResultType result;
189 for (int i = 0; i < kResultElementCount; ++i) {
190 result[i] = SignedAdd(lo_, sample[i] % range_);
191 }
192 return result;
193 }
194
195 private:
196 // Note that lo_ is intentionally signed while range_ is intentionally
197 // unsigned. This is because hi - lo can overflow signed integers if
198 // lo < 0 < hi, but always fits in unsigned.
199 int32 lo_;
200 uint32 range_;
201 };
202
203 template <class Generator>
204 class UniformDistribution<Generator, int64> {
205 public:
206 // The number of elements that will be returned.
207 static const int kResultElementCount = Generator::kResultElementCount / 2;
208 // Cost of generation of a single element (in cycles).
209 static const int kElementCost = 3;
210 // Indicate that this distribution may take variable number of samples
211 // during the runtime.
212 static const bool kVariableSamplesPerOutput = false;
213 typedef Array<int64, kResultElementCount> ResultType;
214 typedef int64 ResultElementType;
215
216 // Must have lo < hi
UniformDistribution(int64 lo,int64 hi)217 UniformDistribution(int64 lo, int64 hi)
218 : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
219
220 PHILOX_DEVICE_INLINE
operator()221 ResultType operator()(Generator* gen) {
222 typename Generator::ResultType sample = (*gen)();
223 ResultType result;
224 for (int i = 0; i < kResultElementCount; ++i) {
225 auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
226 result[i] = SignedAdd(lo_, bits % range_);
227 }
228 return result;
229 }
230
231 private:
232 // Note that lo_ is intentionally signed while range_ is intentionally
233 // unsigned. This is because hi - lo can overflow signed integers if
234 // lo < 0 < hi, but always fits in unsigned.
235 int64 lo_;
236 uint64 range_;
237 };
238
239 // A class that adapts the underlying native multiple samples to return a single
240 // sample at a time.
241 template <class Generator>
242 class SingleSampleAdapter {
243 public:
244 // The number of elements that will be returned.
245 static const int kResultElementCount = 1;
246 // The number of elements that will be returned by the underlying generator.
247 static const int kNativeElementCount = Generator::kResultElementCount;
248 typedef typename Generator::ResultElementType ResultType;
249 typedef typename Generator::ResultElementType ResultElementType;
250
251 PHILOX_DEVICE_INLINE
SingleSampleAdapter(Generator * gen)252 explicit SingleSampleAdapter(Generator* gen)
253 : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
254
255 PHILOX_DEVICE_INLINE
operator()256 ResultType operator()() {
257 if (used_result_index_ == Generator::kResultElementCount) {
258 unused_results_ = (*generator_)();
259 used_result_index_ = 0;
260 }
261
262 return unused_results_[used_result_index_++];
263 }
264
265 PHILOX_DEVICE_INLINE
Skip(uint64 num_skips)266 void Skip(uint64 num_skips) {
267 if (!num_skips) {
268 return;
269 }
270 int num_unused_results = kNativeElementCount - used_result_index_;
271 if (num_skips <= num_unused_results) {
272 used_result_index_ += num_skips;
273 return;
274 }
275 num_skips -= num_unused_results;
276 used_result_index_ = kNativeElementCount;
277 SkipFromGenerator(num_skips / kNativeElementCount);
278 num_skips = num_skips % kNativeElementCount;
279 if (num_skips) {
280 unused_results_ = (*generator_)();
281 used_result_index_ = num_skips;
282 }
283 }
284
285 private:
286 // This implementation iteratively skips over `num_skips` samples
287 // from `generator_`. There is an O(1) implementation for PhiloxRandom
288 // in random_distributions.cc.
289 PHILOX_DEVICE_INLINE
SkipFromGenerator(uint64 num_skips)290 void SkipFromGenerator(uint64 num_skips) {
291 while (num_skips--) {
292 (*generator_)();
293 }
294 }
295
296 Generator* generator_;
297 typename Generator::ResultType unused_results_;
298 int used_result_index_;
299 };
300
301 // A class that generates unit normal distribution random numbers from the
302 // underlying random integer generator.
303 // Arguments:
304 // Generator: a generator type that returns a number of uint32 upon each
305 // each invocation. It needs to define kResultElementCount for the
306 // sample count for each invocation, and ResultType for actual
307 // returned sample type.
308 // RealType: the data type of the real numbers that will be returned by the
309 // distribution. This could be either float or double for now.
310 // This class is meant to be implemented through specialization. The default
311 // is not defined by design.
312 template <class Generator, typename RealType>
313 class NormalDistribution;
314
315 PHILOX_DEVICE_INLINE
316 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
317
318 PHILOX_DEVICE_INLINE
319 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1);
320
321 // Exactly like the float version, except that we convert to half afterwards;
322 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
323 // gain from working in half internally.
324 template <class Generator>
325 class NormalDistribution<Generator, Eigen::half> {
326 public:
327 // The number of elements that will be returned.
328 static const int kResultElementCount = Generator::kResultElementCount;
329 // Cost of generation of a single element (in cycles).
330 static const int kElementCost = 70;
331 // Indicate that this distribution may take variable number of samples
332 // during the runtime.
333 static const bool kVariableSamplesPerOutput = false;
334 typedef Array<Eigen::half, kResultElementCount> ResultType;
335 typedef Eigen::half ResultElementType;
336
337 PHILOX_DEVICE_INLINE
operator()338 ResultType operator()(Generator* gen) {
339 typename Generator::ResultType sample = (*gen)();
340 ResultType result;
341 for (int i = 0; i < kResultElementCount; i += 2) {
342 float f[2];
343 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
344 result[i] = Eigen::half(f[0]);
345 result[i + 1] = Eigen::half(f[1]);
346 }
347 return result;
348 }
349 };
350
351 template <class Generator>
352 class NormalDistribution<Generator, bfloat16> {
353 public:
354 // The number of elements that will be returned.
355 static const int kResultElementCount = Generator::kResultElementCount;
356 // Cost of generation of a single element (in cycles).
357 static const int kElementCost = 70;
358 // Indicate that this distribution may take variable number of samples
359 // during the runtime.
360 static const bool kVariableSamplesPerOutput = false;
361 typedef Array<bfloat16, kResultElementCount> ResultType;
362 typedef bfloat16 ResultElementType;
363
364 PHILOX_DEVICE_INLINE
operator()365 ResultType operator()(Generator* gen) {
366 typename Generator::ResultType sample = (*gen)();
367 ResultType result;
368 static_assert(kResultElementCount % 2 == 0, "kResultElementCount should be an even number");
369 for (int i = 0; i < kResultElementCount; i += 2) {
370 float f[2];
371 // Box-Muller transform requires processing 2 elements at a time.
372 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
373 result[i] = bfloat16(f[0]);
374 result[i + 1] = bfloat16(f[1]);
375 }
376 return result;
377 }
378 };
379
380 template <class Generator>
381 class NormalDistribution<Generator, float> {
382 public:
383 // The number of elements that will be returned.
384 static const int kResultElementCount = Generator::kResultElementCount;
385 // Cost of generation of a single element (in cycles).
386 static const int kElementCost = 70;
387 // Indicate that this distribution may take variable number of samples
388 // during the runtime.
389 static const bool kVariableSamplesPerOutput = false;
390 typedef Array<float, kResultElementCount> ResultType;
391 typedef float ResultElementType;
392
393 PHILOX_DEVICE_INLINE
operator()394 ResultType operator()(Generator* gen) {
395 typename Generator::ResultType sample = (*gen)();
396 ResultType result;
397 for (int i = 0; i < kResultElementCount; i += 2) {
398 BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
399 }
400 return result;
401 }
402 };
403
404 template <class Generator>
405 class NormalDistribution<Generator, double> {
406 public:
407 // The number of elements that will be returned.
408 static const int kResultElementCount = Generator::kResultElementCount / 2;
409 // Cost of generation of a single element (in cycles).
410 static const int kElementCost = 70;
411 // Indicate that this distribution may take variable number of samples
412 // during the runtime.
413 static const bool kVariableSamplesPerOutput = false;
414 typedef Array<double, kResultElementCount> ResultType;
415 typedef double ResultElementType;
416
417 PHILOX_DEVICE_INLINE
operator()418 ResultType operator()(Generator* gen) {
419 typename Generator::ResultType sample = (*gen)();
420 ResultType result;
421 for (int i = 0; i < kResultElementCount; i += 2) {
422 const int i2 = 2 * i;
423 BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i],
424 &result[i + 1]);
425 }
426 return result;
427 }
428 };
429
430 // A class that returns standard normal distribution between
431 // [-kTruncateValue, kTruncateValue].
432 // Arguments:
433 // Generator: a generator type that returns a number of uint32 upon each
434 // each invocation. It needs to define kResultElementCount for the
435 // sample count for each invocation, and ResultType for actual
436 // returned sample type.
437 // RealType: the data type of the real numbers that will be returned by the
438 // distribution. This could be either float or double for now.
439 // This class is meant to be implemented through specialization. The default
440 // is not defined by design.
441 template <class SingleSampleGenerator, typename RealType>
442 class TruncatedNormalDistribution;
443
444 // Exactly like the float version, except that we convert to half afterwards;
445 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
446 // gain from working in half internally.
447 template <class SingleSampleGenerator>
448 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
449 public:
450 // The number of elements that will be returned.
451 static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
452 // Cost of generation of a single element (in cycles).
453 static const int kElementCost = 90;
454 // Indicate that this distribution may take variable number of samples
455 // during the runtime.
456 static const bool kVariableSamplesPerOutput = true;
457 // The threshold where the normal distribution is truncated.
458 const float kTruncateValue = 2.0f;
459
460 typedef Array<Eigen::half, kResultElementCount> ResultType;
461 typedef Eigen::half ResultElementType;
462
463 PHILOX_DEVICE_INLINE
operator()464 ResultType operator()(SingleSampleGenerator* gen) {
465 ResultType results;
466 int index = 0;
467 while (true) {
468 // Repeatedly take samples from the normal distribution, until we have
469 // the desired number of elements that fall within the pre-defined cutoff
470 // threshold.
471 const uint32 x0 = (*gen)();
472 const uint32 x1 = (*gen)();
473 float f[2];
474 BoxMullerFloat(x0, x1, &f[0], &f[1]);
475
476 for (int i = 0; i < 2; ++i) {
477 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
478 results[index++] = Eigen::half(f[i]);
479 if (index >= kResultElementCount) {
480 return results;
481 }
482 }
483 }
484 }
485 }
486 };
487
488 template <class SingleSampleGenerator>
489 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
490 public:
491 // The number of elements that will be returned.
492 static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
493 // Cost of generation of a single element (in cycles).
494 static const int kElementCost = 90;
495 // Indicate that this distribution may take variable number of samples
496 // during the runtime.
497 static const bool kVariableSamplesPerOutput = true;
498 // The threshold where the normal distribution is truncated.
499 const float kTruncateValue = 2.0f;
500
501 typedef Array<bfloat16, kResultElementCount> ResultType;
502 typedef bfloat16 ResultElementType;
503
504 PHILOX_DEVICE_INLINE
operator()505 ResultType operator()(SingleSampleGenerator* gen) {
506 ResultType results;
507 int index = 0;
508 while (true) {
509 // Repeatedly take samples from the normal distribution, until we have
510 // the desired number of elements that fall within the pre-defined cutoff
511 // threshold.
512 const uint32 x0 = (*gen)();
513 const uint32 x1 = (*gen)();
514 float f[2];
515 BoxMullerFloat(x0, x1, &f[0], &f[1]);
516
517 for (int i = 0; i < 2; ++i) {
518 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
519 results[index++] = bfloat16(f[i]);
520 if (index >= kResultElementCount) {
521 return results;
522 }
523 }
524 }
525 }
526 }
527 };
528
529 // Partial specialization for float.
530 template <class SingleSampleGenerator>
531 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
532 public:
533 // The number of elements that will be returned.
534 static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
535 // Cost of generation of a single element (in cycles).
536 static const int kElementCost = 90;
537 // Indicate that this distribution may take variable number of samples
538 // during the runtime.
539 static const bool kVariableSamplesPerOutput = true;
540 // The threshold where the normal distribution is truncated.
541 const float kTruncateValue = 2.0f;
542
543 typedef Array<float, kResultElementCount> ResultType;
544 typedef float ResultElementType;
545
546 PHILOX_DEVICE_INLINE
operator()547 ResultType operator()(SingleSampleGenerator* gen) {
548 ResultType results;
549 int index = 0;
550 while (true) {
551 // Repeatedly take samples from the normal distribution, until we have
552 // the desired number of elements that fall within the pre-defined cutoff
553 // threshold.
554 const uint32 x0 = (*gen)();
555 const uint32 x1 = (*gen)();
556 float f[2];
557 BoxMullerFloat(x0, x1, &f[0], &f[1]);
558
559 for (int i = 0; i < 2; ++i) {
560 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
561 results[index++] = f[i];
562 if (index >= kResultElementCount) {
563 return results;
564 }
565 }
566 }
567 }
568 }
569 };
570
571 // Partial specialization for double.
572 template <class SingleSampleGenerator>
573 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
574 public:
575 // The number of elements that will be returned.
576 static const int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
577 ? SingleSampleGenerator::kNativeElementCount / 2
578 : 1;
579 // Cost of generation of a single element (in cycles).
580 static const int kElementCost = 90;
581 // Indicate that this distribution may take variable number of samples
582 // during the runtime.
583 static const bool kVariableSamplesPerOutput = true;
584 typedef Array<double, kResultElementCount> ResultType;
585 typedef double ResultElementType;
586 const double kTruncateValue = 2.0;
587
588 PHILOX_DEVICE_INLINE
operator()589 ResultType operator()(SingleSampleGenerator* gen) {
590 ResultType results;
591 int index = 0;
592 while (1) {
593 const uint32 x0 = (*gen)();
594 const uint32 x1 = (*gen)();
595 const uint32 x2 = (*gen)();
596 const uint32 x3 = (*gen)();
597 double d[2];
598 BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
599
600 for (int i = 0; i < 2; ++i) {
601 if (Eigen::numext::abs(d[i]) < kTruncateValue) {
602 results[index++] = d[i];
603 if (index >= kResultElementCount) {
604 return results;
605 }
606 }
607 }
608 }
609 }
610 };
611
612 // Helper function to convert two 32-bit uniform integers to two floats
613 // under the unit normal distribution.
614 PHILOX_DEVICE_INLINE
BoxMullerFloat(uint32 x0,uint32 x1,float * f0,float * f1)615 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
616 // This function implements the Box-Muller transform:
617 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
618 // Do not send a really small number to log().
619 // We cannot mark "epsilon" as "static const" because NVCC would complain
620 const float epsilon = 1.0e-7f;
621 float u1 = Uint32ToFloat(x0);
622 if (u1 < epsilon) {
623 u1 = epsilon;
624 }
625 const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
626 const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
627 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
628 *f0 = Eigen::numext::sin(v1);
629 *f1 = Eigen::numext::cos(v1);
630 #else
631 sincosf(v1, f0, f1);
632 #endif
633 *f0 *= u2;
634 *f1 *= u2;
635 }
636
637 // Helper function to convert four 32-bit uniform integers to two doubles
638 // under the unit normal distribution.
639 PHILOX_DEVICE_INLINE
BoxMullerDouble(uint32 x0,uint32 x1,uint32 x2,uint32 x3,double * d0,double * d1)640 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1) {
641 // This function implements the Box-Muller transform:
642 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
643 // Do not send a really small number to log().
644 // We cannot mark "epsilon" as "static const" because NVCC would complain
645 const double epsilon = 1.0e-7;
646 double u1 = Uint64ToDouble(x0, x1);
647 if (u1 < epsilon) {
648 u1 = epsilon;
649 }
650 const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
651 const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
652 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
653 *d0 = Eigen::numext::sin(v1);
654 *d1 = Eigen::numext::cos(v1);
655 #else
656 sincos(v1, d0, d1);
657 #endif
658 *d0 *= u2;
659 *d1 *= u2;
660 }
661
662 // Helper function to convert an 16-bit integer to a half between [0..1).
Uint16ToHalf(uint16 x)663 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
664 // IEEE754 halfs are formatted as follows (MSB first):
665 // sign(1) exponent(5) mantissa(10)
666 // Conceptually construct the following:
667 // sign == 0
668 // exponent == 15 -- an excess 15 representation of a zero exponent
669 // mantissa == 10 random bits
670 const uint16 man = x & 0x3ffu; // 10 bit mantissa
671 const uint16 exp = static_cast<uint16>(15);
672 const uint16 val = (exp << 10) | man;
673
674 Eigen::half result;
675 result.x = val;
676 return result - Eigen::half(1.0);
677 }
678
679 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
680 // This can create a uniform distribution of values between [0..1).
Uint16ToGfloat16(uint16 x)681 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
682 // bfloat are formatted as follows (MSB first):
683 // sign(1) exponent(8) mantissa(7)
684 // Conceptually construct the following:
685 // sign == 0
686 // exponent == 127 -- an excess 127 representation of a zero exponent
687 // mantissa == 7 random bits
688 const uint16 man = x & 0x7fu; // 7 bit mantissa
689 const uint16 exp = static_cast<uint16>(127);
690 const uint16 val = (exp << 7) | man;
691
692 bfloat16 result;
693 memcpy(&result, &val, sizeof(val));
694 // The mantissa has an implicit leading 1, so the above code creates a value
695 // in [1, 2). The minus will not cause a rounding that makes the result 1.
696 // Instead it will just be close to 1.
697 return result - bfloat16(1.0);
698 }
699
700 // Helper function to convert an 32-bit integer to a float between [0..1).
Uint32ToFloat(uint32 x)701 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
702 // IEEE754 floats are formatted as follows (MSB first):
703 // sign(1) exponent(8) mantissa(23)
704 // Conceptually construct the following:
705 // sign == 0
706 // exponent == 127 -- an excess 127 representation of a zero exponent
707 // mantissa == 23 random bits
708 const uint32 man = x & 0x7fffffu; // 23 bit mantissa
709 const uint32 exp = static_cast<uint32>(127);
710 const uint32 val = (exp << 23) | man;
711
712 // Assumes that endian-ness is same for float and uint32.
713 float result;
714 memcpy(&result, &val, sizeof(val));
715 return result - 1.0f;
716 }
717
718 // Helper function to convert two 32-bit integers to a double between [0..1).
Uint64ToDouble(uint32 x0,uint32 x1)719 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
720 // IEEE754 doubles are formatted as follows (MSB first):
721 // sign(1) exponent(11) mantissa(52)
722 // Conceptually construct the following:
723 // sign == 0
724 // exponent == 1023 -- an excess 1023 representation of a zero exponent
725 // mantissa == 52 random bits
726 const uint32 mhi = x0 & 0xfffffu; // upper 20 bits of mantissa
727 const uint32 mlo = x1; // lower 32 bits of mantissa
728 const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo; // mantissa
729 const uint64 exp = static_cast<uint64>(1023);
730 const uint64 val = (exp << 52) | man;
731 // Assumes that endian-ness is same for double and uint64.
732 double result;
733 memcpy(&result, &val, sizeof(val));
734 return result - 1.0;
735 }
736
737 } // namespace random
738 } // namespace tensorflow
739
740 #endif // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
741