1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *  * Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  *  * Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in
12  *    the documentation and/or other materials provided with the
13  *    distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include "private/bionic_elf_tls.h"
30 
31 #include <async_safe/log.h>
32 #include <string.h>
33 #include <sys/param.h>
34 #include <unistd.h>
35 
36 #include "private/ScopedRWLock.h"
37 #include "private/ScopedSignalBlocker.h"
38 #include "private/bionic_globals.h"
39 #include "platform/bionic/macros.h"
40 #include "private/bionic_tls.h"
41 #include "pthread_internal.h"
42 
43 // Every call to __tls_get_addr needs to check the generation counter, so
44 // accesses to the counter need to be as fast as possible. Keep a copy of it in
45 // a hidden variable, which can be accessed without using the GOT. The linker
46 // will update this variable when it updates its counter.
47 //
48 // To allow the linker to update this variable, libc.so's constructor passes its
49 // address to the linker. To accommodate a possible __tls_get_addr call before
50 // libc.so's constructor, this local copy is initialized to SIZE_MAX, forcing
51 // __tls_get_addr to initially use the slow path.
52 __LIBC_HIDDEN__ _Atomic(size_t) __libc_tls_generation_copy = SIZE_MAX;
53 
54 // Search for a TLS segment in the given phdr table. Returns true if it has a
55 // TLS segment and false otherwise.
__bionic_get_tls_segment(const ElfW (Phdr)* phdr_table,size_t phdr_count,ElfW (Addr)load_bias,TlsSegment * out)56 bool __bionic_get_tls_segment(const ElfW(Phdr)* phdr_table, size_t phdr_count,
57                               ElfW(Addr) load_bias, TlsSegment* out) {
58   for (size_t i = 0; i < phdr_count; ++i) {
59     const ElfW(Phdr)& phdr = phdr_table[i];
60     if (phdr.p_type == PT_TLS) {
61       *out = TlsSegment {
62         phdr.p_memsz,
63         phdr.p_align,
64         reinterpret_cast<void*>(load_bias + phdr.p_vaddr),
65         phdr.p_filesz,
66       };
67       return true;
68     }
69   }
70   return false;
71 }
72 
73 // Return true if the alignment of a TLS segment is a valid power-of-two. Also
74 // cap the alignment if it's too high.
__bionic_check_tls_alignment(size_t * alignment)75 bool __bionic_check_tls_alignment(size_t* alignment) {
76   // N.B. The size does not need to be a multiple of the alignment. With
77   // ld.bfd (or after using binutils' strip), the TLS segment's size isn't
78   // rounded up.
79   if (*alignment == 0 || !powerof2(*alignment)) {
80     return false;
81   }
82   // Bionic only respects TLS alignment up to one page.
83   *alignment = MIN(*alignment, PAGE_SIZE);
84   return true;
85 }
86 
offset_thread_pointer() const87 size_t StaticTlsLayout::offset_thread_pointer() const {
88   return offset_bionic_tcb_ + (-MIN_TLS_SLOT * sizeof(void*));
89 }
90 
91 // Reserves space for the Bionic TCB and the executable's TLS segment. Returns
92 // the offset of the executable's TLS segment.
reserve_exe_segment_and_tcb(const TlsSegment * exe_segment,const char * progname)93 size_t StaticTlsLayout::reserve_exe_segment_and_tcb(const TlsSegment* exe_segment,
94                                                     const char* progname __attribute__((unused))) {
95   // Special case: if the executable has no TLS segment, then just allocate a
96   // TCB and skip the minimum alignment check on ARM.
97   if (exe_segment == nullptr) {
98     offset_bionic_tcb_ = reserve_type<bionic_tcb>();
99     return 0;
100   }
101 
102 #if defined(__arm__) || defined(__aarch64__)
103 
104   // First reserve enough space for the TCB before the executable segment.
105   reserve(sizeof(bionic_tcb), 1);
106 
107   // Then reserve the segment itself.
108   const size_t result = reserve(exe_segment->size, exe_segment->alignment);
109 
110   // The variant 1 ABI that ARM linkers follow specifies a 2-word TCB between
111   // the thread pointer and the start of the executable's TLS segment, but both
112   // the thread pointer and the TLS segment are aligned appropriately for the
113   // TLS segment. Calculate the distance between the thread pointer and the
114   // EXE's segment.
115   const size_t exe_tpoff = __BIONIC_ALIGN(sizeof(void*) * 2, exe_segment->alignment);
116 
117   const size_t min_bionic_alignment = BIONIC_ROUND_UP_POWER_OF_2(MAX_TLS_SLOT) * sizeof(void*);
118   if (exe_tpoff < min_bionic_alignment) {
119     async_safe_fatal("error: \"%s\": executable's TLS segment is underaligned: "
120                      "alignment is %zu, needs to be at least %zu for %s Bionic",
121                      progname, exe_segment->alignment, min_bionic_alignment,
122                      (sizeof(void*) == 4 ? "ARM" : "ARM64"));
123   }
124 
125   offset_bionic_tcb_ = result - exe_tpoff - (-MIN_TLS_SLOT * sizeof(void*));
126   return result;
127 
128 #elif defined(__i386__) || defined(__x86_64__)
129 
130   // x86 uses variant 2 TLS layout. The executable's segment is located just
131   // before the TCB.
132   static_assert(MIN_TLS_SLOT == 0, "First slot of bionic_tcb must be slot #0 on x86");
133   const size_t exe_size = round_up_with_overflow_check(exe_segment->size, exe_segment->alignment);
134   reserve(exe_size, 1);
135   const size_t max_align = MAX(alignof(bionic_tcb), exe_segment->alignment);
136   offset_bionic_tcb_ = reserve(sizeof(bionic_tcb), max_align);
137   return offset_bionic_tcb_ - exe_size;
138 
139 #else
140 #error "Unrecognized architecture"
141 #endif
142 }
143 
reserve_bionic_tls()144 void StaticTlsLayout::reserve_bionic_tls() {
145   offset_bionic_tls_ = reserve_type<bionic_tls>();
146 }
147 
finish_layout()148 void StaticTlsLayout::finish_layout() {
149   // Round the offset up to the alignment.
150   offset_ = round_up_with_overflow_check(offset_, alignment_);
151 
152   if (overflowed_) {
153     async_safe_fatal("error: TLS segments in static TLS overflowed");
154   }
155 }
156 
157 // The size is not required to be a multiple of the alignment. The alignment
158 // must be a positive power-of-two.
reserve(size_t size,size_t alignment)159 size_t StaticTlsLayout::reserve(size_t size, size_t alignment) {
160   offset_ = round_up_with_overflow_check(offset_, alignment);
161   const size_t result = offset_;
162   if (__builtin_add_overflow(offset_, size, &offset_)) overflowed_ = true;
163   alignment_ = MAX(alignment_, alignment);
164   return result;
165 }
166 
round_up_with_overflow_check(size_t value,size_t alignment)167 size_t StaticTlsLayout::round_up_with_overflow_check(size_t value, size_t alignment) {
168   const size_t old_value = value;
169   value = __BIONIC_ALIGN(value, alignment);
170   if (value < old_value) overflowed_ = true;
171   return value;
172 }
173 
174 // Copy each TLS module's initialization image into a newly-allocated block of
175 // static TLS memory. To reduce dirty pages, this function only writes to pages
176 // within the static TLS that need initialization. The memory should already be
177 // zero-initialized on entry.
__init_static_tls(void * static_tls)178 void __init_static_tls(void* static_tls) {
179   // The part of the table we care about (i.e. static TLS modules) never changes
180   // after startup, but we still need the mutex because the table could grow,
181   // moving the initial part. If this locking is too slow, we can duplicate the
182   // static part of the table.
183   TlsModules& modules = __libc_shared_globals()->tls_modules;
184   ScopedSignalBlocker ssb;
185   ScopedReadLock locker(&modules.rwlock);
186 
187   for (size_t i = 0; i < modules.module_count; ++i) {
188     TlsModule& module = modules.module_table[i];
189     if (module.static_offset == SIZE_MAX) {
190       // All of the static modules come before all of the dynamic modules, so
191       // once we see the first dynamic module, we're done.
192       break;
193     }
194     if (module.segment.init_size == 0) {
195       // Skip the memcpy call for TLS segments with no initializer, which is
196       // common.
197       continue;
198     }
199     memcpy(static_cast<char*>(static_tls) + module.static_offset,
200            module.segment.init_ptr,
201            module.segment.init_size);
202   }
203 }
204 
dtv_size_in_bytes(size_t module_count)205 static inline size_t dtv_size_in_bytes(size_t module_count) {
206   return sizeof(TlsDtv) + module_count * sizeof(void*);
207 }
208 
209 // Calculates the number of module slots to allocate in a new DTV. For small
210 // objects (up to 1KiB), the TLS allocator allocates memory in power-of-2 sizes,
211 // so for better space usage, ensure that the DTV size (header + slots) is a
212 // power of 2.
213 //
214 // The lock on TlsModules must be held.
calculate_new_dtv_count()215 static size_t calculate_new_dtv_count() {
216   size_t loaded_cnt = __libc_shared_globals()->tls_modules.module_count;
217   size_t bytes = dtv_size_in_bytes(MAX(1, loaded_cnt));
218   if (!powerof2(bytes)) {
219     bytes = BIONIC_ROUND_UP_POWER_OF_2(bytes);
220   }
221   return (bytes - sizeof(TlsDtv)) / sizeof(void*);
222 }
223 
224 // This function must be called with signals blocked and a write lock on
225 // TlsModules held.
update_tls_dtv(bionic_tcb * tcb)226 static void update_tls_dtv(bionic_tcb* tcb) {
227   const TlsModules& modules = __libc_shared_globals()->tls_modules;
228   BionicAllocator& allocator = __libc_shared_globals()->tls_allocator;
229 
230   // Use the generation counter from the shared globals instead of the local
231   // copy, which won't be initialized yet if __tls_get_addr is called before
232   // libc.so's constructor.
233   if (__get_tcb_dtv(tcb)->generation == atomic_load(&modules.generation)) {
234     return;
235   }
236 
237   const size_t old_cnt = __get_tcb_dtv(tcb)->count;
238 
239   // If the DTV isn't large enough, allocate a larger one. Because a signal
240   // handler could interrupt the fast path of __tls_get_addr, we don't free the
241   // old DTV. Instead, we add the old DTV to a list, then free all of a thread's
242   // DTVs at thread-exit. Each time the DTV is reallocated, its size at least
243   // doubles.
244   if (modules.module_count > old_cnt) {
245     size_t new_cnt = calculate_new_dtv_count();
246     TlsDtv* const old_dtv = __get_tcb_dtv(tcb);
247     TlsDtv* const new_dtv = static_cast<TlsDtv*>(allocator.alloc(dtv_size_in_bytes(new_cnt)));
248     memcpy(new_dtv, old_dtv, dtv_size_in_bytes(old_cnt));
249     new_dtv->count = new_cnt;
250     new_dtv->next = old_dtv;
251     __set_tcb_dtv(tcb, new_dtv);
252   }
253 
254   TlsDtv* const dtv = __get_tcb_dtv(tcb);
255 
256   const StaticTlsLayout& layout = __libc_shared_globals()->static_tls_layout;
257   char* static_tls = reinterpret_cast<char*>(tcb) - layout.offset_bionic_tcb();
258 
259   // Initialize static TLS modules and free unloaded modules.
260   for (size_t i = 0; i < dtv->count; ++i) {
261     if (i < modules.module_count) {
262       const TlsModule& mod = modules.module_table[i];
263       if (mod.static_offset != SIZE_MAX) {
264         dtv->modules[i] = static_tls + mod.static_offset;
265         continue;
266       }
267       if (mod.first_generation != kTlsGenerationNone &&
268           mod.first_generation <= dtv->generation) {
269         continue;
270       }
271     }
272     allocator.free(dtv->modules[i]);
273     dtv->modules[i] = nullptr;
274   }
275 
276   dtv->generation = atomic_load(&modules.generation);
277 }
278 
tls_get_addr_slow_path(const TlsIndex * ti)279 __attribute__((noinline)) static void* tls_get_addr_slow_path(const TlsIndex* ti) {
280   TlsModules& modules = __libc_shared_globals()->tls_modules;
281   bionic_tcb* tcb = __get_bionic_tcb();
282 
283   // Block signals and lock TlsModules. We may need the allocator, so take
284   // a write lock.
285   ScopedSignalBlocker ssb;
286   ScopedWriteLock locker(&modules.rwlock);
287 
288   update_tls_dtv(tcb);
289 
290   TlsDtv* dtv = __get_tcb_dtv(tcb);
291   const size_t module_idx = __tls_module_id_to_idx(ti->module_id);
292   void* mod_ptr = dtv->modules[module_idx];
293   if (mod_ptr == nullptr) {
294     const TlsSegment& segment = modules.module_table[module_idx].segment;
295     mod_ptr = __libc_shared_globals()->tls_allocator.memalign(segment.alignment, segment.size);
296     if (segment.init_size > 0) {
297       memcpy(mod_ptr, segment.init_ptr, segment.init_size);
298     }
299     dtv->modules[module_idx] = mod_ptr;
300   }
301 
302   return static_cast<char*>(mod_ptr) + ti->offset;
303 }
304 
305 // Returns the address of a thread's TLS memory given a module ID and an offset
306 // into that module's TLS segment. This function is called on every access to a
307 // dynamic TLS variable on targets that don't use TLSDESC. arm64 uses TLSDESC,
308 // so it only calls this function on a thread's first access to a module's TLS
309 // segment.
310 //
311 // On most targets, this accessor function is __tls_get_addr and
312 // TLS_GET_ADDR_CCONV is unset. 32-bit x86 uses ___tls_get_addr instead and a
313 // regparm() calling convention.
TLS_GET_ADDR(const TlsIndex * ti)314 extern "C" void* TLS_GET_ADDR(const TlsIndex* ti) TLS_GET_ADDR_CCONV {
315   TlsDtv* dtv = __get_tcb_dtv(__get_bionic_tcb());
316 
317   // TODO: See if we can use a relaxed memory ordering here instead.
318   size_t generation = atomic_load(&__libc_tls_generation_copy);
319   if (__predict_true(generation == dtv->generation)) {
320     void* mod_ptr = dtv->modules[__tls_module_id_to_idx(ti->module_id)];
321     if (__predict_true(mod_ptr != nullptr)) {
322       return static_cast<char*>(mod_ptr) + ti->offset;
323     }
324   }
325 
326   return tls_get_addr_slow_path(ti);
327 }
328 
329 // This function frees:
330 //  - TLS modules referenced by the current DTV.
331 //  - The list of DTV objects associated with the current thread.
332 //
333 // The caller must have already blocked signals.
__free_dynamic_tls(bionic_tcb * tcb)334 void __free_dynamic_tls(bionic_tcb* tcb) {
335   TlsModules& modules = __libc_shared_globals()->tls_modules;
336   BionicAllocator& allocator = __libc_shared_globals()->tls_allocator;
337 
338   // If we didn't allocate any dynamic memory, skip out early without taking
339   // the lock.
340   TlsDtv* dtv = __get_tcb_dtv(tcb);
341   if (dtv->generation == kTlsGenerationNone) {
342     return;
343   }
344 
345   // We need the write lock to use the allocator.
346   ScopedWriteLock locker(&modules.rwlock);
347 
348   // First free everything in the current DTV.
349   for (size_t i = 0; i < dtv->count; ++i) {
350     if (i < modules.module_count && modules.module_table[i].static_offset != SIZE_MAX) {
351       // This module's TLS memory is allocated statically, so don't free it here.
352       continue;
353     }
354     allocator.free(dtv->modules[i]);
355   }
356 
357   // Now free the thread's list of DTVs.
358   while (dtv->generation != kTlsGenerationNone) {
359     TlsDtv* next = dtv->next;
360     allocator.free(dtv);
361     dtv = next;
362   }
363 
364   // Clear the DTV slot. The DTV must not be used again with this thread.
365   tcb->tls_slot(TLS_SLOT_DTV) = nullptr;
366 }
367