LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 
28 #if OMPT_SUPPORT
29 #include "ompt-specific.h"
30 #endif
31 
32 /* these are temporary issues to be dealt with */
33 #define KMP_USE_PRCTL 0
34 
35 #if KMP_OS_WINDOWS
36 #include <process.h>
37 #endif
38 
39 #include "tsan_annotations.h"
40 
41 #if defined(KMP_GOMP_COMPAT)
42 char const __kmp_version_alt_comp[] =
43  KMP_VERSION_PREFIX "alternative compiler support: yes";
44 #endif /* defined(KMP_GOMP_COMPAT) */
45 
46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
47 #if OMP_50_ENABLED
48  "5.0 (201611)";
49 #elif OMP_45_ENABLED
50  "4.5 (201511)";
51 #elif OMP_40_ENABLED
52  "4.0 (201307)";
53 #else
54  "3.1 (201107)";
55 #endif
56 
57 #ifdef KMP_DEBUG
58 char const __kmp_version_lock[] =
59  KMP_VERSION_PREFIX "lock type: run time selectable";
60 #endif /* KMP_DEBUG */
61 
62 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
63 
64 /* ------------------------------------------------------------------------ */
65 
66 kmp_info_t __kmp_monitor;
67 
68 /* Forward declarations */
69 
70 void __kmp_cleanup(void);
71 
72 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
73  int gtid);
74 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
75  kmp_internal_control_t *new_icvs,
76  ident_t *loc);
77 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
78 static void __kmp_partition_places(kmp_team_t *team,
79  int update_master_only = 0);
80 #endif
81 static void __kmp_do_serial_initialize(void);
82 void __kmp_fork_barrier(int gtid, int tid);
83 void __kmp_join_barrier(int gtid);
84 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
85  kmp_internal_control_t *new_icvs, ident_t *loc);
86 
87 #ifdef USE_LOAD_BALANCE
88 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
89 #endif
90 
91 static int __kmp_expand_threads(int nWish, int nNeed);
92 #if KMP_OS_WINDOWS
93 static int __kmp_unregister_root_other_thread(int gtid);
94 #endif
95 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
96 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
97 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique identifier of executing
101  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
102 int __kmp_get_global_thread_id() {
103  int i;
104  kmp_info_t **other_threads;
105  size_t stack_data;
106  char *stack_addr;
107  size_t stack_size;
108  char *stack_base;
109 
110  KA_TRACE(
111  1000,
112  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
113  __kmp_nth, __kmp_all_nth));
114 
115  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
116  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
117  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
118  __kmp_init_gtid for this to work. */
119 
120  if (!TCR_4(__kmp_init_gtid))
121  return KMP_GTID_DNE;
122 
123 #ifdef KMP_TDATA_GTID
124  if (TCR_4(__kmp_gtid_mode) >= 3) {
125  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
126  return __kmp_gtid;
127  }
128 #endif
129  if (TCR_4(__kmp_gtid_mode) >= 2) {
130  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
131  return __kmp_gtid_get_specific();
132  }
133  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
134 
135  stack_addr = (char *)&stack_data;
136  other_threads = __kmp_threads;
137 
138  /* ATT: The code below is a source of potential bugs due to unsynchronized
139  access to __kmp_threads array. For example:
140  1. Current thread loads other_threads[i] to thr and checks it, it is
141  non-NULL.
142  2. Current thread is suspended by OS.
143  3. Another thread unregisters and finishes (debug versions of free()
144  may fill memory with something like 0xEF).
145  4. Current thread is resumed.
146  5. Current thread reads junk from *thr.
147  TODO: Fix it. --ln */
148 
149  for (i = 0; i < __kmp_threads_capacity; i++) {
150 
151  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
152  if (!thr)
153  continue;
154 
155  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
156  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
157 
158  /* stack grows down -- search through all of the active threads */
159 
160  if (stack_addr <= stack_base) {
161  size_t stack_diff = stack_base - stack_addr;
162 
163  if (stack_diff <= stack_size) {
164  /* The only way we can be closer than the allocated */
165  /* stack size is if we are running on this thread. */
166  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
167  return i;
168  }
169  }
170  }
171 
172  /* get specific to try and determine our gtid */
173  KA_TRACE(1000,
174  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
175  "thread, using TLS\n"));
176  i = __kmp_gtid_get_specific();
177 
178  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
179 
180  /* if we havn't been assigned a gtid, then return code */
181  if (i < 0)
182  return i;
183 
184  /* dynamically updated stack window for uber threads to avoid get_specific
185  call */
186  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
187  KMP_FATAL(StackOverflow, i);
188  }
189 
190  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
191  if (stack_addr > stack_base) {
192  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
193  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
194  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
195  stack_base);
196  } else {
197  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
198  stack_base - stack_addr);
199  }
200 
201  /* Reprint stack bounds for ubermaster since they have been refined */
202  if (__kmp_storage_map) {
203  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
205  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
206  other_threads[i]->th.th_info.ds.ds_stacksize,
207  "th_%d stack (refinement)", i);
208  }
209  return i;
210 }
211 
212 int __kmp_get_global_thread_id_reg() {
213  int gtid;
214 
215  if (!__kmp_init_serial) {
216  gtid = KMP_GTID_DNE;
217  } else
218 #ifdef KMP_TDATA_GTID
219  if (TCR_4(__kmp_gtid_mode) >= 3) {
220  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
221  gtid = __kmp_gtid;
222  } else
223 #endif
224  if (TCR_4(__kmp_gtid_mode) >= 2) {
225  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
226  gtid = __kmp_gtid_get_specific();
227  } else {
228  KA_TRACE(1000,
229  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
230  gtid = __kmp_get_global_thread_id();
231  }
232 
233  /* we must be a new uber master sibling thread */
234  if (gtid == KMP_GTID_DNE) {
235  KA_TRACE(10,
236  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
237  "Registering a new gtid.\n"));
238  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
239  if (!__kmp_init_serial) {
240  __kmp_do_serial_initialize();
241  gtid = __kmp_gtid_get_specific();
242  } else {
243  gtid = __kmp_register_root(FALSE);
244  }
245  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
246  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
247  }
248 
249  KMP_DEBUG_ASSERT(gtid >= 0);
250 
251  return gtid;
252 }
253 
254 /* caller must hold forkjoin_lock */
255 void __kmp_check_stack_overlap(kmp_info_t *th) {
256  int f;
257  char *stack_beg = NULL;
258  char *stack_end = NULL;
259  int gtid;
260 
261  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
262  if (__kmp_storage_map) {
263  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
264  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
265 
266  gtid = __kmp_gtid_from_thread(th);
267 
268  if (gtid == KMP_GTID_MONITOR) {
269  __kmp_print_storage_map_gtid(
270  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271  "th_%s stack (%s)", "mon",
272  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
273  } else {
274  __kmp_print_storage_map_gtid(
275  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276  "th_%d stack (%s)", gtid,
277  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278  }
279  }
280 
281  /* No point in checking ubermaster threads since they use refinement and
282  * cannot overlap */
283  gtid = __kmp_gtid_from_thread(th);
284  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
285  KA_TRACE(10,
286  ("__kmp_check_stack_overlap: performing extensive checking\n"));
287  if (stack_beg == NULL) {
288  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
289  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
290  }
291 
292  for (f = 0; f < __kmp_threads_capacity; f++) {
293  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
294 
295  if (f_th && f_th != th) {
296  char *other_stack_end =
297  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
298  char *other_stack_beg =
299  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
300  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
301  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
302 
303  /* Print the other stack values before the abort */
304  if (__kmp_storage_map)
305  __kmp_print_storage_map_gtid(
306  -1, other_stack_beg, other_stack_end,
307  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
308  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
309 
310  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
311  __kmp_msg_null);
312  }
313  }
314  }
315  }
316  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
317 }
318 
319 /* ------------------------------------------------------------------------ */
320 
321 void __kmp_infinite_loop(void) {
322  static int done = FALSE;
323 
324  while (!done) {
325  KMP_YIELD(1);
326  }
327 }
328 
329 #define MAX_MESSAGE 512
330 
331 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
332  char const *format, ...) {
333  char buffer[MAX_MESSAGE];
334  va_list ap;
335 
336  va_start(ap, format);
337  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
338  p2, (unsigned long)size, format);
339  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
340  __kmp_vprintf(kmp_err, buffer, ap);
341 #if KMP_PRINT_DATA_PLACEMENT
342  int node;
343  if (gtid >= 0) {
344  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
345  if (__kmp_storage_map_verbose) {
346  node = __kmp_get_host_node(p1);
347  if (node < 0) /* doesn't work, so don't try this next time */
348  __kmp_storage_map_verbose = FALSE;
349  else {
350  char *last;
351  int lastNode;
352  int localProc = __kmp_get_cpu_from_gtid(gtid);
353 
354  const int page_size = KMP_GET_PAGE_SIZE();
355 
356  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
357  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
358  if (localProc >= 0)
359  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
360  localProc >> 1);
361  else
362  __kmp_printf_no_lock(" GTID %d\n", gtid);
363 #if KMP_USE_PRCTL
364  /* The more elaborate format is disabled for now because of the prctl
365  * hanging bug. */
366  do {
367  last = p1;
368  lastNode = node;
369  /* This loop collates adjacent pages with the same host node. */
370  do {
371  (char *)p1 += page_size;
372  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
373  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
374  lastNode);
375  } while (p1 <= p2);
376 #else
377  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
378  (char *)p1 + (page_size - 1),
379  __kmp_get_host_node(p1));
380  if (p1 < p2) {
381  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
382  (char *)p2 + (page_size - 1),
383  __kmp_get_host_node(p2));
384  }
385 #endif
386  }
387  }
388  } else
389  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
390  }
391 #endif /* KMP_PRINT_DATA_PLACEMENT */
392  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
393 }
394 
395 void __kmp_warn(char const *format, ...) {
396  char buffer[MAX_MESSAGE];
397  va_list ap;
398 
399  if (__kmp_generate_warnings == kmp_warnings_off) {
400  return;
401  }
402 
403  va_start(ap, format);
404 
405  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
406  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
407  __kmp_vprintf(kmp_err, buffer, ap);
408  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
409 
410  va_end(ap);
411 }
412 
413 void __kmp_abort_process() {
414  // Later threads may stall here, but that's ok because abort() will kill them.
415  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
416 
417  if (__kmp_debug_buf) {
418  __kmp_dump_debug_buffer();
419  }
420 
421  if (KMP_OS_WINDOWS) {
422  // Let other threads know of abnormal termination and prevent deadlock
423  // if abort happened during library initialization or shutdown
424  __kmp_global.g.g_abort = SIGABRT;
425 
426  /* On Windows* OS by default abort() causes pop-up error box, which stalls
427  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
428  boxes. _set_abort_behavior() works well, but this function is not
429  available in VS7 (this is not problem for DLL, but it is a problem for
430  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
431  help, at least in some versions of MS C RTL.
432 
433  It seems following sequence is the only way to simulate abort() and
434  avoid pop-up error box. */
435  raise(SIGABRT);
436  _exit(3); // Just in case, if signal ignored, exit anyway.
437  } else {
438  abort();
439  }
440 
441  __kmp_infinite_loop();
442  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
443 
444 } // __kmp_abort_process
445 
446 void __kmp_abort_thread(void) {
447  // TODO: Eliminate g_abort global variable and this function.
448  // In case of abort just call abort(), it will kill all the threads.
449  __kmp_infinite_loop();
450 } // __kmp_abort_thread
451 
452 /* Print out the storage map for the major kmp_info_t thread data structures
453  that are allocated together. */
454 
455 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
456  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
457  gtid);
458 
459  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
460  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
461 
462  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
463  sizeof(kmp_local_t), "th_%d.th_local", gtid);
464 
465  __kmp_print_storage_map_gtid(
466  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
467  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
468 
469  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
470  &thr->th.th_bar[bs_plain_barrier + 1],
471  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
472  gtid);
473 
474  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
475  &thr->th.th_bar[bs_forkjoin_barrier + 1],
476  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
477  gtid);
478 
479 #if KMP_FAST_REDUCTION_BARRIER
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
481  &thr->th.th_bar[bs_reduction_barrier + 1],
482  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
483  gtid);
484 #endif // KMP_FAST_REDUCTION_BARRIER
485 }
486 
487 /* Print out the storage map for the major kmp_team_t team data structures
488  that are allocated together. */
489 
490 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
491  int team_id, int num_thr) {
492  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
493  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
494  header, team_id);
495 
496  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
497  &team->t.t_bar[bs_last_barrier],
498  sizeof(kmp_balign_team_t) * bs_last_barrier,
499  "%s_%d.t_bar", header, team_id);
500 
501  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
502  &team->t.t_bar[bs_plain_barrier + 1],
503  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
504  header, team_id);
505 
506  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
507  &team->t.t_bar[bs_forkjoin_barrier + 1],
508  sizeof(kmp_balign_team_t),
509  "%s_%d.t_bar[forkjoin]", header, team_id);
510 
511 #if KMP_FAST_REDUCTION_BARRIER
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
513  &team->t.t_bar[bs_reduction_barrier + 1],
514  sizeof(kmp_balign_team_t),
515  "%s_%d.t_bar[reduction]", header, team_id);
516 #endif // KMP_FAST_REDUCTION_BARRIER
517 
518  __kmp_print_storage_map_gtid(
519  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
520  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
521 
522  __kmp_print_storage_map_gtid(
523  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
524  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
525 
526  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
527  &team->t.t_disp_buffer[num_disp_buff],
528  sizeof(dispatch_shared_info_t) * num_disp_buff,
529  "%s_%d.t_disp_buffer", header, team_id);
530 
531  __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
532  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
533  team_id);
534 }
535 
536 static void __kmp_init_allocator() {}
537 static void __kmp_fini_allocator() {}
538 
539 /* ------------------------------------------------------------------------ */
540 
541 #ifdef KMP_DYNAMIC_LIB
542 #if KMP_OS_WINDOWS
543 
544 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
545  // TODO: Change to __kmp_break_bootstrap_lock().
546  __kmp_init_bootstrap_lock(lck); // make the lock released
547 }
548 
549 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
550  int i;
551  int thread_count;
552 
553  // PROCESS_DETACH is expected to be called by a thread that executes
554  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
555  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
556  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
557  // threads can be still alive here, although being about to be terminated. The
558  // threads in the array with ds_thread==0 are most suspicious. Actually, it
559  // can be not safe to access the __kmp_threads[].
560 
561  // TODO: does it make sense to check __kmp_roots[] ?
562 
563  // Let's check that there are no other alive threads registered with the OMP
564  // lib.
565  while (1) {
566  thread_count = 0;
567  for (i = 0; i < __kmp_threads_capacity; ++i) {
568  if (!__kmp_threads)
569  continue;
570  kmp_info_t *th = __kmp_threads[i];
571  if (th == NULL)
572  continue;
573  int gtid = th->th.th_info.ds.ds_gtid;
574  if (gtid == gtid_req)
575  continue;
576  if (gtid < 0)
577  continue;
578  DWORD exit_val;
579  int alive = __kmp_is_thread_alive(th, &exit_val);
580  if (alive) {
581  ++thread_count;
582  }
583  }
584  if (thread_count == 0)
585  break; // success
586  }
587 
588  // Assume that I'm alone. Now it might be safe to check and reset locks.
589  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
590  __kmp_reset_lock(&__kmp_forkjoin_lock);
591 #ifdef KMP_DEBUG
592  __kmp_reset_lock(&__kmp_stdio_lock);
593 #endif // KMP_DEBUG
594 }
595 
596 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
597  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
598 
599  switch (fdwReason) {
600 
601  case DLL_PROCESS_ATTACH:
602  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
603 
604  return TRUE;
605 
606  case DLL_PROCESS_DETACH:
607  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
608 
609  if (lpReserved != NULL) {
610  // lpReserved is used for telling the difference:
611  // lpReserved == NULL when FreeLibrary() was called,
612  // lpReserved != NULL when the process terminates.
613  // When FreeLibrary() is called, worker threads remain alive. So they will
614  // release the forkjoin lock by themselves. When the process terminates,
615  // worker threads disappear triggering the problem of unreleased forkjoin
616  // lock as described below.
617 
618  // A worker thread can take the forkjoin lock. The problem comes up if
619  // that worker thread becomes dead before it releases the forkjoin lock.
620  // The forkjoin lock remains taken, while the thread executing
621  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
622  // to take the forkjoin lock and will always fail, so that the application
623  // will never finish [normally]. This scenario is possible if
624  // __kmpc_end() has not been executed. It looks like it's not a corner
625  // case, but common cases:
626  // - the main function was compiled by an alternative compiler;
627  // - the main function was compiled by icl but without /Qopenmp
628  // (application with plugins);
629  // - application terminates by calling C exit(), Fortran CALL EXIT() or
630  // Fortran STOP.
631  // - alive foreign thread prevented __kmpc_end from doing cleanup.
632  //
633  // This is a hack to work around the problem.
634  // TODO: !!! figure out something better.
635  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
636  }
637 
638  __kmp_internal_end_library(__kmp_gtid_get_specific());
639 
640  return TRUE;
641 
642  case DLL_THREAD_ATTACH:
643  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
644 
645  /* if we want to register new siblings all the time here call
646  * __kmp_get_gtid(); */
647  return TRUE;
648 
649  case DLL_THREAD_DETACH:
650  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
651 
652  __kmp_internal_end_thread(__kmp_gtid_get_specific());
653  return TRUE;
654  }
655 
656  return TRUE;
657 }
658 
659 #endif /* KMP_OS_WINDOWS */
660 #endif /* KMP_DYNAMIC_LIB */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int __kmp_change_library(int status) {
665  int old_status;
666 
667  old_status = __kmp_yield_init &
668  1; // check whether KMP_LIBRARY=throughput (even init count)
669 
670  if (status) {
671  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
672  } else {
673  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
674  }
675 
676  return old_status; // return previous setting of whether
677  // KMP_LIBRARY=throughput
678 }
679 
680 /* __kmp_parallel_deo -- Wait until it's our turn. */
681 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682  int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684  kmp_team_t *team = __kmp_team_from_gtid(gtid);
685 #endif /* BUILD_PARALLEL_ORDERED */
686 
687  if (__kmp_env_consistency_check) {
688  if (__kmp_threads[gtid]->th.th_root->r.r_active)
689 #if KMP_USE_DYNAMIC_LOCK
690  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
691 #else
692  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
693 #endif
694  }
695 #ifdef BUILD_PARALLEL_ORDERED
696  if (!team->t.t_serialized) {
697  KMP_MB();
698  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
699  KMP_EQ, NULL);
700  KMP_MB();
701  }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* __kmp_parallel_dxo -- Signal the next task. */
706 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
707  int gtid = *gtid_ref;
708 #ifdef BUILD_PARALLEL_ORDERED
709  int tid = __kmp_tid_from_gtid(gtid);
710  kmp_team_t *team = __kmp_team_from_gtid(gtid);
711 #endif /* BUILD_PARALLEL_ORDERED */
712 
713  if (__kmp_env_consistency_check) {
714  if (__kmp_threads[gtid]->th.th_root->r.r_active)
715  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
716  }
717 #ifdef BUILD_PARALLEL_ORDERED
718  if (!team->t.t_serialized) {
719  KMP_MB(); /* Flush all pending memory write invalidates. */
720 
721  /* use the tid of the next thread in this team */
722  /* TODO replace with general release procedure */
723  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
724 
725  KMP_MB(); /* Flush all pending memory write invalidates. */
726  }
727 #endif /* BUILD_PARALLEL_ORDERED */
728 }
729 
730 /* ------------------------------------------------------------------------ */
731 /* The BARRIER for a SINGLE process section is always explicit */
732 
733 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
734  int status;
735  kmp_info_t *th;
736  kmp_team_t *team;
737 
738  if (!TCR_4(__kmp_init_parallel))
739  __kmp_parallel_initialize();
740 
741  th = __kmp_threads[gtid];
742  team = th->th.th_team;
743  status = 0;
744 
745  th->th.th_ident = id_ref;
746 
747  if (team->t.t_serialized) {
748  status = 1;
749  } else {
750  kmp_int32 old_this = th->th.th_local.this_construct;
751 
752  ++th->th.th_local.this_construct;
753  /* try to set team count to thread count--success means thread got the
754  single block */
755  /* TODO: Should this be acquire or release? */
756  if (team->t.t_construct == old_this) {
757  status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
758  th->th.th_local.this_construct);
759  }
760 #if USE_ITT_BUILD
761  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
762  KMP_MASTER_GTID(gtid) &&
763 #if OMP_40_ENABLED
764  th->th.th_teams_microtask == NULL &&
765 #endif
766  team->t.t_active_level ==
767  1) { // Only report metadata by master of active team at level 1
768  __kmp_itt_metadata_single(id_ref);
769  }
770 #endif /* USE_ITT_BUILD */
771  }
772 
773  if (__kmp_env_consistency_check) {
774  if (status && push_ws) {
775  __kmp_push_workshare(gtid, ct_psingle, id_ref);
776  } else {
777  __kmp_check_workshare(gtid, ct_psingle, id_ref);
778  }
779  }
780 #if USE_ITT_BUILD
781  if (status) {
782  __kmp_itt_single_start(gtid);
783  }
784 #endif /* USE_ITT_BUILD */
785  return status;
786 }
787 
788 void __kmp_exit_single(int gtid) {
789 #if USE_ITT_BUILD
790  __kmp_itt_single_end(gtid);
791 #endif /* USE_ITT_BUILD */
792  if (__kmp_env_consistency_check)
793  __kmp_pop_workshare(gtid, ct_psingle, NULL);
794 }
795 
796 /* determine if we can go parallel or must use a serialized parallel region and
797  * how many threads we can use
798  * set_nproc is the number of threads requested for the team
799  * returns 0 if we should serialize or only use one thread,
800  * otherwise the number of threads to use
801  * The forkjoin lock is held by the caller. */
802 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
803  int master_tid, int set_nthreads
804 #if OMP_40_ENABLED
805  ,
806  int enter_teams
807 #endif /* OMP_40_ENABLED */
808  ) {
809  int capacity;
810  int new_nthreads;
811  KMP_DEBUG_ASSERT(__kmp_init_serial);
812  KMP_DEBUG_ASSERT(root && parent_team);
813 
814  // If dyn-var is set, dynamically adjust the number of desired threads,
815  // according to the method specified by dynamic_mode.
816  new_nthreads = set_nthreads;
817  if (!get__dynamic_2(parent_team, master_tid)) {
818  ;
819  }
820 #ifdef USE_LOAD_BALANCE
821  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
822  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
823  if (new_nthreads == 1) {
824  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
825  "reservation to 1 thread\n",
826  master_tid));
827  return 1;
828  }
829  if (new_nthreads < set_nthreads) {
830  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
831  "reservation to %d threads\n",
832  master_tid, new_nthreads));
833  }
834  }
835 #endif /* USE_LOAD_BALANCE */
836  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
837  new_nthreads = __kmp_avail_proc - __kmp_nth +
838  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839  if (new_nthreads <= 1) {
840  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
841  "reservation to 1 thread\n",
842  master_tid));
843  return 1;
844  }
845  if (new_nthreads < set_nthreads) {
846  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
847  "reservation to %d threads\n",
848  master_tid, new_nthreads));
849  } else {
850  new_nthreads = set_nthreads;
851  }
852  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
853  if (set_nthreads > 2) {
854  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
855  new_nthreads = (new_nthreads % set_nthreads) + 1;
856  if (new_nthreads == 1) {
857  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
858  "reservation to 1 thread\n",
859  master_tid));
860  return 1;
861  }
862  if (new_nthreads < set_nthreads) {
863  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
864  "reservation to %d threads\n",
865  master_tid, new_nthreads));
866  }
867  }
868  } else {
869  KMP_ASSERT(0);
870  }
871 
872  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
873  if (__kmp_nth + new_nthreads -
874  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
875  __kmp_max_nth) {
876  int tl_nthreads = __kmp_max_nth - __kmp_nth +
877  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
878  if (tl_nthreads <= 0) {
879  tl_nthreads = 1;
880  }
881 
882  // If dyn-var is false, emit a 1-time warning.
883  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
884  __kmp_reserve_warn = 1;
885  __kmp_msg(kmp_ms_warning,
886  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
887  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
888  }
889  if (tl_nthreads == 1) {
890  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
891  "reduced reservation to 1 thread\n",
892  master_tid));
893  return 1;
894  }
895  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
896  "reservation to %d threads\n",
897  master_tid, tl_nthreads));
898  new_nthreads = tl_nthreads;
899  }
900 
901  // Respect OMP_THREAD_LIMIT
902  if (root->r.r_cg_nthreads + new_nthreads -
903  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
904  __kmp_cg_max_nth) {
905  int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
906  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
907  if (tl_nthreads <= 0) {
908  tl_nthreads = 1;
909  }
910 
911  // If dyn-var is false, emit a 1-time warning.
912  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
913  __kmp_reserve_warn = 1;
914  __kmp_msg(kmp_ms_warning,
915  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
916  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
917  }
918  if (tl_nthreads == 1) {
919  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
920  "reduced reservation to 1 thread\n",
921  master_tid));
922  return 1;
923  }
924  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
925  "reservation to %d threads\n",
926  master_tid, tl_nthreads));
927  new_nthreads = tl_nthreads;
928  }
929 
930  // Check if the threads array is large enough, or needs expanding.
931  // See comment in __kmp_register_root() about the adjustment if
932  // __kmp_threads[0] == NULL.
933  capacity = __kmp_threads_capacity;
934  if (TCR_PTR(__kmp_threads[0]) == NULL) {
935  --capacity;
936  }
937  if (__kmp_nth + new_nthreads -
938  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
939  capacity) {
940  // Expand the threads array.
941  int slotsRequired = __kmp_nth + new_nthreads -
942  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
943  capacity;
944  int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
945  if (slotsAdded < slotsRequired) {
946  // The threads array was not expanded enough.
947  new_nthreads -= (slotsRequired - slotsAdded);
948  KMP_ASSERT(new_nthreads >= 1);
949 
950  // If dyn-var is false, emit a 1-time warning.
951  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
952  __kmp_reserve_warn = 1;
953  if (__kmp_tp_cached) {
954  __kmp_msg(kmp_ms_warning,
955  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
956  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
957  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
958  } else {
959  __kmp_msg(kmp_ms_warning,
960  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
961  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
962  }
963  }
964  }
965  }
966 
967 #ifdef KMP_DEBUG
968  if (new_nthreads == 1) {
969  KC_TRACE(10,
970  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
971  "dead roots and rechecking; requested %d threads\n",
972  __kmp_get_gtid(), set_nthreads));
973  } else {
974  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
975  " %d threads\n",
976  __kmp_get_gtid(), new_nthreads, set_nthreads));
977  }
978 #endif // KMP_DEBUG
979  return new_nthreads;
980 }
981 
982 /* Allocate threads from the thread pool and assign them to the new team. We are
983  assured that there are enough threads available, because we checked on that
984  earlier within critical section forkjoin */
985 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
986  kmp_info_t *master_th, int master_gtid) {
987  int i;
988  int use_hot_team;
989 
990  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
991  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
992  KMP_MB();
993 
994  /* first, let's setup the master thread */
995  master_th->th.th_info.ds.ds_tid = 0;
996  master_th->th.th_team = team;
997  master_th->th.th_team_nproc = team->t.t_nproc;
998  master_th->th.th_team_master = master_th;
999  master_th->th.th_team_serialized = FALSE;
1000  master_th->th.th_dispatch = &team->t.t_dispatch[0];
1001 
1002 /* make sure we are not the optimized hot team */
1003 #if KMP_NESTED_HOT_TEAMS
1004  use_hot_team = 0;
1005  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1006  if (hot_teams) { // hot teams array is not allocated if
1007  // KMP_HOT_TEAMS_MAX_LEVEL=0
1008  int level = team->t.t_active_level - 1; // index in array of hot teams
1009  if (master_th->th.th_teams_microtask) { // are we inside the teams?
1010  if (master_th->th.th_teams_size.nteams > 1) {
1011  ++level; // level was not increased in teams construct for
1012  // team_of_masters
1013  }
1014  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1015  master_th->th.th_teams_level == team->t.t_level) {
1016  ++level; // level was not increased in teams construct for
1017  // team_of_workers before the parallel
1018  } // team->t.t_level will be increased inside parallel
1019  }
1020  if (level < __kmp_hot_teams_max_level) {
1021  if (hot_teams[level].hot_team) {
1022  // hot team has already been allocated for given level
1023  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1024  use_hot_team = 1; // the team is ready to use
1025  } else {
1026  use_hot_team = 0; // AC: threads are not allocated yet
1027  hot_teams[level].hot_team = team; // remember new hot team
1028  hot_teams[level].hot_team_nth = team->t.t_nproc;
1029  }
1030  } else {
1031  use_hot_team = 0;
1032  }
1033  }
1034 #else
1035  use_hot_team = team == root->r.r_hot_team;
1036 #endif
1037  if (!use_hot_team) {
1038 
1039  /* install the master thread */
1040  team->t.t_threads[0] = master_th;
1041  __kmp_initialize_info(master_th, team, 0, master_gtid);
1042 
1043  /* now, install the worker threads */
1044  for (i = 1; i < team->t.t_nproc; i++) {
1045 
1046  /* fork or reallocate a new thread and install it in team */
1047  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1048  team->t.t_threads[i] = thr;
1049  KMP_DEBUG_ASSERT(thr);
1050  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1051  /* align team and thread arrived states */
1052  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1053  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1054  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1055  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1056  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1057  team->t.t_bar[bs_plain_barrier].b_arrived));
1058 #if OMP_40_ENABLED
1059  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1060  thr->th.th_teams_level = master_th->th.th_teams_level;
1061  thr->th.th_teams_size = master_th->th.th_teams_size;
1062 #endif
1063  { // Initialize threads' barrier data.
1064  int b;
1065  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1066  for (b = 0; b < bs_last_barrier; ++b) {
1067  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1068  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1069 #if USE_DEBUGGER
1070  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1071 #endif
1072  }
1073  }
1074  }
1075 
1076 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1077  __kmp_partition_places(team);
1078 #endif
1079  }
1080 
1081  KMP_MB();
1082 }
1083 
1084 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1085 // Propagate any changes to the floating point control registers out to the team
1086 // We try to avoid unnecessary writes to the relevant cache line in the team
1087 // structure, so we don't make changes unless they are needed.
1088 inline static void propagateFPControl(kmp_team_t *team) {
1089  if (__kmp_inherit_fp_control) {
1090  kmp_int16 x87_fpu_control_word;
1091  kmp_uint32 mxcsr;
1092 
1093  // Get master values of FPU control flags (both X87 and vector)
1094  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1095  __kmp_store_mxcsr(&mxcsr);
1096  mxcsr &= KMP_X86_MXCSR_MASK;
1097 
1098  // There is no point looking at t_fp_control_saved here.
1099  // If it is TRUE, we still have to update the values if they are different
1100  // from those we now have. If it is FALSE we didn't save anything yet, but
1101  // our objective is the same. We have to ensure that the values in the team
1102  // are the same as those we have.
1103  // So, this code achieves what we need whether or not t_fp_control_saved is
1104  // true. By checking whether the value needs updating we avoid unnecessary
1105  // writes that would put the cache-line into a written state, causing all
1106  // threads in the team to have to read it again.
1107  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1108  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1109  // Although we don't use this value, other code in the runtime wants to know
1110  // whether it should restore them. So we must ensure it is correct.
1111  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1112  } else {
1113  // Similarly here. Don't write to this cache-line in the team structure
1114  // unless we have to.
1115  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1116  }
1117 }
1118 
1119 // Do the opposite, setting the hardware registers to the updated values from
1120 // the team.
1121 inline static void updateHWFPControl(kmp_team_t *team) {
1122  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1123  // Only reset the fp control regs if they have been changed in the team.
1124  // the parallel region that we are exiting.
1125  kmp_int16 x87_fpu_control_word;
1126  kmp_uint32 mxcsr;
1127  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1128  __kmp_store_mxcsr(&mxcsr);
1129  mxcsr &= KMP_X86_MXCSR_MASK;
1130 
1131  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1132  __kmp_clear_x87_fpu_status_word();
1133  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1134  }
1135 
1136  if (team->t.t_mxcsr != mxcsr) {
1137  __kmp_load_mxcsr(&team->t.t_mxcsr);
1138  }
1139  }
1140 }
1141 #else
1142 #define propagateFPControl(x) ((void)0)
1143 #define updateHWFPControl(x) ((void)0)
1144 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1145 
1146 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1147  int realloc); // forward declaration
1148 
1149 /* Run a parallel region that has been serialized, so runs only in a team of the
1150  single master thread. */
1151 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1152  kmp_info_t *this_thr;
1153  kmp_team_t *serial_team;
1154 
1155  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1156 
1157  /* Skip all this code for autopar serialized loops since it results in
1158  unacceptable overhead */
1159  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1160  return;
1161 
1162  if (!TCR_4(__kmp_init_parallel))
1163  __kmp_parallel_initialize();
1164 
1165  this_thr = __kmp_threads[global_tid];
1166  serial_team = this_thr->th.th_serial_team;
1167 
1168  /* utilize the serialized team held by this thread */
1169  KMP_DEBUG_ASSERT(serial_team);
1170  KMP_MB();
1171 
1172  if (__kmp_tasking_mode != tskm_immediate_exec) {
1173  KMP_DEBUG_ASSERT(
1174  this_thr->th.th_task_team ==
1175  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1176  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1177  NULL);
1178  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1179  "team %p, new task_team = NULL\n",
1180  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1181  this_thr->th.th_task_team = NULL;
1182  }
1183 
1184 #if OMP_40_ENABLED
1185  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1186  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1187  proc_bind = proc_bind_false;
1188  } else if (proc_bind == proc_bind_default) {
1189  // No proc_bind clause was specified, so use the current value
1190  // of proc-bind-var for this parallel region.
1191  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1192  }
1193  // Reset for next parallel region
1194  this_thr->th.th_set_proc_bind = proc_bind_default;
1195 #endif /* OMP_40_ENABLED */
1196 
1197 #if OMPT_SUPPORT
1198  ompt_data_t ompt_parallel_data;
1199  ompt_parallel_data.ptr = NULL;
1200  ompt_data_t *implicit_task_data;
1201  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1202  if (ompt_enabled.enabled &&
1203  this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1204 
1205  ompt_task_info_t *parent_task_info;
1206  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1207 
1208  parent_task_info->frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1209  if (ompt_enabled.ompt_callback_parallel_begin) {
1210  int team_size = 1;
1211 
1212  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1213  &(parent_task_info->task_data), &(parent_task_info->frame),
1214  &ompt_parallel_data, team_size, ompt_invoker_program, codeptr);
1215  }
1216  }
1217 #endif // OMPT_SUPPORT
1218 
1219  if (this_thr->th.th_team != serial_team) {
1220  // Nested level will be an index in the nested nthreads array
1221  int level = this_thr->th.th_team->t.t_level;
1222 
1223  if (serial_team->t.t_serialized) {
1224  /* this serial team was already used
1225  TODO increase performance by making this locks more specific */
1226  kmp_team_t *new_team;
1227 
1228  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1229 
1230  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1231 #if OMPT_SUPPORT
1232  ompt_parallel_data,
1233 #endif
1234 #if OMP_40_ENABLED
1235  proc_bind,
1236 #endif
1237  &this_thr->th.th_current_task->td_icvs,
1238  0 USE_NESTED_HOT_ARG(NULL));
1239  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1240  KMP_ASSERT(new_team);
1241 
1242  /* setup new serialized team and install it */
1243  new_team->t.t_threads[0] = this_thr;
1244  new_team->t.t_parent = this_thr->th.th_team;
1245  serial_team = new_team;
1246  this_thr->th.th_serial_team = serial_team;
1247 
1248  KF_TRACE(
1249  10,
1250  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1251  global_tid, serial_team));
1252 
1253  /* TODO the above breaks the requirement that if we run out of resources,
1254  then we can still guarantee that serialized teams are ok, since we may
1255  need to allocate a new one */
1256  } else {
1257  KF_TRACE(
1258  10,
1259  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1260  global_tid, serial_team));
1261  }
1262 
1263  /* we have to initialize this serial team */
1264  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1267  serial_team->t.t_ident = loc;
1268  serial_team->t.t_serialized = 1;
1269  serial_team->t.t_nproc = 1;
1270  serial_team->t.t_parent = this_thr->th.th_team;
1271  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1272  this_thr->th.th_team = serial_team;
1273  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1274 
1275  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1276  this_thr->th.th_current_task));
1277  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1278  this_thr->th.th_current_task->td_flags.executing = 0;
1279 
1280  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1281 
1282  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1283  implicit task for each serialized task represented by
1284  team->t.t_serialized? */
1285  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1286  &this_thr->th.th_current_task->td_parent->td_icvs);
1287 
1288  // Thread value exists in the nested nthreads array for the next nested
1289  // level
1290  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1291  this_thr->th.th_current_task->td_icvs.nproc =
1292  __kmp_nested_nth.nth[level + 1];
1293  }
1294 
1295 #if OMP_40_ENABLED
1296  if (__kmp_nested_proc_bind.used &&
1297  (level + 1 < __kmp_nested_proc_bind.used)) {
1298  this_thr->th.th_current_task->td_icvs.proc_bind =
1299  __kmp_nested_proc_bind.bind_types[level + 1];
1300  }
1301 #endif /* OMP_40_ENABLED */
1302 
1303 #if USE_DEBUGGER
1304  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1305 #endif
1306  this_thr->th.th_info.ds.ds_tid = 0;
1307 
1308  /* set thread cache values */
1309  this_thr->th.th_team_nproc = 1;
1310  this_thr->th.th_team_master = this_thr;
1311  this_thr->th.th_team_serialized = 1;
1312 
1313  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1314  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1315 
1316  propagateFPControl(serial_team);
1317 
1318  /* check if we need to allocate dispatch buffers stack */
1319  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1320  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1321  serial_team->t.t_dispatch->th_disp_buffer =
1322  (dispatch_private_info_t *)__kmp_allocate(
1323  sizeof(dispatch_private_info_t));
1324  }
1325  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1326 
1327  KMP_MB();
1328 
1329  } else {
1330  /* this serialized team is already being used,
1331  * that's fine, just add another nested level */
1332  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1333  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1334  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1335  ++serial_team->t.t_serialized;
1336  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1337 
1338  // Nested level will be an index in the nested nthreads array
1339  int level = this_thr->th.th_team->t.t_level;
1340  // Thread value exists in the nested nthreads array for the next nested
1341  // level
1342  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1343  this_thr->th.th_current_task->td_icvs.nproc =
1344  __kmp_nested_nth.nth[level + 1];
1345  }
1346  serial_team->t.t_level++;
1347  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1348  "of serial team %p to %d\n",
1349  global_tid, serial_team, serial_team->t.t_level));
1350 
1351  /* allocate/push dispatch buffers stack */
1352  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1353  {
1354  dispatch_private_info_t *disp_buffer =
1355  (dispatch_private_info_t *)__kmp_allocate(
1356  sizeof(dispatch_private_info_t));
1357  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1358  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1359  }
1360  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1361 
1362  KMP_MB();
1363  }
1364 #if OMP_40_ENABLED
1365  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1366 #endif
1367 
1368  if (__kmp_env_consistency_check)
1369  __kmp_push_parallel(global_tid, NULL);
1370 #if OMPT_SUPPORT
1371  serial_team->t.ompt_team_info.master_return_address = codeptr;
1372  if (ompt_enabled.enabled &&
1373  this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1374  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1375 
1376  ompt_lw_taskteam_t lw_taskteam;
1377  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1378  &ompt_parallel_data, codeptr);
1379 
1380  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1381  // don't use lw_taskteam after linking. content was swaped
1382 
1383  /* OMPT implicit task begin */
1384  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1385  if (ompt_enabled.ompt_callback_implicit_task) {
1386  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1387  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1388  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
1389  }
1390 
1391  /* OMPT state */
1392  this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1393  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1394  }
1395 #endif
1396 }
1397 
1398 /* most of the work for a fork */
1399 /* return true if we really went parallel, false if serialized */
1400 int __kmp_fork_call(ident_t *loc, int gtid,
1401  enum fork_context_e call_context, // Intel, GNU, ...
1402  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1403 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1404 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1405  va_list *ap
1406 #else
1407  va_list ap
1408 #endif
1409  ) {
1410  void **argv;
1411  int i;
1412  int master_tid;
1413  int master_this_cons;
1414  kmp_team_t *team;
1415  kmp_team_t *parent_team;
1416  kmp_info_t *master_th;
1417  kmp_root_t *root;
1418  int nthreads;
1419  int master_active;
1420  int master_set_numthreads;
1421  int level;
1422 #if OMP_40_ENABLED
1423  int active_level;
1424  int teams_level;
1425 #endif
1426 #if KMP_NESTED_HOT_TEAMS
1427  kmp_hot_team_ptr_t **p_hot_teams;
1428 #endif
1429  { // KMP_TIME_BLOCK
1430  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1431  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1432 
1433  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1434  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1435  /* Some systems prefer the stack for the root thread(s) to start with */
1436  /* some gap from the parent stack to prevent false sharing. */
1437  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1438  /* These 2 lines below are so this does not get optimized out */
1439  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1440  __kmp_stkpadding += (short)((kmp_int64)dummy);
1441  }
1442 
1443  /* initialize if needed */
1444  KMP_DEBUG_ASSERT(
1445  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1446  if (!TCR_4(__kmp_init_parallel))
1447  __kmp_parallel_initialize();
1448 
1449  /* setup current data */
1450  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1451  // shutdown
1452  parent_team = master_th->th.th_team;
1453  master_tid = master_th->th.th_info.ds.ds_tid;
1454  master_this_cons = master_th->th.th_local.this_construct;
1455  root = master_th->th.th_root;
1456  master_active = root->r.r_active;
1457  master_set_numthreads = master_th->th.th_set_nproc;
1458 
1459 #if OMPT_SUPPORT
1460  ompt_data_t ompt_parallel_data;
1461  ompt_parallel_data.ptr = NULL;
1462  ompt_data_t *parent_task_data;
1463  ompt_frame_t *ompt_frame;
1464  ompt_data_t *implicit_task_data;
1465  void *return_address = NULL;
1466 
1467  if (ompt_enabled.enabled) {
1468  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1469  NULL, NULL);
1470  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1471  }
1472 #endif
1473 
1474  // Nested level will be an index in the nested nthreads array
1475  level = parent_team->t.t_level;
1476  // used to launch non-serial teams even if nested is not allowed
1477  active_level = parent_team->t.t_active_level;
1478 #if OMP_40_ENABLED
1479  // needed to check nesting inside the teams
1480  teams_level = master_th->th.th_teams_level;
1481 #endif
1482 #if KMP_NESTED_HOT_TEAMS
1483  p_hot_teams = &master_th->th.th_hot_teams;
1484  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1485  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1486  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1487  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1488  // it is either actual or not needed (when active_level > 0)
1489  (*p_hot_teams)[0].hot_team_nth = 1;
1490  }
1491 #endif
1492 
1493 #if OMPT_SUPPORT
1494  if (ompt_enabled.enabled) {
1495  if (ompt_enabled.ompt_callback_parallel_begin) {
1496  int team_size = master_set_numthreads
1497  ? master_set_numthreads
1498  : get__nproc_2(parent_team, master_tid);
1499  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1500  parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1501  OMPT_INVOKER(call_context), return_address);
1502  }
1503  master_th->th.ompt_thread_info.state = omp_state_overhead;
1504  }
1505 #endif
1506 
1507  master_th->th.th_ident = loc;
1508 
1509 #if OMP_40_ENABLED
1510  if (master_th->th.th_teams_microtask && ap &&
1511  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1512  // AC: This is start of parallel that is nested inside teams construct.
1513  // The team is actual (hot), all workers are ready at the fork barrier.
1514  // No lock needed to initialize the team a bit, then free workers.
1515  parent_team->t.t_ident = loc;
1516  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1517  parent_team->t.t_argc = argc;
1518  argv = (void **)parent_team->t.t_argv;
1519  for (i = argc - 1; i >= 0; --i)
1520 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1521 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1522  *argv++ = va_arg(*ap, void *);
1523 #else
1524  *argv++ = va_arg(ap, void *);
1525 #endif
1526  // Increment our nested depth levels, but not increase the serialization
1527  if (parent_team == master_th->th.th_serial_team) {
1528  // AC: we are in serialized parallel
1529  __kmpc_serialized_parallel(loc, gtid);
1530  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1531  // AC: need this in order enquiry functions work
1532  // correctly, will restore at join time
1533  parent_team->t.t_serialized--;
1534 #if OMPT_SUPPORT
1535  void *dummy;
1536  void **exit_runtime_p;
1537 
1538  ompt_lw_taskteam_t lw_taskteam;
1539 
1540  if (ompt_enabled.enabled) {
1541  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1542  &ompt_parallel_data, return_address);
1543  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame);
1544 
1545  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1546  // don't use lw_taskteam after linking. content was swaped
1547 
1548  /* OMPT implicit task begin */
1549  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1550  if (ompt_enabled.ompt_callback_implicit_task) {
1551  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1552  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1553  implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1554  }
1555 
1556  /* OMPT state */
1557  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1558  } else {
1559  exit_runtime_p = &dummy;
1560  }
1561 #endif
1562 
1563  {
1564  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1565  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1566  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1567 #if OMPT_SUPPORT
1568  ,
1569  exit_runtime_p
1570 #endif
1571  );
1572  }
1573 
1574 #if OMPT_SUPPORT
1575  *exit_runtime_p = NULL;
1576  if (ompt_enabled.enabled) {
1577  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = NULL;
1578  if (ompt_enabled.ompt_callback_implicit_task) {
1579  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1580  ompt_scope_end, NULL, implicit_task_data, 1,
1581  __kmp_tid_from_gtid(gtid));
1582  }
1583  __ompt_lw_taskteam_unlink(master_th);
1584 
1585  if (ompt_enabled.ompt_callback_parallel_end) {
1586  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1587  OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1588  OMPT_INVOKER(call_context), return_address);
1589  }
1590  master_th->th.ompt_thread_info.state = omp_state_overhead;
1591  }
1592 #endif
1593  return TRUE;
1594  }
1595 
1596  parent_team->t.t_pkfn = microtask;
1597  parent_team->t.t_invoke = invoker;
1598  KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1599  parent_team->t.t_active_level++;
1600  parent_team->t.t_level++;
1601 
1602  /* Change number of threads in the team if requested */
1603  if (master_set_numthreads) { // The parallel has num_threads clause
1604  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1605  // AC: only can reduce number of threads dynamically, can't increase
1606  kmp_info_t **other_threads = parent_team->t.t_threads;
1607  parent_team->t.t_nproc = master_set_numthreads;
1608  for (i = 0; i < master_set_numthreads; ++i) {
1609  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1610  }
1611  // Keep extra threads hot in the team for possible next parallels
1612  }
1613  master_th->th.th_set_nproc = 0;
1614  }
1615 
1616 #if USE_DEBUGGER
1617  if (__kmp_debugging) { // Let debugger override number of threads.
1618  int nth = __kmp_omp_num_threads(loc);
1619  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1620  master_set_numthreads = nth;
1621  }
1622  }
1623 #endif
1624 
1625  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1626  "master_th=%p, gtid=%d\n",
1627  root, parent_team, master_th, gtid));
1628  __kmp_internal_fork(loc, gtid, parent_team);
1629  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1630  "master_th=%p, gtid=%d\n",
1631  root, parent_team, master_th, gtid));
1632 
1633  /* Invoke microtask for MASTER thread */
1634  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1635  parent_team->t.t_id, parent_team->t.t_pkfn));
1636 
1637  {
1638  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1639  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1640  if (!parent_team->t.t_invoke(gtid)) {
1641  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1642  }
1643  }
1644  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1645  parent_team->t.t_id, parent_team->t.t_pkfn));
1646  KMP_MB(); /* Flush all pending memory write invalidates. */
1647 
1648  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1649 
1650  return TRUE;
1651  } // Parallel closely nested in teams construct
1652 #endif /* OMP_40_ENABLED */
1653 
1654 #if KMP_DEBUG
1655  if (__kmp_tasking_mode != tskm_immediate_exec) {
1656  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1657  parent_team->t.t_task_team[master_th->th.th_task_state]);
1658  }
1659 #endif
1660 
1661  if (parent_team->t.t_active_level >=
1662  master_th->th.th_current_task->td_icvs.max_active_levels) {
1663  nthreads = 1;
1664  } else {
1665 #if OMP_40_ENABLED
1666  int enter_teams = ((ap == NULL && active_level == 0) ||
1667  (ap && teams_level > 0 && teams_level == level));
1668 #endif
1669  nthreads =
1670  master_set_numthreads
1671  ? master_set_numthreads
1672  : get__nproc_2(
1673  parent_team,
1674  master_tid); // TODO: get nproc directly from current task
1675 
1676  // Check if we need to take forkjoin lock? (no need for serialized
1677  // parallel out of teams construct). This code moved here from
1678  // __kmp_reserve_threads() to speedup nested serialized parallels.
1679  if (nthreads > 1) {
1680  if ((!get__nested(master_th) && (root->r.r_in_parallel
1681 #if OMP_40_ENABLED
1682  && !enter_teams
1683 #endif /* OMP_40_ENABLED */
1684  )) ||
1685  (__kmp_library == library_serial)) {
1686  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1687  " threads\n",
1688  gtid, nthreads));
1689  nthreads = 1;
1690  }
1691  }
1692  if (nthreads > 1) {
1693  /* determine how many new threads we can use */
1694  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1695  nthreads = __kmp_reserve_threads(
1696  root, parent_team, master_tid, nthreads
1697 #if OMP_40_ENABLED
1698  /* AC: If we execute teams from parallel region (on host), then
1699  teams should be created but each can only have 1 thread if
1700  nesting is disabled. If teams called from serial region, then
1701  teams and their threads should be created regardless of the
1702  nesting setting. */
1703  ,
1704  enter_teams
1705 #endif /* OMP_40_ENABLED */
1706  );
1707  if (nthreads == 1) {
1708  // Free lock for single thread execution here; for multi-thread
1709  // execution it will be freed later after team of threads created
1710  // and initialized
1711  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1712  }
1713  }
1714  }
1715  KMP_DEBUG_ASSERT(nthreads > 0);
1716 
1717  // If we temporarily changed the set number of threads then restore it now
1718  master_th->th.th_set_nproc = 0;
1719 
1720  /* create a serialized parallel region? */
1721  if (nthreads == 1) {
1722 /* josh todo: hypothetical question: what do we do for OS X*? */
1723 #if KMP_OS_LINUX && \
1724  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1725  void *args[argc];
1726 #else
1727  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1728 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1729  KMP_ARCH_AARCH64) */
1730 
1731  KA_TRACE(20,
1732  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1733 
1734  __kmpc_serialized_parallel(loc, gtid);
1735 
1736  if (call_context == fork_context_intel) {
1737  /* TODO this sucks, use the compiler itself to pass args! :) */
1738  master_th->th.th_serial_team->t.t_ident = loc;
1739 #if OMP_40_ENABLED
1740  if (!ap) {
1741  // revert change made in __kmpc_serialized_parallel()
1742  master_th->th.th_serial_team->t.t_level--;
1743 // Get args from parent team for teams construct
1744 
1745 #if OMPT_SUPPORT
1746  void *dummy;
1747  void **exit_runtime_p;
1748  ompt_task_info_t *task_info;
1749 
1750  ompt_lw_taskteam_t lw_taskteam;
1751 
1752  if (ompt_enabled.enabled) {
1753  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1754  &ompt_parallel_data, return_address);
1755 
1756  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1757  // don't use lw_taskteam after linking. content was swaped
1758 
1759  task_info = OMPT_CUR_TASK_INFO(master_th);
1760  exit_runtime_p = &(task_info->frame.exit_frame);
1761  if (ompt_enabled.ompt_callback_implicit_task) {
1762  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1763  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1764  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
1765  }
1766 
1767  /* OMPT state */
1768  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1769  } else {
1770  exit_runtime_p = &dummy;
1771  }
1772 #endif
1773 
1774  {
1775  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1776  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1777  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1778  parent_team->t.t_argv
1779 #if OMPT_SUPPORT
1780  ,
1781  exit_runtime_p
1782 #endif
1783  );
1784  }
1785 
1786 #if OMPT_SUPPORT
1787  if (ompt_enabled.enabled) {
1788  exit_runtime_p = NULL;
1789  if (ompt_enabled.ompt_callback_implicit_task) {
1790  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1791  ompt_scope_end, NULL, &(task_info->task_data), 1,
1792  __kmp_tid_from_gtid(gtid));
1793  }
1794 
1795  __ompt_lw_taskteam_unlink(master_th);
1796  if (ompt_enabled.ompt_callback_parallel_end) {
1797  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1798  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1799  OMPT_INVOKER(call_context), return_address);
1800  }
1801  master_th->th.ompt_thread_info.state = omp_state_overhead;
1802  }
1803 #endif
1804  } else if (microtask == (microtask_t)__kmp_teams_master) {
1805  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1806  master_th->th.th_serial_team);
1807  team = master_th->th.th_team;
1808  // team->t.t_pkfn = microtask;
1809  team->t.t_invoke = invoker;
1810  __kmp_alloc_argv_entries(argc, team, TRUE);
1811  team->t.t_argc = argc;
1812  argv = (void **)team->t.t_argv;
1813  if (ap) {
1814  for (i = argc - 1; i >= 0; --i)
1815 // TODO: revert workaround for Intel(R) 64 tracker #96
1816 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1817  *argv++ = va_arg(*ap, void *);
1818 #else
1819  *argv++ = va_arg(ap, void *);
1820 #endif
1821  } else {
1822  for (i = 0; i < argc; ++i)
1823  // Get args from parent team for teams construct
1824  argv[i] = parent_team->t.t_argv[i];
1825  }
1826  // AC: revert change made in __kmpc_serialized_parallel()
1827  // because initial code in teams should have level=0
1828  team->t.t_level--;
1829  // AC: call special invoker for outer "parallel" of teams construct
1830  {
1831  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1832  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1833  invoker(gtid);
1834  }
1835  } else {
1836 #endif /* OMP_40_ENABLED */
1837  argv = args;
1838  for (i = argc - 1; i >= 0; --i)
1839 // TODO: revert workaround for Intel(R) 64 tracker #96
1840 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1841  *argv++ = va_arg(*ap, void *);
1842 #else
1843  *argv++ = va_arg(ap, void *);
1844 #endif
1845  KMP_MB();
1846 
1847 #if OMPT_SUPPORT
1848  void *dummy;
1849  void **exit_runtime_p;
1850  ompt_task_info_t *task_info;
1851 
1852  ompt_lw_taskteam_t lw_taskteam;
1853 
1854  if (ompt_enabled.enabled) {
1855  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1856  &ompt_parallel_data, return_address);
1857  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1858  // don't use lw_taskteam after linking. content was swaped
1859  task_info = OMPT_CUR_TASK_INFO(master_th);
1860  exit_runtime_p = &(task_info->frame.exit_frame);
1861 
1862  /* OMPT implicit task begin */
1863  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1864  if (ompt_enabled.ompt_callback_implicit_task) {
1865  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1866  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1867  implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1868  }
1869 
1870  /* OMPT state */
1871  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1872  } else {
1873  exit_runtime_p = &dummy;
1874  }
1875 #endif
1876 
1877  {
1878  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1879  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1880  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1881 #if OMPT_SUPPORT
1882  ,
1883  exit_runtime_p
1884 #endif
1885  );
1886  }
1887 
1888 #if OMPT_SUPPORT
1889  if (ompt_enabled.enabled) {
1890  *exit_runtime_p = NULL;
1891  if (ompt_enabled.ompt_callback_implicit_task) {
1892  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1893  ompt_scope_end, NULL, &(task_info->task_data), 1,
1894  __kmp_tid_from_gtid(gtid));
1895  }
1896 
1897  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1898  __ompt_lw_taskteam_unlink(master_th);
1899  if (ompt_enabled.ompt_callback_parallel_end) {
1900  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1901  &ompt_parallel_data, parent_task_data,
1902  OMPT_INVOKER(call_context), return_address);
1903  }
1904  master_th->th.ompt_thread_info.state = omp_state_overhead;
1905  }
1906 #endif
1907 #if OMP_40_ENABLED
1908  }
1909 #endif /* OMP_40_ENABLED */
1910  } else if (call_context == fork_context_gnu) {
1911 #if OMPT_SUPPORT
1912  ompt_lw_taskteam_t lwt;
1913  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1914  return_address);
1915 
1916  lwt.ompt_task_info.frame.exit_frame = NULL;
1917  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1918 // don't use lw_taskteam after linking. content was swaped
1919 #endif
1920 
1921  // we were called from GNU native code
1922  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1923  return FALSE;
1924  } else {
1925  KMP_ASSERT2(call_context < fork_context_last,
1926  "__kmp_fork_call: unknown fork_context parameter");
1927  }
1928 
1929  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1930  KMP_MB();
1931  return FALSE;
1932  }
1933 
1934  // GEH: only modify the executing flag in the case when not serialized
1935  // serialized case is handled in kmpc_serialized_parallel
1936  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1937  "curtask=%p, curtask_max_aclevel=%d\n",
1938  parent_team->t.t_active_level, master_th,
1939  master_th->th.th_current_task,
1940  master_th->th.th_current_task->td_icvs.max_active_levels));
1941  // TODO: GEH - cannot do this assertion because root thread not set up as
1942  // executing
1943  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1944  master_th->th.th_current_task->td_flags.executing = 0;
1945 
1946 #if OMP_40_ENABLED
1947  if (!master_th->th.th_teams_microtask || level > teams_level)
1948 #endif /* OMP_40_ENABLED */
1949  {
1950  /* Increment our nested depth level */
1951  KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1952  }
1953 
1954  // See if we need to make a copy of the ICVs.
1955  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1956  if ((level + 1 < __kmp_nested_nth.used) &&
1957  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1958  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1959  } else {
1960  nthreads_icv = 0; // don't update
1961  }
1962 
1963 #if OMP_40_ENABLED
1964  // Figure out the proc_bind_policy for the new team.
1965  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1966  kmp_proc_bind_t proc_bind_icv =
1967  proc_bind_default; // proc_bind_default means don't update
1968  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1969  proc_bind = proc_bind_false;
1970  } else {
1971  if (proc_bind == proc_bind_default) {
1972  // No proc_bind clause specified; use current proc-bind-var for this
1973  // parallel region
1974  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1975  }
1976  /* else: The proc_bind policy was specified explicitly on parallel clause.
1977  This overrides proc-bind-var for this parallel region, but does not
1978  change proc-bind-var. */
1979  // Figure the value of proc-bind-var for the child threads.
1980  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1981  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1982  master_th->th.th_current_task->td_icvs.proc_bind)) {
1983  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1984  }
1985  }
1986 
1987  // Reset for next parallel region
1988  master_th->th.th_set_proc_bind = proc_bind_default;
1989 #endif /* OMP_40_ENABLED */
1990 
1991  if ((nthreads_icv > 0)
1992 #if OMP_40_ENABLED
1993  || (proc_bind_icv != proc_bind_default)
1994 #endif /* OMP_40_ENABLED */
1995  ) {
1996  kmp_internal_control_t new_icvs;
1997  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1998  new_icvs.next = NULL;
1999  if (nthreads_icv > 0) {
2000  new_icvs.nproc = nthreads_icv;
2001  }
2002 
2003 #if OMP_40_ENABLED
2004  if (proc_bind_icv != proc_bind_default) {
2005  new_icvs.proc_bind = proc_bind_icv;
2006  }
2007 #endif /* OMP_40_ENABLED */
2008 
2009  /* allocate a new parallel team */
2010  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2011  team = __kmp_allocate_team(root, nthreads, nthreads,
2012 #if OMPT_SUPPORT
2013  ompt_parallel_data,
2014 #endif
2015 #if OMP_40_ENABLED
2016  proc_bind,
2017 #endif
2018  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2019  } else {
2020  /* allocate a new parallel team */
2021  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2022  team = __kmp_allocate_team(root, nthreads, nthreads,
2023 #if OMPT_SUPPORT
2024  ompt_parallel_data,
2025 #endif
2026 #if OMP_40_ENABLED
2027  proc_bind,
2028 #endif
2029  &master_th->th.th_current_task->td_icvs,
2030  argc USE_NESTED_HOT_ARG(master_th));
2031  }
2032  KF_TRACE(
2033  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2034 
2035  /* setup the new team */
2036  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2037  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2038  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2039  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2040  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2041 #if OMPT_SUPPORT
2042  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2043  return_address);
2044 #endif
2045  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2046 // TODO: parent_team->t.t_level == INT_MAX ???
2047 #if OMP_40_ENABLED
2048  if (!master_th->th.th_teams_microtask || level > teams_level) {
2049 #endif /* OMP_40_ENABLED */
2050  int new_level = parent_team->t.t_level + 1;
2051  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2052  new_level = parent_team->t.t_active_level + 1;
2053  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2054 #if OMP_40_ENABLED
2055  } else {
2056  // AC: Do not increase parallel level at start of the teams construct
2057  int new_level = parent_team->t.t_level;
2058  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2059  new_level = parent_team->t.t_active_level;
2060  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2061  }
2062 #endif /* OMP_40_ENABLED */
2063  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2064  // set master's schedule as new run-time schedule
2065  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2066 
2067 #if OMP_40_ENABLED
2068  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2069 #endif
2070 
2071  // Update the floating point rounding in the team if required.
2072  propagateFPControl(team);
2073 
2074  if (__kmp_tasking_mode != tskm_immediate_exec) {
2075  // Set master's task team to team's task team. Unless this is hot team, it
2076  // should be NULL.
2077  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2078  parent_team->t.t_task_team[master_th->th.th_task_state]);
2079  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2080  "%p, new task_team %p / team %p\n",
2081  __kmp_gtid_from_thread(master_th),
2082  master_th->th.th_task_team, parent_team,
2083  team->t.t_task_team[master_th->th.th_task_state], team));
2084 
2085  if (active_level || master_th->th.th_task_team) {
2086  // Take a memo of master's task_state
2087  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2088  if (master_th->th.th_task_state_top >=
2089  master_th->th.th_task_state_stack_sz) { // increase size
2090  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2091  kmp_uint8 *old_stack, *new_stack;
2092  kmp_uint32 i;
2093  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2094  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2095  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2096  }
2097  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2098  ++i) { // zero-init rest of stack
2099  new_stack[i] = 0;
2100  }
2101  old_stack = master_th->th.th_task_state_memo_stack;
2102  master_th->th.th_task_state_memo_stack = new_stack;
2103  master_th->th.th_task_state_stack_sz = new_size;
2104  __kmp_free(old_stack);
2105  }
2106  // Store master's task_state on stack
2107  master_th->th
2108  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2109  master_th->th.th_task_state;
2110  master_th->th.th_task_state_top++;
2111 #if KMP_NESTED_HOT_TEAMS
2112  if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2113  // Restore master's nested state if nested hot team
2114  master_th->th.th_task_state =
2115  master_th->th
2116  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2117  } else {
2118 #endif
2119  master_th->th.th_task_state = 0;
2120 #if KMP_NESTED_HOT_TEAMS
2121  }
2122 #endif
2123  }
2124 #if !KMP_NESTED_HOT_TEAMS
2125  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2126  (team == root->r.r_hot_team));
2127 #endif
2128  }
2129 
2130  KA_TRACE(
2131  20,
2132  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2133  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2134  team->t.t_nproc));
2135  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2136  (team->t.t_master_tid == 0 &&
2137  (team->t.t_parent == root->r.r_root_team ||
2138  team->t.t_parent->t.t_serialized)));
2139  KMP_MB();
2140 
2141  /* now, setup the arguments */
2142  argv = (void **)team->t.t_argv;
2143 #if OMP_40_ENABLED
2144  if (ap) {
2145 #endif /* OMP_40_ENABLED */
2146  for (i = argc - 1; i >= 0; --i) {
2147 // TODO: revert workaround for Intel(R) 64 tracker #96
2148 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2149  void *new_argv = va_arg(*ap, void *);
2150 #else
2151  void *new_argv = va_arg(ap, void *);
2152 #endif
2153  KMP_CHECK_UPDATE(*argv, new_argv);
2154  argv++;
2155  }
2156 #if OMP_40_ENABLED
2157  } else {
2158  for (i = 0; i < argc; ++i) {
2159  // Get args from parent team for teams construct
2160  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2161  }
2162  }
2163 #endif /* OMP_40_ENABLED */
2164 
2165  /* now actually fork the threads */
2166  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2167  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2168  root->r.r_active = TRUE;
2169 
2170  __kmp_fork_team_threads(root, team, master_th, gtid);
2171  __kmp_setup_icv_copy(team, nthreads,
2172  &master_th->th.th_current_task->td_icvs, loc);
2173 
2174 #if OMPT_SUPPORT
2175  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
2176 #endif
2177 
2178  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2179 
2180 #if USE_ITT_BUILD
2181  if (team->t.t_active_level == 1 // only report frames at level 1
2182 #if OMP_40_ENABLED
2183  && !master_th->th.th_teams_microtask // not in teams construct
2184 #endif /* OMP_40_ENABLED */
2185  ) {
2186 #if USE_ITT_NOTIFY
2187  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2188  (__kmp_forkjoin_frames_mode == 3 ||
2189  __kmp_forkjoin_frames_mode == 1)) {
2190  kmp_uint64 tmp_time = 0;
2191  if (__itt_get_timestamp_ptr)
2192  tmp_time = __itt_get_timestamp();
2193  // Internal fork - report frame begin
2194  master_th->th.th_frame_time = tmp_time;
2195  if (__kmp_forkjoin_frames_mode == 3)
2196  team->t.t_region_time = tmp_time;
2197  } else
2198 // only one notification scheme (either "submit" or "forking/joined", not both)
2199 #endif /* USE_ITT_NOTIFY */
2200  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2201  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2202  // Mark start of "parallel" region for VTune.
2203  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2204  }
2205  }
2206 #endif /* USE_ITT_BUILD */
2207 
2208  /* now go on and do the work */
2209  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2210  KMP_MB();
2211  KF_TRACE(10,
2212  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2213  root, team, master_th, gtid));
2214 
2215 #if USE_ITT_BUILD
2216  if (__itt_stack_caller_create_ptr) {
2217  team->t.t_stack_id =
2218  __kmp_itt_stack_caller_create(); // create new stack stitching id
2219  // before entering fork barrier
2220  }
2221 #endif /* USE_ITT_BUILD */
2222 
2223 #if OMP_40_ENABLED
2224  // AC: skip __kmp_internal_fork at teams construct, let only master
2225  // threads execute
2226  if (ap)
2227 #endif /* OMP_40_ENABLED */
2228  {
2229  __kmp_internal_fork(loc, gtid, team);
2230  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2231  "master_th=%p, gtid=%d\n",
2232  root, team, master_th, gtid));
2233  }
2234 
2235  if (call_context == fork_context_gnu) {
2236  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2237  return TRUE;
2238  }
2239 
2240  /* Invoke microtask for MASTER thread */
2241  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2242  team->t.t_id, team->t.t_pkfn));
2243  } // END of timer KMP_fork_call block
2244 
2245  {
2246  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2247  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2248  if (!team->t.t_invoke(gtid)) {
2249  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2250  }
2251  }
2252  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2253  team->t.t_id, team->t.t_pkfn));
2254  KMP_MB(); /* Flush all pending memory write invalidates. */
2255 
2256  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2257 
2258 #if OMPT_SUPPORT
2259  if (ompt_enabled.enabled) {
2260  master_th->th.ompt_thread_info.state = omp_state_overhead;
2261  }
2262 #endif
2263 
2264  return TRUE;
2265 }
2266 
2267 #if OMPT_SUPPORT
2268 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2269  kmp_team_t *team) {
2270  // restore state outside the region
2271  thread->th.ompt_thread_info.state =
2272  ((team->t.t_serialized) ? omp_state_work_serial
2273  : omp_state_work_parallel);
2274 }
2275 
2276 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2277  kmp_team_t *team, ompt_data_t *parallel_data,
2278  fork_context_e fork_context, void *codeptr) {
2279  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2280  if (ompt_enabled.ompt_callback_parallel_end) {
2281  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2282  parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2283  codeptr);
2284  }
2285 
2286  task_info->frame.enter_frame = NULL;
2287  __kmp_join_restore_state(thread, team);
2288 }
2289 #endif
2290 
2291 void __kmp_join_call(ident_t *loc, int gtid
2292 #if OMPT_SUPPORT
2293  ,
2294  enum fork_context_e fork_context
2295 #endif
2296 #if OMP_40_ENABLED
2297  ,
2298  int exit_teams
2299 #endif /* OMP_40_ENABLED */
2300  ) {
2301  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2302  kmp_team_t *team;
2303  kmp_team_t *parent_team;
2304  kmp_info_t *master_th;
2305  kmp_root_t *root;
2306  int master_active;
2307  int i;
2308 
2309  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2310 
2311  /* setup current data */
2312  master_th = __kmp_threads[gtid];
2313  root = master_th->th.th_root;
2314  team = master_th->th.th_team;
2315  parent_team = team->t.t_parent;
2316 
2317  master_th->th.th_ident = loc;
2318 
2319 #if OMPT_SUPPORT
2320  if (ompt_enabled.enabled) {
2321  master_th->th.ompt_thread_info.state = omp_state_overhead;
2322  }
2323 #endif
2324 
2325 #if KMP_DEBUG
2326  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2327  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2328  "th_task_team = %p\n",
2329  __kmp_gtid_from_thread(master_th), team,
2330  team->t.t_task_team[master_th->th.th_task_state],
2331  master_th->th.th_task_team));
2332  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2333  team->t.t_task_team[master_th->th.th_task_state]);
2334  }
2335 #endif
2336 
2337  if (team->t.t_serialized) {
2338 #if OMP_40_ENABLED
2339  if (master_th->th.th_teams_microtask) {
2340  // We are in teams construct
2341  int level = team->t.t_level;
2342  int tlevel = master_th->th.th_teams_level;
2343  if (level == tlevel) {
2344  // AC: we haven't incremented it earlier at start of teams construct,
2345  // so do it here - at the end of teams construct
2346  team->t.t_level++;
2347  } else if (level == tlevel + 1) {
2348  // AC: we are exiting parallel inside teams, need to increment
2349  // serialization in order to restore it in the next call to
2350  // __kmpc_end_serialized_parallel
2351  team->t.t_serialized++;
2352  }
2353  }
2354 #endif /* OMP_40_ENABLED */
2355  __kmpc_end_serialized_parallel(loc, gtid);
2356 
2357 #if OMPT_SUPPORT
2358  if (ompt_enabled.enabled) {
2359  __kmp_join_restore_state(master_th, parent_team);
2360  }
2361 #endif
2362 
2363  return;
2364  }
2365 
2366  master_active = team->t.t_master_active;
2367 
2368 #if OMP_40_ENABLED
2369  if (!exit_teams)
2370 #endif /* OMP_40_ENABLED */
2371  {
2372  // AC: No barrier for internal teams at exit from teams construct.
2373  // But there is barrier for external team (league).
2374  __kmp_internal_join(loc, gtid, team);
2375  }
2376 #if OMP_40_ENABLED
2377  else {
2378  master_th->th.th_task_state =
2379  0; // AC: no tasking in teams (out of any parallel)
2380  }
2381 #endif /* OMP_40_ENABLED */
2382 
2383  KMP_MB();
2384 
2385 #if OMPT_SUPPORT
2386  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2387  void *codeptr = team->t.ompt_team_info.master_return_address;
2388 #endif
2389 
2390 #if USE_ITT_BUILD
2391  if (__itt_stack_caller_create_ptr) {
2392  __kmp_itt_stack_caller_destroy(
2393  (__itt_caller)team->t
2394  .t_stack_id); // destroy the stack stitching id after join barrier
2395  }
2396 
2397  // Mark end of "parallel" region for VTune.
2398  if (team->t.t_active_level == 1
2399 #if OMP_40_ENABLED
2400  && !master_th->th.th_teams_microtask /* not in teams construct */
2401 #endif /* OMP_40_ENABLED */
2402  ) {
2403  master_th->th.th_ident = loc;
2404  // only one notification scheme (either "submit" or "forking/joined", not
2405  // both)
2406  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2407  __kmp_forkjoin_frames_mode == 3)
2408  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2409  master_th->th.th_frame_time, 0, loc,
2410  master_th->th.th_team_nproc, 1);
2411  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2412  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2413  __kmp_itt_region_joined(gtid);
2414  } // active_level == 1
2415 #endif /* USE_ITT_BUILD */
2416 
2417 #if OMP_40_ENABLED
2418  if (master_th->th.th_teams_microtask && !exit_teams &&
2419  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2420  team->t.t_level == master_th->th.th_teams_level + 1) {
2421  // AC: We need to leave the team structure intact at the end of parallel
2422  // inside the teams construct, so that at the next parallel same (hot) team
2423  // works, only adjust nesting levels
2424 
2425  /* Decrement our nested depth level */
2426  team->t.t_level--;
2427  team->t.t_active_level--;
2428  KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2429 
2430  /* Restore number of threads in the team if needed */
2431  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2432  int old_num = master_th->th.th_team_nproc;
2433  int new_num = master_th->th.th_teams_size.nth;
2434  kmp_info_t **other_threads = team->t.t_threads;
2435  team->t.t_nproc = new_num;
2436  for (i = 0; i < old_num; ++i) {
2437  other_threads[i]->th.th_team_nproc = new_num;
2438  }
2439  // Adjust states of non-used threads of the team
2440  for (i = old_num; i < new_num; ++i) {
2441  // Re-initialize thread's barrier data.
2442  int b;
2443  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2444  for (b = 0; b < bs_last_barrier; ++b) {
2445  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2446  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2447 #if USE_DEBUGGER
2448  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2449 #endif
2450  }
2451  if (__kmp_tasking_mode != tskm_immediate_exec) {
2452  // Synchronize thread's task state
2453  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2454  }
2455  }
2456  }
2457 
2458 #if OMPT_SUPPORT
2459  if (ompt_enabled.enabled) {
2460  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2461  codeptr);
2462  }
2463 #endif
2464 
2465  return;
2466  }
2467 #endif /* OMP_40_ENABLED */
2468 
2469  /* do cleanup and restore the parent team */
2470  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2471  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2472 
2473  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2474 
2475  /* jc: The following lock has instructions with REL and ACQ semantics,
2476  separating the parallel user code called in this parallel region
2477  from the serial user code called after this function returns. */
2478  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2479 
2480 #if OMP_40_ENABLED
2481  if (!master_th->th.th_teams_microtask ||
2482  team->t.t_level > master_th->th.th_teams_level)
2483 #endif /* OMP_40_ENABLED */
2484  {
2485  /* Decrement our nested depth level */
2486  KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2487  }
2488  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2489 
2490 #if OMPT_SUPPORT
2491  if (ompt_enabled.enabled) {
2492  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2493  if (ompt_enabled.ompt_callback_implicit_task) {
2494  int ompt_team_size = team->t.t_nproc;
2495  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2496  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2497  __kmp_tid_from_gtid(gtid));
2498  }
2499 
2500  task_info->frame.exit_frame = NULL;
2501  task_info->task_data = ompt_data_none;
2502  }
2503 #endif
2504 
2505  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2506  master_th, team));
2507  __kmp_pop_current_task_from_thread(master_th);
2508 
2509 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2510  // Restore master thread's partition.
2511  master_th->th.th_first_place = team->t.t_first_place;
2512  master_th->th.th_last_place = team->t.t_last_place;
2513 #endif /* OMP_40_ENABLED */
2514 
2515  updateHWFPControl(team);
2516 
2517  if (root->r.r_active != master_active)
2518  root->r.r_active = master_active;
2519 
2520  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2521  master_th)); // this will free worker threads
2522 
2523  /* this race was fun to find. make sure the following is in the critical
2524  region otherwise assertions may fail occasionally since the old team may be
2525  reallocated and the hierarchy appears inconsistent. it is actually safe to
2526  run and won't cause any bugs, but will cause those assertion failures. it's
2527  only one deref&assign so might as well put this in the critical region */
2528  master_th->th.th_team = parent_team;
2529  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2530  master_th->th.th_team_master = parent_team->t.t_threads[0];
2531  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2532 
2533  /* restore serialized team, if need be */
2534  if (parent_team->t.t_serialized &&
2535  parent_team != master_th->th.th_serial_team &&
2536  parent_team != root->r.r_root_team) {
2537  __kmp_free_team(root,
2538  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2539  master_th->th.th_serial_team = parent_team;
2540  }
2541 
2542  if (__kmp_tasking_mode != tskm_immediate_exec) {
2543  if (master_th->th.th_task_state_top >
2544  0) { // Restore task state from memo stack
2545  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2546  // Remember master's state if we re-use this nested hot team
2547  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2548  master_th->th.th_task_state;
2549  --master_th->th.th_task_state_top; // pop
2550  // Now restore state at this level
2551  master_th->th.th_task_state =
2552  master_th->th
2553  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2554  }
2555  // Copy the task team from the parent team to the master thread
2556  master_th->th.th_task_team =
2557  parent_team->t.t_task_team[master_th->th.th_task_state];
2558  KA_TRACE(20,
2559  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2560  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2561  parent_team));
2562  }
2563 
2564  // TODO: GEH - cannot do this assertion because root thread not set up as
2565  // executing
2566  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2567  master_th->th.th_current_task->td_flags.executing = 1;
2568 
2569  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2570 
2571 #if OMPT_SUPPORT
2572  if (ompt_enabled.enabled) {
2573  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2574  codeptr);
2575  }
2576 #endif
2577 
2578  KMP_MB();
2579  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2580 }
2581 
2582 /* Check whether we should push an internal control record onto the
2583  serial team stack. If so, do it. */
2584 void __kmp_save_internal_controls(kmp_info_t *thread) {
2585 
2586  if (thread->th.th_team != thread->th.th_serial_team) {
2587  return;
2588  }
2589  if (thread->th.th_team->t.t_serialized > 1) {
2590  int push = 0;
2591 
2592  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2593  push = 1;
2594  } else {
2595  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2596  thread->th.th_team->t.t_serialized) {
2597  push = 1;
2598  }
2599  }
2600  if (push) { /* push a record on the serial team's stack */
2601  kmp_internal_control_t *control =
2602  (kmp_internal_control_t *)__kmp_allocate(
2603  sizeof(kmp_internal_control_t));
2604 
2605  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2606 
2607  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2608 
2609  control->next = thread->th.th_team->t.t_control_stack_top;
2610  thread->th.th_team->t.t_control_stack_top = control;
2611  }
2612  }
2613 }
2614 
2615 /* Changes set_nproc */
2616 void __kmp_set_num_threads(int new_nth, int gtid) {
2617  kmp_info_t *thread;
2618  kmp_root_t *root;
2619 
2620  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2621  KMP_DEBUG_ASSERT(__kmp_init_serial);
2622 
2623  if (new_nth < 1)
2624  new_nth = 1;
2625  else if (new_nth > __kmp_max_nth)
2626  new_nth = __kmp_max_nth;
2627 
2628  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2629  thread = __kmp_threads[gtid];
2630 
2631  __kmp_save_internal_controls(thread);
2632 
2633  set__nproc(thread, new_nth);
2634 
2635  // If this omp_set_num_threads() call will cause the hot team size to be
2636  // reduced (in the absence of a num_threads clause), then reduce it now,
2637  // rather than waiting for the next parallel region.
2638  root = thread->th.th_root;
2639  if (__kmp_init_parallel && (!root->r.r_active) &&
2640  (root->r.r_hot_team->t.t_nproc > new_nth)
2641 #if KMP_NESTED_HOT_TEAMS
2642  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2643 #endif
2644  ) {
2645  kmp_team_t *hot_team = root->r.r_hot_team;
2646  int f;
2647 
2648  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2649 
2650  // Release the extra threads we don't need any more.
2651  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2652  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2653  if (__kmp_tasking_mode != tskm_immediate_exec) {
2654  // When decreasing team size, threads no longer in the team should unref
2655  // task team.
2656  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2657  }
2658  __kmp_free_thread(hot_team->t.t_threads[f]);
2659  hot_team->t.t_threads[f] = NULL;
2660  }
2661  hot_team->t.t_nproc = new_nth;
2662 #if KMP_NESTED_HOT_TEAMS
2663  if (thread->th.th_hot_teams) {
2664  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2665  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2666  }
2667 #endif
2668 
2669  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2670 
2671  // Update the t_nproc field in the threads that are still active.
2672  for (f = 0; f < new_nth; f++) {
2673  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2674  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2675  }
2676  // Special flag in case omp_set_num_threads() call
2677  hot_team->t.t_size_changed = -1;
2678  }
2679 }
2680 
2681 /* Changes max_active_levels */
2682 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2683  kmp_info_t *thread;
2684 
2685  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2686  "%d = (%d)\n",
2687  gtid, max_active_levels));
2688  KMP_DEBUG_ASSERT(__kmp_init_serial);
2689 
2690  // validate max_active_levels
2691  if (max_active_levels < 0) {
2692  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2693  // We ignore this call if the user has specified a negative value.
2694  // The current setting won't be changed. The last valid setting will be
2695  // used. A warning will be issued (if warnings are allowed as controlled by
2696  // the KMP_WARNINGS env var).
2697  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2698  "max_active_levels for thread %d = (%d)\n",
2699  gtid, max_active_levels));
2700  return;
2701  }
2702  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2703  // it's OK, the max_active_levels is within the valid range: [ 0;
2704  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2705  // We allow a zero value. (implementation defined behavior)
2706  } else {
2707  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2708  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2709  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2710  // Current upper limit is MAX_INT. (implementation defined behavior)
2711  // If the input exceeds the upper limit, we correct the input to be the
2712  // upper limit. (implementation defined behavior)
2713  // Actually, the flow should never get here until we use MAX_INT limit.
2714  }
2715  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2716  "max_active_levels for thread %d = (%d)\n",
2717  gtid, max_active_levels));
2718 
2719  thread = __kmp_threads[gtid];
2720 
2721  __kmp_save_internal_controls(thread);
2722 
2723  set__max_active_levels(thread, max_active_levels);
2724 }
2725 
2726 /* Gets max_active_levels */
2727 int __kmp_get_max_active_levels(int gtid) {
2728  kmp_info_t *thread;
2729 
2730  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2731  KMP_DEBUG_ASSERT(__kmp_init_serial);
2732 
2733  thread = __kmp_threads[gtid];
2734  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2735  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2736  "curtask_maxaclevel=%d\n",
2737  gtid, thread->th.th_current_task,
2738  thread->th.th_current_task->td_icvs.max_active_levels));
2739  return thread->th.th_current_task->td_icvs.max_active_levels;
2740 }
2741 
2742 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2743 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2744  kmp_info_t *thread;
2745  // kmp_team_t *team;
2746 
2747  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2748  gtid, (int)kind, chunk));
2749  KMP_DEBUG_ASSERT(__kmp_init_serial);
2750 
2751  // Check if the kind parameter is valid, correct if needed.
2752  // Valid parameters should fit in one of two intervals - standard or extended:
2753  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2754  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2755  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2756  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2757  // TODO: Hint needs attention in case we change the default schedule.
2758  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2759  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2760  __kmp_msg_null);
2761  kind = kmp_sched_default;
2762  chunk = 0; // ignore chunk value in case of bad kind
2763  }
2764 
2765  thread = __kmp_threads[gtid];
2766 
2767  __kmp_save_internal_controls(thread);
2768 
2769  if (kind < kmp_sched_upper_std) {
2770  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2771  // differ static chunked vs. unchunked: chunk should be invalid to
2772  // indicate unchunked schedule (which is the default)
2773  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2774  } else {
2775  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2776  __kmp_sch_map[kind - kmp_sched_lower - 1];
2777  }
2778  } else {
2779  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2780  // kmp_sched_lower - 2 ];
2781  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2782  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2783  kmp_sched_lower - 2];
2784  }
2785  if (kind == kmp_sched_auto || chunk < 1) {
2786  // ignore parameter chunk for schedule auto
2787  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2788  } else {
2789  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2790  }
2791 }
2792 
2793 /* Gets def_sched_var ICV values */
2794 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2795  kmp_info_t *thread;
2796  enum sched_type th_type;
2797 
2798  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2799  KMP_DEBUG_ASSERT(__kmp_init_serial);
2800 
2801  thread = __kmp_threads[gtid];
2802 
2803  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2804 
2805  switch (th_type) {
2806  case kmp_sch_static:
2807  case kmp_sch_static_greedy:
2808  case kmp_sch_static_balanced:
2809  *kind = kmp_sched_static;
2810  *chunk = 0; // chunk was not set, try to show this fact via zero value
2811  return;
2812  case kmp_sch_static_chunked:
2813  *kind = kmp_sched_static;
2814  break;
2815  case kmp_sch_dynamic_chunked:
2816  *kind = kmp_sched_dynamic;
2817  break;
2819  case kmp_sch_guided_iterative_chunked:
2820  case kmp_sch_guided_analytical_chunked:
2821  *kind = kmp_sched_guided;
2822  break;
2823  case kmp_sch_auto:
2824  *kind = kmp_sched_auto;
2825  break;
2826  case kmp_sch_trapezoidal:
2827  *kind = kmp_sched_trapezoidal;
2828  break;
2829 #if KMP_STATIC_STEAL_ENABLED
2830  case kmp_sch_static_steal:
2831  *kind = kmp_sched_static_steal;
2832  break;
2833 #endif
2834  default:
2835  KMP_FATAL(UnknownSchedulingType, th_type);
2836  }
2837 
2838  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2839 }
2840 
2841 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2842 
2843  int ii, dd;
2844  kmp_team_t *team;
2845  kmp_info_t *thr;
2846 
2847  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2848  KMP_DEBUG_ASSERT(__kmp_init_serial);
2849 
2850  // validate level
2851  if (level == 0)
2852  return 0;
2853  if (level < 0)
2854  return -1;
2855  thr = __kmp_threads[gtid];
2856  team = thr->th.th_team;
2857  ii = team->t.t_level;
2858  if (level > ii)
2859  return -1;
2860 
2861 #if OMP_40_ENABLED
2862  if (thr->th.th_teams_microtask) {
2863  // AC: we are in teams region where multiple nested teams have same level
2864  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2865  if (level <=
2866  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2867  KMP_DEBUG_ASSERT(ii >= tlevel);
2868  // AC: As we need to pass by the teams league, we need to artificially
2869  // increase ii
2870  if (ii == tlevel) {
2871  ii += 2; // three teams have same level
2872  } else {
2873  ii++; // two teams have same level
2874  }
2875  }
2876  }
2877 #endif
2878 
2879  if (ii == level)
2880  return __kmp_tid_from_gtid(gtid);
2881 
2882  dd = team->t.t_serialized;
2883  level++;
2884  while (ii > level) {
2885  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2886  }
2887  if ((team->t.t_serialized) && (!dd)) {
2888  team = team->t.t_parent;
2889  continue;
2890  }
2891  if (ii > level) {
2892  team = team->t.t_parent;
2893  dd = team->t.t_serialized;
2894  ii--;
2895  }
2896  }
2897 
2898  return (dd > 1) ? (0) : (team->t.t_master_tid);
2899 }
2900 
2901 int __kmp_get_team_size(int gtid, int level) {
2902 
2903  int ii, dd;
2904  kmp_team_t *team;
2905  kmp_info_t *thr;
2906 
2907  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2908  KMP_DEBUG_ASSERT(__kmp_init_serial);
2909 
2910  // validate level
2911  if (level == 0)
2912  return 1;
2913  if (level < 0)
2914  return -1;
2915  thr = __kmp_threads[gtid];
2916  team = thr->th.th_team;
2917  ii = team->t.t_level;
2918  if (level > ii)
2919  return -1;
2920 
2921 #if OMP_40_ENABLED
2922  if (thr->th.th_teams_microtask) {
2923  // AC: we are in teams region where multiple nested teams have same level
2924  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2925  if (level <=
2926  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2927  KMP_DEBUG_ASSERT(ii >= tlevel);
2928  // AC: As we need to pass by the teams league, we need to artificially
2929  // increase ii
2930  if (ii == tlevel) {
2931  ii += 2; // three teams have same level
2932  } else {
2933  ii++; // two teams have same level
2934  }
2935  }
2936  }
2937 #endif
2938 
2939  while (ii > level) {
2940  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2941  }
2942  if (team->t.t_serialized && (!dd)) {
2943  team = team->t.t_parent;
2944  continue;
2945  }
2946  if (ii > level) {
2947  team = team->t.t_parent;
2948  ii--;
2949  }
2950  }
2951 
2952  return team->t.t_nproc;
2953 }
2954 
2955 kmp_r_sched_t __kmp_get_schedule_global() {
2956  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2957  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2958  // independently. So one can get the updated schedule here.
2959 
2960  kmp_r_sched_t r_sched;
2961 
2962  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2963  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2964  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2965  // different roots (even in OMP 2.5)
2966  if (__kmp_sched == kmp_sch_static) {
2967  // replace STATIC with more detailed schedule (balanced or greedy)
2968  r_sched.r_sched_type = __kmp_static;
2969  } else if (__kmp_sched == kmp_sch_guided_chunked) {
2970  // replace GUIDED with more detailed schedule (iterative or analytical)
2971  r_sched.r_sched_type = __kmp_guided;
2972  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2973  r_sched.r_sched_type = __kmp_sched;
2974  }
2975 
2976  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2977  // __kmp_chunk may be wrong here (if it was not ever set)
2978  r_sched.chunk = KMP_DEFAULT_CHUNK;
2979  } else {
2980  r_sched.chunk = __kmp_chunk;
2981  }
2982 
2983  return r_sched;
2984 }
2985 
2986 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2987  at least argc number of *t_argv entries for the requested team. */
2988 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2989 
2990  KMP_DEBUG_ASSERT(team);
2991  if (!realloc || argc > team->t.t_max_argc) {
2992 
2993  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2994  "current entries=%d\n",
2995  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2996  /* if previously allocated heap space for args, free them */
2997  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2998  __kmp_free((void *)team->t.t_argv);
2999 
3000  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3001  /* use unused space in the cache line for arguments */
3002  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3003  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3004  "argv entries\n",
3005  team->t.t_id, team->t.t_max_argc));
3006  team->t.t_argv = &team->t.t_inline_argv[0];
3007  if (__kmp_storage_map) {
3008  __kmp_print_storage_map_gtid(
3009  -1, &team->t.t_inline_argv[0],
3010  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3011  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3012  team->t.t_id);
3013  }
3014  } else {
3015  /* allocate space for arguments in the heap */
3016  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3017  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3018  : 2 * argc;
3019  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3020  "argv entries\n",
3021  team->t.t_id, team->t.t_max_argc));
3022  team->t.t_argv =
3023  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3024  if (__kmp_storage_map) {
3025  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3026  &team->t.t_argv[team->t.t_max_argc],
3027  sizeof(void *) * team->t.t_max_argc,
3028  "team_%d.t_argv", team->t.t_id);
3029  }
3030  }
3031  }
3032 }
3033 
3034 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3035  int i;
3036  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3037  team->t.t_threads =
3038  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3039  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3040  sizeof(dispatch_shared_info_t) * num_disp_buff);
3041  team->t.t_dispatch =
3042  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3043  team->t.t_implicit_task_taskdata =
3044  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3045  team->t.t_max_nproc = max_nth;
3046 
3047  /* setup dispatch buffers */
3048  for (i = 0; i < num_disp_buff; ++i) {
3049  team->t.t_disp_buffer[i].buffer_index = i;
3050 #if OMP_45_ENABLED
3051  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3052 #endif
3053  }
3054 }
3055 
3056 static void __kmp_free_team_arrays(kmp_team_t *team) {
3057  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3058  int i;
3059  for (i = 0; i < team->t.t_max_nproc; ++i) {
3060  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3061  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3062  team->t.t_dispatch[i].th_disp_buffer = NULL;
3063  }
3064  }
3065  __kmp_free(team->t.t_threads);
3066  __kmp_free(team->t.t_disp_buffer);
3067  __kmp_free(team->t.t_dispatch);
3068  __kmp_free(team->t.t_implicit_task_taskdata);
3069  team->t.t_threads = NULL;
3070  team->t.t_disp_buffer = NULL;
3071  team->t.t_dispatch = NULL;
3072  team->t.t_implicit_task_taskdata = 0;
3073 }
3074 
3075 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3076  kmp_info_t **oldThreads = team->t.t_threads;
3077 
3078  __kmp_free(team->t.t_disp_buffer);
3079  __kmp_free(team->t.t_dispatch);
3080  __kmp_free(team->t.t_implicit_task_taskdata);
3081  __kmp_allocate_team_arrays(team, max_nth);
3082 
3083  KMP_MEMCPY(team->t.t_threads, oldThreads,
3084  team->t.t_nproc * sizeof(kmp_info_t *));
3085 
3086  __kmp_free(oldThreads);
3087 }
3088 
3089 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3090 
3091  kmp_r_sched_t r_sched =
3092  __kmp_get_schedule_global(); // get current state of scheduling globals
3093 
3094 #if OMP_40_ENABLED
3095  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3096 #endif /* OMP_40_ENABLED */
3097 
3098  kmp_internal_control_t g_icvs = {
3099  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3100  (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3101  // for nested parallelism (per thread)
3102  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3103  // adjustment of threads (per thread)
3104  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3105  // whether blocktime is explicitly set
3106  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3107 #if KMP_USE_MONITOR
3108  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3109 // intervals
3110 #endif
3111  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3112  // next parallel region (per thread)
3113  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3114  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3115  // for max_active_levels
3116  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3117 // {sched,chunk} pair
3118 #if OMP_40_ENABLED
3119  __kmp_nested_proc_bind.bind_types[0],
3120  __kmp_default_device,
3121 #endif /* OMP_40_ENABLED */
3122  NULL // struct kmp_internal_control *next;
3123  };
3124 
3125  return g_icvs;
3126 }
3127 
3128 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3129 
3130  kmp_internal_control_t gx_icvs;
3131  gx_icvs.serial_nesting_level =
3132  0; // probably =team->t.t_serial like in save_inter_controls
3133  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3134  gx_icvs.next = NULL;
3135 
3136  return gx_icvs;
3137 }
3138 
3139 static void __kmp_initialize_root(kmp_root_t *root) {
3140  int f;
3141  kmp_team_t *root_team;
3142  kmp_team_t *hot_team;
3143  int hot_team_max_nth;
3144  kmp_r_sched_t r_sched =
3145  __kmp_get_schedule_global(); // get current state of scheduling globals
3146  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3147  KMP_DEBUG_ASSERT(root);
3148  KMP_ASSERT(!root->r.r_begin);
3149 
3150  /* setup the root state structure */
3151  __kmp_init_lock(&root->r.r_begin_lock);
3152  root->r.r_begin = FALSE;
3153  root->r.r_active = FALSE;
3154  root->r.r_in_parallel = 0;
3155  root->r.r_blocktime = __kmp_dflt_blocktime;
3156  root->r.r_nested = __kmp_dflt_nested;
3157  root->r.r_cg_nthreads = 1;
3158 
3159  /* setup the root team for this task */
3160  /* allocate the root team structure */
3161  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3162 
3163  root_team =
3164  __kmp_allocate_team(root,
3165  1, // new_nproc
3166  1, // max_nproc
3167 #if OMPT_SUPPORT
3168  ompt_data_none, // root parallel id
3169 #endif
3170 #if OMP_40_ENABLED
3171  __kmp_nested_proc_bind.bind_types[0],
3172 #endif
3173  &r_icvs,
3174  0 // argc
3175  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3176  );
3177 #if USE_DEBUGGER
3178  // Non-NULL value should be assigned to make the debugger display the root
3179  // team.
3180  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3181 #endif
3182 
3183  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3184 
3185  root->r.r_root_team = root_team;
3186  root_team->t.t_control_stack_top = NULL;
3187 
3188  /* initialize root team */
3189  root_team->t.t_threads[0] = NULL;
3190  root_team->t.t_nproc = 1;
3191  root_team->t.t_serialized = 1;
3192  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3193  root_team->t.t_sched.sched = r_sched.sched;
3194  KA_TRACE(
3195  20,
3196  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3197  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3198 
3199  /* setup the hot team for this task */
3200  /* allocate the hot team structure */
3201  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3202 
3203  hot_team =
3204  __kmp_allocate_team(root,
3205  1, // new_nproc
3206  __kmp_dflt_team_nth_ub * 2, // max_nproc
3207 #if OMPT_SUPPORT
3208  ompt_data_none, // root parallel id
3209 #endif
3210 #if OMP_40_ENABLED
3211  __kmp_nested_proc_bind.bind_types[0],
3212 #endif
3213  &r_icvs,
3214  0 // argc
3215  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3216  );
3217  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3218 
3219  root->r.r_hot_team = hot_team;
3220  root_team->t.t_control_stack_top = NULL;
3221 
3222  /* first-time initialization */
3223  hot_team->t.t_parent = root_team;
3224 
3225  /* initialize hot team */
3226  hot_team_max_nth = hot_team->t.t_max_nproc;
3227  for (f = 0; f < hot_team_max_nth; ++f) {
3228  hot_team->t.t_threads[f] = NULL;
3229  }
3230  hot_team->t.t_nproc = 1;
3231  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3232  hot_team->t.t_sched.sched = r_sched.sched;
3233  hot_team->t.t_size_changed = 0;
3234 }
3235 
3236 #ifdef KMP_DEBUG
3237 
3238 typedef struct kmp_team_list_item {
3239  kmp_team_p const *entry;
3240  struct kmp_team_list_item *next;
3241 } kmp_team_list_item_t;
3242 typedef kmp_team_list_item_t *kmp_team_list_t;
3243 
3244 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3245  kmp_team_list_t list, // List of teams.
3246  kmp_team_p const *team // Team to add.
3247  ) {
3248 
3249  // List must terminate with item where both entry and next are NULL.
3250  // Team is added to the list only once.
3251  // List is sorted in ascending order by team id.
3252  // Team id is *not* a key.
3253 
3254  kmp_team_list_t l;
3255 
3256  KMP_DEBUG_ASSERT(list != NULL);
3257  if (team == NULL) {
3258  return;
3259  }
3260 
3261  __kmp_print_structure_team_accum(list, team->t.t_parent);
3262  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3263 
3264  // Search list for the team.
3265  l = list;
3266  while (l->next != NULL && l->entry != team) {
3267  l = l->next;
3268  }
3269  if (l->next != NULL) {
3270  return; // Team has been added before, exit.
3271  }
3272 
3273  // Team is not found. Search list again for insertion point.
3274  l = list;
3275  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3276  l = l->next;
3277  }
3278 
3279  // Insert team.
3280  {
3281  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3282  sizeof(kmp_team_list_item_t));
3283  *item = *l;
3284  l->entry = team;
3285  l->next = item;
3286  }
3287 }
3288 
3289 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3290 
3291  ) {
3292  __kmp_printf("%s", title);
3293  if (team != NULL) {
3294  __kmp_printf("%2x %p\n", team->t.t_id, team);
3295  } else {
3296  __kmp_printf(" - (nil)\n");
3297  }
3298 }
3299 
3300 static void __kmp_print_structure_thread(char const *title,
3301  kmp_info_p const *thread) {
3302  __kmp_printf("%s", title);
3303  if (thread != NULL) {
3304  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3305  } else {
3306  __kmp_printf(" - (nil)\n");
3307  }
3308 }
3309 
3310 void __kmp_print_structure(void) {
3311 
3312  kmp_team_list_t list;
3313 
3314  // Initialize list of teams.
3315  list =
3316  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3317  list->entry = NULL;
3318  list->next = NULL;
3319 
3320  __kmp_printf("\n------------------------------\nGlobal Thread "
3321  "Table\n------------------------------\n");
3322  {
3323  int gtid;
3324  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3325  __kmp_printf("%2d", gtid);
3326  if (__kmp_threads != NULL) {
3327  __kmp_printf(" %p", __kmp_threads[gtid]);
3328  }
3329  if (__kmp_root != NULL) {
3330  __kmp_printf(" %p", __kmp_root[gtid]);
3331  }
3332  __kmp_printf("\n");
3333  }
3334  }
3335 
3336  // Print out __kmp_threads array.
3337  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3338  "----------\n");
3339  if (__kmp_threads != NULL) {
3340  int gtid;
3341  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3342  kmp_info_t const *thread = __kmp_threads[gtid];
3343  if (thread != NULL) {
3344  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3345  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3346  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3347  __kmp_print_structure_team(" Serial Team: ",
3348  thread->th.th_serial_team);
3349  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3350  __kmp_print_structure_thread(" Master: ",
3351  thread->th.th_team_master);
3352  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3353  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3354 #if OMP_40_ENABLED
3355  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3356 #endif
3357  __kmp_print_structure_thread(" Next in pool: ",
3358  thread->th.th_next_pool);
3359  __kmp_printf("\n");
3360  __kmp_print_structure_team_accum(list, thread->th.th_team);
3361  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3362  }
3363  }
3364  } else {
3365  __kmp_printf("Threads array is not allocated.\n");
3366  }
3367 
3368  // Print out __kmp_root array.
3369  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3370  "--------\n");
3371  if (__kmp_root != NULL) {
3372  int gtid;
3373  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3374  kmp_root_t const *root = __kmp_root[gtid];
3375  if (root != NULL) {
3376  __kmp_printf("GTID %2d %p:\n", gtid, root);
3377  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3378  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3379  __kmp_print_structure_thread(" Uber Thread: ",
3380  root->r.r_uber_thread);
3381  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3382  __kmp_printf(" Nested?: %2d\n", root->r.r_nested);
3383  __kmp_printf(" In Parallel: %2d\n", root->r.r_in_parallel);
3384  __kmp_printf("\n");
3385  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3386  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3387  }
3388  }
3389  } else {
3390  __kmp_printf("Ubers array is not allocated.\n");
3391  }
3392 
3393  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3394  "--------\n");
3395  while (list->next != NULL) {
3396  kmp_team_p const *team = list->entry;
3397  int i;
3398  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3399  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3400  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3401  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3402  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3403  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3404  for (i = 0; i < team->t.t_nproc; ++i) {
3405  __kmp_printf(" Thread %2d: ", i);
3406  __kmp_print_structure_thread("", team->t.t_threads[i]);
3407  }
3408  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3409  __kmp_printf("\n");
3410  list = list->next;
3411  }
3412 
3413  // Print out __kmp_thread_pool and __kmp_team_pool.
3414  __kmp_printf("\n------------------------------\nPools\n----------------------"
3415  "--------\n");
3416  __kmp_print_structure_thread("Thread pool: ",
3417  CCAST(kmp_info_t *, __kmp_thread_pool));
3418  __kmp_print_structure_team("Team pool: ",
3419  CCAST(kmp_team_t *, __kmp_team_pool));
3420  __kmp_printf("\n");
3421 
3422  // Free team list.
3423  while (list != NULL) {
3424  kmp_team_list_item_t *item = list;
3425  list = list->next;
3426  KMP_INTERNAL_FREE(item);
3427  }
3428 }
3429 
3430 #endif
3431 
3432 //---------------------------------------------------------------------------
3433 // Stuff for per-thread fast random number generator
3434 // Table of primes
3435 static const unsigned __kmp_primes[] = {
3436  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3437  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3438  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3439  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3440  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3441  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3442  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3443  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3444  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3445  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3446  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3447 
3448 //---------------------------------------------------------------------------
3449 // __kmp_get_random: Get a random number using a linear congruential method.
3450 unsigned short __kmp_get_random(kmp_info_t *thread) {
3451  unsigned x = thread->th.th_x;
3452  unsigned short r = x >> 16;
3453 
3454  thread->th.th_x = x * thread->th.th_a + 1;
3455 
3456  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3457  thread->th.th_info.ds.ds_tid, r));
3458 
3459  return r;
3460 }
3461 //--------------------------------------------------------
3462 // __kmp_init_random: Initialize a random number generator
3463 void __kmp_init_random(kmp_info_t *thread) {
3464  unsigned seed = thread->th.th_info.ds.ds_tid;
3465 
3466  thread->th.th_a =
3467  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3468  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3469  KA_TRACE(30,
3470  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3471 }
3472 
3473 #if KMP_OS_WINDOWS
3474 /* reclaim array entries for root threads that are already dead, returns number
3475  * reclaimed */
3476 static int __kmp_reclaim_dead_roots(void) {
3477  int i, r = 0;
3478 
3479  for (i = 0; i < __kmp_threads_capacity; ++i) {
3480  if (KMP_UBER_GTID(i) &&
3481  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3482  !__kmp_root[i]
3483  ->r.r_active) { // AC: reclaim only roots died in non-active state
3484  r += __kmp_unregister_root_other_thread(i);
3485  }
3486  }
3487  return r;
3488 }
3489 #endif
3490 
3491 /* This function attempts to create free entries in __kmp_threads and
3492  __kmp_root, and returns the number of free entries generated.
3493 
3494  For Windows* OS static library, the first mechanism used is to reclaim array
3495  entries for root threads that are already dead.
3496 
3497  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3498  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3499  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3500  threadprivate cache array has been created. Synchronization with
3501  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3502 
3503  After any dead root reclamation, if the clipping value allows array expansion
3504  to result in the generation of a total of nWish free slots, the function does
3505  that expansion. If not, but the clipping value allows array expansion to
3506  result in the generation of a total of nNeed free slots, the function does
3507  that expansion. Otherwise, nothing is done beyond the possible initial root
3508  thread reclamation. However, if nNeed is zero, a best-effort attempt is made
3509  to fulfil nWish as far as possible, i.e. the function will attempt to create
3510  as many free slots as possible up to nWish.
3511 
3512  If any argument is negative, the behavior is undefined. */
3513 static int __kmp_expand_threads(int nWish, int nNeed) {
3514  int added = 0;
3515  int old_tp_cached;
3516  int __kmp_actual_max_nth;
3517 
3518  if (nNeed > nWish) /* normalize the arguments */
3519  nWish = nNeed;
3520 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3521  /* only for Windows static library */
3522  /* reclaim array entries for root threads that are already dead */
3523  added = __kmp_reclaim_dead_roots();
3524 
3525  if (nNeed) {
3526  nNeed -= added;
3527  if (nNeed < 0)
3528  nNeed = 0;
3529  }
3530  if (nWish) {
3531  nWish -= added;
3532  if (nWish < 0)
3533  nWish = 0;
3534  }
3535 #endif
3536  if (nWish <= 0)
3537  return added;
3538 
3539  while (1) {
3540  int nTarget;
3541  int minimumRequiredCapacity;
3542  int newCapacity;
3543  kmp_info_t **newThreads;
3544  kmp_root_t **newRoot;
3545 
3546  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3547  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3548  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3549  // > __kmp_max_nth in one of two ways:
3550  //
3551  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3552  // may not be resused by another thread, so we may need to increase
3553  // __kmp_threads_capacity to __kmp_max_nth + 1.
3554  //
3555  // 2) New foreign root(s) are encountered. We always register new foreign
3556  // roots. This may cause a smaller # of threads to be allocated at
3557  // subsequent parallel regions, but the worker threads hang around (and
3558  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3559  //
3560  // Anyway, that is the reason for moving the check to see if
3561  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3562  // instead of having it performed here. -BB
3563  old_tp_cached = __kmp_tp_cached;
3564  __kmp_actual_max_nth =
3565  old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3566  KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3567 
3568  /* compute expansion headroom to check if we can expand and whether to aim
3569  for nWish or nNeed */
3570  nTarget = nWish;
3571  if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3572  /* can't fulfil nWish, so try nNeed */
3573  if (nNeed) {
3574  nTarget = nNeed;
3575  if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3576  /* possible expansion too small -- give up */
3577  break;
3578  }
3579  } else {
3580  /* best-effort */
3581  nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3582  if (!nTarget) {
3583  /* can expand at all -- give up */
3584  break;
3585  }
3586  }
3587  }
3588  minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3589 
3590  newCapacity = __kmp_threads_capacity;
3591  do {
3592  newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1)
3593  ? (newCapacity << 1)
3594  : __kmp_actual_max_nth;
3595  } while (newCapacity < minimumRequiredCapacity);
3596  newThreads = (kmp_info_t **)__kmp_allocate(
3597  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity +
3598  CACHE_LINE);
3599  newRoot = (kmp_root_t **)((char *)newThreads +
3600  sizeof(kmp_info_t *) * newCapacity);
3601  KMP_MEMCPY(newThreads, __kmp_threads,
3602  __kmp_threads_capacity * sizeof(kmp_info_t *));
3603  KMP_MEMCPY(newRoot, __kmp_root,
3604  __kmp_threads_capacity * sizeof(kmp_root_t *));
3605  memset(newThreads + __kmp_threads_capacity, 0,
3606  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *));
3607  memset(newRoot + __kmp_threads_capacity, 0,
3608  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *));
3609 
3610  if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3611  /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has
3612  allocated a threadprivate cache while we were allocating the expanded
3613  array, and our new capacity is larger than the threadprivate cache
3614  capacity, so we should deallocate the expanded arrays and try again.
3615  This is the first check of a double-check pair. */
3616  __kmp_free(newThreads);
3617  continue; /* start over and try again */
3618  }
3619  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3620  if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3621  /* Same check as above, but this time with the lock so we can be sure if
3622  we can succeed. */
3623  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3624  __kmp_free(newThreads);
3625  continue; /* start over and try again */
3626  } else {
3627  /* success */
3628  // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be
3629  // investigated.
3630  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3631  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3632  added += newCapacity - __kmp_threads_capacity;
3633  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3634  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3635  break; /* succeeded, so we can exit the loop */
3636  }
3637  }
3638  return added;
3639 }
3640 
3641 /* Register the current thread as a root thread and obtain our gtid. We must
3642  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3643  thread that calls from __kmp_do_serial_initialize() */
3644 int __kmp_register_root(int initial_thread) {
3645  kmp_info_t *root_thread;
3646  kmp_root_t *root;
3647  int gtid;
3648  int capacity;
3649  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3650  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3651  KMP_MB();
3652 
3653  /* 2007-03-02:
3654  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3655  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3656  work as expected -- it may return false (that means there is at least one
3657  empty slot in __kmp_threads array), but it is possible the only free slot
3658  is #0, which is reserved for initial thread and so cannot be used for this
3659  one. Following code workarounds this bug.
3660 
3661  However, right solution seems to be not reserving slot #0 for initial
3662  thread because:
3663  (1) there is no magic in slot #0,
3664  (2) we cannot detect initial thread reliably (the first thread which does
3665  serial initialization may be not a real initial thread).
3666  */
3667  capacity = __kmp_threads_capacity;
3668  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3669  --capacity;
3670  }
3671 
3672  /* see if there are too many threads */
3673  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1, 1)) {
3674  if (__kmp_tp_cached) {
3675  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3676  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3677  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3678  } else {
3679  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3680  __kmp_msg_null);
3681  }
3682  }
3683 
3684  /* find an available thread slot */
3685  /* Don't reassign the zero slot since we need that to only be used by initial
3686  thread */
3687  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3688  gtid++)
3689  ;
3690  KA_TRACE(1,
3691  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3692  KMP_ASSERT(gtid < __kmp_threads_capacity);
3693 
3694  /* update global accounting */
3695  __kmp_all_nth++;
3696  TCW_4(__kmp_nth, __kmp_nth + 1);
3697 
3698  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3699  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3700  if (__kmp_adjust_gtid_mode) {
3701  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3702  if (TCR_4(__kmp_gtid_mode) != 2) {
3703  TCW_4(__kmp_gtid_mode, 2);
3704  }
3705  } else {
3706  if (TCR_4(__kmp_gtid_mode) != 1) {
3707  TCW_4(__kmp_gtid_mode, 1);
3708  }
3709  }
3710  }
3711 
3712 #ifdef KMP_ADJUST_BLOCKTIME
3713  /* Adjust blocktime to zero if necessary */
3714  /* Middle initialization might not have occurred yet */
3715  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3716  if (__kmp_nth > __kmp_avail_proc) {
3717  __kmp_zero_bt = TRUE;
3718  }
3719  }
3720 #endif /* KMP_ADJUST_BLOCKTIME */
3721 
3722  /* setup this new hierarchy */
3723  if (!(root = __kmp_root[gtid])) {
3724  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3725  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3726  }
3727 
3728 #if KMP_STATS_ENABLED
3729  // Initialize stats as soon as possible (right after gtid assignment).
3730  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3731  KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3732  KMP_SET_THREAD_STATE(SERIAL_REGION);
3733  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3734 #endif
3735  __kmp_initialize_root(root);
3736 
3737  /* setup new root thread structure */
3738  if (root->r.r_uber_thread) {
3739  root_thread = root->r.r_uber_thread;
3740  } else {
3741  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3742  if (__kmp_storage_map) {
3743  __kmp_print_thread_storage_map(root_thread, gtid);
3744  }
3745  root_thread->th.th_info.ds.ds_gtid = gtid;
3746 #if OMPT_SUPPORT
3747  root_thread->th.ompt_thread_info.thread_data.ptr = NULL;
3748 #endif
3749  root_thread->th.th_root = root;
3750  if (__kmp_env_consistency_check) {
3751  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3752  }
3753 #if USE_FAST_MEMORY
3754  __kmp_initialize_fast_memory(root_thread);
3755 #endif /* USE_FAST_MEMORY */
3756 
3757 #if KMP_USE_BGET
3758  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3759  __kmp_initialize_bget(root_thread);
3760 #endif
3761  __kmp_init_random(root_thread); // Initialize random number generator
3762  }
3763 
3764  /* setup the serial team held in reserve by the root thread */
3765  if (!root_thread->th.th_serial_team) {
3766  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3767  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3768  root_thread->th.th_serial_team =
3769  __kmp_allocate_team(root, 1, 1,
3770 #if OMPT_SUPPORT
3771  ompt_data_none, // root parallel id
3772 #endif
3773 #if OMP_40_ENABLED
3774  proc_bind_default,
3775 #endif
3776  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3777  }
3778  KMP_ASSERT(root_thread->th.th_serial_team);
3779  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3780  root_thread->th.th_serial_team));
3781 
3782  /* drop root_thread into place */
3783  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3784 
3785  root->r.r_root_team->t.t_threads[0] = root_thread;
3786  root->r.r_hot_team->t.t_threads[0] = root_thread;
3787  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3788  // AC: the team created in reserve, not for execution (it is unused for now).
3789  root_thread->th.th_serial_team->t.t_serialized = 0;
3790  root->r.r_uber_thread = root_thread;
3791 
3792  /* initialize the thread, get it ready to go */
3793  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3794  TCW_4(__kmp_init_gtid, TRUE);
3795 
3796  /* prepare the master thread for get_gtid() */
3797  __kmp_gtid_set_specific(gtid);
3798 
3799 #if USE_ITT_BUILD
3800  __kmp_itt_thread_name(gtid);
3801 #endif /* USE_ITT_BUILD */
3802 
3803 #ifdef KMP_TDATA_GTID
3804  __kmp_gtid = gtid;
3805 #endif
3806  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3807  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3808 
3809  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3810  "plain=%u\n",
3811  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3812  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3813  KMP_INIT_BARRIER_STATE));
3814  { // Initialize barrier data.
3815  int b;
3816  for (b = 0; b < bs_last_barrier; ++b) {
3817  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3818 #if USE_DEBUGGER
3819  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3820 #endif
3821  }
3822  }
3823  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3824  KMP_INIT_BARRIER_STATE);
3825 
3826 #if KMP_AFFINITY_SUPPORTED
3827 #if OMP_40_ENABLED
3828  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3829  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3830  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3831  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3832 #endif
3833 
3834  if (TCR_4(__kmp_init_middle)) {
3835  __kmp_affinity_set_init_mask(gtid, TRUE);
3836  }
3837 #endif /* KMP_AFFINITY_SUPPORTED */
3838 
3839  __kmp_root_counter++;
3840 
3841 #if OMPT_SUPPORT
3842  if (!initial_thread && ompt_enabled.enabled) {
3843 
3844  ompt_thread_t *root_thread = ompt_get_thread();
3845 
3846  ompt_set_thread_state(root_thread, omp_state_overhead);
3847 
3848  if (ompt_enabled.ompt_callback_thread_begin) {
3849  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3850  ompt_thread_initial, __ompt_get_thread_data_internal());
3851  }
3852  ompt_data_t *task_data;
3853  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3854  if (ompt_enabled.ompt_callback_task_create) {
3855  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3856  NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3857  // initial task has nothing to return to
3858  }
3859 
3860  ompt_set_thread_state(root_thread, omp_state_work_serial);
3861  }
3862 #endif
3863 
3864  KMP_MB();
3865  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3866 
3867  return gtid;
3868 }
3869 
3870 #if KMP_NESTED_HOT_TEAMS
3871 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3872  const int max_level) {
3873  int i, n, nth;
3874  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3875  if (!hot_teams || !hot_teams[level].hot_team) {
3876  return 0;
3877  }
3878  KMP_DEBUG_ASSERT(level < max_level);
3879  kmp_team_t *team = hot_teams[level].hot_team;
3880  nth = hot_teams[level].hot_team_nth;
3881  n = nth - 1; // master is not freed
3882  if (level < max_level - 1) {
3883  for (i = 0; i < nth; ++i) {
3884  kmp_info_t *th = team->t.t_threads[i];
3885  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3886  if (i > 0 && th->th.th_hot_teams) {
3887  __kmp_free(th->th.th_hot_teams);
3888  th->th.th_hot_teams = NULL;
3889  }
3890  }
3891  }
3892  __kmp_free_team(root, team, NULL);
3893  return n;
3894 }
3895 #endif
3896 
3897 // Resets a root thread and clear its root and hot teams.
3898 // Returns the number of __kmp_threads entries directly and indirectly freed.
3899 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3900  kmp_team_t *root_team = root->r.r_root_team;
3901  kmp_team_t *hot_team = root->r.r_hot_team;
3902  int n = hot_team->t.t_nproc;
3903  int i;
3904 
3905  KMP_DEBUG_ASSERT(!root->r.r_active);
3906 
3907  root->r.r_root_team = NULL;
3908  root->r.r_hot_team = NULL;
3909  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3910  // before call to __kmp_free_team().
3911  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3912 #if KMP_NESTED_HOT_TEAMS
3913  if (__kmp_hot_teams_max_level >
3914  0) { // need to free nested hot teams and their threads if any
3915  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3916  kmp_info_t *th = hot_team->t.t_threads[i];
3917  if (__kmp_hot_teams_max_level > 1) {
3918  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3919  }
3920  if (th->th.th_hot_teams) {
3921  __kmp_free(th->th.th_hot_teams);
3922  th->th.th_hot_teams = NULL;
3923  }
3924  }
3925  }
3926 #endif
3927  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3928 
3929  // Before we can reap the thread, we need to make certain that all other
3930  // threads in the teams that had this root as ancestor have stopped trying to
3931  // steal tasks.
3932  if (__kmp_tasking_mode != tskm_immediate_exec) {
3933  __kmp_wait_to_unref_task_teams();
3934  }
3935 
3936 #if KMP_OS_WINDOWS
3937  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3938  KA_TRACE(
3939  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3940  "\n",
3941  (LPVOID) & (root->r.r_uber_thread->th),
3942  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3943  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3944 #endif /* KMP_OS_WINDOWS */
3945 
3946 #if OMPT_SUPPORT
3947  if (ompt_enabled.ompt_callback_thread_end) {
3948  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3949  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3950  }
3951 #endif
3952 
3953  TCW_4(__kmp_nth,
3954  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3955  root->r.r_cg_nthreads--;
3956 
3957  __kmp_reap_thread(root->r.r_uber_thread, 1);
3958 
3959  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3960  // of freeing.
3961  root->r.r_uber_thread = NULL;
3962  /* mark root as no longer in use */
3963  root->r.r_begin = FALSE;
3964 
3965  return n;
3966 }
3967 
3968 void __kmp_unregister_root_current_thread(int gtid) {
3969  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3970  /* this lock should be ok, since unregister_root_current_thread is never
3971  called during an abort, only during a normal close. furthermore, if you
3972  have the forkjoin lock, you should never try to get the initz lock */
3973  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3974  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3975  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3976  "exiting T#%d\n",
3977  gtid));
3978  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3979  return;
3980  }
3981  kmp_root_t *root = __kmp_root[gtid];
3982 
3983  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3984  KMP_ASSERT(KMP_UBER_GTID(gtid));
3985  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3986  KMP_ASSERT(root->r.r_active == FALSE);
3987 
3988  KMP_MB();
3989 
3990 #if OMP_45_ENABLED
3991  kmp_info_t *thread = __kmp_threads[gtid];
3992  kmp_team_t *team = thread->th.th_team;
3993  kmp_task_team_t *task_team = thread->th.th_task_team;
3994 
3995  // we need to wait for the proxy tasks before finishing the thread
3996  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3997 #if OMPT_SUPPORT
3998  // the runtime is shutting down so we won't report any events
3999  thread->th.ompt_thread_info.state = omp_state_undefined;
4000 #endif
4001  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4002  }
4003 #endif
4004 
4005  __kmp_reset_root(gtid, root);
4006 
4007  /* free up this thread slot */
4008  __kmp_gtid_set_specific(KMP_GTID_DNE);
4009 #ifdef KMP_TDATA_GTID
4010  __kmp_gtid = KMP_GTID_DNE;
4011 #endif
4012 
4013  KMP_MB();
4014  KC_TRACE(10,
4015  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4016 
4017  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4018 }
4019 
4020 #if KMP_OS_WINDOWS
4021 /* __kmp_forkjoin_lock must be already held
4022  Unregisters a root thread that is not the current thread. Returns the number
4023  of __kmp_threads entries freed as a result. */
4024 static int __kmp_unregister_root_other_thread(int gtid) {
4025  kmp_root_t *root = __kmp_root[gtid];
4026  int r;
4027 
4028  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4029  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4030  KMP_ASSERT(KMP_UBER_GTID(gtid));
4031  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4032  KMP_ASSERT(root->r.r_active == FALSE);
4033 
4034  r = __kmp_reset_root(gtid, root);
4035  KC_TRACE(10,
4036  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4037  return r;
4038 }
4039 #endif
4040 
4041 #if KMP_DEBUG
4042 void __kmp_task_info() {
4043 
4044  kmp_int32 gtid = __kmp_entry_gtid();
4045  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4046  kmp_info_t *this_thr = __kmp_threads[gtid];
4047  kmp_team_t *steam = this_thr->th.th_serial_team;
4048  kmp_team_t *team = this_thr->th.th_team;
4049 
4050  __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
4051  "ptask=%p\n",
4052  gtid, tid, this_thr, team, this_thr->th.th_current_task,
4053  team->t.t_implicit_task_taskdata[tid].td_parent);
4054 }
4055 #endif // KMP_DEBUG
4056 
4057 /* TODO optimize with one big memclr, take out what isn't needed, split
4058  responsibility to workers as much as possible, and delay initialization of
4059  features as much as possible */
4060 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4061  int tid, int gtid) {
4062  /* this_thr->th.th_info.ds.ds_gtid is setup in
4063  kmp_allocate_thread/create_worker.
4064  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4065  kmp_info_t *master = team->t.t_threads[0];
4066  KMP_DEBUG_ASSERT(this_thr != NULL);
4067  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4068  KMP_DEBUG_ASSERT(team);
4069  KMP_DEBUG_ASSERT(team->t.t_threads);
4070  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4071  KMP_DEBUG_ASSERT(master);
4072  KMP_DEBUG_ASSERT(master->th.th_root);
4073 
4074  KMP_MB();
4075 
4076  TCW_SYNC_PTR(this_thr->th.th_team, team);
4077 
4078  this_thr->th.th_info.ds.ds_tid = tid;
4079  this_thr->th.th_set_nproc = 0;
4080  if (__kmp_tasking_mode != tskm_immediate_exec)
4081  // When tasking is possible, threads are not safe to reap until they are
4082  // done tasking; this will be set when tasking code is exited in wait
4083  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4084  else // no tasking --> always safe to reap
4085  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4086 #if OMP_40_ENABLED
4087  this_thr->th.th_set_proc_bind = proc_bind_default;
4088 #if KMP_AFFINITY_SUPPORTED
4089  this_thr->th.th_new_place = this_thr->th.th_current_place;
4090 #endif
4091 #endif
4092  this_thr->th.th_root = master->th.th_root;
4093 
4094  /* setup the thread's cache of the team structure */
4095  this_thr->th.th_team_nproc = team->t.t_nproc;
4096  this_thr->th.th_team_master = master;
4097  this_thr->th.th_team_serialized = team->t.t_serialized;
4098  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4099 
4100  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4101 
4102  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4103  tid, gtid, this_thr, this_thr->th.th_current_task));
4104 
4105  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4106  team, tid, TRUE);
4107 
4108  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4109  tid, gtid, this_thr, this_thr->th.th_current_task));
4110  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4111  // __kmp_initialize_team()?
4112 
4113  /* TODO no worksharing in speculative threads */
4114  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4115 
4116  this_thr->th.th_local.this_construct = 0;
4117 
4118  if (!this_thr->th.th_pri_common) {
4119  this_thr->th.th_pri_common =
4120  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4121  if (__kmp_storage_map) {
4122  __kmp_print_storage_map_gtid(
4123  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4124  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4125  }
4126  this_thr->th.th_pri_head = NULL;
4127  }
4128 
4129  /* Initialize dynamic dispatch */
4130  {
4131  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4132  // Use team max_nproc since this will never change for the team.
4133  size_t disp_size =
4134  sizeof(dispatch_private_info_t) *
4135  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4136  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4137  team->t.t_max_nproc));
4138  KMP_ASSERT(dispatch);
4139  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4140  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4141 
4142  dispatch->th_disp_index = 0;
4143 #if OMP_45_ENABLED
4144  dispatch->th_doacross_buf_idx = 0;
4145 #endif
4146  if (!dispatch->th_disp_buffer) {
4147  dispatch->th_disp_buffer =
4148  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4149 
4150  if (__kmp_storage_map) {
4151  __kmp_print_storage_map_gtid(
4152  gtid, &dispatch->th_disp_buffer[0],
4153  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4154  ? 1
4155  : __kmp_dispatch_num_buffers],
4156  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4157  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4158  gtid, team->t.t_id, gtid);
4159  }
4160  } else {
4161  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4162  }
4163 
4164  dispatch->th_dispatch_pr_current = 0;
4165  dispatch->th_dispatch_sh_current = 0;
4166 
4167  dispatch->th_deo_fcn = 0; /* ORDERED */
4168  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4169  }
4170 
4171  this_thr->th.th_next_pool = NULL;
4172 
4173  if (!this_thr->th.th_task_state_memo_stack) {
4174  size_t i;
4175  this_thr->th.th_task_state_memo_stack =
4176  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4177  this_thr->th.th_task_state_top = 0;
4178  this_thr->th.th_task_state_stack_sz = 4;
4179  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4180  ++i) // zero init the stack
4181  this_thr->th.th_task_state_memo_stack[i] = 0;
4182  }
4183 
4184  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4185  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4186 
4187  KMP_MB();
4188 }
4189 
4190 /* allocate a new thread for the requesting team. this is only called from
4191  within a forkjoin critical section. we will first try to get an available
4192  thread from the thread pool. if none is available, we will fork a new one
4193  assuming we are able to create a new one. this should be assured, as the
4194  caller should check on this first. */
4195 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4196  int new_tid) {
4197  kmp_team_t *serial_team;
4198  kmp_info_t *new_thr;
4199  int new_gtid;
4200 
4201  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4202  KMP_DEBUG_ASSERT(root && team);
4203 #if !KMP_NESTED_HOT_TEAMS
4204  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4205 #endif
4206  KMP_MB();
4207 
4208  /* first, try to get one from the thread pool */
4209  if (__kmp_thread_pool) {
4210 
4211  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4212  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4213  if (new_thr == __kmp_thread_pool_insert_pt) {
4214  __kmp_thread_pool_insert_pt = NULL;
4215  }
4216  TCW_4(new_thr->th.th_in_pool, FALSE);
4217  // Don't touch th_active_in_pool or th_active.
4218  // The worker thread adjusts those flags as it sleeps/awakens.
4219  __kmp_thread_pool_nth--;
4220 
4221  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4222  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4223  KMP_ASSERT(!new_thr->th.th_team);
4224  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4225  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4226 
4227  /* setup the thread structure */
4228  __kmp_initialize_info(new_thr, team, new_tid,
4229  new_thr->th.th_info.ds.ds_gtid);
4230  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4231 
4232  TCW_4(__kmp_nth, __kmp_nth + 1);
4233  root->r.r_cg_nthreads++;
4234 
4235  new_thr->th.th_task_state = 0;
4236  new_thr->th.th_task_state_top = 0;
4237  new_thr->th.th_task_state_stack_sz = 4;
4238 
4239 #ifdef KMP_ADJUST_BLOCKTIME
4240  /* Adjust blocktime back to zero if necessary */
4241  /* Middle initialization might not have occurred yet */
4242  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4243  if (__kmp_nth > __kmp_avail_proc) {
4244  __kmp_zero_bt = TRUE;
4245  }
4246  }
4247 #endif /* KMP_ADJUST_BLOCKTIME */
4248 
4249 #if KMP_DEBUG
4250  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4251  // KMP_BARRIER_PARENT_FLAG.
4252  int b;
4253  kmp_balign_t *balign = new_thr->th.th_bar;
4254  for (b = 0; b < bs_last_barrier; ++b)
4255  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4256 #endif
4257 
4258  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4259  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4260 
4261  KMP_MB();
4262  return new_thr;
4263  }
4264 
4265  /* no, well fork a new one */
4266  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4267  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4268 
4269 #if KMP_USE_MONITOR
4270  // If this is the first worker thread the RTL is creating, then also
4271  // launch the monitor thread. We try to do this as early as possible.
4272  if (!TCR_4(__kmp_init_monitor)) {
4273  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4274  if (!TCR_4(__kmp_init_monitor)) {
4275  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4276  TCW_4(__kmp_init_monitor, 1);
4277  __kmp_create_monitor(&__kmp_monitor);
4278  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4279 #if KMP_OS_WINDOWS
4280  // AC: wait until monitor has started. This is a fix for CQ232808.
4281  // The reason is that if the library is loaded/unloaded in a loop with
4282  // small (parallel) work in between, then there is high probability that
4283  // monitor thread started after the library shutdown. At shutdown it is
4284  // too late to cope with the problem, because when the master is in
4285  // DllMain (process detach) the monitor has no chances to start (it is
4286  // blocked), and master has no means to inform the monitor that the
4287  // library has gone, because all the memory which the monitor can access
4288  // is going to be released/reset.
4289  while (TCR_4(__kmp_init_monitor) < 2) {
4290  KMP_YIELD(TRUE);
4291  }
4292  KF_TRACE(10, ("after monitor thread has started\n"));
4293 #endif
4294  }
4295  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4296  }
4297 #endif
4298 
4299  KMP_MB();
4300  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4301  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4302  }
4303 
4304  /* allocate space for it. */
4305  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4306 
4307  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4308 
4309  if (__kmp_storage_map) {
4310  __kmp_print_thread_storage_map(new_thr, new_gtid);
4311  }
4312 
4313  // add the reserve serialized team, initialized from the team's master thread
4314  {
4315  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4316  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4317  new_thr->th.th_serial_team = serial_team =
4318  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4319 #if OMPT_SUPPORT
4320  ompt_data_none, // root parallel id
4321 #endif
4322 #if OMP_40_ENABLED
4323  proc_bind_default,
4324 #endif
4325  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4326  }
4327  KMP_ASSERT(serial_team);
4328  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4329  // execution (it is unused for now).
4330  serial_team->t.t_threads[0] = new_thr;
4331  KF_TRACE(10,
4332  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4333  new_thr));
4334 
4335  /* setup the thread structures */
4336  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4337 
4338 #if USE_FAST_MEMORY
4339  __kmp_initialize_fast_memory(new_thr);
4340 #endif /* USE_FAST_MEMORY */
4341 
4342 #if KMP_USE_BGET
4343  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4344  __kmp_initialize_bget(new_thr);
4345 #endif
4346 
4347  __kmp_init_random(new_thr); // Initialize random number generator
4348 
4349  /* Initialize these only once when thread is grabbed for a team allocation */
4350  KA_TRACE(20,
4351  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4352  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4353 
4354  int b;
4355  kmp_balign_t *balign = new_thr->th.th_bar;
4356  for (b = 0; b < bs_last_barrier; ++b) {
4357  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4358  balign[b].bb.team = NULL;
4359  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4360  balign[b].bb.use_oncore_barrier = 0;
4361  }
4362 
4363  new_thr->th.th_spin_here = FALSE;
4364  new_thr->th.th_next_waiting = 0;
4365 
4366 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4367  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4368  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4369  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4370  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4371 #endif
4372 
4373  TCW_4(new_thr->th.th_in_pool, FALSE);
4374  new_thr->th.th_active_in_pool = FALSE;
4375  TCW_4(new_thr->th.th_active, TRUE);
4376 
4377  /* adjust the global counters */
4378  __kmp_all_nth++;
4379  __kmp_nth++;
4380 
4381  root->r.r_cg_nthreads++;
4382 
4383  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4384  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4385  if (__kmp_adjust_gtid_mode) {
4386  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4387  if (TCR_4(__kmp_gtid_mode) != 2) {
4388  TCW_4(__kmp_gtid_mode, 2);
4389  }
4390  } else {
4391  if (TCR_4(__kmp_gtid_mode) != 1) {
4392  TCW_4(__kmp_gtid_mode, 1);
4393  }
4394  }
4395  }
4396 
4397 #ifdef KMP_ADJUST_BLOCKTIME
4398  /* Adjust blocktime back to zero if necessary */
4399  /* Middle initialization might not have occurred yet */
4400  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4401  if (__kmp_nth > __kmp_avail_proc) {
4402  __kmp_zero_bt = TRUE;
4403  }
4404  }
4405 #endif /* KMP_ADJUST_BLOCKTIME */
4406 
4407  /* actually fork it and create the new worker thread */
4408  KF_TRACE(
4409  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4410  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4411  KF_TRACE(10,
4412  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4413 
4414  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4415  new_gtid));
4416  KMP_MB();
4417  return new_thr;
4418 }
4419 
4420 /* Reinitialize team for reuse.
4421  The hot team code calls this case at every fork barrier, so EPCC barrier
4422  test are extremely sensitive to changes in it, esp. writes to the team
4423  struct, which cause a cache invalidation in all threads.
4424  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4425 static void __kmp_reinitialize_team(kmp_team_t *team,
4426  kmp_internal_control_t *new_icvs,
4427  ident_t *loc) {
4428  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4429  team->t.t_threads[0], team));
4430  KMP_DEBUG_ASSERT(team && new_icvs);
4431  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4432  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4433 
4434  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4435  // Copy ICVs to the master thread's implicit taskdata
4436  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4437  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4438 
4439  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4440  team->t.t_threads[0], team));
4441 }
4442 
4443 /* Initialize the team data structure.
4444  This assumes the t_threads and t_max_nproc are already set.
4445  Also, we don't touch the arguments */
4446 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4447  kmp_internal_control_t *new_icvs,
4448  ident_t *loc) {
4449  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4450 
4451  /* verify */
4452  KMP_DEBUG_ASSERT(team);
4453  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4454  KMP_DEBUG_ASSERT(team->t.t_threads);
4455  KMP_MB();
4456 
4457  team->t.t_master_tid = 0; /* not needed */
4458  /* team->t.t_master_bar; not needed */
4459  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4460  team->t.t_nproc = new_nproc;
4461 
4462  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4463  team->t.t_next_pool = NULL;
4464  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4465  * up hot team */
4466 
4467  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4468  team->t.t_invoke = NULL; /* not needed */
4469 
4470  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4471  team->t.t_sched.sched = new_icvs->sched.sched;
4472 
4473 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4474  team->t.t_fp_control_saved = FALSE; /* not needed */
4475  team->t.t_x87_fpu_control_word = 0; /* not needed */
4476  team->t.t_mxcsr = 0; /* not needed */
4477 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4478 
4479  team->t.t_construct = 0;
4480 
4481  team->t.t_ordered.dt.t_value = 0;
4482  team->t.t_master_active = FALSE;
4483 
4484  memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4485 
4486 #ifdef KMP_DEBUG
4487  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4488 #endif
4489  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4490 
4491  team->t.t_control_stack_top = NULL;
4492 
4493  __kmp_reinitialize_team(team, new_icvs, loc);
4494 
4495  KMP_MB();
4496  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4497 }
4498 
4499 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4500 /* Sets full mask for thread and returns old mask, no changes to structures. */
4501 static void
4502 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4503  if (KMP_AFFINITY_CAPABLE()) {
4504  int status;
4505  if (old_mask != NULL) {
4506  status = __kmp_get_system_affinity(old_mask, TRUE);
4507  int error = errno;
4508  if (status != 0) {
4509  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4510  __kmp_msg_null);
4511  }
4512  }
4513  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4514  }
4515 }
4516 #endif
4517 
4518 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4519 
4520 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4521 // It calculats the worker + master thread's partition based upon the parent
4522 // thread's partition, and binds each worker to a thread in their partition.
4523 // The master thread's partition should already include its current binding.
4524 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4525  // Copy the master thread's place partion to the team struct
4526  kmp_info_t *master_th = team->t.t_threads[0];
4527  KMP_DEBUG_ASSERT(master_th != NULL);
4528  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4529  int first_place = master_th->th.th_first_place;
4530  int last_place = master_th->th.th_last_place;
4531  int masters_place = master_th->th.th_current_place;
4532  team->t.t_first_place = first_place;
4533  team->t.t_last_place = last_place;
4534 
4535  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4536  "bound to place %d partition = [%d,%d]\n",
4537  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4538  team->t.t_id, masters_place, first_place, last_place));
4539 
4540  switch (proc_bind) {
4541 
4542  case proc_bind_default:
4543  // serial teams might have the proc_bind policy set to proc_bind_default. It
4544  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4545  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4546  break;
4547 
4548  case proc_bind_master: {
4549  int f;
4550  int n_th = team->t.t_nproc;
4551  for (f = 1; f < n_th; f++) {
4552  kmp_info_t *th = team->t.t_threads[f];
4553  KMP_DEBUG_ASSERT(th != NULL);
4554  th->th.th_first_place = first_place;
4555  th->th.th_last_place = last_place;
4556  th->th.th_new_place = masters_place;
4557 
4558  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4559  "partition = [%d,%d]\n",
4560  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4561  f, masters_place, first_place, last_place));
4562  }
4563  } break;
4564 
4565  case proc_bind_close: {
4566  int f;
4567  int n_th = team->t.t_nproc;
4568  int n_places;
4569  if (first_place <= last_place) {
4570  n_places = last_place - first_place + 1;
4571  } else {
4572  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4573  }
4574  if (n_th <= n_places) {
4575  int place = masters_place;
4576  for (f = 1; f < n_th; f++) {
4577  kmp_info_t *th = team->t.t_threads[f];
4578  KMP_DEBUG_ASSERT(th != NULL);
4579 
4580  if (place == last_place) {
4581  place = first_place;
4582  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4583  place = 0;
4584  } else {
4585  place++;
4586  }
4587  th->th.th_first_place = first_place;
4588  th->th.th_last_place = last_place;
4589  th->th.th_new_place = place;
4590 
4591  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4592  "partition = [%d,%d]\n",
4593  __kmp_gtid_from_thread(team->t.t_threads[f]),
4594  team->t.t_id, f, place, first_place, last_place));
4595  }
4596  } else {
4597  int S, rem, gap, s_count;
4598  S = n_th / n_places;
4599  s_count = 0;
4600  rem = n_th - (S * n_places);
4601  gap = rem > 0 ? n_places / rem : n_places;
4602  int place = masters_place;
4603  int gap_ct = gap;
4604  for (f = 0; f < n_th; f++) {
4605  kmp_info_t *th = team->t.t_threads[f];
4606  KMP_DEBUG_ASSERT(th != NULL);
4607 
4608  th->th.th_first_place = first_place;
4609  th->th.th_last_place = last_place;
4610  th->th.th_new_place = place;
4611  s_count++;
4612 
4613  if ((s_count == S) && rem && (gap_ct == gap)) {
4614  // do nothing, add an extra thread to place on next iteration
4615  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4616  // we added an extra thread to this place; move to next place
4617  if (place == last_place) {
4618  place = first_place;
4619  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4620  place = 0;
4621  } else {
4622  place++;
4623  }
4624  s_count = 0;
4625  gap_ct = 1;
4626  rem--;
4627  } else if (s_count == S) { // place full; don't add extra
4628  if (place == last_place) {
4629  place = first_place;
4630  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4631  place = 0;
4632  } else {
4633  place++;
4634  }
4635  gap_ct++;
4636  s_count = 0;
4637  }
4638 
4639  KA_TRACE(100,
4640  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4641  "partition = [%d,%d]\n",
4642  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4643  th->th.th_new_place, first_place, last_place));
4644  }
4645  KMP_DEBUG_ASSERT(place == masters_place);
4646  }
4647  } break;
4648 
4649  case proc_bind_spread: {
4650  int f;
4651  int n_th = team->t.t_nproc;
4652  int n_places;
4653  int thidx;
4654  if (first_place <= last_place) {
4655  n_places = last_place - first_place + 1;
4656  } else {
4657  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4658  }
4659  if (n_th <= n_places) {
4660  int place = -1;
4661 
4662  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4663  int S = n_places / n_th;
4664  int s_count, rem, gap, gap_ct;
4665 
4666  place = masters_place;
4667  rem = n_places - n_th * S;
4668  gap = rem ? n_th / rem : 1;
4669  gap_ct = gap;
4670  thidx = n_th;
4671  if (update_master_only == 1)
4672  thidx = 1;
4673  for (f = 0; f < thidx; f++) {
4674  kmp_info_t *th = team->t.t_threads[f];
4675  KMP_DEBUG_ASSERT(th != NULL);
4676 
4677  th->th.th_first_place = place;
4678  th->th.th_new_place = place;
4679  s_count = 1;
4680  while (s_count < S) {
4681  if (place == last_place) {
4682  place = first_place;
4683  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4684  place = 0;
4685  } else {
4686  place++;
4687  }
4688  s_count++;
4689  }
4690  if (rem && (gap_ct == gap)) {
4691  if (place == last_place) {
4692  place = first_place;
4693  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4694  place = 0;
4695  } else {
4696  place++;
4697  }
4698  rem--;
4699  gap_ct = 0;
4700  }
4701  th->th.th_last_place = place;
4702  gap_ct++;
4703 
4704  if (place == last_place) {
4705  place = first_place;
4706  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4707  place = 0;
4708  } else {
4709  place++;
4710  }
4711 
4712  KA_TRACE(100,
4713  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4714  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4715  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4716  f, th->th.th_new_place, th->th.th_first_place,
4717  th->th.th_last_place, __kmp_affinity_num_masks));
4718  }
4719  } else {
4720  /* Having uniform space of available computation places I can create
4721  T partitions of round(P/T) size and put threads into the first
4722  place of each partition. */
4723  double current = static_cast<double>(masters_place);
4724  double spacing =
4725  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4726  int first, last;
4727  kmp_info_t *th;
4728 
4729  thidx = n_th + 1;
4730  if (update_master_only == 1)
4731  thidx = 1;
4732  for (f = 0; f < thidx; f++) {
4733  first = static_cast<int>(current);
4734  last = static_cast<int>(current + spacing) - 1;
4735  KMP_DEBUG_ASSERT(last >= first);
4736  if (first >= n_places) {
4737  if (masters_place) {
4738  first -= n_places;
4739  last -= n_places;
4740  if (first == (masters_place + 1)) {
4741  KMP_DEBUG_ASSERT(f == n_th);
4742  first--;
4743  }
4744  if (last == masters_place) {
4745  KMP_DEBUG_ASSERT(f == (n_th - 1));
4746  last--;
4747  }
4748  } else {
4749  KMP_DEBUG_ASSERT(f == n_th);
4750  first = 0;
4751  last = 0;
4752  }
4753  }
4754  if (last >= n_places) {
4755  last = (n_places - 1);
4756  }
4757  place = first;
4758  current += spacing;
4759  if (f < n_th) {
4760  KMP_DEBUG_ASSERT(0 <= first);
4761  KMP_DEBUG_ASSERT(n_places > first);
4762  KMP_DEBUG_ASSERT(0 <= last);
4763  KMP_DEBUG_ASSERT(n_places > last);
4764  KMP_DEBUG_ASSERT(last_place >= first_place);
4765  th = team->t.t_threads[f];
4766  KMP_DEBUG_ASSERT(th);
4767  th->th.th_first_place = first;
4768  th->th.th_new_place = place;
4769  th->th.th_last_place = last;
4770 
4771  KA_TRACE(100,
4772  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4773  "partition = [%d,%d], spacing = %.4f\n",
4774  __kmp_gtid_from_thread(team->t.t_threads[f]),
4775  team->t.t_id, f, th->th.th_new_place,
4776  th->th.th_first_place, th->th.th_last_place, spacing));
4777  }
4778  }
4779  }
4780  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4781  } else {
4782  int S, rem, gap, s_count;
4783  S = n_th / n_places;
4784  s_count = 0;
4785  rem = n_th - (S * n_places);
4786  gap = rem > 0 ? n_places / rem : n_places;
4787  int place = masters_place;
4788  int gap_ct = gap;
4789  thidx = n_th;
4790  if (update_master_only == 1)
4791  thidx = 1;
4792  for (f = 0; f < thidx; f++) {
4793  kmp_info_t *th = team->t.t_threads[f];
4794  KMP_DEBUG_ASSERT(th != NULL);
4795 
4796  th->th.th_first_place = place;
4797  th->th.th_last_place = place;
4798  th->th.th_new_place = place;
4799  s_count++;
4800 
4801  if ((s_count == S) && rem && (gap_ct == gap)) {
4802  // do nothing, add an extra thread to place on next iteration
4803  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4804  // we added an extra thread to this place; move on to next place
4805  if (place == last_place) {
4806  place = first_place;
4807  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4808  place = 0;
4809  } else {
4810  place++;
4811  }
4812  s_count = 0;
4813  gap_ct = 1;
4814  rem--;
4815  } else if (s_count == S) { // place is full; don't add extra thread
4816  if (place == last_place) {
4817  place = first_place;
4818  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4819  place = 0;
4820  } else {
4821  place++;
4822  }
4823  gap_ct++;
4824  s_count = 0;
4825  }
4826 
4827  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4828  "partition = [%d,%d]\n",
4829  __kmp_gtid_from_thread(team->t.t_threads[f]),
4830  team->t.t_id, f, th->th.th_new_place,
4831  th->th.th_first_place, th->th.th_last_place));
4832  }
4833  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4834  }
4835  } break;
4836 
4837  default:
4838  break;
4839  }
4840 
4841  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4842 }
4843 
4844 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4845 
4846 /* allocate a new team data structure to use. take one off of the free pool if
4847  available */
4848 kmp_team_t *
4849 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4850 #if OMPT_SUPPORT
4851  ompt_data_t ompt_parallel_data,
4852 #endif
4853 #if OMP_40_ENABLED
4854  kmp_proc_bind_t new_proc_bind,
4855 #endif
4856  kmp_internal_control_t *new_icvs,
4857  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4858  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4859  int f;
4860  kmp_team_t *team;
4861  int use_hot_team = !root->r.r_active;
4862  int level = 0;
4863 
4864  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4865  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4866  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4867  KMP_MB();
4868 
4869 #if KMP_NESTED_HOT_TEAMS
4870  kmp_hot_team_ptr_t *hot_teams;
4871  if (master) {
4872  team = master->th.th_team;
4873  level = team->t.t_active_level;
4874  if (master->th.th_teams_microtask) { // in teams construct?
4875  if (master->th.th_teams_size.nteams > 1 &&
4876  ( // #teams > 1
4877  team->t.t_pkfn ==
4878  (microtask_t)__kmp_teams_master || // inner fork of the teams
4879  master->th.th_teams_level <
4880  team->t.t_level)) { // or nested parallel inside the teams
4881  ++level; // not increment if #teams==1, or for outer fork of the teams;
4882  // increment otherwise
4883  }
4884  }
4885  hot_teams = master->th.th_hot_teams;
4886  if (level < __kmp_hot_teams_max_level && hot_teams &&
4887  hot_teams[level]
4888  .hot_team) { // hot team has already been allocated for given level
4889  use_hot_team = 1;
4890  } else {
4891  use_hot_team = 0;
4892  }
4893  }
4894 #endif
4895  // Optimization to use a "hot" team
4896  if (use_hot_team && new_nproc > 1) {
4897  KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4898 #if KMP_NESTED_HOT_TEAMS
4899  team = hot_teams[level].hot_team;
4900 #else
4901  team = root->r.r_hot_team;
4902 #endif
4903 #if KMP_DEBUG
4904  if (__kmp_tasking_mode != tskm_immediate_exec) {
4905  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4906  "task_team[1] = %p before reinit\n",
4907  team->t.t_task_team[0], team->t.t_task_team[1]));
4908  }
4909 #endif
4910 
4911  // Has the number of threads changed?
4912  /* Let's assume the most common case is that the number of threads is
4913  unchanged, and put that case first. */
4914  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4915  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4916  // This case can mean that omp_set_num_threads() was called and the hot
4917  // team size was already reduced, so we check the special flag
4918  if (team->t.t_size_changed == -1) {
4919  team->t.t_size_changed = 1;
4920  } else {
4921  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4922  }
4923 
4924  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4925  kmp_r_sched_t new_sched = new_icvs->sched;
4926  // set master's schedule as new run-time schedule
4927  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4928 
4929  __kmp_reinitialize_team(team, new_icvs,
4930  root->r.r_uber_thread->th.th_ident);
4931 
4932  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4933  team->t.t_threads[0], team));
4934  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4935 
4936 #if OMP_40_ENABLED
4937 #if KMP_AFFINITY_SUPPORTED
4938  if ((team->t.t_size_changed == 0) &&
4939  (team->t.t_proc_bind == new_proc_bind)) {
4940  if (new_proc_bind == proc_bind_spread) {
4941  __kmp_partition_places(
4942  team, 1); // add flag to update only master for spread
4943  }
4944  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4945  "proc_bind = %d, partition = [%d,%d]\n",
4946  team->t.t_id, new_proc_bind, team->t.t_first_place,
4947  team->t.t_last_place));
4948  } else {
4949  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4950  __kmp_partition_places(team);
4951  }
4952 #else
4953  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4954 #endif /* KMP_AFFINITY_SUPPORTED */
4955 #endif /* OMP_40_ENABLED */
4956  } else if (team->t.t_nproc > new_nproc) {
4957  KA_TRACE(20,
4958  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4959  new_nproc));
4960 
4961  team->t.t_size_changed = 1;
4962 #if KMP_NESTED_HOT_TEAMS
4963  if (__kmp_hot_teams_mode == 0) {
4964  // AC: saved number of threads should correspond to team's value in this
4965  // mode, can be bigger in mode 1, when hot team has threads in reserve
4966  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4967  hot_teams[level].hot_team_nth = new_nproc;
4968 #endif // KMP_NESTED_HOT_TEAMS
4969  /* release the extra threads we don't need any more */
4970  for (f = new_nproc; f < team->t.t_nproc; f++) {
4971  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4972  if (__kmp_tasking_mode != tskm_immediate_exec) {
4973  // When decreasing team size, threads no longer in the team should
4974  // unref task team.
4975  team->t.t_threads[f]->th.th_task_team = NULL;
4976  }
4977  __kmp_free_thread(team->t.t_threads[f]);
4978  team->t.t_threads[f] = NULL;
4979  }
4980 #if KMP_NESTED_HOT_TEAMS
4981  } // (__kmp_hot_teams_mode == 0)
4982  else {
4983  // When keeping extra threads in team, switch threads to wait on own
4984  // b_go flag
4985  for (f = new_nproc; f < team->t.t_nproc; ++f) {
4986  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4987  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4988  for (int b = 0; b < bs_last_barrier; ++b) {
4989  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4990  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4991  }
4992  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4993  }
4994  }
4995  }
4996 #endif // KMP_NESTED_HOT_TEAMS
4997  team->t.t_nproc = new_nproc;
4998  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4999  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5000  __kmp_reinitialize_team(team, new_icvs,
5001  root->r.r_uber_thread->th.th_ident);
5002 
5003  /* update the remaining threads */
5004  for (f = 0; f < new_nproc; ++f) {
5005  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5006  }
5007  // restore the current task state of the master thread: should be the
5008  // implicit task
5009  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5010  team->t.t_threads[0], team));
5011 
5012  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5013 
5014 #ifdef KMP_DEBUG
5015  for (f = 0; f < team->t.t_nproc; f++) {
5016  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5017  team->t.t_threads[f]->th.th_team_nproc ==
5018  team->t.t_nproc);
5019  }
5020 #endif
5021 
5022 #if OMP_40_ENABLED
5023  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5024 #if KMP_AFFINITY_SUPPORTED
5025  __kmp_partition_places(team);
5026 #endif
5027 #endif
5028  } else { // team->t.t_nproc < new_nproc
5029 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5030  kmp_affin_mask_t *old_mask;
5031  if (KMP_AFFINITY_CAPABLE()) {
5032  KMP_CPU_ALLOC(old_mask);
5033  }
5034 #endif
5035 
5036  KA_TRACE(20,
5037  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5038  new_nproc));
5039 
5040  team->t.t_size_changed = 1;
5041 
5042 #if KMP_NESTED_HOT_TEAMS
5043  int avail_threads = hot_teams[level].hot_team_nth;
5044  if (new_nproc < avail_threads)
5045  avail_threads = new_nproc;
5046  kmp_info_t **other_threads = team->t.t_threads;
5047  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5048  // Adjust barrier data of reserved threads (if any) of the team
5049  // Other data will be set in __kmp_initialize_info() below.
5050  int b;
5051  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5052  for (b = 0; b < bs_last_barrier; ++b) {
5053  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5054  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5055 #if USE_DEBUGGER
5056  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5057 #endif
5058  }
5059  }
5060  if (hot_teams[level].hot_team_nth >= new_nproc) {
5061  // we have all needed threads in reserve, no need to allocate any
5062  // this only possible in mode 1, cannot have reserved threads in mode 0
5063  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5064  team->t.t_nproc = new_nproc; // just get reserved threads involved
5065  } else {
5066  // we may have some threads in reserve, but not enough
5067  team->t.t_nproc =
5068  hot_teams[level]
5069  .hot_team_nth; // get reserved threads involved if any
5070  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5071 #endif // KMP_NESTED_HOT_TEAMS
5072  if (team->t.t_max_nproc < new_nproc) {
5073  /* reallocate larger arrays */
5074  __kmp_reallocate_team_arrays(team, new_nproc);
5075  __kmp_reinitialize_team(team, new_icvs, NULL);
5076  }
5077 
5078 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5079  /* Temporarily set full mask for master thread before creation of
5080  workers. The reason is that workers inherit the affinity from master,
5081  so if a lot of workers are created on the single core quickly, they
5082  don't get a chance to set their own affinity for a long time. */
5083  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5084 #endif
5085 
5086  /* allocate new threads for the hot team */
5087  for (f = team->t.t_nproc; f < new_nproc; f++) {
5088  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5089  KMP_DEBUG_ASSERT(new_worker);
5090  team->t.t_threads[f] = new_worker;
5091 
5092  KA_TRACE(20,
5093  ("__kmp_allocate_team: team %d init T#%d arrived: "
5094  "join=%llu, plain=%llu\n",
5095  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5096  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5097  team->t.t_bar[bs_plain_barrier].b_arrived));
5098 
5099  { // Initialize barrier data for new threads.
5100  int b;
5101  kmp_balign_t *balign = new_worker->th.th_bar;
5102  for (b = 0; b < bs_last_barrier; ++b) {
5103  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5104  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5105  KMP_BARRIER_PARENT_FLAG);
5106 #if USE_DEBUGGER
5107  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5108 #endif
5109  }
5110  }
5111  }
5112 
5113 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5114  if (KMP_AFFINITY_CAPABLE()) {
5115  /* Restore initial master thread's affinity mask */
5116  __kmp_set_system_affinity(old_mask, TRUE);
5117  KMP_CPU_FREE(old_mask);
5118  }
5119 #endif
5120 #if KMP_NESTED_HOT_TEAMS
5121  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5122 #endif // KMP_NESTED_HOT_TEAMS
5123  /* make sure everyone is syncronized */
5124  int old_nproc = team->t.t_nproc; // save old value and use to update only
5125  // new threads below
5126  __kmp_initialize_team(team, new_nproc, new_icvs,
5127  root->r.r_uber_thread->th.th_ident);
5128 
5129  /* reinitialize the threads */
5130  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5131  for (f = 0; f < team->t.t_nproc; ++f)
5132  __kmp_initialize_info(team->t.t_threads[f], team, f,
5133  __kmp_gtid_from_tid(f, team));
5134  if (level) { // set th_task_state for new threads in nested hot team
5135  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5136  // only need to set the th_task_state for the new threads. th_task_state
5137  // for master thread will not be accurate until after this in
5138  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5139  // correct value.
5140  for (f = old_nproc; f < team->t.t_nproc; ++f)
5141  team->t.t_threads[f]->th.th_task_state =
5142  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5143  } else { // set th_task_state for new threads in non-nested hot team
5144  int old_state =
5145  team->t.t_threads[0]->th.th_task_state; // copy master's state
5146  for (f = old_nproc; f < team->t.t_nproc; ++f)
5147  team->t.t_threads[f]->th.th_task_state = old_state;
5148  }
5149 
5150 #ifdef KMP_DEBUG
5151  for (f = 0; f < team->t.t_nproc; ++f) {
5152  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5153  team->t.t_threads[f]->th.th_team_nproc ==
5154  team->t.t_nproc);
5155  }
5156 #endif
5157 
5158 #if OMP_40_ENABLED
5159  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5160 #if KMP_AFFINITY_SUPPORTED
5161  __kmp_partition_places(team);
5162 #endif
5163 #endif
5164  } // Check changes in number of threads
5165 
5166 #if OMP_40_ENABLED
5167  kmp_info_t *master = team->t.t_threads[0];
5168  if (master->th.th_teams_microtask) {
5169  for (f = 1; f < new_nproc; ++f) {
5170  // propagate teams construct specific info to workers
5171  kmp_info_t *thr = team->t.t_threads[f];
5172  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5173  thr->th.th_teams_level = master->th.th_teams_level;
5174  thr->th.th_teams_size = master->th.th_teams_size;
5175  }
5176  }
5177 #endif /* OMP_40_ENABLED */
5178 #if KMP_NESTED_HOT_TEAMS
5179  if (level) {
5180  // Sync barrier state for nested hot teams, not needed for outermost hot
5181  // team.
5182  for (f = 1; f < new_nproc; ++f) {
5183  kmp_info_t *thr = team->t.t_threads[f];
5184  int b;
5185  kmp_balign_t *balign = thr->th.th_bar;
5186  for (b = 0; b < bs_last_barrier; ++b) {
5187  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5188  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5189 #if USE_DEBUGGER
5190  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5191 #endif
5192  }
5193  }
5194  }
5195 #endif // KMP_NESTED_HOT_TEAMS
5196 
5197  /* reallocate space for arguments if necessary */
5198  __kmp_alloc_argv_entries(argc, team, TRUE);
5199  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5200  // The hot team re-uses the previous task team,
5201  // if untouched during the previous release->gather phase.
5202 
5203  KF_TRACE(10, (" hot_team = %p\n", team));
5204 
5205 #if KMP_DEBUG
5206  if (__kmp_tasking_mode != tskm_immediate_exec) {
5207  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5208  "task_team[1] = %p after reinit\n",
5209  team->t.t_task_team[0], team->t.t_task_team[1]));
5210  }
5211 #endif
5212 
5213 #if OMPT_SUPPORT
5214  __ompt_team_assign_id(team, ompt_parallel_data);
5215 #endif
5216 
5217  KMP_MB();
5218 
5219  return team;
5220  }
5221 
5222  /* next, let's try to take one from the team pool */
5223  KMP_MB();
5224  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5225  /* TODO: consider resizing undersized teams instead of reaping them, now
5226  that we have a resizing mechanism */
5227  if (team->t.t_max_nproc >= max_nproc) {
5228  /* take this team from the team pool */
5229  __kmp_team_pool = team->t.t_next_pool;
5230 
5231  /* setup the team for fresh use */
5232  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5233 
5234  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5235  "task_team[1] %p to NULL\n",
5236  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5237  team->t.t_task_team[0] = NULL;
5238  team->t.t_task_team[1] = NULL;
5239 
5240  /* reallocate space for arguments if necessary */
5241  __kmp_alloc_argv_entries(argc, team, TRUE);
5242  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5243 
5244  KA_TRACE(
5245  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5246  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5247  { // Initialize barrier data.
5248  int b;
5249  for (b = 0; b < bs_last_barrier; ++b) {
5250  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5251 #if USE_DEBUGGER
5252  team->t.t_bar[b].b_master_arrived = 0;
5253  team->t.t_bar[b].b_team_arrived = 0;
5254 #endif
5255  }
5256  }
5257 
5258 #if OMP_40_ENABLED
5259  team->t.t_proc_bind = new_proc_bind;
5260 #endif
5261 
5262  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5263  team->t.t_id));
5264 
5265 #if OMPT_SUPPORT
5266  __ompt_team_assign_id(team, ompt_parallel_data);
5267 #endif
5268 
5269  KMP_MB();
5270 
5271  return team;
5272  }
5273 
5274  /* reap team if it is too small, then loop back and check the next one */
5275  // not sure if this is wise, but, will be redone during the hot-teams
5276  // rewrite.
5277  /* TODO: Use technique to find the right size hot-team, don't reap them */
5278  team = __kmp_reap_team(team);
5279  __kmp_team_pool = team;
5280  }
5281 
5282  /* nothing available in the pool, no matter, make a new team! */
5283  KMP_MB();
5284  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5285 
5286  /* and set it up */
5287  team->t.t_max_nproc = max_nproc;
5288  /* NOTE well, for some reason allocating one big buffer and dividing it up
5289  seems to really hurt performance a lot on the P4, so, let's not use this */
5290  __kmp_allocate_team_arrays(team, max_nproc);
5291 
5292  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5293  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5294 
5295  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5296  "%p to NULL\n",
5297  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5298  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5299  // memory, no need to duplicate
5300  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5301  // memory, no need to duplicate
5302 
5303  if (__kmp_storage_map) {
5304  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5305  }
5306 
5307  /* allocate space for arguments */
5308  __kmp_alloc_argv_entries(argc, team, FALSE);
5309  team->t.t_argc = argc;
5310 
5311  KA_TRACE(20,
5312  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5313  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5314  { // Initialize barrier data.
5315  int b;
5316  for (b = 0; b < bs_last_barrier; ++b) {
5317  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5318 #if USE_DEBUGGER
5319  team->t.t_bar[b].b_master_arrived = 0;
5320  team->t.t_bar[b].b_team_arrived = 0;
5321 #endif
5322  }
5323  }
5324 
5325 #if OMP_40_ENABLED
5326  team->t.t_proc_bind = new_proc_bind;
5327 #endif
5328 
5329 #if OMPT_SUPPORT
5330  __ompt_team_assign_id(team, ompt_parallel_data);
5331  team->t.ompt_serialized_team_info = NULL;
5332 #endif
5333 
5334  KMP_MB();
5335 
5336  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5337  team->t.t_id));
5338 
5339  return team;
5340 }
5341 
5342 /* TODO implement hot-teams at all levels */
5343 /* TODO implement lazy thread release on demand (disband request) */
5344 
5345 /* free the team. return it to the team pool. release all the threads
5346  * associated with it */
5347 void __kmp_free_team(kmp_root_t *root,
5348  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5349  int f;
5350  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5351  team->t.t_id));
5352 
5353  /* verify state */
5354  KMP_DEBUG_ASSERT(root);
5355  KMP_DEBUG_ASSERT(team);
5356  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5357  KMP_DEBUG_ASSERT(team->t.t_threads);
5358 
5359  int use_hot_team = team == root->r.r_hot_team;
5360 #if KMP_NESTED_HOT_TEAMS
5361  int level;
5362  kmp_hot_team_ptr_t *hot_teams;
5363  if (master) {
5364  level = team->t.t_active_level - 1;
5365  if (master->th.th_teams_microtask) { // in teams construct?
5366  if (master->th.th_teams_size.nteams > 1) {
5367  ++level; // level was not increased in teams construct for
5368  // team_of_masters
5369  }
5370  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5371  master->th.th_teams_level == team->t.t_level) {
5372  ++level; // level was not increased in teams construct for
5373  // team_of_workers before the parallel
5374  } // team->t.t_level will be increased inside parallel
5375  }
5376  hot_teams = master->th.th_hot_teams;
5377  if (level < __kmp_hot_teams_max_level) {
5378  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5379  use_hot_team = 1;
5380  }
5381  }
5382 #endif // KMP_NESTED_HOT_TEAMS
5383 
5384  /* team is done working */
5385  TCW_SYNC_PTR(team->t.t_pkfn,
5386  NULL); // Important for Debugging Support Library.
5387  team->t.t_copyin_counter = 0; // init counter for possible reuse
5388  // Do not reset pointer to parent team to NULL for hot teams.
5389 
5390  /* if we are non-hot team, release our threads */
5391  if (!use_hot_team) {
5392  if (__kmp_tasking_mode != tskm_immediate_exec) {
5393  // Wait for threads to reach reapable state
5394  for (f = 1; f < team->t.t_nproc; ++f) {
5395  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5396  kmp_info_t *th = team->t.t_threads[f];
5397  volatile kmp_uint32 *state = &th->th.th_reap_state;
5398  while (*state != KMP_SAFE_TO_REAP) {
5399 #if KMP_OS_WINDOWS
5400  // On Windows a thread can be killed at any time, check this
5401  DWORD ecode;
5402  if (!__kmp_is_thread_alive(th, &ecode)) {
5403  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5404  break;
5405  }
5406 #endif
5407  // first check if thread is sleeping
5408  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5409  if (fl.is_sleeping())
5410  fl.resume(__kmp_gtid_from_thread(th));
5411  KMP_CPU_PAUSE();
5412  }
5413  }
5414 
5415  // Delete task teams
5416  int tt_idx;
5417  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5418  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5419  if (task_team != NULL) {
5420  for (f = 0; f < team->t.t_nproc;
5421  ++f) { // Have all threads unref task teams
5422  team->t.t_threads[f]->th.th_task_team = NULL;
5423  }
5424  KA_TRACE(
5425  20,
5426  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5427  __kmp_get_gtid(), task_team, team->t.t_id));
5428 #if KMP_NESTED_HOT_TEAMS
5429  __kmp_free_task_team(master, task_team);
5430 #endif
5431  team->t.t_task_team[tt_idx] = NULL;
5432  }
5433  }
5434  }
5435 
5436  // Reset pointer to parent team only for non-hot teams.
5437  team->t.t_parent = NULL;
5438  team->t.t_level = 0;
5439  team->t.t_active_level = 0;
5440 
5441  /* free the worker threads */
5442  for (f = 1; f < team->t.t_nproc; ++f) {
5443  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5444  __kmp_free_thread(team->t.t_threads[f]);
5445  team->t.t_threads[f] = NULL;
5446  }
5447 
5448  /* put the team back in the team pool */
5449  /* TODO limit size of team pool, call reap_team if pool too large */
5450  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5451  __kmp_team_pool = (volatile kmp_team_t *)team;
5452  }
5453 
5454  KMP_MB();
5455 }
5456 
5457 /* reap the team. destroy it, reclaim all its resources and free its memory */
5458 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5459  kmp_team_t *next_pool = team->t.t_next_pool;
5460 
5461  KMP_DEBUG_ASSERT(team);
5462  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5463  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5464  KMP_DEBUG_ASSERT(team->t.t_threads);
5465  KMP_DEBUG_ASSERT(team->t.t_argv);
5466 
5467  /* TODO clean the threads that are a part of this? */
5468 
5469  /* free stuff */
5470  __kmp_free_team_arrays(team);
5471  if (team->t.t_argv != &team->t.t_inline_argv[0])
5472  __kmp_free((void *)team->t.t_argv);
5473  __kmp_free(team);
5474 
5475  KMP_MB();
5476  return next_pool;
5477 }
5478 
5479 // Free the thread. Don't reap it, just place it on the pool of available
5480 // threads.
5481 //
5482 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5483 // binding for the affinity mechanism to be useful.
5484 //
5485 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5486 // However, we want to avoid a potential performance problem by always
5487 // scanning through the list to find the correct point at which to insert
5488 // the thread (potential N**2 behavior). To do this we keep track of the
5489 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5490 // With single-level parallelism, threads will always be added to the tail
5491 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5492 // parallelism, all bets are off and we may need to scan through the entire
5493 // free list.
5494 //
5495 // This change also has a potentially large performance benefit, for some
5496 // applications. Previously, as threads were freed from the hot team, they
5497 // would be placed back on the free list in inverse order. If the hot team
5498 // grew back to it's original size, then the freed thread would be placed
5499 // back on the hot team in reverse order. This could cause bad cache
5500 // locality problems on programs where the size of the hot team regularly
5501 // grew and shrunk.
5502 //
5503 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5504 void __kmp_free_thread(kmp_info_t *this_th) {
5505  int gtid;
5506  kmp_info_t **scan;
5507  kmp_root_t *root = this_th->th.th_root;
5508 
5509  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5510  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5511 
5512  KMP_DEBUG_ASSERT(this_th);
5513 
5514  // When moving thread to pool, switch thread to wait on own b_go flag, and
5515  // uninitialized (NULL team).
5516  int b;
5517  kmp_balign_t *balign = this_th->th.th_bar;
5518  for (b = 0; b < bs_last_barrier; ++b) {
5519  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5520  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5521  balign[b].bb.team = NULL;
5522  balign[b].bb.leaf_kids = 0;
5523  }
5524  this_th->th.th_task_state = 0;
5525 
5526  /* put thread back on the free pool */
5527  TCW_PTR(this_th->th.th_team, NULL);
5528  TCW_PTR(this_th->th.th_root, NULL);
5529  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5530 
5531  // If the __kmp_thread_pool_insert_pt is already past the new insert
5532  // point, then we need to re-scan the entire list.
5533  gtid = this_th->th.th_info.ds.ds_gtid;
5534  if (__kmp_thread_pool_insert_pt != NULL) {
5535  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5536  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5537  __kmp_thread_pool_insert_pt = NULL;
5538  }
5539  }
5540 
5541  // Scan down the list to find the place to insert the thread.
5542  // scan is the address of a link in the list, possibly the address of
5543  // __kmp_thread_pool itself.
5544  //
5545  // In the absence of nested parallism, the for loop will have 0 iterations.
5546  if (__kmp_thread_pool_insert_pt != NULL) {
5547  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5548  } else {
5549  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5550  }
5551  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5552  scan = &((*scan)->th.th_next_pool))
5553  ;
5554 
5555  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5556  // to its address.
5557  TCW_PTR(this_th->th.th_next_pool, *scan);
5558  __kmp_thread_pool_insert_pt = *scan = this_th;
5559  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5560  (this_th->th.th_info.ds.ds_gtid <
5561  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5562  TCW_4(this_th->th.th_in_pool, TRUE);
5563  __kmp_thread_pool_nth++;
5564 
5565  TCW_4(__kmp_nth, __kmp_nth - 1);
5566  root->r.r_cg_nthreads--;
5567 
5568 #ifdef KMP_ADJUST_BLOCKTIME
5569  /* Adjust blocktime back to user setting or default if necessary */
5570  /* Middle initialization might never have occurred */
5571  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5572  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5573  if (__kmp_nth <= __kmp_avail_proc) {
5574  __kmp_zero_bt = FALSE;
5575  }
5576  }
5577 #endif /* KMP_ADJUST_BLOCKTIME */
5578 
5579  KMP_MB();
5580 }
5581 
5582 /* ------------------------------------------------------------------------ */
5583 
5584 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5585  int gtid = this_thr->th.th_info.ds.ds_gtid;
5586  /* void *stack_data;*/
5587  kmp_team_t *(*volatile pteam);
5588 
5589  KMP_MB();
5590  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5591 
5592  if (__kmp_env_consistency_check) {
5593  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5594  }
5595 
5596 #if OMPT_SUPPORT
5597  ompt_data_t *thread_data;
5598  if (ompt_enabled.enabled) {
5599  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5600  thread_data->ptr = NULL;
5601 
5602  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5603  this_thr->th.ompt_thread_info.wait_id = 0;
5604  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5605  if (ompt_enabled.ompt_callback_thread_begin) {
5606  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5607  ompt_thread_worker, thread_data);
5608  }
5609  }
5610 #endif
5611 
5612 #if OMPT_SUPPORT
5613  if (ompt_enabled.enabled) {
5614  this_thr->th.ompt_thread_info.state = omp_state_idle;
5615  }
5616 #endif
5617  /* This is the place where threads wait for work */
5618  while (!TCR_4(__kmp_global.g.g_done)) {
5619  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5620  KMP_MB();
5621 
5622  /* wait for work to do */
5623  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5624 
5625  /* No tid yet since not part of a team */
5626  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5627 
5628 #if OMPT_SUPPORT
5629  if (ompt_enabled.enabled) {
5630  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5631  }
5632 #endif
5633 
5634  pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5635 
5636  /* have we been allocated? */
5637  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5638  /* we were just woken up, so run our new task */
5639  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5640  int rc;
5641  KA_TRACE(20,
5642  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5643  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5644  (*pteam)->t.t_pkfn));
5645 
5646  updateHWFPControl(*pteam);
5647 
5648 #if OMPT_SUPPORT
5649  if (ompt_enabled.enabled) {
5650  this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
5651  }
5652 #endif
5653 
5654  {
5655  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5656  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5657  rc = (*pteam)->t.t_invoke(gtid);
5658  }
5659  KMP_ASSERT(rc);
5660 
5661  KMP_MB();
5662  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5663  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5664  (*pteam)->t.t_pkfn));
5665  }
5666 #if OMPT_SUPPORT
5667  if (ompt_enabled.enabled) {
5668  /* no frame set while outside task */
5669  __ompt_get_task_info_object(0)->frame.exit_frame = NULL;
5670 
5671  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5672  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
5673  }
5674 #endif
5675  /* join barrier after parallel region */
5676  __kmp_join_barrier(gtid);
5677  }
5678  }
5679  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5680 
5681 #if OMPT_SUPPORT
5682  if (ompt_enabled.ompt_callback_thread_end) {
5683  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5684  }
5685 #endif
5686 
5687  this_thr->th.th_task_team = NULL;
5688  /* run the destructors for the threadprivate data for this thread */
5689  __kmp_common_destroy_gtid(gtid);
5690 
5691  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5692  KMP_MB();
5693  return this_thr;
5694 }
5695 
5696 /* ------------------------------------------------------------------------ */
5697 
5698 void __kmp_internal_end_dest(void *specific_gtid) {
5699 #if KMP_COMPILER_ICC
5700 #pragma warning(push)
5701 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5702 // significant bits
5703 #endif
5704  // Make sure no significant bits are lost
5705  int gtid = (kmp_intptr_t)specific_gtid - 1;
5706 #if KMP_COMPILER_ICC
5707 #pragma warning(pop)
5708 #endif
5709 
5710  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5711  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5712  * this is because 0 is reserved for the nothing-stored case */
5713 
5714  /* josh: One reason for setting the gtid specific data even when it is being
5715  destroyed by pthread is to allow gtid lookup through thread specific data
5716  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5717  that gets executed in the call to __kmp_internal_end_thread, actually
5718  gets the gtid through the thread specific data. Setting it here seems
5719  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5720  to run smoothly.
5721  todo: get rid of this after we remove the dependence on
5722  __kmp_gtid_get_specific */
5723  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5724  __kmp_gtid_set_specific(gtid);
5725 #ifdef KMP_TDATA_GTID
5726  __kmp_gtid = gtid;
5727 #endif
5728  __kmp_internal_end_thread(gtid);
5729 }
5730 
5731 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5732 
5733 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5734 // destructors work perfectly, but in real libomp.so I have no evidence it is
5735 // ever called. However, -fini linker option in makefile.mk works fine.
5736 
5737 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5738  __kmp_internal_end_atexit();
5739 }
5740 
5741 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5742 
5743 #endif
5744 
5745 /* [Windows] josh: when the atexit handler is called, there may still be more
5746  than one thread alive */
5747 void __kmp_internal_end_atexit(void) {
5748  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5749  /* [Windows]
5750  josh: ideally, we want to completely shutdown the library in this atexit
5751  handler, but stat code that depends on thread specific data for gtid fails
5752  because that data becomes unavailable at some point during the shutdown, so
5753  we call __kmp_internal_end_thread instead. We should eventually remove the
5754  dependency on __kmp_get_specific_gtid in the stat code and use
5755  __kmp_internal_end_library to cleanly shutdown the library.
5756 
5757  // TODO: Can some of this comment about GVS be removed?
5758  I suspect that the offending stat code is executed when the calling thread
5759  tries to clean up a dead root thread's data structures, resulting in GVS
5760  code trying to close the GVS structures for that thread, but since the stat
5761  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5762  the calling thread is cleaning up itself instead of another thread, it get
5763  confused. This happens because allowing a thread to unregister and cleanup
5764  another thread is a recent modification for addressing an issue.
5765  Based on the current design (20050722), a thread may end up
5766  trying to unregister another thread only if thread death does not trigger
5767  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5768  thread specific data destructor function to detect thread death. For
5769  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5770  is nothing. Thus, the workaround is applicable only for Windows static
5771  stat library. */
5772  __kmp_internal_end_library(-1);
5773 #if KMP_OS_WINDOWS
5774  __kmp_close_console();
5775 #endif
5776 }
5777 
5778 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5779  // It is assumed __kmp_forkjoin_lock is acquired.
5780 
5781  int gtid;
5782 
5783  KMP_DEBUG_ASSERT(thread != NULL);
5784 
5785  gtid = thread->th.th_info.ds.ds_gtid;
5786 
5787  if (!is_root) {
5788 
5789  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5790  /* Assume the threads are at the fork barrier here */
5791  KA_TRACE(
5792  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5793  gtid));
5794  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5795  * (GEH) */
5796  ANNOTATE_HAPPENS_BEFORE(thread);
5797  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5798  __kmp_release_64(&flag);
5799  }
5800 
5801  // Terminate OS thread.
5802  __kmp_reap_worker(thread);
5803 
5804  // The thread was killed asynchronously. If it was actively
5805  // spinning in the thread pool, decrement the global count.
5806  //
5807  // There is a small timing hole here - if the worker thread was just waking
5808  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5809  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5810  // the global counter might not get updated.
5811  //
5812  // Currently, this can only happen as the library is unloaded,
5813  // so there are no harmful side effects.
5814  if (thread->th.th_active_in_pool) {
5815  thread->th.th_active_in_pool = FALSE;
5816  KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
5817  KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5818  }
5819 
5820  // Decrement # of [worker] threads in the pool.
5821  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5822  --__kmp_thread_pool_nth;
5823  }
5824 
5825  __kmp_free_implicit_task(thread);
5826 
5827 // Free the fast memory for tasking
5828 #if USE_FAST_MEMORY
5829  __kmp_free_fast_memory(thread);
5830 #endif /* USE_FAST_MEMORY */
5831 
5832  __kmp_suspend_uninitialize_thread(thread);
5833 
5834  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5835  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5836 
5837  --__kmp_all_nth;
5838 // __kmp_nth was decremented when thread is added to the pool.
5839 
5840 #ifdef KMP_ADJUST_BLOCKTIME
5841  /* Adjust blocktime back to user setting or default if necessary */
5842  /* Middle initialization might never have occurred */
5843  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5844  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5845  if (__kmp_nth <= __kmp_avail_proc) {
5846  __kmp_zero_bt = FALSE;
5847  }
5848  }
5849 #endif /* KMP_ADJUST_BLOCKTIME */
5850 
5851  /* free the memory being used */
5852  if (__kmp_env_consistency_check) {
5853  if (thread->th.th_cons) {
5854  __kmp_free_cons_stack(thread->th.th_cons);
5855  thread->th.th_cons = NULL;
5856  }
5857  }
5858 
5859  if (thread->th.th_pri_common != NULL) {
5860  __kmp_free(thread->th.th_pri_common);
5861  thread->th.th_pri_common = NULL;
5862  }
5863 
5864  if (thread->th.th_task_state_memo_stack != NULL) {
5865  __kmp_free(thread->th.th_task_state_memo_stack);
5866  thread->th.th_task_state_memo_stack = NULL;
5867  }
5868 
5869 #if KMP_USE_BGET
5870  if (thread->th.th_local.bget_data != NULL) {
5871  __kmp_finalize_bget(thread);
5872  }
5873 #endif
5874 
5875 #if KMP_AFFINITY_SUPPORTED
5876  if (thread->th.th_affin_mask != NULL) {
5877  KMP_CPU_FREE(thread->th.th_affin_mask);
5878  thread->th.th_affin_mask = NULL;
5879  }
5880 #endif /* KMP_AFFINITY_SUPPORTED */
5881 
5882  __kmp_reap_team(thread->th.th_serial_team);
5883  thread->th.th_serial_team = NULL;
5884  __kmp_free(thread);
5885 
5886  KMP_MB();
5887 
5888 } // __kmp_reap_thread
5889 
5890 static void __kmp_internal_end(void) {
5891  int i;
5892 
5893  /* First, unregister the library */
5894  __kmp_unregister_library();
5895 
5896 #if KMP_OS_WINDOWS
5897  /* In Win static library, we can't tell when a root actually dies, so we
5898  reclaim the data structures for any root threads that have died but not
5899  unregistered themselves, in order to shut down cleanly.
5900  In Win dynamic library we also can't tell when a thread dies. */
5901  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5902 // dead roots
5903 #endif
5904 
5905  for (i = 0; i < __kmp_threads_capacity; i++)
5906  if (__kmp_root[i])
5907  if (__kmp_root[i]->r.r_active)
5908  break;
5909  KMP_MB(); /* Flush all pending memory write invalidates. */
5910  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5911 
5912  if (i < __kmp_threads_capacity) {
5913 #if KMP_USE_MONITOR
5914  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5915  KMP_MB(); /* Flush all pending memory write invalidates. */
5916 
5917  // Need to check that monitor was initialized before reaping it. If we are
5918  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5919  // __kmp_monitor will appear to contain valid data, but it is only valid in
5920  // the parent process, not the child.
5921  // New behavior (201008): instead of keying off of the flag
5922  // __kmp_init_parallel, the monitor thread creation is keyed off
5923  // of the new flag __kmp_init_monitor.
5924  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5925  if (TCR_4(__kmp_init_monitor)) {
5926  __kmp_reap_monitor(&__kmp_monitor);
5927  TCW_4(__kmp_init_monitor, 0);
5928  }
5929  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5930  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5931 #endif // KMP_USE_MONITOR
5932  } else {
5933 /* TODO move this to cleanup code */
5934 #ifdef KMP_DEBUG
5935  /* make sure that everything has properly ended */
5936  for (i = 0; i < __kmp_threads_capacity; i++) {
5937  if (__kmp_root[i]) {
5938  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
5939  // there can be uber threads alive here
5940  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5941  }
5942  }
5943 #endif
5944 
5945  KMP_MB();
5946 
5947  // Reap the worker threads.
5948  // This is valid for now, but be careful if threads are reaped sooner.
5949  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5950  // Get the next thread from the pool.
5951  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5952  __kmp_thread_pool = thread->th.th_next_pool;
5953  // Reap it.
5954  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5955  thread->th.th_next_pool = NULL;
5956  thread->th.th_in_pool = FALSE;
5957  __kmp_reap_thread(thread, 0);
5958  }
5959  __kmp_thread_pool_insert_pt = NULL;
5960 
5961  // Reap teams.
5962  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5963  // Get the next team from the pool.
5964  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5965  __kmp_team_pool = team->t.t_next_pool;
5966  // Reap it.
5967  team->t.t_next_pool = NULL;
5968  __kmp_reap_team(team);
5969  }
5970 
5971  __kmp_reap_task_teams();
5972 
5973  for (i = 0; i < __kmp_threads_capacity; ++i) {
5974  // TBD: Add some checking...
5975  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5976  }
5977 
5978  /* Make sure all threadprivate destructors get run by joining with all
5979  worker threads before resetting this flag */
5980  TCW_SYNC_4(__kmp_init_common, FALSE);
5981 
5982  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5983  KMP_MB();
5984 
5985 #if KMP_USE_MONITOR
5986  // See note above: One of the possible fixes for CQ138434 / CQ140126
5987  //
5988  // FIXME: push both code fragments down and CSE them?
5989  // push them into __kmp_cleanup() ?
5990  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5991  if (TCR_4(__kmp_init_monitor)) {
5992  __kmp_reap_monitor(&__kmp_monitor);
5993  TCW_4(__kmp_init_monitor, 0);
5994  }
5995  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5996  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5997 #endif
5998  } /* else !__kmp_global.t_active */
5999  TCW_4(__kmp_init_gtid, FALSE);
6000  KMP_MB(); /* Flush all pending memory write invalidates. */
6001 
6002  __kmp_cleanup();
6003 #if OMPT_SUPPORT
6004  ompt_fini();
6005 #endif
6006 }
6007 
6008 void __kmp_internal_end_library(int gtid_req) {
6009  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6010  /* this shouldn't be a race condition because __kmp_internal_end() is the
6011  only place to clear __kmp_serial_init */
6012  /* we'll check this later too, after we get the lock */
6013  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6014  // redundaant, because the next check will work in any case.
6015  if (__kmp_global.g.g_abort) {
6016  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6017  /* TODO abort? */
6018  return;
6019  }
6020  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6021  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6022  return;
6023  }
6024 
6025  KMP_MB(); /* Flush all pending memory write invalidates. */
6026 
6027  /* find out who we are and what we should do */
6028  {
6029  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6030  KA_TRACE(
6031  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6032  if (gtid == KMP_GTID_SHUTDOWN) {
6033  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6034  "already shutdown\n"));
6035  return;
6036  } else if (gtid == KMP_GTID_MONITOR) {
6037  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6038  "registered, or system shutdown\n"));
6039  return;
6040  } else if (gtid == KMP_GTID_DNE) {
6041  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6042  "shutdown\n"));
6043  /* we don't know who we are, but we may still shutdown the library */
6044  } else if (KMP_UBER_GTID(gtid)) {
6045  /* unregister ourselves as an uber thread. gtid is no longer valid */
6046  if (__kmp_root[gtid]->r.r_active) {
6047  __kmp_global.g.g_abort = -1;
6048  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6049  KA_TRACE(10,
6050  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6051  gtid));
6052  return;
6053  } else {
6054  KA_TRACE(
6055  10,
6056  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6057  __kmp_unregister_root_current_thread(gtid);
6058  }
6059  } else {
6060 /* worker threads may call this function through the atexit handler, if they
6061  * call exit() */
6062 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6063  TODO: do a thorough shutdown instead */
6064 #ifdef DUMP_DEBUG_ON_EXIT
6065  if (__kmp_debug_buf)
6066  __kmp_dump_debug_buffer();
6067 #endif
6068  return;
6069  }
6070  }
6071  /* synchronize the termination process */
6072  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6073 
6074  /* have we already finished */
6075  if (__kmp_global.g.g_abort) {
6076  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6077  /* TODO abort? */
6078  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6079  return;
6080  }
6081  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6082  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6083  return;
6084  }
6085 
6086  /* We need this lock to enforce mutex between this reading of
6087  __kmp_threads_capacity and the writing by __kmp_register_root.
6088  Alternatively, we can use a counter of roots that is atomically updated by
6089  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6090  __kmp_internal_end_*. */
6091  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6092 
6093  /* now we can safely conduct the actual termination */
6094  __kmp_internal_end();
6095 
6096  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6097  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6098 
6099  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6100 
6101 #ifdef DUMP_DEBUG_ON_EXIT
6102  if (__kmp_debug_buf)
6103  __kmp_dump_debug_buffer();
6104 #endif
6105 
6106 #if KMP_OS_WINDOWS
6107  __kmp_close_console();
6108 #endif
6109 
6110  __kmp_fini_allocator();
6111 
6112 } // __kmp_internal_end_library
6113 
6114 void __kmp_internal_end_thread(int gtid_req) {
6115  int i;
6116 
6117  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6118  /* this shouldn't be a race condition because __kmp_internal_end() is the
6119  * only place to clear __kmp_serial_init */
6120  /* we'll check this later too, after we get the lock */
6121  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6122  // redundant, because the next check will work in any case.
6123  if (__kmp_global.g.g_abort) {
6124  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6125  /* TODO abort? */
6126  return;
6127  }
6128  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6129  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6130  return;
6131  }
6132 
6133  KMP_MB(); /* Flush all pending memory write invalidates. */
6134 
6135  /* find out who we are and what we should do */
6136  {
6137  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6138  KA_TRACE(10,
6139  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6140  if (gtid == KMP_GTID_SHUTDOWN) {
6141  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6142  "already shutdown\n"));
6143  return;
6144  } else if (gtid == KMP_GTID_MONITOR) {
6145  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6146  "registered, or system shutdown\n"));
6147  return;
6148  } else if (gtid == KMP_GTID_DNE) {
6149  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6150  "shutdown\n"));
6151  return;
6152  /* we don't know who we are */
6153  } else if (KMP_UBER_GTID(gtid)) {
6154  /* unregister ourselves as an uber thread. gtid is no longer valid */
6155  if (__kmp_root[gtid]->r.r_active) {
6156  __kmp_global.g.g_abort = -1;
6157  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6158  KA_TRACE(10,
6159  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6160  gtid));
6161  return;
6162  } else {
6163  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6164  gtid));
6165  __kmp_unregister_root_current_thread(gtid);
6166  }
6167  } else {
6168  /* just a worker thread, let's leave */
6169  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6170 
6171  if (gtid >= 0) {
6172  __kmp_threads[gtid]->th.th_task_team = NULL;
6173  }
6174 
6175  KA_TRACE(10,
6176  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6177  gtid));
6178  return;
6179  }
6180  }
6181 #if defined KMP_DYNAMIC_LIB
6182  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6183  // thread, because we will better shutdown later in the library destructor.
6184  // The reason of this change is performance problem when non-openmp thread in
6185  // a loop forks and joins many openmp threads. We can save a lot of time
6186  // keeping worker threads alive until the program shutdown.
6187  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6188  // and Windows(DPD200287443) that occurs when using critical sections from
6189  // foreign threads.
6190  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6191  return;
6192 #endif
6193  /* synchronize the termination process */
6194  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6195 
6196  /* have we already finished */
6197  if (__kmp_global.g.g_abort) {
6198  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6199  /* TODO abort? */
6200  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6201  return;
6202  }
6203  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6204  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6205  return;
6206  }
6207 
6208  /* We need this lock to enforce mutex between this reading of
6209  __kmp_threads_capacity and the writing by __kmp_register_root.
6210  Alternatively, we can use a counter of roots that is atomically updated by
6211  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6212  __kmp_internal_end_*. */
6213 
6214  /* should we finish the run-time? are all siblings done? */
6215  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6216 
6217  for (i = 0; i < __kmp_threads_capacity; ++i) {
6218  if (KMP_UBER_GTID(i)) {
6219  KA_TRACE(
6220  10,
6221  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6222  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6223  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6224  return;
6225  }
6226  }
6227 
6228  /* now we can safely conduct the actual termination */
6229 
6230  __kmp_internal_end();
6231 
6232  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6233  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6234 
6235  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6236 
6237 #ifdef DUMP_DEBUG_ON_EXIT
6238  if (__kmp_debug_buf)
6239  __kmp_dump_debug_buffer();
6240 #endif
6241 } // __kmp_internal_end_thread
6242 
6243 // -----------------------------------------------------------------------------
6244 // Library registration stuff.
6245 
6246 static long __kmp_registration_flag = 0;
6247 // Random value used to indicate library initialization.
6248 static char *__kmp_registration_str = NULL;
6249 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6250 
6251 static inline char *__kmp_reg_status_name() {
6252  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6253  each thread. If registration and unregistration go in different threads
6254  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6255  env var can not be found, because the name will contain different pid. */
6256  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6257 } // __kmp_reg_status_get
6258 
6259 void __kmp_register_library_startup(void) {
6260 
6261  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6262  int done = 0;
6263  union {
6264  double dtime;
6265  long ltime;
6266  } time;
6267 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6268  __kmp_initialize_system_tick();
6269 #endif
6270  __kmp_read_system_time(&time.dtime);
6271  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6272  __kmp_registration_str =
6273  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6274  __kmp_registration_flag, KMP_LIBRARY_FILE);
6275 
6276  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6277  __kmp_registration_str));
6278 
6279  while (!done) {
6280 
6281  char *value = NULL; // Actual value of the environment variable.
6282 
6283  // Set environment variable, but do not overwrite if it is exist.
6284  __kmp_env_set(name, __kmp_registration_str, 0);
6285  // Check the variable is written.
6286  value = __kmp_env_get(name);
6287  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6288 
6289  done = 1; // Ok, environment variable set successfully, exit the loop.
6290 
6291  } else {
6292 
6293  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6294  // Check whether it alive or dead.
6295  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6296  char *tail = value;
6297  char *flag_addr_str = NULL;
6298  char *flag_val_str = NULL;
6299  char const *file_name = NULL;
6300  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6301  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6302  file_name = tail;
6303  if (tail != NULL) {
6304  long *flag_addr = 0;
6305  long flag_val = 0;
6306  KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6307  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6308  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6309  // First, check whether environment-encoded address is mapped into
6310  // addr space.
6311  // If so, dereference it to see if it still has the right value.
6312  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6313  neighbor = 1;
6314  } else {
6315  // If not, then we know the other copy of the library is no longer
6316  // running.
6317  neighbor = 2;
6318  }
6319  }
6320  }
6321  switch (neighbor) {
6322  case 0: // Cannot parse environment variable -- neighbor status unknown.
6323  // Assume it is the incompatible format of future version of the
6324  // library. Assume the other library is alive.
6325  // WARN( ... ); // TODO: Issue a warning.
6326  file_name = "unknown library";
6327  // Attention! Falling to the next case. That's intentional.
6328  case 1: { // Neighbor is alive.
6329  // Check it is allowed.
6330  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6331  if (!__kmp_str_match_true(duplicate_ok)) {
6332  // That's not allowed. Issue fatal error.
6333  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6334  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6335  }
6336  KMP_INTERNAL_FREE(duplicate_ok);
6337  __kmp_duplicate_library_ok = 1;
6338  done = 1; // Exit the loop.
6339  } break;
6340  case 2: { // Neighbor is dead.
6341  // Clear the variable and try to register library again.
6342  __kmp_env_unset(name);
6343  } break;
6344  default: { KMP_DEBUG_ASSERT(0); } break;
6345  }
6346  }
6347  KMP_INTERNAL_FREE((void *)value);
6348  }
6349  KMP_INTERNAL_FREE((void *)name);
6350 
6351 } // func __kmp_register_library_startup
6352 
6353 void __kmp_unregister_library(void) {
6354 
6355  char *name = __kmp_reg_status_name();
6356  char *value = __kmp_env_get(name);
6357 
6358  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6359  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6360  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6361  // Ok, this is our variable. Delete it.
6362  __kmp_env_unset(name);
6363  }
6364 
6365  KMP_INTERNAL_FREE(__kmp_registration_str);
6366  KMP_INTERNAL_FREE(value);
6367  KMP_INTERNAL_FREE(name);
6368 
6369  __kmp_registration_flag = 0;
6370  __kmp_registration_str = NULL;
6371 
6372 } // __kmp_unregister_library
6373 
6374 // End of Library registration stuff.
6375 // -----------------------------------------------------------------------------
6376 
6377 #if KMP_MIC_SUPPORTED
6378 
6379 static void __kmp_check_mic_type() {
6380  kmp_cpuid_t cpuid_state = {0};
6381  kmp_cpuid_t *cs_p = &cpuid_state;
6382  __kmp_x86_cpuid(1, 0, cs_p);
6383  // We don't support mic1 at the moment
6384  if ((cs_p->eax & 0xff0) == 0xB10) {
6385  __kmp_mic_type = mic2;
6386  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6387  __kmp_mic_type = mic3;
6388  } else {
6389  __kmp_mic_type = non_mic;
6390  }
6391 }
6392 
6393 #endif /* KMP_MIC_SUPPORTED */
6394 
6395 static void __kmp_do_serial_initialize(void) {
6396  int i, gtid;
6397  int size;
6398 
6399  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6400 
6401  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6402  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6403  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6404  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6405  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6406 
6407 #if OMPT_SUPPORT
6408  ompt_pre_init();
6409 #endif
6410 
6411  __kmp_validate_locks();
6412 
6413  /* Initialize internal memory allocator */
6414  __kmp_init_allocator();
6415 
6416  /* Register the library startup via an environment variable and check to see
6417  whether another copy of the library is already registered. */
6418 
6419  __kmp_register_library_startup();
6420 
6421  /* TODO reinitialization of library */
6422  if (TCR_4(__kmp_global.g.g_done)) {
6423  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6424  }
6425 
6426  __kmp_global.g.g_abort = 0;
6427  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6428 
6429 /* initialize the locks */
6430 #if KMP_USE_ADAPTIVE_LOCKS
6431 #if KMP_DEBUG_ADAPTIVE_LOCKS
6432  __kmp_init_speculative_stats();
6433 #endif
6434 #endif
6435 #if KMP_STATS_ENABLED
6436  __kmp_stats_init();
6437 #endif
6438  __kmp_init_lock(&__kmp_global_lock);
6439  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6440  __kmp_init_lock(&__kmp_debug_lock);
6441  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6442  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6443  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6444  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6445  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6446  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6447  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6448  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6449  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6450  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6451  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6452  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6453  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6454  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6455  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6456 #if KMP_USE_MONITOR
6457  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6458 #endif
6459  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6460 
6461  /* conduct initialization and initial setup of configuration */
6462 
6463  __kmp_runtime_initialize();
6464 
6465 #if KMP_MIC_SUPPORTED
6466  __kmp_check_mic_type();
6467 #endif
6468 
6469 // Some global variable initialization moved here from kmp_env_initialize()
6470 #ifdef KMP_DEBUG
6471  kmp_diag = 0;
6472 #endif
6473  __kmp_abort_delay = 0;
6474 
6475  // From __kmp_init_dflt_team_nth()
6476  /* assume the entire machine will be used */
6477  __kmp_dflt_team_nth_ub = __kmp_xproc;
6478  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6479  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6480  }
6481  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6482  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6483  }
6484  __kmp_max_nth = __kmp_sys_max_nth;
6485  __kmp_cg_max_nth = __kmp_sys_max_nth;
6486  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6487  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6488  __kmp_teams_max_nth = __kmp_sys_max_nth;
6489  }
6490 
6491  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6492  // part
6493  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6494 #if KMP_USE_MONITOR
6495  __kmp_monitor_wakeups =
6496  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6497  __kmp_bt_intervals =
6498  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6499 #endif
6500  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6501  __kmp_library = library_throughput;
6502  // From KMP_SCHEDULE initialization
6503  __kmp_static = kmp_sch_static_balanced;
6504 // AC: do not use analytical here, because it is non-monotonous
6505 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6506 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6507 // need to repeat assignment
6508 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6509 // bit control and barrier method control parts
6510 #if KMP_FAST_REDUCTION_BARRIER
6511 #define kmp_reduction_barrier_gather_bb ((int)1)
6512 #define kmp_reduction_barrier_release_bb ((int)1)
6513 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6514 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6515 #endif // KMP_FAST_REDUCTION_BARRIER
6516  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6517  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6518  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6519  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6520  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6521 #if KMP_FAST_REDUCTION_BARRIER
6522  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6523  // lin_64 ): hyper,1
6524  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6525  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6526  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6527  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6528  }
6529 #endif // KMP_FAST_REDUCTION_BARRIER
6530  }
6531 #if KMP_FAST_REDUCTION_BARRIER
6532 #undef kmp_reduction_barrier_release_pat
6533 #undef kmp_reduction_barrier_gather_pat
6534 #undef kmp_reduction_barrier_release_bb
6535 #undef kmp_reduction_barrier_gather_bb
6536 #endif // KMP_FAST_REDUCTION_BARRIER
6537 #if KMP_MIC_SUPPORTED
6538  if (__kmp_mic_type == mic2) { // KNC
6539  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6540  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6541  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6542  1; // forkjoin release
6543  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6544  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6545  }
6546 #if KMP_FAST_REDUCTION_BARRIER
6547  if (__kmp_mic_type == mic2) { // KNC
6548  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6549  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6550  }
6551 #endif // KMP_FAST_REDUCTION_BARRIER
6552 #endif // KMP_MIC_SUPPORTED
6553 
6554 // From KMP_CHECKS initialization
6555 #ifdef KMP_DEBUG
6556  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6557 #else
6558  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6559 #endif
6560 
6561  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6562  __kmp_foreign_tp = TRUE;
6563 
6564  __kmp_global.g.g_dynamic = FALSE;
6565  __kmp_global.g.g_dynamic_mode = dynamic_default;
6566 
6567  __kmp_env_initialize(NULL);
6568 
6569 // Print all messages in message catalog for testing purposes.
6570 #ifdef KMP_DEBUG
6571  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6572  if (__kmp_str_match_true(val)) {
6573  kmp_str_buf_t buffer;
6574  __kmp_str_buf_init(&buffer);
6575  __kmp_i18n_dump_catalog(&buffer);
6576  __kmp_printf("%s", buffer.str);
6577  __kmp_str_buf_free(&buffer);
6578  }
6579  __kmp_env_free(&val);
6580 #endif
6581 
6582  __kmp_threads_capacity =
6583  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6584  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6585  __kmp_tp_capacity = __kmp_default_tp_capacity(
6586  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6587 
6588  // If the library is shut down properly, both pools must be NULL. Just in
6589  // case, set them to NULL -- some memory may leak, but subsequent code will
6590  // work even if pools are not freed.
6591  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6592  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6593  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6594  __kmp_thread_pool = NULL;
6595  __kmp_thread_pool_insert_pt = NULL;
6596  __kmp_team_pool = NULL;
6597 
6598  /* Allocate all of the variable sized records */
6599  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6600  * expandable */
6601  /* Since allocation is cache-aligned, just add extra padding at the end */
6602  size =
6603  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6604  CACHE_LINE;
6605  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6606  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6607  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6608 
6609  /* init thread counts */
6610  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6611  0); // Asserts fail if the library is reinitializing and
6612  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6613  __kmp_all_nth = 0;
6614  __kmp_nth = 0;
6615 
6616  /* setup the uber master thread and hierarchy */
6617  gtid = __kmp_register_root(TRUE);
6618  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6619  KMP_ASSERT(KMP_UBER_GTID(gtid));
6620  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6621 
6622  KMP_MB(); /* Flush all pending memory write invalidates. */
6623 
6624  __kmp_common_initialize();
6625 
6626 #if KMP_OS_UNIX
6627  /* invoke the child fork handler */
6628  __kmp_register_atfork();
6629 #endif
6630 
6631 #if !defined KMP_DYNAMIC_LIB
6632  {
6633  /* Invoke the exit handler when the program finishes, only for static
6634  library. For dynamic library, we already have _fini and DllMain. */
6635  int rc = atexit(__kmp_internal_end_atexit);
6636  if (rc != 0) {
6637  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6638  __kmp_msg_null);
6639  }
6640  }
6641 #endif
6642 
6643 #if KMP_HANDLE_SIGNALS
6644 #if KMP_OS_UNIX
6645  /* NOTE: make sure that this is called before the user installs their own
6646  signal handlers so that the user handlers are called first. this way they
6647  can return false, not call our handler, avoid terminating the library, and
6648  continue execution where they left off. */
6649  __kmp_install_signals(FALSE);
6650 #endif /* KMP_OS_UNIX */
6651 #if KMP_OS_WINDOWS
6652  __kmp_install_signals(TRUE);
6653 #endif /* KMP_OS_WINDOWS */
6654 #endif
6655 
6656  /* we have finished the serial initialization */
6657  __kmp_init_counter++;
6658 
6659  __kmp_init_serial = TRUE;
6660 
6661  if (__kmp_settings) {
6662  __kmp_env_print();
6663  }
6664 
6665 #if OMP_40_ENABLED
6666  if (__kmp_display_env || __kmp_display_env_verbose) {
6667  __kmp_env_print_2();
6668  }
6669 #endif // OMP_40_ENABLED
6670 
6671 #if OMPT_SUPPORT
6672  ompt_post_init();
6673 #endif
6674 
6675  KMP_MB();
6676 
6677  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6678 }
6679 
6680 void __kmp_serial_initialize(void) {
6681  if (__kmp_init_serial) {
6682  return;
6683  }
6684  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6685  if (__kmp_init_serial) {
6686  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6687  return;
6688  }
6689  __kmp_do_serial_initialize();
6690  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6691 }
6692 
6693 static void __kmp_do_middle_initialize(void) {
6694  int i, j;
6695  int prev_dflt_team_nth;
6696 
6697  if (!__kmp_init_serial) {
6698  __kmp_do_serial_initialize();
6699  }
6700 
6701  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6702 
6703  // Save the previous value for the __kmp_dflt_team_nth so that
6704  // we can avoid some reinitialization if it hasn't changed.
6705  prev_dflt_team_nth = __kmp_dflt_team_nth;
6706 
6707 #if KMP_AFFINITY_SUPPORTED
6708  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6709  // number of cores on the machine.
6710  __kmp_affinity_initialize();
6711 
6712  // Run through the __kmp_threads array and set the affinity mask
6713  // for each root thread that is currently registered with the RTL.
6714  for (i = 0; i < __kmp_threads_capacity; i++) {
6715  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6716  __kmp_affinity_set_init_mask(i, TRUE);
6717  }
6718  }
6719 #endif /* KMP_AFFINITY_SUPPORTED */
6720 
6721  KMP_ASSERT(__kmp_xproc > 0);
6722  if (__kmp_avail_proc == 0) {
6723  __kmp_avail_proc = __kmp_xproc;
6724  }
6725 
6726  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6727  // correct them now
6728  j = 0;
6729  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6730  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6731  __kmp_avail_proc;
6732  j++;
6733  }
6734 
6735  if (__kmp_dflt_team_nth == 0) {
6736 #ifdef KMP_DFLT_NTH_CORES
6737  // Default #threads = #cores
6738  __kmp_dflt_team_nth = __kmp_ncores;
6739  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6740  "__kmp_ncores (%d)\n",
6741  __kmp_dflt_team_nth));
6742 #else
6743  // Default #threads = #available OS procs
6744  __kmp_dflt_team_nth = __kmp_avail_proc;
6745  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6746  "__kmp_avail_proc(%d)\n",
6747  __kmp_dflt_team_nth));
6748 #endif /* KMP_DFLT_NTH_CORES */
6749  }
6750 
6751  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6752  __kmp_dflt_team_nth = KMP_MIN_NTH;
6753  }
6754  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6755  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6756  }
6757 
6758  // There's no harm in continuing if the following check fails,
6759  // but it indicates an error in the previous logic.
6760  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6761 
6762  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6763  // Run through the __kmp_threads array and set the num threads icv for each
6764  // root thread that is currently registered with the RTL (which has not
6765  // already explicitly set its nthreads-var with a call to
6766  // omp_set_num_threads()).
6767  for (i = 0; i < __kmp_threads_capacity; i++) {
6768  kmp_info_t *thread = __kmp_threads[i];
6769  if (thread == NULL)
6770  continue;
6771  if (thread->th.th_current_task->td_icvs.nproc != 0)
6772  continue;
6773 
6774  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6775  }
6776  }
6777  KA_TRACE(
6778  20,
6779  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6780  __kmp_dflt_team_nth));
6781 
6782 #ifdef KMP_ADJUST_BLOCKTIME
6783  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6784  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6785  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6786  if (__kmp_nth > __kmp_avail_proc) {
6787  __kmp_zero_bt = TRUE;
6788  }
6789  }
6790 #endif /* KMP_ADJUST_BLOCKTIME */
6791 
6792  /* we have finished middle initialization */
6793  TCW_SYNC_4(__kmp_init_middle, TRUE);
6794 
6795  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6796 }
6797 
6798 void __kmp_middle_initialize(void) {
6799  if (__kmp_init_middle) {
6800  return;
6801  }
6802  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6803  if (__kmp_init_middle) {
6804  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6805  return;
6806  }
6807  __kmp_do_middle_initialize();
6808  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6809 }
6810 
6811 void __kmp_parallel_initialize(void) {
6812  int gtid = __kmp_entry_gtid(); // this might be a new root
6813 
6814  /* synchronize parallel initialization (for sibling) */
6815  if (TCR_4(__kmp_init_parallel))
6816  return;
6817  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6818  if (TCR_4(__kmp_init_parallel)) {
6819  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6820  return;
6821  }
6822 
6823  /* TODO reinitialization after we have already shut down */
6824  if (TCR_4(__kmp_global.g.g_done)) {
6825  KA_TRACE(
6826  10,
6827  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6828  __kmp_infinite_loop();
6829  }
6830 
6831  /* jc: The lock __kmp_initz_lock is already held, so calling
6832  __kmp_serial_initialize would cause a deadlock. So we call
6833  __kmp_do_serial_initialize directly. */
6834  if (!__kmp_init_middle) {
6835  __kmp_do_middle_initialize();
6836  }
6837 
6838  /* begin initialization */
6839  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6840  KMP_ASSERT(KMP_UBER_GTID(gtid));
6841 
6842 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6843  // Save the FP control regs.
6844  // Worker threads will set theirs to these values at thread startup.
6845  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6846  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6847  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6848 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6849 
6850 #if KMP_OS_UNIX
6851 #if KMP_HANDLE_SIGNALS
6852  /* must be after __kmp_serial_initialize */
6853  __kmp_install_signals(TRUE);
6854 #endif
6855 #endif
6856 
6857  __kmp_suspend_initialize();
6858 
6859 #if defined(USE_LOAD_BALANCE)
6860  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6861  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6862  }
6863 #else
6864  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6865  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6866  }
6867 #endif
6868 
6869  if (__kmp_version) {
6870  __kmp_print_version_2();
6871  }
6872 
6873  /* we have finished parallel initialization */
6874  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6875 
6876  KMP_MB();
6877  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6878 
6879  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6880 }
6881 
6882 /* ------------------------------------------------------------------------ */
6883 
6884 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6885  kmp_team_t *team) {
6886  kmp_disp_t *dispatch;
6887 
6888  KMP_MB();
6889 
6890  /* none of the threads have encountered any constructs, yet. */
6891  this_thr->th.th_local.this_construct = 0;
6892 #if KMP_CACHE_MANAGE
6893  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6894 #endif /* KMP_CACHE_MANAGE */
6895  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6896  KMP_DEBUG_ASSERT(dispatch);
6897  KMP_DEBUG_ASSERT(team->t.t_dispatch);
6898  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6899  // this_thr->th.th_info.ds.ds_tid ] );
6900 
6901  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6902 #if OMP_45_ENABLED
6903  dispatch->th_doacross_buf_idx =
6904  0; /* reset the doacross dispatch buffer counter */
6905 #endif
6906  if (__kmp_env_consistency_check)
6907  __kmp_push_parallel(gtid, team->t.t_ident);
6908 
6909  KMP_MB(); /* Flush all pending memory write invalidates. */
6910 }
6911 
6912 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6913  kmp_team_t *team) {
6914  if (__kmp_env_consistency_check)
6915  __kmp_pop_parallel(gtid, team->t.t_ident);
6916 
6917  __kmp_finish_implicit_task(this_thr);
6918 }
6919 
6920 int __kmp_invoke_task_func(int gtid) {
6921  int rc;
6922  int tid = __kmp_tid_from_gtid(gtid);
6923  kmp_info_t *this_thr = __kmp_threads[gtid];
6924  kmp_team_t *team = this_thr->th.th_team;
6925 
6926  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6927 #if USE_ITT_BUILD
6928  if (__itt_stack_caller_create_ptr) {
6929  __kmp_itt_stack_callee_enter(
6930  (__itt_caller)
6931  team->t.t_stack_id); // inform ittnotify about entering user's code
6932  }
6933 #endif /* USE_ITT_BUILD */
6934 #if INCLUDE_SSC_MARKS
6935  SSC_MARK_INVOKING();
6936 #endif
6937 
6938 #if OMPT_SUPPORT
6939  void *dummy;
6940  void **exit_runtime_p;
6941  ompt_data_t *my_task_data;
6942  ompt_data_t *my_parallel_data;
6943  int ompt_team_size;
6944 
6945  if (ompt_enabled.enabled) {
6946  exit_runtime_p = &(
6947  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame);
6948  } else {
6949  exit_runtime_p = &dummy;
6950  }
6951 
6952  my_task_data =
6953  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6954  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6955  if (ompt_enabled.ompt_callback_implicit_task) {
6956  ompt_team_size = team->t.t_nproc;
6957  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6958  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6959  __kmp_tid_from_gtid(gtid));
6960  }
6961 #endif
6962 
6963  {
6964  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6965  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6966  rc =
6967  __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6968  tid, (int)team->t.t_argc, (void **)team->t.t_argv
6969 #if OMPT_SUPPORT
6970  ,
6971  exit_runtime_p
6972 #endif
6973  );
6974 #if OMPT_SUPPORT
6975  *exit_runtime_p = NULL;
6976 #endif
6977  }
6978 
6979 #if USE_ITT_BUILD
6980  if (__itt_stack_caller_create_ptr) {
6981  __kmp_itt_stack_callee_leave(
6982  (__itt_caller)
6983  team->t.t_stack_id); // inform ittnotify about leaving user's code
6984  }
6985 #endif /* USE_ITT_BUILD */
6986  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6987 
6988  return rc;
6989 }
6990 
6991 #if OMP_40_ENABLED
6992 void __kmp_teams_master(int gtid) {
6993  // This routine is called by all master threads in teams construct
6994  kmp_info_t *thr = __kmp_threads[gtid];
6995  kmp_team_t *team = thr->th.th_team;
6996  ident_t *loc = team->t.t_ident;
6997  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6998  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
6999  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7000  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7001  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7002 // Launch league of teams now, but not let workers execute
7003 // (they hang on fork barrier until next parallel)
7004 #if INCLUDE_SSC_MARKS
7005  SSC_MARK_FORKING();
7006 #endif
7007  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7008  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7009  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7010 #if INCLUDE_SSC_MARKS
7011  SSC_MARK_JOINING();
7012 #endif
7013 
7014  // AC: last parameter "1" eliminates join barrier which won't work because
7015  // worker threads are in a fork barrier waiting for more parallel regions
7016  __kmp_join_call(loc, gtid
7017 #if OMPT_SUPPORT
7018  ,
7019  fork_context_intel
7020 #endif
7021  ,
7022  1);
7023 }
7024 
7025 int __kmp_invoke_teams_master(int gtid) {
7026  kmp_info_t *this_thr = __kmp_threads[gtid];
7027  kmp_team_t *team = this_thr->th.th_team;
7028 #if KMP_DEBUG
7029  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7030  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7031  (void *)__kmp_teams_master);
7032 #endif
7033  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7034  __kmp_teams_master(gtid);
7035  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7036  return 1;
7037 }
7038 #endif /* OMP_40_ENABLED */
7039 
7040 /* this sets the requested number of threads for the next parallel region
7041  encountered by this team. since this should be enclosed in the forkjoin
7042  critical section it should avoid race conditions with assymmetrical nested
7043  parallelism */
7044 
7045 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7046  kmp_info_t *thr = __kmp_threads[gtid];
7047 
7048  if (num_threads > 0)
7049  thr->th.th_set_nproc = num_threads;
7050 }
7051 
7052 #if OMP_40_ENABLED
7053 
7054 /* this sets the requested number of teams for the teams region and/or
7055  the number of threads for the next parallel region encountered */
7056 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7057  int num_threads) {
7058  kmp_info_t *thr = __kmp_threads[gtid];
7059  KMP_DEBUG_ASSERT(num_teams >= 0);
7060  KMP_DEBUG_ASSERT(num_threads >= 0);
7061 
7062  if (num_teams == 0)
7063  num_teams = 1; // default number of teams is 1.
7064  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7065  if (!__kmp_reserve_warn) {
7066  __kmp_reserve_warn = 1;
7067  __kmp_msg(kmp_ms_warning,
7068  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7069  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7070  }
7071  num_teams = __kmp_teams_max_nth;
7072  }
7073  // Set number of teams (number of threads in the outer "parallel" of the
7074  // teams)
7075  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7076 
7077  // Remember the number of threads for inner parallel regions
7078  if (num_threads == 0) {
7079  if (!TCR_4(__kmp_init_middle))
7080  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7081  num_threads = __kmp_avail_proc / num_teams;
7082  if (num_teams * num_threads > __kmp_teams_max_nth) {
7083  // adjust num_threads w/o warning as it is not user setting
7084  num_threads = __kmp_teams_max_nth / num_teams;
7085  }
7086  } else {
7087  if (num_teams * num_threads > __kmp_teams_max_nth) {
7088  int new_threads = __kmp_teams_max_nth / num_teams;
7089  if (!__kmp_reserve_warn) { // user asked for too many threads
7090  __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7091  __kmp_msg(kmp_ms_warning,
7092  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7093  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7094  }
7095  num_threads = new_threads;
7096  }
7097  }
7098  thr->th.th_teams_size.nth = num_threads;
7099 }
7100 
7101 // Set the proc_bind var to use in the following parallel region.
7102 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7103  kmp_info_t *thr = __kmp_threads[gtid];
7104  thr->th.th_set_proc_bind = proc_bind;
7105 }
7106 
7107 #endif /* OMP_40_ENABLED */
7108 
7109 /* Launch the worker threads into the microtask. */
7110 
7111 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7112  kmp_info_t *this_thr = __kmp_threads[gtid];
7113 
7114 #ifdef KMP_DEBUG
7115  int f;
7116 #endif /* KMP_DEBUG */
7117 
7118  KMP_DEBUG_ASSERT(team);
7119  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7120  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7121  KMP_MB(); /* Flush all pending memory write invalidates. */
7122 
7123  team->t.t_construct = 0; /* no single directives seen yet */
7124  team->t.t_ordered.dt.t_value =
7125  0; /* thread 0 enters the ordered section first */
7126 
7127  /* Reset the identifiers on the dispatch buffer */
7128  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7129  if (team->t.t_max_nproc > 1) {
7130  int i;
7131  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7132  team->t.t_disp_buffer[i].buffer_index = i;
7133 #if OMP_45_ENABLED
7134  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7135 #endif
7136  }
7137  } else {
7138  team->t.t_disp_buffer[0].buffer_index = 0;
7139 #if OMP_45_ENABLED
7140  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7141 #endif
7142  }
7143 
7144  KMP_MB(); /* Flush all pending memory write invalidates. */
7145  KMP_ASSERT(this_thr->th.th_team == team);
7146 
7147 #ifdef KMP_DEBUG
7148  for (f = 0; f < team->t.t_nproc; f++) {
7149  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7150  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7151  }
7152 #endif /* KMP_DEBUG */
7153 
7154  /* release the worker threads so they may begin working */
7155  __kmp_fork_barrier(gtid, 0);
7156 }
7157 
7158 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7159  kmp_info_t *this_thr = __kmp_threads[gtid];
7160 
7161  KMP_DEBUG_ASSERT(team);
7162  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7163  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7164  KMP_MB(); /* Flush all pending memory write invalidates. */
7165 
7166 /* Join barrier after fork */
7167 
7168 #ifdef KMP_DEBUG
7169  if (__kmp_threads[gtid] &&
7170  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7171  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7172  __kmp_threads[gtid]);
7173  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7174  "team->t.t_nproc=%d\n",
7175  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7176  team->t.t_nproc);
7177  __kmp_print_structure();
7178  }
7179  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7180  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7181 #endif /* KMP_DEBUG */
7182 
7183  __kmp_join_barrier(gtid); /* wait for everyone */
7184 #if OMPT_SUPPORT
7185  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7186  if (this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
7187  ompt_data_t *tId = OMPT_CUR_TASK_DATA(this_thr);
7188  ompt_data_t *pId = OMPT_CUR_TEAM_DATA(this_thr);
7189  this_thr->th.ompt_thread_info.state = omp_state_overhead;
7190 #if OMPT_OPTIONAL
7191  void *codeptr = NULL;
7192  if (KMP_MASTER_TID(ds_tid) &&
7193  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7194  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7195  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7196 
7197  if (ompt_enabled.ompt_callback_sync_region_wait) {
7198  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7199  ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr);
7200  }
7201  if (ompt_enabled.ompt_callback_sync_region) {
7202  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7203  ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr);
7204  }
7205 #endif
7206  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7207  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7208  ompt_scope_end, NULL, tId, 0, ds_tid);
7209  }
7210  // return to idle state
7211  this_thr->th.ompt_thread_info.state = omp_state_overhead;
7212  }
7213 #endif
7214 
7215  KMP_MB(); /* Flush all pending memory write invalidates. */
7216  KMP_ASSERT(this_thr->th.th_team == team);
7217 }
7218 
7219 /* ------------------------------------------------------------------------ */
7220 
7221 #ifdef USE_LOAD_BALANCE
7222 
7223 // Return the worker threads actively spinning in the hot team, if we
7224 // are at the outermost level of parallelism. Otherwise, return 0.
7225 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7226  int i;
7227  int retval;
7228  kmp_team_t *hot_team;
7229 
7230  if (root->r.r_active) {
7231  return 0;
7232  }
7233  hot_team = root->r.r_hot_team;
7234  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7235  return hot_team->t.t_nproc - 1; // Don't count master thread
7236  }
7237 
7238  // Skip the master thread - it is accounted for elsewhere.
7239  retval = 0;
7240  for (i = 1; i < hot_team->t.t_nproc; i++) {
7241  if (hot_team->t.t_threads[i]->th.th_active) {
7242  retval++;
7243  }
7244  }
7245  return retval;
7246 }
7247 
7248 // Perform an automatic adjustment to the number of
7249 // threads used by the next parallel region.
7250 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7251  int retval;
7252  int pool_active;
7253  int hot_team_active;
7254  int team_curr_active;
7255  int system_active;
7256 
7257  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7258  set_nproc));
7259  KMP_DEBUG_ASSERT(root);
7260  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7261  ->th.th_current_task->td_icvs.dynamic == TRUE);
7262  KMP_DEBUG_ASSERT(set_nproc > 1);
7263 
7264  if (set_nproc == 1) {
7265  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7266  return 1;
7267  }
7268 
7269  // Threads that are active in the thread pool, active in the hot team for this
7270  // particular root (if we are at the outer par level), and the currently
7271  // executing thread (to become the master) are available to add to the new
7272  // team, but are currently contributing to the system load, and must be
7273  // accounted for.
7274  pool_active = TCR_4(__kmp_thread_pool_active_nth);
7275  hot_team_active = __kmp_active_hot_team_nproc(root);
7276  team_curr_active = pool_active + hot_team_active + 1;
7277 
7278  // Check the system load.
7279  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7280  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7281  "hot team active = %d\n",
7282  system_active, pool_active, hot_team_active));
7283 
7284  if (system_active < 0) {
7285  // There was an error reading the necessary info from /proc, so use the
7286  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7287  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7288  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7289  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7290 
7291  // Make this call behave like the thread limit algorithm.
7292  retval = __kmp_avail_proc - __kmp_nth +
7293  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7294  if (retval > set_nproc) {
7295  retval = set_nproc;
7296  }
7297  if (retval < KMP_MIN_NTH) {
7298  retval = KMP_MIN_NTH;
7299  }
7300 
7301  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7302  retval));
7303  return retval;
7304  }
7305 
7306  // There is a slight delay in the load balance algorithm in detecting new
7307  // running procs. The real system load at this instant should be at least as
7308  // large as the #active omp thread that are available to add to the team.
7309  if (system_active < team_curr_active) {
7310  system_active = team_curr_active;
7311  }
7312  retval = __kmp_avail_proc - system_active + team_curr_active;
7313  if (retval > set_nproc) {
7314  retval = set_nproc;
7315  }
7316  if (retval < KMP_MIN_NTH) {
7317  retval = KMP_MIN_NTH;
7318  }
7319 
7320  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7321  return retval;
7322 } // __kmp_load_balance_nproc()
7323 
7324 #endif /* USE_LOAD_BALANCE */
7325 
7326 /* ------------------------------------------------------------------------ */
7327 
7328 /* NOTE: this is called with the __kmp_init_lock held */
7329 void __kmp_cleanup(void) {
7330  int f;
7331 
7332  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7333 
7334  if (TCR_4(__kmp_init_parallel)) {
7335 #if KMP_HANDLE_SIGNALS
7336  __kmp_remove_signals();
7337 #endif
7338  TCW_4(__kmp_init_parallel, FALSE);
7339  }
7340 
7341  if (TCR_4(__kmp_init_middle)) {
7342 #if KMP_AFFINITY_SUPPORTED
7343  __kmp_affinity_uninitialize();
7344 #endif /* KMP_AFFINITY_SUPPORTED */
7345  __kmp_cleanup_hierarchy();
7346  TCW_4(__kmp_init_middle, FALSE);
7347  }
7348 
7349  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7350 
7351  if (__kmp_init_serial) {
7352  __kmp_runtime_destroy();
7353  __kmp_init_serial = FALSE;
7354  }
7355 
7356  for (f = 0; f < __kmp_threads_capacity; f++) {
7357  if (__kmp_root[f] != NULL) {
7358  __kmp_free(__kmp_root[f]);
7359  __kmp_root[f] = NULL;
7360  }
7361  }
7362  __kmp_free(__kmp_threads);
7363  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7364  // there is no need in freeing __kmp_root.
7365  __kmp_threads = NULL;
7366  __kmp_root = NULL;
7367  __kmp_threads_capacity = 0;
7368 
7369 #if KMP_USE_DYNAMIC_LOCK
7370  __kmp_cleanup_indirect_user_locks();
7371 #else
7372  __kmp_cleanup_user_locks();
7373 #endif
7374 
7375 #if KMP_AFFINITY_SUPPORTED
7376  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7377  __kmp_cpuinfo_file = NULL;
7378 #endif /* KMP_AFFINITY_SUPPORTED */
7379 
7380 #if KMP_USE_ADAPTIVE_LOCKS
7381 #if KMP_DEBUG_ADAPTIVE_LOCKS
7382  __kmp_print_speculative_stats();
7383 #endif
7384 #endif
7385  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7386  __kmp_nested_nth.nth = NULL;
7387  __kmp_nested_nth.size = 0;
7388  __kmp_nested_nth.used = 0;
7389  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7390  __kmp_nested_proc_bind.bind_types = NULL;
7391  __kmp_nested_proc_bind.size = 0;
7392  __kmp_nested_proc_bind.used = 0;
7393 
7394  __kmp_i18n_catclose();
7395 
7396 #if KMP_STATS_ENABLED
7397  __kmp_stats_fini();
7398 #endif
7399 
7400  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7401 }
7402 
7403 /* ------------------------------------------------------------------------ */
7404 
7405 int __kmp_ignore_mppbeg(void) {
7406  char *env;
7407 
7408  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7409  if (__kmp_str_match_false(env))
7410  return FALSE;
7411  }
7412  // By default __kmpc_begin() is no-op.
7413  return TRUE;
7414 }
7415 
7416 int __kmp_ignore_mppend(void) {
7417  char *env;
7418 
7419  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7420  if (__kmp_str_match_false(env))
7421  return FALSE;
7422  }
7423  // By default __kmpc_end() is no-op.
7424  return TRUE;
7425 }
7426 
7427 void __kmp_internal_begin(void) {
7428  int gtid;
7429  kmp_root_t *root;
7430 
7431  /* this is a very important step as it will register new sibling threads
7432  and assign these new uber threads a new gtid */
7433  gtid = __kmp_entry_gtid();
7434  root = __kmp_threads[gtid]->th.th_root;
7435  KMP_ASSERT(KMP_UBER_GTID(gtid));
7436 
7437  if (root->r.r_begin)
7438  return;
7439  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7440  if (root->r.r_begin) {
7441  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7442  return;
7443  }
7444 
7445  root->r.r_begin = TRUE;
7446 
7447  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7448 }
7449 
7450 /* ------------------------------------------------------------------------ */
7451 
7452 void __kmp_user_set_library(enum library_type arg) {
7453  int gtid;
7454  kmp_root_t *root;
7455  kmp_info_t *thread;
7456 
7457  /* first, make sure we are initialized so we can get our gtid */
7458 
7459  gtid = __kmp_entry_gtid();
7460  thread = __kmp_threads[gtid];
7461 
7462  root = thread->th.th_root;
7463 
7464  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7465  library_serial));
7466  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7467  thread */
7468  KMP_WARNING(SetLibraryIncorrectCall);
7469  return;
7470  }
7471 
7472  switch (arg) {
7473  case library_serial:
7474  thread->th.th_set_nproc = 0;
7475  set__nproc(thread, 1);
7476  break;
7477  case library_turnaround:
7478  thread->th.th_set_nproc = 0;
7479  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7480  : __kmp_dflt_team_nth_ub);
7481  break;
7482  case library_throughput:
7483  thread->th.th_set_nproc = 0;
7484  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7485  : __kmp_dflt_team_nth_ub);
7486  break;
7487  default:
7488  KMP_FATAL(UnknownLibraryType, arg);
7489  }
7490 
7491  __kmp_aux_set_library(arg);
7492 }
7493 
7494 void __kmp_aux_set_stacksize(size_t arg) {
7495  if (!__kmp_init_serial)
7496  __kmp_serial_initialize();
7497 
7498 #if KMP_OS_DARWIN
7499  if (arg & (0x1000 - 1)) {
7500  arg &= ~(0x1000 - 1);
7501  if (arg + 0x1000) /* check for overflow if we round up */
7502  arg += 0x1000;
7503  }
7504 #endif
7505  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7506 
7507  /* only change the default stacksize before the first parallel region */
7508  if (!TCR_4(__kmp_init_parallel)) {
7509  size_t value = arg; /* argument is in bytes */
7510 
7511  if (value < __kmp_sys_min_stksize)
7512  value = __kmp_sys_min_stksize;
7513  else if (value > KMP_MAX_STKSIZE)
7514  value = KMP_MAX_STKSIZE;
7515 
7516  __kmp_stksize = value;
7517 
7518  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7519  }
7520 
7521  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7522 }
7523 
7524 /* set the behaviour of the runtime library */
7525 /* TODO this can cause some odd behaviour with sibling parallelism... */
7526 void __kmp_aux_set_library(enum library_type arg) {
7527  __kmp_library = arg;
7528 
7529  switch (__kmp_library) {
7530  case library_serial: {
7531  KMP_INFORM(LibraryIsSerial);
7532  (void)__kmp_change_library(TRUE);
7533  } break;
7534  case library_turnaround:
7535  (void)__kmp_change_library(TRUE);
7536  break;
7537  case library_throughput:
7538  (void)__kmp_change_library(FALSE);
7539  break;
7540  default:
7541  KMP_FATAL(UnknownLibraryType, arg);
7542  }
7543 }
7544 
7545 /* ------------------------------------------------------------------------ */
7546 
7547 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7548  int blocktime = arg; /* argument is in milliseconds */
7549 #if KMP_USE_MONITOR
7550  int bt_intervals;
7551 #endif
7552  int bt_set;
7553 
7554  __kmp_save_internal_controls(thread);
7555 
7556  /* Normalize and set blocktime for the teams */
7557  if (blocktime < KMP_MIN_BLOCKTIME)
7558  blocktime = KMP_MIN_BLOCKTIME;
7559  else if (blocktime > KMP_MAX_BLOCKTIME)
7560  blocktime = KMP_MAX_BLOCKTIME;
7561 
7562  set__blocktime_team(thread->th.th_team, tid, blocktime);
7563  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7564 
7565 #if KMP_USE_MONITOR
7566  /* Calculate and set blocktime intervals for the teams */
7567  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7568 
7569  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7570  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7571 #endif
7572 
7573  /* Set whether blocktime has been set to "TRUE" */
7574  bt_set = TRUE;
7575 
7576  set__bt_set_team(thread->th.th_team, tid, bt_set);
7577  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7578 #if KMP_USE_MONITOR
7579  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7580  "bt_intervals=%d, monitor_updates=%d\n",
7581  __kmp_gtid_from_tid(tid, thread->th.th_team),
7582  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7583  __kmp_monitor_wakeups));
7584 #else
7585  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7586  __kmp_gtid_from_tid(tid, thread->th.th_team),
7587  thread->th.th_team->t.t_id, tid, blocktime));
7588 #endif
7589 }
7590 
7591 void __kmp_aux_set_defaults(char const *str, int len) {
7592  if (!__kmp_init_serial) {
7593  __kmp_serial_initialize();
7594  }
7595  __kmp_env_initialize(str);
7596 
7597  if (__kmp_settings
7598 #if OMP_40_ENABLED
7599  || __kmp_display_env || __kmp_display_env_verbose
7600 #endif // OMP_40_ENABLED
7601  ) {
7602  __kmp_env_print();
7603  }
7604 } // __kmp_aux_set_defaults
7605 
7606 /* ------------------------------------------------------------------------ */
7607 /* internal fast reduction routines */
7608 
7609 PACKED_REDUCTION_METHOD_T
7610 __kmp_determine_reduction_method(
7611  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7612  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7613  kmp_critical_name *lck) {
7614 
7615  // Default reduction method: critical construct ( lck != NULL, like in current
7616  // PAROPT )
7617  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7618  // can be selected by RTL
7619  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7620  // can be selected by RTL
7621  // Finally, it's up to OpenMP RTL to make a decision on which method to select
7622  // among generated by PAROPT.
7623 
7624  PACKED_REDUCTION_METHOD_T retval;
7625 
7626  int team_size;
7627 
7628  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7629  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7630 
7631 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
7632  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7633 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7634 
7635  retval = critical_reduce_block;
7636 
7637  // another choice of getting a team size (with 1 dynamic deference) is slower
7638  team_size = __kmp_get_team_num_threads(global_tid);
7639  if (team_size == 1) {
7640 
7641  retval = empty_reduce_block;
7642 
7643  } else {
7644 
7645  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7646  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7647 
7648 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7649 
7650 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || \
7651  KMP_OS_DARWIN
7652 
7653  int teamsize_cutoff = 4;
7654 
7655 #if KMP_MIC_SUPPORTED
7656  if (__kmp_mic_type != non_mic) {
7657  teamsize_cutoff = 8;
7658  }
7659 #endif
7660  if (tree_available) {
7661  if (team_size <= teamsize_cutoff) {
7662  if (atomic_available) {
7663  retval = atomic_reduce_block;
7664  }
7665  } else {
7666  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7667  }
7668  } else if (atomic_available) {
7669  retval = atomic_reduce_block;
7670  }
7671 #else
7672 #error "Unknown or unsupported OS"
7673 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7674 // KMP_OS_DARWIN
7675 
7676 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7677 
7678 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7679 
7680  // basic tuning
7681 
7682  if (atomic_available) {
7683  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7684  retval = atomic_reduce_block;
7685  }
7686  } // otherwise: use critical section
7687 
7688 #elif KMP_OS_DARWIN
7689 
7690  if (atomic_available && (num_vars <= 3)) {
7691  retval = atomic_reduce_block;
7692  } else if (tree_available) {
7693  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7694  (reduce_size < (2000 * sizeof(kmp_real64)))) {
7695  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7696  }
7697  } // otherwise: use critical section
7698 
7699 #else
7700 #error "Unknown or unsupported OS"
7701 #endif
7702 
7703 #else
7704 #error "Unknown or unsupported architecture"
7705 #endif
7706  }
7707 
7708  // KMP_FORCE_REDUCTION
7709 
7710  // If the team is serialized (team_size == 1), ignore the forced reduction
7711  // method and stay with the unsynchronized method (empty_reduce_block)
7712  if (__kmp_force_reduction_method != reduction_method_not_defined &&
7713  team_size != 1) {
7714 
7715  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7716 
7717  int atomic_available, tree_available;
7718 
7719  switch ((forced_retval = __kmp_force_reduction_method)) {
7720  case critical_reduce_block:
7721  KMP_ASSERT(lck); // lck should be != 0
7722  break;
7723 
7724  case atomic_reduce_block:
7725  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7726  if (!atomic_available) {
7727  KMP_WARNING(RedMethodNotSupported, "atomic");
7728  forced_retval = critical_reduce_block;
7729  }
7730  break;
7731 
7732  case tree_reduce_block:
7733  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7734  if (!tree_available) {
7735  KMP_WARNING(RedMethodNotSupported, "tree");
7736  forced_retval = critical_reduce_block;
7737  } else {
7738 #if KMP_FAST_REDUCTION_BARRIER
7739  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7740 #endif
7741  }
7742  break;
7743 
7744  default:
7745  KMP_ASSERT(0); // "unsupported method specified"
7746  }
7747 
7748  retval = forced_retval;
7749  }
7750 
7751  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7752 
7753 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7754 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7755 
7756  return (retval);
7757 }
7758 
7759 // this function is for testing set/get/determine reduce method
7760 kmp_int32 __kmp_get_reduce_method(void) {
7761  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7762 }
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:821
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:790
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:189
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:870
sched_type
Definition: kmp.h:317
Definition: kmp.h:210
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
kmp_int32 flags
Definition: kmp.h:212