1 /*****************************************************************************
2 * Copyright (C) 2006-2010 by Michael Rieker, Jason Ansel, Kapil Arya, and *
3 * Gene Cooperman *
4 * mrieker@nii.net, jansel@csail.mit.edu, kapil@ccs.neu.edu, and *
5 * gene@ccs.neu.edu *
6 * *
7 * This file is part of the MTCP module of DMTCP (DMTCP:mtcp). *
8 * *
9 * DMTCP:mtcp is free software: you can redistribute it and/or *
10 * modify it under the terms of the GNU Lesser General Public License as *
11 * published by the Free Software Foundation, either version 3 of the *
12 * License, or (at your option) any later version. *
13 * *
14 * DMTCP:dmtcp/src is distributed in the hope that it will be useful, *
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
17 * GNU Lesser General Public License for more details. *
18 * *
19 * You should have received a copy of the GNU Lesser General Public *
20 * License along with DMTCP:dmtcp/src. If not, see *
21 * <http://www.gnu.org/licenses/>. *
22 *****************************************************************************/
23
24 /********************************************************************************************************************************/
25 /* */
26 /* Multi-threaded checkpoint library */
27 /* */
28 /* Link this in as part of your program that you want checkpoints taken */
29 /* Call the mtcp_init routine at the beginning of your program */
30 /* Call the mtcp_ok routine when it's OK to do checkpointing */
31 /* Call the mtcp_no routine when you want checkpointing inhibited */
32 /* */
33 /* This module also contains a __clone wrapper routine */
34 /* */
35 /********************************************************************************************************************************/
36
37
38 // Set _GNU_SOURCE in order to expose glibc-defined sigandset()
39 #define _GNU_SOURCE
40 #include <asm/ldt.h> // for struct user_desc
41 //#include <asm/segment.h> // for GDT_ENTRY_TLS_... stuff
42 #include <dirent.h>
43 #include <dlfcn.h>
44 #include <errno.h>
45 #include <fcntl.h>
46 #include <pthread.h>
47 #include <semaphore.h>
48 #include <sched.h>
49 #include <signal.h>
50 #include <stdarg.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <sys/mman.h>
55 #include <sys/resource.h>
56 #include <sys/sem.h>
57 #include <sys/stat.h>
58 #include <sys/syscall.h>
59 #include <sys/ioctl.h>
60 #include <termios.h> // for tcdrain, tcsetattr, etc.
61 #include <unistd.h>
62 #include <ucontext.h>
63 #include <sys/types.h> // for gettid, tkill, waitpid
64 #include <sys/wait.h> // for waitpid
65 #include <linux/unistd.h> // for gettid, tkill
66 #include <gnu/libc-version.h>
67
68 #define MTCP_SYS_STRCPY
69 #define MTCP_SYS_STRLEN
70 #include "mtcp_internal.h"
71
72 /* required for ptrace sake */
73 #include <sys/user.h>
74 #include "mtcp_ptrace.h"
75
76 static int WAIT=1;
77 // static int WAIT=0;
78
79 #if 0
80 // Force thread to stop, without use of a system call.
81 static int WAIT=1;
82 # define DEBUG_WAIT \
83 if (DEBUG_RESTARTING) \
84 {int i,j; \
85 for (i = 0; WAIT && i < 1000000000; i++) \
86 for (j = 0; j < 1000000000; j++) ; \
87 }
88 #else
89 # define DEBUG_WAIT
90 #endif
91
92 #if defined(GDT_ENTRY_TLS_ENTRIES) && !defined(__x86_64__)
93 #define MTCP__SAVE_MANY_GDT_ENTRIES 1
94 #else
95 #define MTCP__SAVE_MANY_GDT_ENTRIES 0
96 #endif
97
98 /* Retrieve saved stack pointer saved by getcontext () */
99 #ifdef __x86_64__
100 #define MYREG_RSP 15
101 #define SAVEDSP uc_mcontext.gregs[MYREG_RSP]
102 #else
103 #define MYREG_ESP 7
104 #define SAVEDSP uc_mcontext.gregs[MYREG_ESP]
105 #endif
106
107 /* TLS segment registers used differently in i386 and x86_64. - Gene */
108 #ifdef __i386__
109 # define TLSSEGREG gs
110 #endif
111 #ifdef __x86_64__
112 # define TLSSEGREG fs
113 #endif
114
115 /* Offset computed (&x.pid - &x) for
116 * struct pthread x;
117 * as found in: glibc-2.5/nptl/descr.h
118 * It was 0x4c and 0x48 for pid and tid for i386.
119 * Roughly, the definition is:
120 *glibc-2.5/nptl/descr.h:
121 * struct pthread
122 * {
123 * union {
124 * tcbheader_t tcbheader;
125 * void *__padding[16];
126 * };
127 * list_t list;
128 * pid_t tid;
129 * pid_t pid;
130 * ...
131 * } __attribute ((aligned (TCB_ALIGNMENT)));
132 *
133 *glibc-2.5/nptl/sysdeps/pthread/list.h:
134 * typedef struct list_head
135 * {
136 * struct list_head *next;
137 * struct list_head *prev;
138 * } list_t;
139 *
140 * NOTE: glibc-2.10 changes the size of __padding from 16 to 24. --KAPIL
141 *
142 * NOTE: glibc-2.10 further changes the size tcphead_t without updating the
143 * size of __padding in struct pthread. We need to add an extra 512 bytes
144 * to accomodate this. -- KAPIL
145 */
146 #if __GLIBC_PREREQ (2,12)
147 /* WHEN WE HAVE CONFIDENCE IN THIS VERSION, REMOVE ALL OTHER __GLIBC_PREREQ
148 * AND MAKE THIS THE ONLY VERSION. IT SHOULD BE BACKWARDS COMPATIBLE.
149 */
150 /* These function definitions should succeed independently of the glibc version.
151 * They use get_thread_area() to match (tid, pid) and find offset.
152 * In other code, on restart, that offset is used to set (tid,pid) to
153 * the latest tid and pid of the new thread, instead of the (tid,pid)
154 * of the original thread.
155 * SEE: "struct pthread" in glibc-2.XX/nptl/descr.h for 'struct pthread'.
156 */
157 static int TLS_TID_OFFSET(void);
158
159 /* Can remove the unused attribute when this __GLIBC_PREREQ is the only one. */
160 static char *memsubarray (char *array, char *subarray, int len)
161 __attribute__ ((unused));
162 static int mtcp_get_tls_segreg(void);
163 static void *mtcp_get_tls_base_addr(void);
164
165 static int TLS_TID_OFFSET(void) {
166 static int tid_offset = -1;
167 if (tid_offset == -1) {
168 struct {pid_t tid; pid_t pid;} tid_pid;
169 /* struct pthread has adjacent fields, tid and pid, in that order.
170 * Try to find at what offset that bit patttern occurs in struct pthread.
171 */
172 char * tmp;
173 tid_pid.tid = mtcp_sys_kernel_gettid();
174 tid_pid.pid = mtcp_sys_getpid();
175 /* Get entry number of current thread descriptor from its segment register:
176 * Segment register / 8 is the entry_number for the "thread area", which
177 * is of type 'struct user_desc'. The base_addr field of that struct
178 * points to the struct pthread for the thread with that entry_number.
179 * The tid and pid are contained in the 'struct pthread'.
180 * So, to access the tid/pid fields, first find the entry number.
181 * Then fill in the entry_number field of an empty 'struct user_desc', and
182 * get_thread_area(struct user_desc *uinfo) will fill in the rest.
183 * Then use the filled in base_address field to get the 'struct pthread'.
184 * The function mtcp_get_tls_base_addr() returns this 'struct pthread' addr.
185 */
186 void * pthread_desc = mtcp_get_tls_base_addr();
187 /* A false hit for tid_offset probably can't happen since a new
188 * 'struct pthread' is zeroed out before adding tid and pid.
189 */
190 tmp = memsubarray((char *)pthread_desc, (char *)&tid_pid, sizeof(tid_pid));
191 if (tmp == NULL) {
192 mtcp_printf("MTCP: Couldn't find offsets of tid/pid in thread_area.\n");
193 mtcp_abort();
194 }
195 tid_offset = tmp - (char *)pthread_desc;
196 #ifdef __x86_64__
197 if (tid_offset != 512+26*sizeof(void *))
198 #else
199 if (tid_offset != 26*sizeof(void *))
200 #endif
201 mtcp_printf("MTCP: Warning: tid_offset = %d; different from expected.\n"
202 " Continuing anyway. If this fails, please try again.\n",
203 tid_offset);
204 DPRINTF(("tid_offset: %d\n", tid_offset));
205 if (tid_offset % sizeof(int) != 0) {
206 mtcp_printf("MTCP: tid_offset is not divisible by sizeof(int).\n");
207 mtcp_abort();
208 }
209 /* Should we do a double-check, and spawn a new thread and see
210 * if its TID matches at this tid_offset? This would give greater
211 * confidence, but for the reasons above, it's probably not necessary.
212 */
213 }
214 return tid_offset;
215 }
216 static int TLS_PID_OFFSET(void) {
217 static int pid_offset = -1;
218 struct {pid_t tid; pid_t pid;} tid_pid;
219 if (pid_offset == -1) {
220 int tid_offset = TLS_TID_OFFSET();
221 pid_offset = tid_offset + (char *)&(tid_pid.pid) - (char *)&tid_pid;
222 DPRINTF(("pid_offset: %d\n", pid_offset));
223 }
224 return pid_offset;
225 }
226 #elif __GLIBC_PREREQ (2,11)
227 # ifdef __x86_64__
228 # define TLS_PID_OFFSET() \
229 (512+26*sizeof(void *)+sizeof(pid_t)) // offset of pid in pthread struct
230 # define TLS_TID_OFFSET() (512+26*sizeof(void *)) // offset of tid in pthread struct
231 # else
232 # define TLS_PID_OFFSET() \
233 (26*sizeof(void *)+sizeof(pid_t)) // offset of pid in pthread struct
234 # define TLS_TID_OFFSET() (26*sizeof(void *)) // offset of tid in pthread struct
235 # endif
236 #elif __GLIBC_PREREQ (2,10)
237 # define TLS_PID_OFFSET() \
238 (26*sizeof(void *)+sizeof(pid_t)) // offset of pid in pthread struct
239 # define TLS_TID_OFFSET() (26*sizeof(void *)) // offset of tid in pthread struct
240 #else
241 # define TLS_PID_OFFSET() \
242 (18*sizeof(void *)+sizeof(pid_t)) // offset of pid in pthread struct
243 # define TLS_TID_OFFSET() (18*sizeof(void *)) // offset of tid in pthread struct
244 #endif
245
246 /* this call to gettid is hijacked by DMTCP for PID/TID-Virtualization */
247 #define GETTID() (int)syscall(SYS_gettid)
248
249 sem_t sem_start;
250
251 typedef struct Thread Thread;
252
253 struct Thread { Thread *next; // next thread in 'threads' list
254 Thread **prev; // prev thread in 'threads' list
255 int tid; // this thread's id as returned by mtcp_sys_kernel_gettid ()
256 int original_tid; // this is the the thread's "original" tid
257 MtcpState state; // see ST_... below
258 Thread *parent; // parent thread (or NULL if top-level thread)
259 Thread *children; // one of this thread's child threads
260 Thread *siblings; // one of this thread's sibling threads
261
262 int clone_flags; // parameters to __clone that created this thread
263 int *parent_tidptr;
264 int *given_tidptr; // (this is what __clone caller passed in)
265 int *actual_tidptr; // (this is what we passed to the system call, either given_tidptr or &child_tid)
266 int child_tid; // this is used for child_tidptr if the original call did not
267 // ... have both CLONE_CHILD_SETTID and CLONE_CHILD_CLEARTID
268 int (*fn) (void *arg); // thread's initial function entrypoint and argument
269 void *arg;
270
271 sigset_t sigblockmask; // blocked signals
272 sigset_t sigpending; // pending signals
273
274 ///JA: new code ported from v54b
275 ucontext_t savctx; // context saved on suspend
276
277 mtcp_segreg_t fs, gs; // thread local storage pointers
278 pthread_t pth; // added for pthread_join
279 #if MTCP__SAVE_MANY_GDT_ENTRIES
280 struct user_desc gdtentrytls[GDT_ENTRY_TLS_ENTRIES];
281 #else
282 struct user_desc gdtentrytls[1];
283 #endif
284 };
285
286 /*
287 * struct MtcpRestartThreadArg
288 *
289 * DMTCP requires the original_tids of the threads being created during
290 * the RESTARTING phase. We use MtcpRestartThreadArg structure is to pass
291 * the original_tid of the thread being created from MTCP to DMTCP.
292 *
293 * actual clone call: clone (fn, child_stack, flags, void *, ... )
294 * new clone call : clone (fn, child_stack, flags, (struct MtcpRestartThreadArg *), ...)
295 *
296 * DMTCP automatically extracts arg from this structure and passes that
297 * to the _real_clone call.
298 *
299 * IMPORTANT NOTE: While updating, this structure must be kept in sync
300 * with the structure defined with the same name in mtcpinterface.cpp
301 */
302 struct MtcpRestartThreadArg {
303 void *arg;
304 pid_t original_tid;
305 };
306
307 #define ST_RUNDISABLED 0 // thread is running normally but with checkpointing disabled
308 #define ST_RUNENABLED 1 // thread is running normally and has checkpointing enabled
309 #define ST_SIGDISABLED 2 // thread is running normally with cp disabled, but checkpoint thread is waiting for it to enable
310 #define ST_SIGENABLED 3 // thread is running normally with cp enabled, and checkpoint thread has signalled it to stop
311 #define ST_SUSPINPROG 4 // thread context being saved (very brief)
312 #define ST_SUSPENDED 5 // thread is suspended waiting for checkpoint to complete
313 #define ST_CKPNTHREAD 6 // thread is the checkpointing thread (special state just for that thread)
314
315 /* Global data */
316
317 void *mtcp_libc_dl_handle = NULL; // dlopen handle for whatever libc.so is loaded with application program
318 Area mtcp_libc_area; // some area of that libc.so
319
320 /* DMTCP Info Variables */
321
322 /* These are reset by dmtcphijack.so at initialization. */
323 int dmtcp_exists = 0; /* Are we running under DMTCP? */
324 int dmtcp_info_pid_virtualization_enabled = 0;
325 /* The following two DMTCP Info variables are defined in mtcp_printf.c */
326 //int dmtcp_info_stderr_fd = 2;
327 //int dmtcp_info_jassertlog_fd = -1;
328 int dmtcp_info_restore_working_directory = -1;
329
330 /* Static data */
331
332 static sigset_t sigpending_global; // pending signals for the process
333 static char const *nscd_mmap_str = "/var/run/nscd/"; // OpenSUSE
334 static char const *nscd_mmap_str2 = "/var/cache/nscd"; // Debian / Ubuntu
335 static char const *nscd_mmap_str3 = "/var/db/nscd"; // RedHat (Linux 2.6.9)
336 static char const *dev_zero_deleted_str = "/dev/zero (deleted)";
337 static char const *dev_null_deleted_str = "/dev/null (deleted)";
338 static char const *sys_v_shmem_file = "/SYSV";
339 //static char const *perm_checkpointfilename = NULL;
340 //static char const *temp_checkpointfilename = NULL;
341 static char perm_checkpointfilename[MAXPATHLEN];
342 static char temp_checkpointfilename[MAXPATHLEN];
343 static size_t checkpointsize;
344 static int intervalsecs;
345 static pid_t motherpid;
346 static size_t restore_size;
347 static int showtiming;
348 static int threadenabledefault;
349 static int verify_count; // number of checkpoints to go
350 static int verify_total; // value given by envar
351 static pid_t mtcp_ckpt_gzip_child_pid = -1;
352 static int volatile checkpointhreadstarting = 0;
353 static MtcpState restoreinprog = MTCP_STATE_INITIALIZER;
354 static MtcpState threadslocked = MTCP_STATE_INITIALIZER;
355 static pthread_t checkpointhreadid;
356 static struct timeval restorestarted;
357 static int DEBUG_RESTARTING = 0;
358 static Thread *motherofall = NULL;
359 static Thread *ckpthread = NULL;
360 static Thread *threads = NULL;
361 struct sigaction sigactions[NSIG]; // signal handlers
362 static VA restore_begin, restore_end;
363 static void *restore_start; /* will be bound to fnc, mtcp_restore_start */
364 static void *saved_sysinfo;
365 static void *saved_heap_start = NULL;
366 static char saved_working_directory[MTCP_MAX_PATH];
367 static void (*callback_sleep_between_ckpt)(int sec) = NULL;
368 static void (*callback_pre_ckpt)() = NULL;
369 static void (*callback_post_ckpt)(int is_restarting) = NULL;
370 static int (*callback_ckpt_fd)(int fd) = NULL;
371 static void (*callback_write_dmtcp_header)(int fd) = NULL;
372 static void (*callback_restore_virtual_pid_table)() = NULL;
373
374 static int (*clone_entry) (int (*fn) (void *arg),
375 void *child_stack,
376 int flags,
377 void *arg,
378 int *parent_tidptr,
379 struct user_desc *newtls,
380 int *child_tidptr);
381
382 /* temp stack used internally by restore so we don't go outside the
383 * libmtcp.so address range for anything;
384 * including "+ 1" since will set %esp/%rsp to tempstack+STACKSIZE
385 */
386 static long long tempstack[STACKSIZE + 1];
387
388 /* Internal routines */
389
390 static long set_tid_address (int *tidptr);
391
392 static char *memsubarray (char *array, char *subarray, int len)
393 __attribute__ ((unused));
394 static int mtcp_get_tls_segreg(void);
395 static void *mtcp_get_tls_base_addr(void);
396 static int threadcloned (void *threadv);
397 static void setupthread (Thread *thread);
398 static void setup_clone_entry (void);
399 static void threadisdead (Thread *thread);
400 static void *checkpointhread (void *dummy);
401 static int test_use_compression(void);
402 static int open_ckpt_to_write(int fd, int pipe_fds[2], char *gzip_path);
403 static void checkpointeverything (void);
404 static void writefiledescrs (int fd);
405 static void writememoryarea (int fd, Area *area,
406 int stack_was_seen, int vsyscall_exists);
407 static void writecs (int fd, char cs);
408 static void writefile (int fd, void const *buff, size_t size);
409 static void preprocess_special_segments(int *vsyscall_exists);
410 static void stopthisthread (int signum);
411 static void wait_for_all_restored (void);
412 static void save_sig_state (Thread *thisthread);
413 static void restore_sig_state (Thread *thisthread);
414 static void save_sig_handlers (void);
415 static void restore_sig_handlers (Thread *thisthread);
416 static void save_tls_state (Thread *thisthread);
417 static void renametempoverperm (void);
418 static Thread *getcurrenthread (void);
419 static void lock_threads (void);
420 static void unlk_threads (void);
421 static int readmapsline (int mapsfd, Area *area);
422 static void restore_heap(void);
423 static void finishrestore (void);
424 static int restarthread (void *threadv);
425 static void restore_tls_state (Thread *thisthread);
426 static void setup_sig_handler (void);
427 static void sync_shared_mem(void);
428
429 /* FIXME:
430 * dmtcp/src/syscallsreal.c has wrappers around signal, sigaction, sigprocmask
431 * The wrappers go to these mtcp_real_XXX versions so that MTCP can call
432 * the actual system calls and avoid the wrappers. But if that is still
433 * an issue, then we can create mtcp_sys_signal(), etc., for direct calls.
434 *
435 * Update:
436 * mtcp_real_XXX versions have been renamed to _real_XXX in DMTCP.
437 * sigprocmask should not be used in multi-threaded process, use
438 * pthread_sigmask instead.
439 */
440 int _real_sigaction(int signum, const struct sigaction *act,
441 struct sigaction *oldact){
442 if (dmtcp_exists) {
443 mtcp_printf("mtcp %s: This function mustn't be called when working under DMTCP\n",
444 __FUNCTION__);
445 mtcp_abort();
446 }
447 return sigaction(signum, act, oldact);
448 }
449
450
451 /********************************************************************************************************************************/
452 /* */
453 /* This routine must be called at startup time to initiate checkpointing */
454 /* */
455 /* Input: */
456 /* */
457 /* checkpointfilename = name to give the checkpoint file */
458 /* interval = interval, in seconds, to write the checkpoint file */
459 /* clonenabledefault = 0 : clone checkpointing blocked by default (call mtcp_ok in the thread to enable) */
460 /* 1 : clone checkpointing enabled by default (call mtcp_no in the thread to block if you want) */
461 /* */
462 /* envar MTCP_WRAPPER_LIBC_SO = what library to use for inner wrappers (default libc.??.so) */
463 /* envar MTCP_VERIFY_CHECKPOINT = every n checkpoints, verify by doing a restore to resume */
464 /* default is 0, ie, don't ever verify */
465 /* */
466 /********************************************************************************************************************************/
467 /* These hook functions provide an alternative to DMTCP callbacks, using
468 * weak symbols. While MTCP is immature, let's allow both, in case
469 * the flexibility of a second hook mechanism is useful in the future.
470 * The mechanism is invisible unless end user compiles w/ -Wl,-export-dynamic
471 */
472 __attribute__ ((weak)) void mtcpHookPreCheckpoint( void ) { }
473
474 __attribute__ ((weak)) void mtcpHookPostCheckpoint( void ) { }
475
476 __attribute__ ((weak)) void mtcpHookRestart( void ) { }
477
478 /* Statically allocate this. Malloc is dangerous here if application is
479 * defining its own (possibly not thread-safe) malloc routine.
480 */
481 static Thread ckptThreadStorage;
482
483 void mtcp_init (char const *checkpointfilename, int interval, int clonenabledefault)
484 {
485 char *p, *tmp, *endp;
486 int len;
487 Thread *ckptThreadDescriptor = & ckptThreadStorage;
488 mtcp_segreg_t TLSSEGREG;
489 #ifdef PTRACE
490 init_thread_local();
491 #endif
492
493 if (sizeof(void *) != sizeof(long)) {
494 mtcp_printf("ERROR: sizeof(void *) != sizeof(long) on this architecture.\n"
495 " This code assumes they are equal.\n");
496 mtcp_abort ();
497 }
498
499 #ifndef __x86_64__
500 /* Nobody else has a right to preload on internal processes generated
501 * by mtcp_check_XXX() -- not even DMTCP, if it's currently operating.
502 *
503 * Saving LD_PRELOAD in a temp env var and restoring it later --Kapil.
504 *
505 * TODO: To insert some sort of error checking to make sure that we
506 * are correctly setting LD_PRELOAD after we are done with
507 * vdso check.
508 */
509
510 // Shouldn't this removal of LD_PRELOAD be around fork/exec of gzip ?
511 // setenv( "MTCP_TMP_LD_PRELOAD", getenv("LD_PRELOAD"), 1);
512 // unsetenv("LD_PRELOAD");
513 // Allow user program to run with randomize_va
514 // mtcp_check_vdso_enabled();
515 // setenv("LD_PRELOAD", getenv("MTCP_TMP_LD_PRELOAD"), 1);
516 // unsetenv("MTCP_TMP_LD_PRELOAD");
517 #endif
518
519 #if 0
520 { struct user_desc u_info;
521 u_info.entry_number = 12;
522 if (-1 == mtcp_sys_get_thread_area(&u_info) && mtcp_sys_errno == ENOSYS)
523 mtcp_printf(
524 "Apparently, get_thread_area is not implemented in your kernel.\n"
525 " If this doesn't work, please try on a more recent kernel,\n"
526 " or one configured to support get_thread_area.\n"
527 );
528 }
529 #endif
530
531 intervalsecs = interval;
532
533 if (strlen(mtcp_ckpt_newname) >= MAXPATHLEN) {
534 mtcp_printf("mtcp mtcp_init: new ckpt file name (%s) too long (>=512 bytes)\n",
535 mtcp_ckpt_newname);
536 mtcp_abort();
537 }
538 strncpy(perm_checkpointfilename,checkpointfilename,MAXPATHLEN); // this is what user wants the checkpoint file called
539 len = strlen (perm_checkpointfilename); // make up another name, same as that, with ".temp" on the end
540 memcpy(temp_checkpointfilename, perm_checkpointfilename, len);
541 strncpy(temp_checkpointfilename + len, ".temp",MAXPATHLEN-len);
542 // ... we use it to write to in case we crash while writing
543 // we will leave the previous good one intact
544
545 #ifdef PTRACE
546 /* TODO: USE flock WHEN WRITING TO THESE THREE FILES (NOT YET DONE FOR ptrace_setoptions_file? */
547 memset(ptrace_shared_file, '\0', MAXPATHLEN);
548 sprintf(ptrace_shared_file, "%s/ptrace_shared_file.txt", dir);
549 memset(ptrace_setoptions_file, '\0', MAXPATHLEN);
550 sprintf(ptrace_setoptions_file, "%s/ptrace_setoptions_file.txt", dir);
551 memset(checkpoint_threads_file, '\0', MAXPATHLEN);
552 sprintf(checkpoint_threads_file, "%s/checkpoint_threads_file.txt", dir);
553 #endif
554
555 DPRINTF (("mtcp_init*: main tid %d\n", mtcp_sys_kernel_gettid ()));
556 /* If MTCP_INIT_PAUSE set, sleep 15 seconds and allow for gdb attach. */
557 if (getenv("MTCP_INIT_PAUSE")) {
558 mtcp_printf("Pausing 15 seconds. Do: gdb attach %d\n", mtcp_sys_getpid());
559 sleep(15);
560 }
561
562 threadenabledefault = clonenabledefault; // save this away where it's easy to get
563
564 p = getenv ("MTCP_SHOWTIMING");
565 showtiming = ((p != NULL) && (*p & 1));
566
567 /* Maybe dump out some stuff about the TLS */
568
569 mtcp_dump_tls (__FILE__, __LINE__);
570
571 /* Save this process's pid. Then verify that the TLS has it where it should be. */
572 /* When we do a restore, we will have to modify each thread's TLS with the new motherpid. */
573 /* We also assume that GS uses the first GDT entry for its descriptor. */
574
575 motherpid = mtcp_sys_getpid (); /* libc/getpid can lie if we had
576 * used kernel fork() instead of libc fork().
577 */
578 {
579 pid_t tls_pid, tls_tid;
580 tls_pid = *(pid_t *) (mtcp_get_tls_base_addr() + TLS_PID_OFFSET());
581 tls_tid = *(pid_t *) (mtcp_get_tls_base_addr() + TLS_TID_OFFSET());
582
583 if ((tls_pid != motherpid) || (tls_tid != motherpid)) {
584 mtcp_printf ("mtcp_init: getpid %d, tls pid %d, tls tid %d, must all match\n",
585 motherpid, tls_pid, tls_tid);
586 mtcp_abort ();
587 }
588 }
589
590 /* Get verify envar */
591
592 tmp = getenv ("MTCP_VERIFY_CHECKPOINT");
593 verify_total = 0;
594 if (tmp != NULL) {
595 verify_total = strtol (tmp, &p, 0);
596 if ((*p != '\0') || (verify_total < 0)) {
597 mtcp_printf ("mtcp_init: bad MTCP_VERIFY_CHECKPOINT %s\n", tmp);
598 mtcp_abort ();
599 }
600 }
601
602 /* If the user has defined a signal, use that to suspend. Otherwise, use MTCP_DEFAULT_SIGNAL */
603
604 tmp = getenv("MTCP_SIGCKPT");
605 if (tmp == NULL)
606 STOPSIGNAL = MTCP_DEFAULT_SIGNAL;
607 else
608 {
609 errno = 0;
610 STOPSIGNAL = strtol(tmp, &endp, 0);
611
612 if ((errno != 0) || (tmp == endp))
613 {
614 mtcp_printf("mtcp_init: Your chosen SIGCKPT of \"%s\" does not "
615 "translate to a number,\n"
616 " and cannot be used. Signal %d "
617 "will be used instead.\n", tmp, MTCP_DEFAULT_SIGNAL);
618 STOPSIGNAL = MTCP_DEFAULT_SIGNAL;
619 }
620 else if (STOPSIGNAL < 1 || STOPSIGNAL > 31)
621 {
622 mtcp_printf("mtcp_init: Your chosen SIGCKPT of \"%d\" is not a valid "
623 "signal, and cannot be used.\n"
624 " Signal %d will be used instead.\n",
625 STOPSIGNAL, MTCP_DEFAULT_SIGNAL);
626 STOPSIGNAL = MTCP_DEFAULT_SIGNAL;
627 }
628 }
629
630 /* Set up signal handler so we can interrupt the thread for checkpointing */
631 setup_sig_handler ();
632
633 /* Get size and address of the shareable - used to separate it from the rest of the stuff */
634 /* All routines needed to perform restore must be within this address range */
635
636 restore_begin = (((VA)mtcp_shareable_begin) & -MTCP_PAGE_SIZE);
637 restore_size = ((VA)mtcp_shareable_end - restore_begin + MTCP_PAGE_SIZE - 1) & -MTCP_PAGE_SIZE;
638 restore_end = restore_begin + restore_size;
639 restore_start = mtcp_restore_start;
640
641 /* Setup clone_entry to point to glibc's __clone routine */
642
643 setup_clone_entry ();
644
645 /* Set up caller as one of our threads so we can work on it */
646
647 memset (ckptThreadDescriptor, 0, sizeof *ckptThreadDescriptor);
648 setupthread (ckptThreadDescriptor);
649 ckptThreadDescriptor -> child_tid = mtcp_sys_kernel_gettid (); // need to set this up so the checkpointhread can see we haven't exited
650 set_tid_address (&(ckptThreadDescriptor -> child_tid)); // we are assuming mtcp_init has been called before application may have called set_tid_address
651 // ... or else we will end up overwriting that set_tid_address value
652 motherofall = ckptThreadDescriptor;
653
654 /* Spawn off a thread that will perform the checkpoints from time to time */
655
656 checkpointhreadstarting = 1;
657 /* If we return from a fork(), we don't know what is the semaphore value. */
658 errno = 0;
659 while (sem_trywait(&sem_start) == -1 && (errno == EAGAIN || errno == EINTR)) {
660 if ( errno == EAGAIN )
661 sem_post(&sem_start);
662 errno = 0;
663 }
664 if (errno != 0)
665 perror("ERROR: continue anyway from " __FILE__ ":mtcp_init:sem_trywait()");
666 /* Now we successfully locked it. The sempaphore value is zero. */
667 if (pthread_create (&checkpointhreadid, NULL, checkpointhread, NULL) < 0) {
668 mtcp_printf ("mtcp_init: error creating checkpoint thread: %s\n", strerror (errno));
669 mtcp_abort ();
670 }
671 if (checkpointhreadstarting) mtcp_abort (); // make sure the clone wrapper executed (ie, not just the standard clone)
672 /* Stop until checkpoint thread has finished initializing.
673 * Some programs (like gcl) implement their own glibc functions in
674 * a non-thread-safe manner. In case we're using non-thread-safe glibc,
675 * don't run the checkpoint thread and user thread at the same time.
676 */
677 errno = 0;
678 while (-1 == sem_wait(&sem_start) && errno == EINTR)
679 errno = 0;
680 /* The child thread checkpointhread will now wake us. */
681 }
682
683 /********************************************************************************************************************************
684 *
685 * The routine mtcp_set_callbacks below may be called BEFORE the first
686 * MTCP checkpoint, to add special functionality to checkpointing
687 *
688 * Its arguments (callback functions) are:
689 *
690 * sleep_between_ckpt: Called in between checkpoints to replace the default "sleep(sec)" functionality,
691 * when this function returns checkpoint will start
692 * pre_ckpt: Called after all user threads are suspended, but BEFORE checkpoint written
693 * post_ckpt: Called after checkpoint, and after restore. is_restarting will be 1 for restore 0 for after checkpoint
694 * ckpt_fd: Called to test if mtcp should checkpoint a given FD returns 1 if it should
695 *
696 *******************************************************************************************************************************/
697
698 void mtcp_set_callbacks(void (*sleep_between_ckpt)(int sec),
699 void (*pre_ckpt)(),
700 void (*post_ckpt)(int is_restarting),
701 int (*ckpt_fd)(int fd),
702 void (*write_dmtcp_header)(int fd),
703 void (*restore_virtual_pid_table)())
704 {
705 callback_sleep_between_ckpt = sleep_between_ckpt;
706 callback_pre_ckpt = pre_ckpt;
707 callback_post_ckpt = post_ckpt;
708 callback_ckpt_fd = ckpt_fd;
709 callback_write_dmtcp_header = write_dmtcp_header;
710 callback_restore_virtual_pid_table = restore_virtual_pid_table;
711 }
712
713 /*************************************************************************/
714 /* */
715 /* Dump out the TLS stuff pointed to by %gs */
716 /* */
717 /*************************************************************************/
718
719 void mtcp_dump_tls (char const *file, int line)
720 {
721 #if 000
722 int i, j, mypid;
723 sigset_t blockall, oldsigmask;
724 struct user_desc gdtentry;
725 unsigned char byt;
726 unsigned short gs;
727
728 static int mutex = 0;
729
730 /* Block all signals whilst we have the futex */
731
732 memset (&blockall, -1, sizeof blockall);
733 if (sigprocmask (SIG_SETMASK, &blockall, &oldsigmask) < 0) {
734 abort ();
735 }
736
737 /* Block other threads from doing this so the output doesn't mix */
738
739 while (!atomic_setif_int (&mutex, 1, 0)) {
740 mtcp_sys_futex (&mutex, FUTEX_WAIT, 1, NULL, NULL, 0);
741 }
742
743 /* Get the segment for the TLS stuff */
744
745 asm volatile ("movw %%gs,%0" : "=g" (gs));
746 mtcp_printf("mtcp_init: gs=%X at %s:%d\n", gs, file, line);
747 if (gs != 0) {
748
749 /* We only handle GDT based stuff */
750
751 if (gs & 4) mtcp_printf(" *** part of LDT\n");
752
753 /* It's in the GDT */
754
755 else {
756
757 /* Read the TLS descriptor */
758
759 gdtentry.entry_number = gs / 8;
760 i = mtcp_sys_get_thread_area (&gdtentry);
761 if (i < 0) mtcp_printf(" error getting GDT entry %d: %d\n", gdtentry.entry_number, mtcp_sys_errno);
762 else {
763
764 /* Print out descriptor and first 80 bytes of data */
765
766 mtcp_printf(" limit %X, baseaddr %X\n", gdtentry.limit, gdtentry.base_addr);
767 for (i = 0; i < 80; i += 16) {
768 for (j = 16; -- j >= 0;) {
769 if ((j & 3) == 3) fputc (' ', stderr);
770 asm volatile ("movb %%gs:(%1),%0" : "=r" (byt) : "r" (i + j));
771 mtcp_printf("%2.2X", byt);
772 }
773 mtcp_printf(" : gs+%2.2X\n", i);
774 }
775 for (i = 0; i < 80; i += 16) {
776 for (j = 16; -- j >= 0;) {
777 if ((j & 3) == 3) fputc (' ', stderr);
778 byt = ((unsigned char *)gdtentry.base_addr)[i+j];
779 mtcp_printf("%2.2X", byt);
780 }
781 mtcp_printf(" : %8.8X\n", gdtentry.base_addr + i);
782 }
783
784 /* Offset 4C should be the process id */
785
786 asm volatile ("mov %%gs:0x4C,%0" : "=r" (i));
787 mtcp_printf("mtcp_init: getpid=%d, gettid=%d, tls=%d\n", getpid (), mtcp_sys_kernel_gettid (), i);
788 }
789 }
790 }
791
792 /* Release mutex and restore signal delivery */
793
794 mutex = 0;
795 mtcp_sys_futex (&mutex, FUTEX_WAKE, 1, NULL, NULL, 0);
796 if (_real_sigprocmask (SIG_SETMASK, &oldsigmask, NULL) < 0) {
797 abort ();
798 }
799 #endif
800 }
801
802 /*****************************************************************************/
803 /* */
804 /* This is our clone system call wrapper */
805 /* */
806 /* Note: */
807 /* */
808 /* pthread_create eventually calls __clone to create threads */
809 /* It uses flags = 0x3D0F00: */
810 /* CLONE_VM = VM shared between processes */
811 /* CLONE_FS = fs info shared between processes (root, cwd, umask) */
812 /* CLONE_FILES = open files shared between processes (fd table) */
813 /* CLONE_SIGHAND = signal handlers and blocked signals shared */
814 /* (sigaction common to parent and child) */
815 /* CLONE_THREAD = add to same thread group */
816 /* CLONE_SYSVSEM = share system V SEM_UNDO semantics */
817 /* CLONE_SETTLS = create a new TLS for the child from newtls parameter*/
818 /* CLONE_PARENT_SETTID = set the TID in the parent (before MM copy) */
819 /* CLONE_CHILD_CLEARTID = clear the TID in the child and do */
820 /* futex wake at that address */
821 /* CLONE_DETACHED = create clone detached */
822 /* */
823 /*****************************************************************************/
824
825 int __clone (int (*fn) (void *arg), void *child_stack, int flags, void *arg,
826 int *parent_tidptr, struct user_desc *newtls, int *child_tidptr)
827 {
828 int rc;
829 Thread *thread;
830 #ifdef PTRACE
831 int i;
832 #endif
833
834 /* Maybe they decided not to call mtcp_init */
835 if (motherofall != NULL) {
836
837 /* They called mtcp_init meaning we are to do checkpointing.
838 * So we are going to track this thread.
839 */
840
841 thread = malloc (sizeof *thread);
842 memset (thread, 0, sizeof *thread);
843 thread -> fn = fn; // this is the user's function
844 thread -> arg = arg; // ... and the parameter
845 thread -> parent = getcurrenthread ();
846 if (checkpointhreadstarting) {
847 checkpointhreadstarting = 0;
848 mtcp_state_init(&thread->state, ST_CKPNTHREAD);
849 } else {
850 mtcp_state_init(&thread->state, ST_RUNDISABLED);
851 }
852
853 DPRINTF (("mtcp wrapper clone*: calling clone thread=%p,"
854 " fn=%p, flags=0x%X\n", thread, fn, flags));
855 DPRINTF (("mtcp wrapper clone*: parent_tidptr=%p, newtls=%p,"
856 " child_tidptr=%p\n", parent_tidptr, newtls, child_tidptr));
857 //asm volatile ("int3");
858
859 /* Save exactly what the caller is supplying */
860
861 thread -> clone_flags = flags;
862 thread -> parent_tidptr = parent_tidptr;
863 thread -> given_tidptr = child_tidptr;
864
865 /* We need the CLEARTID feature so we can detect */
866 /* when the thread has exited */
867 /* So if the caller doesn't want it, we enable it */
868 /* Retain what the caller originally gave us so we can pass the tid back */
869
870 if (!(flags & CLONE_CHILD_CLEARTID)) {
871 child_tidptr = &(thread -> child_tid);
872 }
873 thread -> actual_tidptr = child_tidptr;
874 DPRINTF (("mtcp wrapper clone*: thread %p -> actual_tidptr %p\n",
875 thread, thread -> actual_tidptr));
876
877 /* Alter call parameters, forcing CLEARTID and make it call the wrapper routine */
878
879 flags |= CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID;
880 fn = threadcloned;
881 arg = thread;
882 }
883
884 /* mtcp_init not called, no checkpointing, but make sure clone_entry is */
885 /* set up so we can call the real clone */
886
887 else if (clone_entry == NULL) setup_clone_entry ();
888
889 /* Now create the thread */
890
891 DPRINTF (("mtcp wrapper clone*: clone fn=%p, child_stack=%p, flags=%X, arg=%p\n", fn, child_stack, flags, arg));
892 DPRINTF (("mtcp wrapper clone*: parent_tidptr=%p, newtls=%p, child_tidptr=%p\n", parent_tidptr, newtls, child_tidptr));
893 rc = (*clone_entry) (fn, child_stack, flags, arg, parent_tidptr, newtls, child_tidptr);
894 if (rc < 0) {
895 DPRINTF (("mtcp wrapper clone*: clone rc=%d, errno=%d\n", rc, errno));
896 } else {
897 DPRINTF (("mtcp wrapper clone*: clone rc=%d\n", rc));
898 }
899
900 #ifdef PTRACE
901 /*************************************************************************/
902 /* Code added to keep record of new tasks and processes in a file */
903 /*************************************************************************/
904
905 // initialize the ptrace_tid_pairs array
906 if (!init_ptrace_pairs) {
907 for (i = 0; i < MAX_PTRACE_PAIRS_COUNT; i++) {
908 ptrace_pairs[i].last_command = PTRACE_UNSPECIFIED_COMMAND;
909 ptrace_pairs[i].singlestep_waited_on = FALSE;
910 ptrace_pairs[i].free = TRUE;
911 ptrace_pairs[i].inferior_st = 'u'; // undefined
912 }
913 init_ptrace_pairs = 1;
914 }
915
916 // initialize the semaphore used when motherofall reads the ptrace shared file
917 if (!init_ptrace_read_pairs_sem) {
918 sem_init(&ptrace_read_pairs_sem, 0, 0);
919 init_ptrace_read_pairs_sem = 1;
920 }
921
922 if (!init__sem) {
923 sem_init(&__sem, 0, 1);
924 init__sem = 1;
925 }
926
927 if (is_ptrace_setoptions == TRUE) writeptraceinfo (setoptions_superior, rc);
928 else {
929 // read from file
930 int setoptions_fd = -1;
931 pid_t inferior;
932 pid_t superior;
933
934 setoptions_fd = open(ptrace_setoptions_file, O_RDONLY);
935
936 if (setoptions_fd != -1) {
937 while (readall(setoptions_fd, &superior, sizeof(pid_t)) > 0) {
938 readall(setoptions_fd, &inferior, sizeof(pid_t));
939 if (inferior == GETTID()) {
940 setoptions_superior = superior;
941 is_ptrace_setoptions = TRUE;
942 writeptraceinfo (setoptions_superior, rc);
943 }
944 }
945 if ( close(setoptions_fd) != 0 ) {
946 mtcp_printf("__clone: Error closing file: %s\n",
947 strerror(errno));
948 mtcp_abort();
949 }
950 }
951 }
952 /* the structure of checkpoint_threads_file is pairs of pid and tid */
953 write_info_to_file (2, getpid(), rc);
954 /*************************************************************************/
955 /* Done recording new tasks and processes. */
956 /*************************************************************************/
957 #endif
958
959 return (rc);
960 }
961
962 void fill_in_pthread (pid_t tid, pthread_t pth) {
963 struct Thread *thread;
964 for (thread = threads; thread != NULL; thread = thread -> next) {
965 if (thread -> tid == tid) {
966 thread -> pth = pth;
967 break;
968 }
969 }
970 }
971
972 void delete_thread_on_pthread_join (pthread_t pth) {
973 struct Thread *thread;
974 for (thread = threads; thread != NULL; thread = thread -> next) {
975 if (thread -> pth == pth) {
976 threadisdead (thread);
977 break;
978 }
979 }
980 }
981
982 asm (".global clone ; .type clone,@function ; clone = __clone");
983
984 /*****************************************************************************/
985 /* */
986 /* This routine is called (via clone) as the top-level routine of a thread */
987 /* that we are tracking. */
988 /* */
989 /* It fills in remaining items of our thread struct, calls the user function,*/
990 /* then cleans up the thread struct before exiting. */
991 /* */
992 /*****************************************************************************/
993
994 static int threadcloned (void *threadv)
995
996 {
997 int rc;
998 Thread *const thread = threadv;
999
1000 DPRINTF (("mtcp threadcloned*: starting thread %p\n", thread));
1001
1002 setupthread (thread);
1003
1004 /* The new TLS should have the process ID in place at TLS_PID_OFFSET() */
1005 /* This is a verification step and is therefore optional as such */
1006 {
1007 pid_t tls_pid = *(pid_t *) (mtcp_get_tls_base_addr() + TLS_PID_OFFSET());
1008 if ((tls_pid != motherpid) && (tls_pid != (pid_t)-1)) {
1009 mtcp_printf ("mtcp threadcloned: getpid %d, tls pid %d at offset %d, must match\n",
1010 motherpid, tls_pid, TLS_PID_OFFSET());
1011 mtcp_printf (" %X\n", motherpid);
1012 for (rc = 0; rc < 256; rc += 4) {
1013 tls_pid = *(pid_t *) (mtcp_get_tls_base_addr() + rc);
1014 mtcp_printf (" %d: %X", rc, tls_pid);
1015 if ((rc & 31) == 28) mtcp_printf ("\n");
1016 }
1017 mtcp_abort ();
1018 }
1019 }
1020
1021 /* If the caller wants the child tid but didn't have CLEARTID, pass the tid back to it */
1022
1023 if ((thread -> clone_flags & (CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) == CLONE_CHILD_SETTID) {
1024 *(thread -> given_tidptr) = thread -> child_tid;
1025 }
1026
1027 /* Maybe enable checkpointing by default */
1028
1029 if (threadenabledefault) mtcp_ok ();
1030
1031 #ifdef PTRACE
|
Event implicit_func_decl: |
function "init_thread_local" declared implicitly |
|
Event caretline: |
^ |
1032 init_thread_local();
1033 #endif
1034
1035 /* Call the user's function for whatever processing they want done */
1036
1037 DPRINTF (("mtcp threadcloned*: calling %p (%p)\n", thread -> fn, thread -> arg));
1038 rc = (*(thread -> fn)) (thread -> arg);
1039 DPRINTF (("mtcp threadcloned*: returned %d\n", rc));
1040
1041 /* Make sure checkpointing is inhibited while we clean up and exit */
1042 /* Otherwise, checkpointer might wait forever for us to re-enable */
1043
1044 mtcp_no ();
1045
1046 /* Do whatever to unlink and free thread block */
1047
1048 threadisdead (thread);
1049
1050 /* Return the user's status as the exit code */
1051
1052 return (rc);
1053 }
1054
1055 /*****************************************************************************/
1056 /* */
1057 /* set_tid_address wrapper routine */
1058 /* */
1059 /* We save the new address of the tidptr that will get cleared when the */
1060 /* thread exits */
1061 /* */
1062 /*****************************************************************************/
1063
1064 static long set_tid_address (int *tidptr)
1065
1066 {
1067 long rc;
1068 Thread *thread;
1069
1070 thread = getcurrenthread ();
1071 DPRINTF (("set_tid_address wrapper*: thread %p -> tid %d, tidptr %p\n",
1072 thread, thread -> tid, tidptr));
1073 thread -> actual_tidptr = tidptr; // save new tidptr so subsequent restore will create with new pointer
1074 rc = mtcp_sys_set_tid_address(tidptr);
1075 return (rc); // now we tell kernel to change it for the current thread
1076 }
1077
1078 /*****************************************************************************/
1079 /* */
1080 /* Link thread struct to the lists and finish filling it in */
1081 /* */
1082 /* Input: */
1083 /* */
1084 /* thread = thread to set up */
1085 /* */
1086 /* Output: */
1087 /* */
1088 /* thread linked to 'threads' list and 'motherofall' tree */
1089 /* thread -> tid = filled in with thread id */
1090 /* thread -> state = ST_RUNDISABLED (thread initially has checkpointing */
1091 /* disabled) */
1092 /* signal handler set up */
1093 /* */
1094 /*****************************************************************************/
1095
1096 static void setupthread (Thread *thread)
1097
1098 {
1099 Thread *parent;
1100
1101 /* Save the thread's ID number and put in threads list so we can look it up */
1102 /* Set state to disable checkpointing so checkpointer won't race between adding to list and setting up handler */
1103
1104 thread -> tid = mtcp_sys_kernel_gettid ();
1105 thread -> original_tid = GETTID ();
1106
1107 DPRINTF (("mtcp setupthread*: thread %p -> tid %d\n", thread, thread->tid));
1108
1109 lock_threads ();
1110
1111 if ((thread -> next = threads) != NULL) {
1112 thread -> next -> prev = &(thread -> next);
1113 }
1114 thread -> prev = &threads;
1115 threads = thread;
1116
1117 parent = thread -> parent;
1118 if (parent != NULL) {
1119 thread -> siblings = parent -> children;
1120 parent -> children = thread;
1121 }
1122
1123 unlk_threads ();
1124 }
1125
1126 /*****************************************************************************/
1127 /* */
1128 /* Set up 'clone_entry' variable */
1129 /* */
1130 /* Output: */
1131 /* */
1132 /* clone_entry = points to clone routine within libc.so */
1133 /* */
1134 /*****************************************************************************/
1135
1136 static void setup_clone_entry (void)
1137
1138 {
1139 char *p, *tmp;
1140 int mapsfd;
1141
1142 /* Get name of whatever concoction we have for a libc shareable image */
1143 /* This is used by the wrapper routines */
1144
1145 tmp = getenv ("MTCP_WRAPPER_LIBC_SO");
1146 if (tmp != NULL) {
1147 if (strlen(tmp) >= sizeof(mtcp_libc_area.name)) {
1148 mtcp_printf("mtcp setup_clone_entry: libc area name (%s) too long (>=1024 chars)\n",
1149 tmp);
1150 mtcp_abort();
1151 }
1152 strncpy (mtcp_libc_area.name, tmp, sizeof mtcp_libc_area.name);
1153 } else {
1154 mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
1155 if (mapsfd < 0) {
1156 mtcp_printf ("mtcp_init: error opening /proc/self/maps: %s\n", strerror (mtcp_sys_errno));
1157 mtcp_abort ();
1158 }
1159 p = NULL;
1160 while (readmapsline (mapsfd, &mtcp_libc_area)) {
1161 p = strstr (mtcp_libc_area.name, "/libc");
1162 if ((p != NULL) && ((p[5] == '-') || (p[5] == '.'))) break;
1163 }
1164 close (mapsfd);
1165 if (p == NULL) {
1166 mtcp_printf ("mtcp_init: cannot find */libc[-.]* in /proc/self/maps\n");
1167 mtcp_abort ();
1168 }
1169 }
1170 mtcp_libc_dl_handle = dlopen (mtcp_libc_area.name, RTLD_LAZY | RTLD_GLOBAL);
1171 if (mtcp_libc_dl_handle == NULL) {
1172 mtcp_printf ("mtcp_init: error opening libc shareable %s: %s\n", mtcp_libc_area.name, dlerror ());
1173 mtcp_abort ();
1174 }
1175
1176 /* Find the clone routine therein */
1177
1178 clone_entry = mtcp_get_libc_symbol ("__clone");
1179 }
1180
1181 /********************************************************************************************************************************/
1182 /* */
1183 /* Thread has exited - unlink it from lists and free struct */
1184 /* */
1185 /* Input: */
1186 /* */
1187 /* thread = thread that has exited */
1188 /* */
1189 /* Output: */
1190 /* */
1191 /* thread removed from 'threads' list and motherofall tree */
1192 /* thread pointer no longer valid */
1193 /* checkpointer woken if waiting for this thread */
1194 /* */
1195 /********************************************************************************************************************************/
1196
1197 static void threadisdead (Thread *thread)
1198
1199 {
1200 Thread **lthread, *parent, *xthread;
1201
1202 lock_threads ();
1203
1204 DPRINTF (("mtcp threadisdead*: thread %p -> tid %d\n", thread, thread -> tid));
1205
1206 /* Remove thread block from 'threads' list */
1207
1208 if ((*(thread -> prev) = thread -> next) != NULL) {
1209 thread -> next -> prev = thread -> prev;
1210 }
1211
1212 /* Remove thread block from parent's list of children */
1213
1214 parent = thread -> parent;
1215 if (parent != NULL) {
1216 for (lthread = &(parent -> children); (xthread = *lthread) != thread; lthread = &(xthread -> siblings)) {}
1217 *lthread = xthread -> siblings;
1218 }
1219
1220 /* If this thread has children, give them to its parent */
1221
1222 if (parent != NULL) {
1223 while ((xthread = thread -> children) != NULL) {
1224 thread -> children = xthread -> siblings;
1225 xthread -> siblings = parent -> children;
1226 parent -> children = xthread;
1227 }
1228 } else {
1229 while ((xthread = thread -> children) != NULL) {
1230 thread -> children = xthread -> siblings;
1231 xthread -> siblings = motherofall;
1232 motherofall = xthread;
1233 }
1234 }
1235
1236 unlk_threads ();
1237
1238 /* If checkpointer is waiting for us, wake it to see this thread no longer in list */
1239
1240 mtcp_state_futex (&(thread -> state), FUTEX_WAKE, 1, NULL);
1241
1242 mtcp_state_destroy( &(thread -> state) );
1243
1244 free (thread);
1245 }
1246
1247 void *mtcp_get_libc_symbol (char const *name)
1248
1249 {
1250 void *temp;
1251
1252 temp = dlsym (mtcp_libc_dl_handle, name);
1253 if (temp == NULL) {
1254 mtcp_printf ("mtcp_get_libc_symbol: error getting %s from %s: %s\n",
1255 name, mtcp_libc_area.name, dlerror ());
1256 mtcp_abort ();
1257 }
1258 return (temp);
1259 }
1260
1261 /********************************************************************************************************************************/
1262 /* */
1263 /* Call this when it's OK to checkpoint */
1264 /* */
1265 /********************************************************************************************************************************/
1266
1267 int mtcp_ok (void)
1268
1269 {
1270 Thread *thread;
1271
1272 if (getenv("MTCP_NO_CHECKPOINT"))
1273 return 0;
1274 thread = getcurrenthread ();
1275
1276 again:
1277 switch (mtcp_state_value(&thread -> state)) {
1278
1279 /* Thread was running normally with checkpointing disabled. Enable checkpointing then just return. */
1280
1281 case ST_RUNDISABLED: {
1282 if (!mtcp_state_set (&(thread -> state), ST_RUNENABLED, ST_RUNDISABLED)) goto again;
1283 return (0);
1284 }
1285
1286 /* Thread was running normally with checkpointing already enabled. So just return as is. */
1287
1288 case ST_RUNENABLED: {
1289 return (1);
1290 }
1291
1292 /* Thread was running with checkpointing disabled, but the checkpointhread wants to write a checkpoint. So mark the thread */
1293 /* as having checkpointing enabled, then just 'manually' call the signal handler as if the signal to suspend were just sent. */
1294
1295 case ST_SIGDISABLED: {
1296 if (!mtcp_state_set (&(thread -> state), ST_SIGENABLED, ST_SIGDISABLED)) goto again;
1297 stopthisthread (0);
1298 return (0);
1299 }
1300
1301 /* Thread is running with checkpointing enabled, but the checkpointhread wants to write a checkpoint and has sent a signal */
1302 /* telling the thread to call 'stopthisthread'. So we'll just keep going as is until the signal is actually delivered. */
1303
1304 case ST_SIGENABLED: {
1305 return (1);
1306 }
1307
1308 /* Thread is the checkpointhread so we just ignore the call (from threadcloned routine). */
1309
1310 case ST_CKPNTHREAD: {
1311 return (-1);
1312 }
1313
1314 /* How'd we get here? */
1315
1316 default: {
1317 mtcp_abort ();
1318 return (0); /* NOTREACHED : stop compiler warning */
1319 }
1320 }
1321 }
1322
1323 /* Likewise, disable checkpointing */
1324
1325 int mtcp_no (void)
1326 {
1327 Thread *thread;
1328
1329 if (getenv("MTCP_NO_CHECKPOINT"))
1330 return 0;
1331 thread = getcurrenthread ();
1332
1333 again:
1334 switch (mtcp_state_value(&thread -> state)) {
1335 case ST_RUNDISABLED: {
1336 return (0);
1337 }
1338
1339 case ST_RUNENABLED: {
1340 if (!mtcp_state_set (&(thread -> state), ST_RUNDISABLED, ST_RUNENABLED)) goto again;
1341 return (1);
1342 }
1343
1344 case ST_SIGDISABLED: {
1345 return (0);
1346 }
1347
1348 case ST_SIGENABLED: {
1349 stopthisthread (0);
1350 goto again;
1351 }
1352
1353 default: {
1354 mtcp_abort ();
1355 return (0); /* NOTREACHED : stop compiler warning */
1356 }
1357 }
1358 }
1359
1360 /* This is used by ../dmtcp/src/mtcpinterface.cpp */
1361 void mtcp_kill_ckpthread (void)
1362 {
1363 Thread *thread;
1364
1365 lock_threads ();
1366 for (thread = threads; thread != NULL; thread = thread -> next) {
1367 if ( mtcp_state_value(&thread -> state) == ST_CKPNTHREAD ) {
1368 unlk_threads ();
1369 DPRINTF(("mtcp_kill_ckpthread: Kill checkpinthread, tid=%d\n",thread->tid));
1370 mtcp_sys_kernel_tkill(thread -> tid, STOPSIGNAL);
1371 return;
1372 }
1373 }
1374 unlk_threads ();
1375 }
1376
1377
1378 /*************************************************************************/
1379 /* */
1380 /* Save and restore terminal settings. */
1381 /* */
1382 /*************************************************************************/
1383
1384 static int saved_termios_exists = 0;
1385 static struct termios saved_termios;
1386 static struct winsize win;
1387
1388 static void save_term_settings() {
1389 saved_termios_exists = ( isatty(STDIN_FILENO)
1390 && tcgetattr(STDIN_FILENO, &saved_termios) >= 0 );
1391 if (saved_termios_exists)
1392 ioctl (STDIN_FILENO, TIOCGWINSZ, (char *) &win);
1393 }
1394 int safe_tcsetattr(int fd, int optional_actions,
1395 const struct termios *termios_p) {
1396 struct termios old_termios, new_termios;
1397 /* We will compare old and new, and we don't want unitialized data */
1398 memset(&new_termios, 0, sizeof(new_termios));
1399 /* tcgetattr returns success as long as at least one of requested
1400 * changes was executed. So, repeat until no more changes.
1401 */
1402 do {
1403 memcpy(&old_termios, &new_termios, sizeof(new_termios));
1404 if (tcsetattr(fd, TCSANOW, termios_p) == -1) return -1;
1405 if (tcgetattr(fd, &new_termios) == -1) return -1;
1406 } while (memcmp(&new_termios, &old_termios, sizeof(new_termios)) != 0);
1407 return 0;
1408 }
1409 static void restore_term_settings() {
1410 if (saved_termios_exists){
1411 /* First check if we are in foreground. If not, skip this and print
1412 * warning. If we try to call tcsetattr in background, we will hang up.
1413 */
1414 int foreground = (tcgetpgrp(STDIN_FILENO) == getpgrp());
1415 DPRINTF(("restore terminal attributes, check foreground status first: %d\n",
1416 foreground));
1417 if (foreground) {
1418 if ( ( ! isatty(STDIN_FILENO)
1419 || safe_tcsetattr(STDIN_FILENO, TCSANOW, &saved_termios) == -1) )
1420 DPRINTF(("WARNING: mtcp finishrestore*: failed to restore terminal\n"));
1421 else {
1422 struct winsize cur_win;
1423 DPRINTF(("mtcp finishrestore*: restored terminal\n"));
1424 ioctl (STDIN_FILENO, TIOCGWINSZ, (char *) &cur_win);
1425 /* ws_row/ws_col was probably not 0/0 prior to checkpoint. We change
1426 * it back to last known row/col prior to checkpoint, and then send a
1427 * SIGWINCH (see below) to notify process that window might have changed
1428 */
1429 if (cur_win.ws_row == 0 && cur_win.ws_col == 0)
1430 ioctl (STDIN_FILENO, TIOCSWINSZ, (char *) &win);
1431 }
1432 } else {
1433 DPRINTF(("WARNING: mtcp finishrestore*: skip restore terminal step\n"
1434 " -- we are in BACKGROUND\n"));
1435 }
1436 }
1437 if (kill(getpid(), SIGWINCH) == -1) {} /* No remedy if error */
1438 }
1439
1440
1441 /*************************************************************************/
1442 /* */
1443 /* This executes as a thread. It sleeps for the checkpoint interval */
1444 /* seconds, then wakes to write the checkpoint file. */
1445 /* */
1446 /*************************************************************************/
1447
1448 static void *checkpointhread (void *dummy)
1449 {
1450 int needrescan;
1451 struct timespec sleeperiod;
1452 struct timeval started, stopped;
1453 Thread *thread;
1454 char * dmtcp_checkpoint_filename = NULL;
1455
1456 /* This is the start function of the checkpoint thread.
1457 * We also call getcontext to get a snapshot of this call frame,
1458 * since we will never exit this call frame. We always return
1459 * to this call frame at time of startup, on restart. Hence, restart
1460 * will forget any modifications to our local variables since restart.
1461 */
1462 static int originalstartup = 1;
1463
1464 #ifdef PTRACE
1465 init_thread_local();
1466 check_size_for_ptrace_file (ptrace_shared_file);
1467 check_size_for_ptrace_file (ptrace_setoptions_file);
1468 check_size_for_ptrace_file (checkpoint_threads_file);
1469 #endif
1470
1471 /* We put a timeout in case the thread being waited for exits whilst we are waiting */
1472
1473 static struct timespec const enabletimeout = { 10, 0 };
1474
1475 DPRINTF (("mtcp checkpointhread*: %d started\n", mtcp_sys_kernel_gettid ()));
1476
1477 /* Set up our restart point, ie, we get jumped to here after a restore */
1478
1479 ckpthread = getcurrenthread ();
1480
1481 save_sig_state( ckpthread );
1482 save_tls_state (ckpthread);
1483 /* Release user thread after we've initialized. */
1484 sem_post(&sem_start);
1485 if (getcontext (&(ckpthread -> savctx)) < 0) mtcp_abort ();
1486
1487 DPRINTF (("mtcp checkpointhread*: after getcontext. current_tid %d, original_tid:%d\n",
1488 mtcp_sys_kernel_gettid(), ckpthread->original_tid));
1489 if (originalstartup)
1490 originalstartup = 0;
1491 else {
1492
1493 /* We are being restored. Wait for all other threads to finish being restored before resuming checkpointing. */
1494
1495 DPRINTF (("mtcp checkpointhread*: waiting for other threads after restore\n"));
1496 wait_for_all_restored ();
1497 #ifdef PTRACE
1498 create_file (GETTID());
1499 #endif
1500 DPRINTF (("mtcp checkpointhread*: resuming after restore\n"));
1501 }
1502
1503 /* Reset the verification counter - on init, this will set it to it's start value. */
1504 /* After a verification, it will reset it to its start value. After a normal */
1505 /* restore, it will set it to its start value. So this covers all cases. */
1506
1507 verify_count = verify_total;
1508 DPRINTF (("After verify count mtcp checkpointhread*: %d started\n",
1509 mtcp_sys_kernel_gettid ()));
1510
1511 while (1) {
1512 #ifdef PTRACE
1513 int ptraced_by = 0;
1514 #endif
1515
1516 /* Wait a while between writing checkpoint files */
1517
1518 if (callback_sleep_between_ckpt == NULL)
1519 {
1520 memset (&sleeperiod, 0, sizeof sleeperiod);
1521 sleeperiod.tv_sec = intervalsecs;
1522 while ((nanosleep (&sleeperiod, &sleeperiod) < 0) && (errno == EINTR)) {}
1523 }
1524 else
1525 {
1526 DPRINTF(("mtcp checkpointhread*: before callback_sleep_between_ckpt(%d)\n",intervalsecs));
1527 (*callback_sleep_between_ckpt)(intervalsecs);
1528 DPRINTF(("mtcp checkpointhread*: after callback_sleep_between_ckpt(%d)\n",intervalsecs));
1529 }
1530
1531 mtcp_sys_gettimeofday (&started, NULL);
1532 checkpointsize = 0;
1533
1534 #ifdef PTRACE
1535 // Refresh ptrace information
1536 has_ptrace_file = 0;
1537 delete_ptrace_leader = -1;
1538 has_setoptions_file = 0;
1539 delete_setoptions_leader = -1;
1540 has_checkpoint_file = 0;
1541 delete_checkpoint_leader = -1;
1542 process_ptrace_info( &delete_ptrace_leader, &has_ptrace_file,
1543 &delete_setoptions_leader, &has_setoptions_file,
1544 &delete_checkpoint_leader, &has_checkpoint_file);
1545
1546 for (thread = threads; thread != NULL; thread = thread -> next) {
1547 int i;
1548 for (i = 0; i < ptrace_pairs_count; i++) {
1549 DPRINTF(("COMPARE: intf=%d, tid=%d\n",
1550 ptrace_pairs[i].inferior, thread->original_tid));
1551 if( ptrace_pairs[i].inferior == thread->original_tid ){
1552 ptraced_by = ptrace_pairs[i].superior;
1553 break;
1554 }
1555 }
1556 if( ptraced_by )
1557 break;
1558 }
1559
1560 DPRINTF(("\n\n%d ptraced by %d\n\n",(thread) ? thread->tid : 0,ptraced_by));
1561 if( ptraced_by ){
1562 DPRINTF(("\n\n%d Wait for superior %d\n\n",thread->tid,ptraced_by));
1563 ptrace_wait4(ptraced_by);
1564 //sleep(1);
1565 DPRINTF(("\n\n%d Wait for superior %d - SUCCESS\n\n",thread->tid,ptraced_by));
1566 }
1567 #endif
1568
1569 /* Halt all other threads - force them to call stopthisthread */
1570 /* If any have blocked checkpointing, wait for them to unblock before signalling */
1571
1572 rescan:
1573 needrescan = 0;
1574 lock_threads ();
1575 for (thread = threads; thread != NULL; thread = thread -> next) {
1576
1577 /* If thread no longer running, remove it from thread list */
1578
1579 again:
1580 if (*(thread -> actual_tidptr) == 0) {
1581 DPRINTF (("mtcp checkpointhread*: thread %d disappeared\n", thread -> tid));
1582 unlk_threads ();
1583 threadisdead (thread);
1584 goto rescan;
1585 }
1586
1587 /* Do various things based on thread's state */
1588
1589 switch (mtcp_state_value (&thread -> state) ) {
1590
1591 /* Thread is running but has checkpointing disabled */
1592 /* Tell the mtcp_ok routine that we are waiting for it */
1593 /* We will need to rescan so we will see it suspended */
1594
1595 case ST_RUNDISABLED: {
1596 if (!mtcp_state_set (&(thread -> state), ST_SIGDISABLED, ST_RUNDISABLED)) goto again;
1597 needrescan = 1;
1598 break;
1599 }
1600
1601 /* Thread is running and has checkpointing enabled */
1602 /* Send it a signal so it will call stopthisthread */
1603 /* We will need to rescan (hopefully it will be suspended by then) */
1604
1605 case ST_RUNENABLED: {
1606 if (!mtcp_state_set (&(thread -> state), ST_SIGENABLED, ST_RUNENABLED)) goto again;
1607 #ifdef PTRACE
1608 ptrace_save_threads_state ();
1609 int index;
1610 char inferior_st = 'N';
1611 char inf_st;
1612 for (index = 0; index < ptrace_pairs_count; index++) {
1613 inf_st = procfs_state(ptrace_pairs[index].inferior);
1614 DPRINTF(("tid = %d now=%c stored=%c superior = %d inferior = %d\n",
1615 GETTID(), inf_st, ptrace_pairs[index].inferior_st,
1616 ptrace_pairs[index].superior, ptrace_pairs[index].inferior));
1617 if (ptrace_pairs[index].inferior == thread -> original_tid) {
1618 inferior_st = ptrace_pairs[index].inferior_st;
1619 break;
1620 }
1621 }
1622 DPRINTF(("%d %c\n", GETTID(), inferior_st));
1623 if (inferior_st == 'N') {
1624 // superior
1625 if (mtcp_sys_kernel_tkill (thread -> tid, STOPSIGNAL) < 0) {
1626 if (mtcp_sys_errno != ESRCH) {
1627 mtcp_printf("mtcp checkpointhread: error signalling thread %d: %s\n",
1628 thread -> tid, strerror (mtcp_sys_errno));
1629 }
1630 unlk_threads ();
1631 threadisdead (thread);
1632 goto rescan;
1633 }
1634 }
1635 else {
1636 // inferior
1637 DPRINTF(("++++++++++++++++++++++++++++++++%c %d\n", inferior_st, thread -> original_tid));
1638 if (inferior_st != 'T') {
1639 if (mtcp_sys_kernel_tkill (thread -> tid, STOPSIGNAL) < 0) {
1640 if (mtcp_sys_errno != ESRCH) {
1641 mtcp_printf ("mtcp checkpointhread: error signalling thread %d: %s\n",
1642 thread -> tid, strerror (mtcp_sys_errno));
1643 }
1644 unlk_threads ();
1645 threadisdead (thread);
1646 goto rescan;
1647 }
1648 }
1649 create_file( thread -> original_tid );
1650 }
1651 #else
1652 if (mtcp_sys_kernel_tkill (thread -> tid, STOPSIGNAL) < 0) {
1653 if (mtcp_sys_errno != ESRCH) {
1654 mtcp_printf ("mtcp checkpointhread: error signalling thread %d: %s\n",
1655 thread -> tid, strerror (mtcp_sys_errno));
1656 }
1657 unlk_threads ();
1658 threadisdead (thread);
1659 goto rescan;
1660 }
1661 #endif
1662 needrescan = 1;
1663 break;
1664 }
1665
1666 /* Thread is running, we have signalled it to stop, but it has
1667 * checkpointing disabled. So we wait for it to change state.
1668 * We have to unlock because it may need lock to change state.
1669 */
1670
1671 case ST_SIGDISABLED: {
1672 unlk_threads ();
1673 mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SIGDISABLED,
1674 &enabletimeout);
1675 goto rescan;
1676 }
1677
1678 /* Thread is running and we have sent signal to stop it */
1679 /* So we have to wait for it to change state (enter signal handler) */
1680 /* We have to unlock because it may try to use lock meanwhile */
1681
1682 case ST_SIGENABLED: {
1683 unlk_threads ();
1684 mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SIGENABLED,
1685 &enabletimeout);
1686 goto rescan;
1687 }
1688
1689 /* Thread has entered signal handler and is saving its context.
1690 * So we have to wait for it to finish doing so. We don't need
1691 * to unlock because it won't use lock before changing state.
1692 */
1693
1694 case ST_SUSPINPROG: {
1695 mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SUSPINPROG,
1696 &enabletimeout);
1697 goto again;
1698 }
1699
1700 /* Thread is suspended and all ready for us to write checkpoint file */
1701
1702 case ST_SUSPENDED: {
1703 break;
1704 }
1705
1706 /* Don't do anything to the checkpointhread (this) thread */
1707
1708 case ST_CKPNTHREAD: {
1709 break;
1710 }
1711
1712 /* Who knows? */
1713
1714 default: {
1715 mtcp_abort ();
1716 }
1717 }
1718 }
1719 unlk_threads ();
1720
1721 /* If need to rescan (ie, some thread possibly not in ST_SUSPENDED STATE),
1722 * check them all again
1723 */
1724
1725 if (needrescan) goto rescan;
1726 RMB; // matched by WMB in stopthisthread
1727 DPRINTF (("mtcp checkpointhread*: everything suspended\n"));
1728
1729 /* If no threads, we're all done */
1730
1731 if (threads == NULL) {
1732 DPRINTF (("mtcp checkpointhread*: exiting (no threads)\n"));
1733 return (NULL);
1734 }
1735
1736 /* Call weak symbol of this file, possibly overridden by user's strong symbol.
1737 * User must compile his/her code with -Wl,-export-dynamic to make it visible.
1738 */
1739 mtcpHookPreCheckpoint();
1740
1741 if (!dmtcp_exists) {
1742 save_sig_handlers();
1743 }
1744
1745 /* All other threads halted in 'stopthisthread' routine (they are all
1746 * in state ST_SUSPENDED). It's safe to write checkpoint file now.
1747 */
1748 if (callback_pre_ckpt != NULL){
1749 // Here we want to synchronize the shared memory pages with the backup files
1750 DPRINTF(("mtcp checkpointhread*: syncing shared memory with backup files\n"));
1751 sync_shared_mem();
1752
1753 DPRINTF(("mtcp checkpointhread*: before callback_pre_ckpt() (&%x,%x) \n",
1754 &callback_pre_ckpt, callback_pre_ckpt));
1755 (*callback_pre_ckpt)(&dmtcp_checkpoint_filename);
1756 if (dmtcp_checkpoint_filename &&
1757 strcmp(dmtcp_checkpoint_filename, "/dev/null") != 0) {
1758 mtcp_sys_strcpy(perm_checkpointfilename, dmtcp_checkpoint_filename);
1759 DPRINTF(("mtcp checkpointhread*: Checkpoint filename changed to %s\n",
1760 perm_checkpointfilename));
1761 }
1762 }
1763
1764 #ifdef PTRACE
1765 /* If old stale files of these names exist, we append, with big problems
1766 * It's okay if files don't exist and unlink fails.
1767 * Pre_ckpt is a barrier from coordinator. So, all processes finished
1768 * reading ptrace pairs from files prior to this barrier.
1769 */
1770 unlink(ptrace_shared_file);
1771 unlink(ptrace_setoptions_file);
1772 unlink(checkpoint_threads_file);
1773 #endif
1774
1775 mtcp_saved_break = (void*) mtcp_sys_brk(NULL); // kernel returns mm->brk when passed zero
1776 /* Do this once, same for all threads. But restore for each thread. */
1777 if (mtcp_have_thread_sysinfo_offset())
1778 saved_sysinfo = mtcp_get_thread_sysinfo();
1779 /* Do this once. It's the same for all threads. */
1780 save_term_settings();
1781
1782 if (getcwd(saved_working_directory, MTCP_MAX_PATH) == NULL) {
1783 // buffer wasn't large enough
1784 perror("getcwd");
1785 mtcp_printf ("getcwd failed.");
1786 mtcp_abort ();
1787 }
1788
1789 DPRINTF (("mtcp checkpointhread*: mtcp_saved_break=%p\n", mtcp_saved_break));
1790
1791 if ( dmtcp_checkpoint_filename == NULL ||
1792 strcmp (dmtcp_checkpoint_filename, "/dev/null") != 0) {
1793 checkpointeverything ();
1794 } else {
1795 mtcp_printf("mtcp checkpointhread*: received \'/dev/null\'" \
1796 " as ckpt filename.\n*** Skipping checkpoint. ***\n");
1797 }
1798
1799 if (callback_post_ckpt != NULL){
1800 DPRINTF(("mtcp checkpointhread*: before callback_post_ckpt() (&%x,%x) \n",
1801 &callback_post_ckpt, callback_post_ckpt));
1802 (*callback_post_ckpt)(0);
1803 }
1804 if (showtiming) {
1805 mtcp_sys_gettimeofday (&stopped, NULL);
1806 stopped.tv_usec += (stopped.tv_sec - started.tv_sec) * 1000000 - started.tv_usec;
1807 mtcp_printf ("mtcp checkpoint: time %u uS, size %u megabytes," \
1808 " avg rate %u MB/s\n",
1809 stopped.tv_usec, (unsigned int)(checkpointsize / 1000000),
1810 (unsigned int)(checkpointsize / stopped.tv_usec));
1811 }
1812
1813 /* Call weak symbol of this file, possibly overridden by user's strong symbol.
1814 * User must compile his/her code with -Wl,-export-dynamic to make it visible.
1815 */
1816 mtcpHookPostCheckpoint();
1817
1818 /* Resume all threads. But if we're doing a checkpoint verify,
1819 * abort all threads except the main thread, as we don't want them
1820 * running when we exec the mtcp_restore program.
1821 */
1822
1823 DPRINTF (("mtcp checkpointhread*: resuming everything\n"));
1824 lock_threads();
1825 for (thread = threads; thread != NULL; thread = thread -> next) {
1826 if (mtcp_state_value(&(thread -> state)) != ST_CKPNTHREAD) {
1827 if (!mtcp_state_set (&(thread -> state), ST_RUNENABLED, ST_SUSPENDED))
1828 mtcp_abort();
1829 mtcp_state_futex(&(thread -> state), FUTEX_WAKE, 1, NULL);
1830 }
1831 }
1832 unlk_threads ();
1833 DPRINTF (("mtcp checkpointhread*: everything resumed\n"));
1834 /* But if we're doing a restore verify, just exit. The main thread is doing the exec to start the restore. */
1835 #ifdef PTRACE
1836 create_file (GETTID());
1837 #endif
1838 if ((verify_total != 0) && (verify_count == 0)) return (NULL);
1839 }
1840 }
1841
1842 /**
1843 * This function returns the fd to which the checkpoint file should be written.
1844 * The purpose of using this function over mtcp_sys_open() is that this
1845 * function will handle compression and gzipping.
1846 */
1847 static int test_use_compression(void)
1848 {
1849 char *do_we_compress;
1850
1851 do_we_compress = getenv("MTCP_GZIP");
1852 // allow alternate name for env var
1853 if (do_we_compress == NULL)
1854 do_we_compress = getenv("DMTCP_GZIP");
1855 // env var is unset, let's default to enabled
1856 // to disable compression, run with MTCP_GZIP=0
1857 if (do_we_compress == NULL)
1858 do_we_compress = "1";
1859
1860 char *endptr;
1861 strtol(do_we_compress, &endptr, 0);
1862 if ( *do_we_compress == '\0' || *endptr != '\0' ) {
1863 mtcp_printf("WARNING: MTCP_GZIP/DMTCP_GZIP defined as %s (not a number)\n"
1864 " Checkpoint image will not be compressed.\n",
1865 do_we_compress);
1866 do_we_compress = "0";
1867 }
1868 if ( 0 == strcmp(do_we_compress, "0") )
1869 return 0;
1870 /* If we arrive down here, it's safe to ccompress. */
1871 return 1;
1872 }
1873
1874 static int open_ckpt_to_write(int fd, int pipe_fds[2], char *gzip_path)
1875 {
1876 pid_t cpid;
1877 char *gzip_args[] = { "gzip", "-1", "-", NULL };
1878
1879 gzip_args[0] = gzip_path;
1880
1881 cpid = mtcp_sys_fork();
1882 if (cpid == -1) {
1883 mtcp_printf("WARNING: error forking child process `%s`. Compression will "
1884 "not be used [%s].\n", gzip_path, strerror(mtcp_sys_errno));
1885 close(pipe_fds[0]);
1886 close(pipe_fds[1]);
1887 //fall through to return fd
1888 } else if (cpid > 0) { /* parent process */
1889 //Before running gzip in child process, we must not use LD_PRELOAD.
1890 // See revision log 342 for details concerning bash.
1891 mtcp_ckpt_gzip_child_pid = cpid;
1892 if (close(pipe_fds[0]) == -1)
1893 mtcp_printf("WARNING: (in open_ckpt_to_write) close failed: %s\n",
1894 strerror(errno));
1895 if (close(fd) == -1)
1896 mtcp_printf("WARNING: (in open_ckpt_to_write) close failed: %s\n",
1897 strerror(errno));
1898 fd=pipe_fds[1];//change return value
1899 } else { /* child process */
1900 static int (*libc_unsetenv) (const char *name);
1901 static int (*libc_execvp) (const char *path, char *const argv[]);
1902
1903 close(pipe_fds[1]);
1904 dup2(pipe_fds[0], STDIN_FILENO);
1905 close(pipe_fds[0]);
1906 dup2(fd, STDOUT_FILENO);
1907 close(fd);
1908
1909 // Don't load dmtcphijack.so, etc. in exec.
1910 unsetenv("LD_PRELOAD"); // If in bash, this is bash env. var. version
1911 libc_unsetenv = mtcp_get_libc_symbol("unsetenv");
1912 (*libc_unsetenv)("LD_PRELOAD");
1913
1914 libc_execvp = mtcp_get_libc_symbol("execvp");
1915 (*libc_execvp)(gzip_path, gzip_args);
1916
1917 /* should not arrive here */
1918 mtcp_printf("ERROR: compression failed! No checkpointing will be"
1919 "performed! Cancel now!\n");
1920 mtcp_sys_exit(1);
1921 }
1922
1923 return fd;
1924 }
1925
1926
1927 /********************************************************************************************************************************/
1928 /* */
1929 /* This routine is called from time-to-time to write a new checkpoint file. */
1930 /* It assumes all the threads are suspended. */
1931 /* */
1932 /********************************************************************************************************************************/
1933
1934 static void checkpointeverything (void)
1935 {
1936 Area area;
1937 int fd, mapsfd;
1938 VA area_begin, area_end;
1939 int stack_was_seen = 0;
1940 int vsyscall_exists = 0;
1941 int forked_checkpointing = 0;
1942 int forked_cpid;
1943 int use_compression = -1; /* decide later */
1944 int pipe_fds[2]; /* for potential piping */
1945 char *gzip_cmd = "gzip";
1946 char gzip_path[MTCP_MAX_PATH];
1947 char tmpDMTCPHeaderBuf[] = "/tmp/dmtcp.XXXXXX";
1948 char *tmpDMTCPHeaderFileName = tmpDMTCPHeaderBuf;
1949 int tmpDMTCPHeaderFd = -1;
1950
1951 static void *const frpointer = finishrestore;
1952
1953 DPRINTF (("mtcp checkpointeverything*: tid %d\n", mtcp_sys_kernel_gettid ()));
1954
1955 if (getenv("MTCP_FORKED_CHECKPOINT") != NULL)
1956 forked_checkpointing = 1;
1957 #ifdef TEST_FORKED_CHECKPOINTING
1958 forked_checkpointing = 1;
1959 #endif
1960
1961 if (callback_write_dmtcp_header != 0) {
1962 /* Temp file for DMTCP header; will be written into the checkpoint file. */
1963 tmpDMTCPHeaderFd = mkstemp(tmpDMTCPHeaderFileName);
1964 if (tmpDMTCPHeaderFd < 0) {
1965 mtcp_printf("error %d creating temp file: %s\n", errno, strerror(errno));
1966 mtcp_abort();
1967 }
1968
1969 if (unlink(tmpDMTCPHeaderFileName) == -1) {
1970 mtcp_printf("NOTE: error %d unlinking temp file: %s\n", errno,
1971 strerror(errno));
1972 }
1973
1974 /* Better to do this in parent, not child, for most accurate header info */
1975 (*callback_write_dmtcp_header)(tmpDMTCPHeaderFd);
1976 }
1977
1978 if (forked_checkpointing) {
1979 forked_cpid = mtcp_sys_fork();
1980 if (forked_cpid == -1) {
1981 mtcp_printf("WARNING: Failed to do forked checkpointing,"
1982 " trying normal checkpoint\n");
1983 } else if (forked_cpid > 0) {
1984 /* Parent process*/
1985 if (tmpDMTCPHeaderFd != -1)
1986 close(tmpDMTCPHeaderFd);
1987 // Calling waitpid here, but on 32-bit Linux, libc:waitpid() calls wait4()
1988 if ( waitpid(forked_cpid, NULL, 0) == -1 )
1989 DPRINTF (("mtcp restoreverything*: error waitpid: errno: %d",
1990 mtcp_sys_errno));
1991 DPRINTF (("mtcp checkpointeverything*: checkpoint complete\n"));
1992 return;
1993 } else {
1994 pid_t grandchild_pid = mtcp_sys_fork();
1995 if (grandchild_pid == -1) {
1996 mtcp_printf("WARNING: Forked checkpoint failed, no checkpoint available\n");
1997 } else if (grandchild_pid > 0) {
1998 mtcp_sys_exit(0); /* child exits */
1999 }
2000 /* grandchild continues; no need now to waitpid() on grandchild */
2001 DPRINTF (("mtcp checkpointeverything*: inside grandchild process\n"));
2002 }
2003 }
2004
2005 /* 1. Test if using compression */
2006 use_compression = test_use_compression();
2007 /* 2. Get gzip path */
2008 if (use_compression && mtcp_find_executable(gzip_cmd, gzip_path) == NULL) {
2009 mtcp_printf("WARNING: gzip cannot be executed. Compression will "
2010 "not be used.\n");
2011 use_compression = 0;
2012 }
2013 /* 3. Create pipe */
2014 /* Note: Must use mtcp_sys_pipe(), to go to kernel, since
2015 * DMTCP has a wrapper around glibc promoting pipes to socketpairs,
2016 * DMTCP doesn't directly checkpoint/restart pipes.
2017 */
2018 if ( use_compression && mtcp_sys_pipe(pipe_fds) == -1 ) {
2019 mtcp_printf("WARNING: error creating pipe. Compression will "
2020 "not be used.\n");
2021 use_compression = 0;
2022 }
2023 /* 4. Open fd to checkpoint image on disk */
2024 /* Create temp checkpoint file and write magic number to it */
2025 /* This is a callback to DMTCP. DMTCP writes header and returns fd. */
2026 fd = mtcp_safe_open(temp_checkpointfilename,
2027 O_CREAT | O_TRUNC | O_WRONLY, 0600);
2028 if (fd < 0) {
2029 mtcp_printf("mtcp.c: checkpointeverything: error creating %s: %s\n",
2030 temp_checkpointfilename, strerror(mtcp_sys_errno));
2031 mtcp_abort();
2032 }
2033 /* 5. We now have the information to pipe to gzip, or directly to fd
2034 * We do it this way, so that gzip will be direct child of forked process
2035 * when using forked checkpointing.
2036 */
2037
2038 #if 1
2039 /* Temporary fix, until DMTCP uses its own separate allocator.
2040 * The else code should really go lower down, just before we checkpoint
2041 * the heap.
2042 */
2043 #else
2044 if (mtcp_sys_break(0) != mtcp_saved_break)
2045 mtcp_printf("\n\n*** ERROR: End of heap grew."
2046 " Continue at your own risk. ***\n\n\n");
2047 #endif
2048
2049 /* Drain stdin and stdout before checkpoint */
2050 tcdrain(STDOUT_FILENO);
2051 tcdrain(STDERR_FILENO);
2052
2053 if (use_compression) /* if use_compression, fork a gzip process */
2054 fd = open_ckpt_to_write(fd, pipe_fds, gzip_path);
2055
2056 if (tmpDMTCPHeaderFd != -1 ) {
2057 char tmpBuff[1024];
2058 int retval = -1;
2059 lseek(tmpDMTCPHeaderFd, 0, SEEK_SET);
2060
2061 while (retval != 0) {
2062 retval = read (tmpDMTCPHeaderFd, tmpBuff, 1024);
2063 if (retval == -1 && (errno == EAGAIN || errno == EINTR))
2064 continue;
2065 if (retval == -1) {
2066 mtcp_printf("Error writing checkpoint file: %s\n", strerror(errno));
2067 mtcp_abort();
2068 }
2069 writefile(fd, tmpBuff, retval);
2070 }
2071 close(tmpDMTCPHeaderFd);
2072 }
2073
2074 // Preprocess special segments like vsyscall, stack, heap etc.
2075 preprocess_special_segments(&vsyscall_exists);
2076
2077 writefile (fd, MAGIC, MAGIC_LEN);
2078
2079 DPRINTF (("mtcp checkpointeverything*: restore_begin %X at %p from [libmtcp.so]\n",
2080 restore_size, restore_begin));
2081
2082 struct rlimit stack_rlimit;
2083 getrlimit(RLIMIT_STACK, &stack_rlimit);
2084
2085 DPRINTF (("mtcp_restart: saved stack resource limit: soft_lim:%p, hard_lim:%p\n",
2086 stack_rlimit.rlim_cur, stack_rlimit.rlim_max));
2087
2088 writecs (fd, CS_STACKRLIMIT);
2089 writefile (fd, &stack_rlimit, sizeof stack_rlimit);
2090
2091 DPRINTF (("mtcp checkpointeverything*: [libmtcp.so] image of size %X at %p\n",
2092 restore_size, restore_begin));
2093
2094 writecs (fd, CS_RESTOREBEGIN);
2095 writefile (fd, &restore_begin, sizeof restore_begin);
2096 writecs (fd, CS_RESTORESIZE);
2097 writefile (fd, &restore_size, sizeof restore_size);
2098 writecs (fd, CS_RESTORESTART);
2099 writefile (fd, &restore_start, sizeof restore_start);
2100 writecs (fd, CS_RESTOREIMAGE);
2101 writefile (fd, (void *)restore_begin, restore_size);
2102 writecs (fd, CS_FINISHRESTORE);
2103 writefile (fd, &frpointer, sizeof frpointer);
2104
2105 /* Write out file descriptors */
2106
2107 writefiledescrs (fd);
2108
2109 /* Finally comes the memory contents */
2110
2111 /**************************************************************************/
2112 /* We can't do any more mallocing at this point because malloc stuff is */
2113 /* outside the limits of the libmtcp.so image, so it won't get */
2114 /* checkpointed, and it's possible that we would checkpoint an */
2115 /* inconsistent state. See note in restoreverything routine. */
2116 /**************************************************************************/
2117
2118 mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
2119
2120 while (readmapsline (mapsfd, &area)) {
2121 area_begin = (VA)area.addr;
2122 area_end = area_begin + area.size;
2123
2124 /* Original comment: Skip anything in kernel address space ---
2125 * beats me what's at FFFFE000..FFFFFFFF - we can't even read it;
2126 * Added: That's the vdso section for earlier Linux 2.6 kernels. For later
2127 * 2.6 kernels, vdso occurs at an earlier address. If it's unreadable,
2128 * then we simply won't copy it. But let's try to read all areas, anyway.
2129 * **COMMENTED OUT:** if (area_begin >= HIGHEST_VA) continue;
2130 */
2131 /* If it's readable, but it's VDSO, it will be dangerous to restore it.
2132 * In 32-bit mode later Red Hat RHEL Linux 2.6.9 releases use 0xffffe000,
2133 * the last page of virtual memory. Note 0xffffe000 >= HIGHEST_VA
2134 * implies we're in 32-bit mode.
2135 */
2136 if (area_begin >= HIGHEST_VA && area_begin == 0xffffe000) continue;
2137 #ifdef __x86_64__
2138 /* And in 64-bit mode later Red Hat RHEL Linux 2.6.9 releases
2139 * use 0xffffffffff600000 for VDSO.
2140 */
2141 if (area_begin >= HIGHEST_VA && area_begin == 0xffffffffff600000) continue;
2142 #endif
2143
2144 /* Skip anything that has no read or execute permission. This occurs
2145 * on one page in a Linux 2.6.9 installation. No idea why. This code
2146 * would also take care of kernel sections since we don't have read/execute
2147 * permission there.
2148 */
2149
2150 if (!((area.prot & PROT_READ) || (area.prot & PROT_WRITE))) continue;
2151
2152 // If the process has an area labelled as "/dev/zero (deleted)", we mark
2153 // the area as Anonymous and save the contents to the ckpt image file.
2154 // IF this area has a MAP_SHARED attribute, it should be replaced with
2155 // MAP_PRIVATE and we won't do any harm because, the /dev/zero file is an
2156 // absolute source and sink. Anything written to it will be discarded and
2157 // anything read from it will be all zeros.
2158 // The following call to mmap will create "/dev/zero (deleted)" area
2159 // mmap(addr, size, protection, MAP_SHARED | MAP_ANONYMOUS, 0, 0)
2160 //
2161 // The above explanation also applies to "/dev/null (deleted)"
2162
2163 if ( mtcp_strstartswith(area.name, dev_zero_deleted_str) ||
2164 mtcp_strstartswith(area.name, dev_null_deleted_str) ) {
2165 DPRINTF(("mtcp checkpointeverything: saving area \"%s\" as Anonymous\n",
2166 area.name));
2167 area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
2168 area.name[0] = '\0';
2169 }
2170
2171 if (mtcp_strstartswith(area.name, sys_v_shmem_file)) {
2172 DPRINTF(("mtcp checkpointeverything: saving area \"%s\" as Anonymous\n",
2173 area.name));
2174 area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
2175 area.name[0] = '\0';
2176 }
2177
2178 /* Special Case Handling: nscd is enabled*/
2179 if ( mtcp_strstartswith(area.name, nscd_mmap_str) ||
2180 mtcp_strstartswith(area.name, nscd_mmap_str2) ||
2181 mtcp_strstartswith(area.name, nscd_mmap_str3) ) {
2182 DPRINTF(("mtcp checkpointeverything: NSCD daemon shared memory area present. MTCP will now try to remap\n" \
2183 " this area in read/write mode and then will fill it with zeros so that\n" \
2184 " glibc will automatically ask NSCD daemon for new shared area\n\n"));
2185 area.prot = PROT_READ | PROT_WRITE;
2186 area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
2187
2188 if ( munmap(area.addr, area.size) == -1) {
2189 mtcp_printf ("mtcp checkpointeverything: error unmapping NSCD shared area: %s\n",
2190 strerror (mtcp_sys_errno));
2191 mtcp_abort();
2192 }
2193
2194 if ( mmap(area.addr, area.size, area.prot, area.flags, 0, 0)
2195 == MAP_FAILED ){
2196 mtcp_printf ("mtcp checkpointeverything: error remapping NSCD shared area: %s\n",
2197 strerror (mtcp_sys_errno));
2198 mtcp_abort();
2199 }
2200
2201 memset(area.addr, 0, area.size);
2202 }
2203
2204 /* Force the anonymous flag if it's a private writeable section, as the
2205 * data has probably changed from the contents of the original images.
2206 */
2207
2208 /* We also do this for read-only private sections as it's possible
2209 * to modify a page there, too (via mprotect).
2210 */
2211
2212 if ((area.flags & MAP_PRIVATE) /*&& (area.prot & PROT_WRITE)*/) {
2213 area.flags |= MAP_ANONYMOUS;
2214 }
2215
2216 if ( area.flags & MAP_SHARED ) {
2217 /* invalidate shared memory pages so that the next read to it (when we are
2218 * writing them to ckpt file) will cause them to be reloaded from the disk.
2219 */
2220 if ( msync(area.addr, area.size, MS_INVALIDATE) < 0 ){
2221 mtcp_printf ("mtcp sync_shared_memory: error %d Invalidating %X"
2222 " at %p from %s + %X\n", mtcp_sys_errno, area.size,
2223 area.addr, area.name, area.offset);
2224 mtcp_abort();
2225 }
2226 }
2227
2228
2229 /* Skip any mapping for this image - it got saved as CS_RESTOREIMAGE
2230 * at the beginning.
2231 */
2232
2233 if (area_begin < restore_begin) {
2234 if (area_end <= restore_begin) {
2235 writememoryarea (fd, &area, 0, vsyscall_exists); // the whole thing is before the restore image
2236 } else if (area_end <= restore_end) {
2237 area.size = restore_begin - area_begin; // we just have to chop the end part off
2238 writememoryarea (fd, &area, 0, vsyscall_exists);
2239 } else {
2240 area.size = restore_begin - area_begin; // we have to write stuff that comes before restore image
2241 writememoryarea (fd, &area, 0, vsyscall_exists);
2242 area.offset += restore_end - area_begin; // ... and we have to write stuff that comes after restore image
2243 area.size = area_end - restore_end;
2244 area.addr = (void *)restore_end;
2245 writememoryarea (fd, &area, 0, vsyscall_exists);
2246 }
2247 } else if (area_begin < restore_end) {
2248 if (area_end > restore_end) {
2249 area.offset += restore_end - area_begin; // we have to write stuff that comes after restore image
2250 area.size = area_end - restore_end;
2251 area.addr = (void *)restore_end;
2252 writememoryarea (fd, &area, 0, vsyscall_exists);
2253 }
2254 } else {
2255 if ( strstr (area.name, "[stack]") )
2256 stack_was_seen = 1;
2257 writememoryarea (fd, &area, stack_was_seen, vsyscall_exists); // the whole thing comes after the restore image
2258 }
2259 }
2260
2261 close (mapsfd);
2262
2263 /* That's all folks */
2264
2265 writecs (fd, CS_THEEND);
2266 if (close (fd) < 0) {
2267 mtcp_printf ("mtcp checkpointeverything(grandchild):"
2268 " error closing checkpoint file: %s\n", strerror (errno));
2269 mtcp_abort ();
2270 }
2271 if (use_compression) {
2272 /* IF OUT OF DISK SPACE, REPORT IT HERE. */
2273 if ( waitpid(mtcp_ckpt_gzip_child_pid, NULL, 0 ) == -1 )
2274 mtcp_printf ("mtcp checkpointeverything(grandchild): waitpid: %s\n",
2275 strerror (errno));
2276 mtcp_ckpt_gzip_child_pid = -1;
2277 }
2278
2279 /* Maybe it's time to verify the checkpoint.
2280 * If so, exec an mtcp_restore with the temp file (in case temp file is bad,
2281 * we'll still have the last one).
2282 * If the new file is good, mtcp_restore will rename it over the last one.
2283 */
2284
2285 if (verify_total != 0) -- verify_count;
2286
2287 /* Now that temp checkpoint file is complete, rename it over old permanent
2288 * checkpoint file. Uses rename() syscall, which doesn't change i-nodes.
2289 * So, gzip process can continue to write to file even after renaming.
2290 */
2291
2292 else renametempoverperm ();
2293
2294 if (forked_checkpointing)
2295 mtcp_sys_exit (0); /* grandchild exits */
2296
2297 DPRINTF (("mtcp checkpointeverything*: checkpoint complete\n"));
2298 }
2299
2300 /* True if the given FD should be checkpointed */
2301 static int should_ckpt_fd (int fd)
2302 {
2303 if ( callback_ckpt_fd!=NULL )
2304 return (*callback_ckpt_fd)(fd); //delegate to callback
2305 else if (fd > 2)
2306 return 1;
2307 else
2308 {
2309 /* stdin/stdout/stderr */
2310 /* we only want to checkpoint these if they are from a file */
2311 struct stat statbuf;
2312 fstat(fd, &statbuf);
2313 return S_ISREG(statbuf.st_mode);
2314 }
2315 }
2316
2317 /* Write list of open files to the checkpoint file */
2318
2319 static void writefiledescrs (int fd)
2320
2321 {
2322 char dbuf[BUFSIZ], linkbuf[FILENAMESIZE], *p, procfdname[64];
2323 int doff, dsiz, fddir, fdnum, linklen, rc;
2324 off_t offset;
2325 struct linux_dirent *dent;
2326 struct stat lstatbuf, statbuf;
2327
2328 writecs (fd, CS_FILEDESCRS);
2329
2330 /* Open /proc/self/fd directory - it contains a list of files I have open */
2331
2332 fddir = mtcp_sys_open ("/proc/self/fd", O_RDONLY, 0);
2333 if (fddir < 0) {
2334 mtcp_printf ("mtcp writefiledescrs: error opening directory /proc/self/fd: %s\n", strerror (errno));
2335 mtcp_abort ();
2336 }
2337
2338 /* Check each entry */
2339
2340 while (1) {
2341 dsiz = -1;
2342 if (sizeof dent -> d_ino == 4) dsiz = mtcp_sys_getdents (fddir, dbuf, sizeof dbuf);
2343 if (sizeof dent -> d_ino == 8) dsiz = mtcp_sys_getdents64 (fddir, dbuf, sizeof dbuf);
2344 if (dsiz <= 0) break;
2345
2346 for (doff = 0; doff < dsiz; doff += dent -> d_reclen) {
2347 dent = (struct linux_dirent *) (dbuf + doff);
2348
2349 /* The filename should just be a decimal number = the fd it represents.
2350 * Also, skip the entry for the checkpoint and directory files
2351 * as we don't want the restore to know about them.
2352 */
2353
2354 fdnum = strtol (dent -> d_name, &p, 10);
2355 if ((*p == '\0') && (fdnum >= 0) && (fdnum != fd) && (fdnum != fddir)
2356 && (should_ckpt_fd (fdnum) > 0)) {
2357
2358 /* Read the symbolic link so we get the filename that's open on the fd */
2359
2360 sprintf (procfdname, "/proc/self/fd/%d", fdnum);
2361 linklen = readlink (procfdname, linkbuf, sizeof linkbuf - 1);
2362 if ((linklen >= 0) || (errno != ENOENT)) { // probably was the proc/self/fd directory itself
2363 if (linklen < 0) {
2364 mtcp_printf ("mtcp writefiledescrs: error reading %s: %s\n",
2365 procfdname, strerror (errno));
2366 mtcp_abort ();
2367 }
2368 linkbuf[linklen] = '\0';
2369
2370 DPRINTF (("mtcp writefiledescrs*: checkpointing fd %d -> %s\n",
2371 fdnum, linkbuf));
2372
2373 /* Read about the link itself so we know read/write open flags */
2374
2375 rc = lstat (procfdname, &lstatbuf);
2376 if (rc < 0) {
2377 mtcp_printf ("mtcp writefiledescrs: error statting %s -> %s: %s\n",
2378 procfdname, linkbuf, strerror (-rc));
2379 mtcp_abort ();
2380 }
2381
2382 /* Read about the actual file open on the fd */
2383
2384 rc = stat (linkbuf, &statbuf);
2385 if (rc < 0) {
2386 mtcp_printf ("mtcp writefiledescrs: error statting %s -> %s: %s\n",
2387 procfdname, linkbuf, strerror (-rc));
2388 }
2389
2390 /* Write state information to checkpoint file.
2391 * Replace file's permissions with current access flags
2392 * so restore will know how to open it.
2393 */
2394
2395 else {
2396 offset = 0;
2397 if (S_ISREG (statbuf.st_mode))
2398 offset = mtcp_sys_lseek (fdnum, 0, SEEK_CUR);
2399 statbuf.st_mode = (statbuf.st_mode & ~0777)
2400 | (lstatbuf.st_mode & 0777);
2401 writefile (fd, &fdnum, sizeof fdnum);
2402 writefile (fd, &statbuf, sizeof statbuf);
2403 writefile (fd, &offset, sizeof offset);
2404 writefile (fd, &linklen, sizeof linklen);
2405 writefile (fd, linkbuf, linklen);
2406 }
2407 }
2408 }
2409 }
2410 }
2411 if (dsiz < 0) {
2412 mtcp_printf ("mtcp writefiledescrs: error reading /proc/self/fd: %s\n",
2413 strerror (mtcp_sys_errno));
2414 mtcp_abort ();
2415 }
2416
2417 mtcp_sys_close (fddir);
2418
2419 /* Write end-of-fd-list marker to checkpoint file */
2420
2421 fdnum = -1;
2422 writefile (fd, &fdnum, sizeof fdnum);
2423 }
2424
2425 static void writememoryarea (int fd, Area *area, int stack_was_seen,
2426 int vsyscall_exists)
2427
2428 { static void * orig_stack = NULL;
2429
2430 /* Write corresponding descriptor to the file */
2431
2432 if (orig_stack == NULL && 0 == strcmp(area -> name, "[stack]"))
2433 orig_stack = area -> addr + area -> size;
2434
2435 if (0 == strcmp(area -> name, "[vdso]") && !stack_was_seen)
2436 DPRINTF (("mtcp checkpointeverything*: skipping over [vdso] section"
2437 " %p at %p\n", area -> size, area -> addr));
2438 else if (0 == strcmp(area -> name, "[vsyscall]") && !stack_was_seen)
2439 DPRINTF (("mtcp checkpointeverything*: skipping over [vsyscall] section"
2440 " %p at %p\n", area -> size, area -> addr));
2441 else if (0 == strcmp(area -> name, "[stack]") &&
2442 orig_stack != area -> addr + area -> size)
2443 /* Kernel won't let us munmap this. But we don't need to restore it. */
2444 DPRINTF (("mtcp checkpointeverything*: skipping over [stack] segment"
2445 " %X at %pi (not the orig stack)\n", area -> size, area -> addr));
2446 else if (!(area -> flags & MAP_ANONYMOUS))
2447 DPRINTF (("mtcp checkpointeverything*: save %p at %p from %s + %X\n",
2448 area -> size, area -> addr, area -> name, area -> offset));
2449 else if (area -> name[0] == '\0')
2450 DPRINTF (("mtcp checkpointeverything*: save anonymous %p at %p\n",
2451 area -> size, area -> addr));
2452 else DPRINTF (("mtcp checkpointeverything*: save anonymous %p at %p"
2453 " from %s + %X\n",
2454 area -> size, area -> addr, area -> name, area -> offset));
2455
2456 if ((area -> name[0]) == '\0') {
2457 void *brk = mtcp_sys_brk(NULL);
2458 if (brk > area -> addr && brk <= area -> addr + area -> size)
2459 mtcp_sys_strcpy(area -> name, "[heap]");
2460 }
2461
2462 if ( 0 != strcmp(area -> name, "[vsyscall]")
2463 && ( (0 != strcmp(area -> name, "[vdso]")
2464 || vsyscall_exists /* which implies vdso can be overwritten */
2465 || !stack_was_seen ))) /* If vdso appeared before stack, it can be replaced */
2466 {
2467 writecs (fd, CS_AREADESCRIP);
2468 writefile (fd, area, sizeof *area);
2469
2470 /* Anonymous sections need to have their data copied to the file,
2471 * as there is no file that contains their data
2472 * We also save shared files to checkpoint file to handle shared memory
2473 * implemented with backing files
2474 */
2475 if (area -> flags & MAP_ANONYMOUS || area -> flags & MAP_SHARED) {
2476 writecs (fd, CS_AREACONTENTS);
2477 writefile (fd, area -> addr, area -> size);
2478 }
2479 }
2480 }
2481
2482 /* Write checkpoint section number to checkpoint file */
2483
2484 static void writecs (int fd, char cs)
2485
2486 {
2487 writefile (fd, &cs, sizeof cs);
2488 }
2489
2490 /* Write something to checkpoint file */
2491
2492 static char zeroes[MTCP_PAGE_SIZE] = { 0 };
2493 static void writefile (int fd, void const *buff, size_t size)
2494
2495 {
2496 char const *bf;
2497 ssize_t rc;
2498 size_t sz, wt;
2499
2500 checkpointsize += size;
2501
2502 bf = buff;
2503 sz = size;
2504 while (sz > 0) {
2505 for (wt = sz; wt > 0; wt /= 2) {
2506 rc = write (fd, bf, wt);
2507 if ((rc >= 0) || (errno != EFAULT)) break;
2508 }
2509
2510 /* Sometimes image page alignment will leave a hole in the middle of an image */
2511 /* ... but the idiot proc/self/maps will include it anyway */
2512
2513 if (wt == 0) {
2514 rc = (sz > sizeof zeroes ? sizeof zeroes : sz);
2515 checkpointsize -= rc; /* Correct now, since writefile will add rc back */
2516 writefile (fd, zeroes, rc);
2517 }
2518
2519 /* Otherwise, check for real error */
2520
2521 else {
2522 if (rc == 0) errno = EPIPE;
2523 if (rc <= 0) {
2524 mtcp_printf ("mtcp writefile: error writing from %p to %s: %s\n",
2525 bf, temp_checkpointfilename, strerror (errno));
2526 mtcp_abort ();
2527 }
2528 }
2529
2530 /* It's ok, we're on to next part */
2531
2532 sz -= rc;
2533 bf += rc;
2534 }
2535 }
2536
2537 static void preprocess_special_segments(int *vsyscall_exists)
2538 {
2539 Area area;
2540 int mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
2541 if (mapsfd < 0) {
2542 mtcp_printf ("mtcp checkpointeverything: error opening"
2543 " /proc/self/maps: %s\n", strerror (mtcp_sys_errno));
2544 mtcp_abort ();
2545 }
2546
2547 while (readmapsline (mapsfd, &area)) {
2548 if (0 == strcmp(area.name, "[vsyscall]")) {
2549 /* Determine if [vsyscall] exists. If [vdso] and [vsyscall] exist,
2550 * [vdso] will be saved and restored.
2551 * NOTE: [vdso] is relocated if /proc/sys/kernel/randomize_va_space == 2.
2552 * We must restore old [vdso] and also keep [vdso] in that case.
2553 * On Linux 2.6.25, 32-bit Linux has: [heap], /lib/ld-2.7.so, [vdso], libs, [stack].
2554 * On Linux 2.6.25, 64-bit Linux has: [stack], [vdso], [vsyscall].
2555 * and at least for gcl, [stack], libmtcp.so, [vsyscall] seen.
2556 * If 32-bit process in 64-bit Linux: [stack] (0xffffd000), [vdso] (0xffffe0000)
2557 * On 32-bit Linux, mtcp_restart has [vdso], /lib/ld-2.7.so, [stack]
2558 * Need to restore old [vdso] into mtcp_restart, to restart.
2559 * With randomize_va_space turned off, libraries start at high address
2560 * 0xb8000000 and are loaded progressively at lower addresses.
2561 * mtcp_restart loads vdso (which looks like a shared library) first.
2562 * But libpthread/libdl/libc libraries are loaded above vdso in user image.
2563 * So, we must use the opposite of the user's setting (no randomization if
2564 * user turned it on, and vice versa). We must also keep the
2565 * new vdso segment, provided by mtcp_restart.
2566 */
2567 *vsyscall_exists = 1;
2568 } else if (!saved_heap_start && strcmp(area.name, "[heap]") == 0) {
2569 // Record start of heap which will later be used in finishrestore()
2570 saved_heap_start = area.addr;
2571 } else if (strcmp(area.name, "[stack]") == 0) {
2572 /*
2573 * When using Matlab with dmtcp_checkpoint, sometimes the bottom most
2574 * page of stack (the page with highest address) which contains the
2575 * environment strings and the argv[] was not shown in /proc/self/maps.
2576 * This happens on some odd combination of environment passed on to
2577 * Matlab process. As a result, the page was not checkpointed and hence
2578 * the process segfaulted on restart. The fix is to try to mprotect this
2579 * page with RWX permission to make the page visible again. This call
2580 * will fail if no stack page was invisible to begin with.
2581 */
2582 int ret = mprotect(area.addr + area.size, 0x1000,
2583 PROT_READ | PROT_WRITE | PROT_EXEC);
2584 if (ret == 0) {
2585 mtcp_printf("mtcp checkpointeverything: bottom-most page of stack\n"
2586 "(page with highest address) was invisible in /proc/self/maps.\n"
2587 "It is made visible again now.\n");
2588 }
2589 }
2590 }
2591 close(mapsfd);
2592 }
2593
2594 /********************************************************************************************************************************/
2595 /* */
2596 /* This signal handler is forced by the main thread doing a 'mtcp_sys_kernel_tkill' to stop these threads so it can do a */
2597 /* checkpoint */
2598 /* */
2599 /********************************************************************************************************************************/
2600 /* Grow the stack by kbStack*1024 so that large stack is allocated on restart
2601 * The kernel won't do it automatically for us any more, since it thinks
2602 * the stack is in a different place after restart.
2603 */
2604 /* growstackValue is volatile so compiler doesn't optimize away growstack
2605 * Maybe it's not needed if we use ((optimize(0))) .
2606 */
2607 static volatile unsigned int growstackValue = 0;
2608 __attribute__ ((optimize(0))) static void growstack (int kbStack);
2609 static void growstack (int kbStack) {
2610 const int kBincrement = 1024;
2611 char array[kBincrement * 1024] __attribute__ ((unused));
2612 volatile int dummy_value __attribute__ ((unused)) = 1; /*Again, try to prevent compiler optimization*/
2613 if (kbStack > 0)
2614 growstack(kbStack - kBincrement);
2615 else
2616 growstackValue++;
2617 }
2618
2619 static void stopthisthread (int signum)
2620
2621 {
2622 int rc;
2623 Thread *thread;
2624 #define BT_SIZE 1024
2625 #define STDERR_FD 826
2626 #define LOG_FD 826
2627
2628 #ifdef PTRACE
2629 ptrace_unlock_inferiors();
2630 ptrace_remove_notexisted();
2631 ptrace_detach_checkpoint_threads ();
2632 ptrace_detach_user_threads ();
2633 #endif
2634
2635 DPRINTF (("mtcp stopthisthread*: tid %d returns to %p\n",
2636 mtcp_sys_kernel_gettid (), __builtin_return_address (0)));
2637
2638 thread = getcurrenthread (); // see which thread this is
2639
2640 // If this is checkpoint thread - exit immidiately
2641 if ( mtcp_state_value(&thread -> state) == ST_CKPNTHREAD ) {
2642 return ;
2643 }
2644
2645 if (0 && thread == motherofall) {
2646 #include <execinfo.h>
2647 void *buffer[BT_SIZE];
2648 int nptrs;
2649
2650 DPRINTF (( "printing stacktrace of the motherofall Thread\n\n" ));
2651 nptrs = backtrace (buffer, BT_SIZE);
2652 backtrace_symbols_fd ( buffer, nptrs, STDERR_FD );
2653 backtrace_symbols_fd ( buffer, nptrs, LOG_FD );
2654 }
2655 if (mtcp_state_set (&(thread -> state), ST_SUSPINPROG, ST_SIGENABLED)) { // make sure we don't get called twice for same thread
2656 static int is_first_checkpoint = 1;
2657
2658 save_sig_state (thread); // save signal state (and block signal delivery)
2659 save_tls_state (thread); // save thread local storage state
2660
2661 /* Grow stack only on first ckpt. Kernel agrees this is main stack and
2662 * will mmap it. On second ckpt and later, we would segfault if we tried
2663 * to grow the former stack beyond the portion that is already mmap'ed.
2664 */
2665 if (thread == motherofall) {
2666 static char *orig_stack_ptr;
2667 int kbStack = 2048;
2668 if (is_first_checkpoint) {
2669 orig_stack_ptr = (char *)&kbStack;
2670 is_first_checkpoint = 0;
2671 DPRINTF(("mtcp_stopthisthread: temp. grow main stack by %d kilobytes\n",
2672 kbStack));
2673 growstack(kbStack);
2674 } else if (orig_stack_ptr - (char *)&kbStack > 3 * kbStack*1024 / 4) {
2675 mtcp_printf("WARNING: Stack within %d bytes of end;\n"
2676 " Consider increasing 'kbStack' at line %d of mtcp/%s\n",
2677 kbStack*1024/4, __LINE__-9, __FILE__);
2678 }
2679 }
2680
2681 ///JA: new code ported from v54b
2682 rc = getcontext (&(thread -> savctx));
2683 if (rc < 0) {
2684 mtcp_printf ("mtcp stopthisthread: getcontext rc %d errno %d\n",
2685 rc, errno);
2686 mtcp_abort ();
2687 }
2688 DPRINTF (("mtcp stopthisthread*: after getcontext\n"));
2689 if (mtcp_state_value(&restoreinprog) == 0) {
2690
2691 /* We are the original process and all context is saved
2692 * restoreinprog is 0 ; wait for ckpt thread to write ckpt, and resume.
2693 */
2694
2695 WMB; // matched by RMB in checkpointhread
2696
2697 /* Next comes the first time we use the old stack. */
2698 /* Tell the checkpoint thread that we're all saved away */
2699 if (!mtcp_state_set (&(thread -> state), ST_SUSPENDED, ST_SUSPINPROG))
2700 mtcp_abort (); // tell checkpointhread all our context is saved
2701 mtcp_state_futex (&(thread -> state), FUTEX_WAKE, 1, NULL); // wake checkpoint thread if it's waiting for me
2702
2703 /* Then we wait for the checkpoint thread to write the checkpoint file then wake us up */
2704
2705 DPRINTF (("mtcp stopthisthread*: thread %d suspending\n", thread -> tid));
2706 while (mtcp_state_value(&thread -> state) == ST_SUSPENDED) {
2707 mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SUSPENDED, NULL);
2708 }
2709
2710 #ifdef PTRACE
2711 DPRINTF (("mtcp stopthisthread*: thread %d after suspending before deleting files\n", thread -> tid));
2712 delete_file(0, delete_ptrace_leader, has_ptrace_file);
2713 delete_file(1, delete_setoptions_leader, has_setoptions_file);
2714 delete_file(2, delete_checkpoint_leader, has_checkpoint_file);
2715 ptrace_attach_threads(0);
2716 #endif
2717
2718 /* Maybe there is to be a checkpoint verification. If so, and we're the main */
2719 /* thread, exec the restore program. If so and we're not the main thread, exit. */
2720
2721 if ((verify_total != 0) && (verify_count == 0)) {
2722
2723 /* If not the main thread, exit. Either normal exit() or _exit()
2724 * seems to cause other threads to exit.
2725 */
2726
2727 if (thread != motherofall) {
2728 mtcp_sys_exit(0);
2729 }
2730
2731 /* This is the main thread, verify checkpoint then restart by doing
2732 * a restart.
2733 * The restore will rename the file after it has done the restart.
2734 */
2735
2736 DPRINTF (("mtcp checkpointeverything*: verifying checkpoint...\n"));
2737 execlp ("mtcp_restart", "mtcp_restart", "--verify", temp_checkpointfilename, NULL);
2738 mtcp_printf ("mtcp checkpointeverything: error execing mtcp_restart %s: %s\n", temp_checkpointfilename, strerror (errno));
2739 mtcp_abort ();
2740 }
2741
2742 /* No verification, resume where we left off */
2743
2744 DPRINTF (("mtcp stopthisthread*: thread %d resuming\n", thread -> tid));
2745 }
2746
2747 /* Else restoreinprog >= 1; This stuff executes to do a restart */
2748
2749 else {
2750 if (!mtcp_state_set (&(thread -> state), ST_RUNENABLED, ST_SUSPENDED))
2751 mtcp_abort (); // checkpoint was written when thread in SUSPENDED state
2752 wait_for_all_restored ();
2753 DPRINTF (("mtcp stopthisthread*: thread %d restored\n", thread -> tid));
2754
2755 if (thread == motherofall) {
2756
2757 /* If we're a restore verification, rename the temp file
2758 * over the permanent one
2759 */
2760
2761 if (mtcp_restore_verify) renametempoverperm ();
2762 }
2763
2764 #ifdef PTRACE
2765 ptrace_attach_threads(1);
2766 #endif
2767 }
2768 }
2769 DPRINTF (("mtcp stopthisthread*: tid %d returning to %p\n",
2770 mtcp_sys_kernel_gettid (), __builtin_return_address (0)));
2771 #ifdef PTRACE
2772 ptrace_lock_inferiors();
2773 #endif
2774 }
2775
2776 /********************************************************************************************************************************/
2777 /* */
2778 /* Wait for all threads to finish restoring their context, then release them all to continue on their way. */
2779 /* */
2780 /* Input: */
2781 /* */
2782 /* restoreinprog = number of threads, including this, that hasn't called 'wait_for_all_restored' yet */
2783 /* thread list locked */
2784 /* */
2785 /* Output: */
2786 /* */
2787 /* restoreinprog = decremented */
2788 /* if now zero, all threads woken and thread list unlocked */
2789 /* */
2790 /********************************************************************************************************************************/
2791
2792 static void wait_for_all_restored (void)
2793
2794 {
2795 int rip;
2796
2797 do rip = mtcp_state_value(&restoreinprog); // dec number of threads cloned but not completed longjmp'ing
2798 while (!mtcp_state_set (&restoreinprog, rip - 1, rip));
2799 if (-- rip == 0) {
2800
2801 /* raise the signals which were pending for the entire process at the time
2802 * of checkpoint. It is assumed that if a signal is pending for all threads
2803 * including the ckpt-thread, then it was sent to the process as opposed to
2804 * sent to individual threads.
2805 */
2806 int i;
2807 for (i = NSIG; i > 0; --i) {
2808 if (sigismember(&sigpending_global, i) == 1) {
2809 kill(getpid(), i);
2810 }
2811 }
2812
2813 if (callback_restore_virtual_pid_table != NULL) {
2814 DPRINTF(("Before callback_restore_virtual_pid_table: Thread:%d \n",
2815 mtcp_sys_kernel_gettid()));
2816 (*callback_restore_virtual_pid_table)();
2817 DPRINTF(("After callback_restore_virtual_pid_table: Thread:%d \n",
2818 mtcp_sys_kernel_gettid()));
2819 }
2820
2821 mtcp_state_futex (&restoreinprog, FUTEX_WAKE, 999999999, NULL); // if this was last of all, wake everyone up
2822
2823 // NOTE: This is last safe moment for hook. All previous threads
2824 // have executed the "else" and are waiting on the futex.
2825 // This last thread has not yet unlocked the threads: unlk_threads()
2826 // So, no race condition occurs.
2827 // By comparison, *callback_post_ckpt() is called before creating
2828 // additional user threads. Only motherofall (checkpoint thread existed)
2829 /* call weak symbol of this file, possibly overridden by the user's strong symbol */
2830 /* user must compile his/her code with -Wl,-export-dynamic to make it visible */
2831 mtcpHookRestart();
2832 unlk_threads (); // ... and release the thread list
2833 } else {
2834 while ((rip = mtcp_state_value(&restoreinprog)) > 0) { // otherwise, wait for last of all to wake this one up
2835 mtcp_state_futex (&restoreinprog, FUTEX_WAIT, rip, NULL);
2836 }
2837 }
2838 }
2839
2840 /********************************************************************************************************************************/
2841 /* */
2842 /* Save signal mask and list of pending signals delivery */
2843 /* */
2844 /********************************************************************************************************************************/
2845
2846 static void save_sig_state (Thread *thisthread)
2847 {
2848 /* For checkpoint thread, we want to block delivery of all but some special signals*/
2849 if (thisthread == ckpthread) {
2850 /*
2851 * For the checkpoint thread, we should not block SIGSETXID which is used
2852 * by the setsid family of system calls to change the session leader. Glibc
2853 * uses this signal to notify the process threads of the change in session
2854 * leader information. This signal is not documented and is used internally
2855 * by glibc. It is defined in <glibc-src-root>/nptl/pthreadP.h
2856 * screen was getting affected by this since it used setsid to change the
2857 * session leaders.
2858 */
2859 #define SIGSETXID (__SIGRTMIN + 1)
2860 sigset_t set;
2861
2862 sigfillset(&set);
2863 sigdelset(&set, SIGSETXID);
2864
2865 if (pthread_sigmask(SIG_SETMASK, &set, NULL) < 0) {
2866 mtcp_printf("mtcp %s: error getting sigal mask: %s\n",
2867 __FUNCTION__, strerror(errno));
2868 mtcp_abort ();
2869 }
2870 }
2871 // Save signal block mask
2872 if (pthread_sigmask (SIG_SETMASK, NULL, &(thisthread -> sigblockmask)) < 0) {
2873 mtcp_printf("mtcp %s: error getting sigal mask: %s\n",
2874 __FUNCTION__, strerror(errno));
2875 mtcp_abort ();
2876 }
2877
2878 // Save pending signals
2879 sigpending ( &(thisthread->sigpending) );
2880 }
2881
2882 /********************************************************************************************************************************/
2883 /* */
2884 /* Restore signal mask and all pending signals */
2885 /* */
2886 /********************************************************************************************************************************/
2887
2888 static void restore_sig_state (Thread *thisthread)
2889 {
2890 int i;
2891 DPRINTF (("mtcp restore_sig_state*: restoring handlers for thread %d\n",
2892 thisthread->original_tid));
2893 if (pthread_sigmask (SIG_SETMASK, &(thisthread -> sigblockmask), NULL) < 0) {
2894 mtcp_printf("mtcp %s: error setting sigal mask: %s\n",
2895 __FUNCTION__, strerror(errno));
2896 mtcp_abort ();
2897 }
2898
2899 // Raise the signals which were pending for only this thread at the time of checkpoint.
2900 for (i = NSIG; i > 0; --i) {
2901 if (sigismember(&(thisthread -> sigpending), i) == 1 &&
2902 sigismember(&(thisthread -> sigblockmask), i) == 1 &&
2903 sigismember(&(sigpending_global), i) == 0) {
2904 raise(i);
2905 }
2906 }
2907 }
2908
2909 /********************************************************************************************************************************/
2910 /* */
2911 /* Save all signal handlers */
2912 /* */
2913 /********************************************************************************************************************************/
2914 static void save_sig_handlers (void)
2915 {
2916 int i;
2917
2918 if (dmtcp_exists) {
2919 mtcp_printf("mtcp:%s Illegal function call when running under DMTCP*****\n",
2920 __FUNCTION__);
2921 // Do a simple return instead of killing the process
2922 return;
2923 //mtcp_abort();
2924 }
2925
2926 /* Now save all the signal handlers */
2927 DPRINTF (("mtcp save_sig_handlers*: saving signal handlers\n"));
2928 for (i = NSIG; i > 0; --i) {
2929 if (_real_sigaction (i, NULL, &sigactions[i]) < 0) {
2930 if (errno == EINVAL)
2931 memset (&sigactions[i], 0, sizeof sigactions[i]);
2932 else {
2933 mtcp_printf ("mtcp save_sig_handlers: error saving signal %d action: %s\n",
2934 i, strerror(errno));
2935 mtcp_abort ();
2936 }
2937 }
2938
2939 DPRINTF (("mtcp save_sig_handlers*: saving signal handler for %d -> %p\n",
2940 i,
2941 (sigactions[i].sa_flags & SA_SIGINFO ?
2942 (void *)(sigactions[i].sa_sigaction) :
2943 (void *)(sigactions[i].sa_handler)) ));
2944 }
2945 }
2946
2947 /********************************************************************************************************************************/
2948 /* */
2949 /* Restore all saved signal handlers */
2950 /* */
2951 /********************************************************************************************************************************/
2952 static void restore_sig_handlers (Thread *thisthread)
2953 {
2954 int i;
2955
2956 if (dmtcp_exists) {
2957 mtcp_printf("mtcp:%s Illegal function when running under DMTCP*****\n",
2958 __FUNCTION__);
2959 // Do a simple return instead of killing the process
2960 return;
2961 //mtcp_abort();
2962 }
2963
2964 DPRINTF (("mtcp restore_sig_handlers*: restoring signal handlers\n"));
2965 #if 0
2966 # define VERBOSE_DEBUG
2967 #endif
2968 for(i = NSIG; i > 0; --i) {
2969 #ifdef VERBOSE_DEBUG
2970 DPRINTF (("mtcp restore_sig_handlers*: restore signal handler for %d -> %p\n",
2971 i,
2972 (sigactions[i].sa_flags & SA_SIGINFO ?
2973 sigactions[i].sa_sigaction :
2974 sigactions[i].sa_handler) ));
2975 #endif
2976
2977 if (_real_sigaction(i, &sigactions[i], NULL) < 0) {
2978 if (errno != EINVAL) {
2979 mtcp_printf ("mtcp restore_sig_handlers:" \
2980 " error restoring signal %d handler: %s\n",
2981 i, strerror(errno));
2982 mtcp_abort ();
2983 }
2984 }
2985 }
2986 }
2987
2988 /********************************************************************************************************************************/
2989 /* */
2990 /* Save state necessary for TLS restore */
2991 /* Linux saves stuff in the GDT, switching it on a per-thread basis */
2992 /* */
2993 /********************************************************************************************************************************/
2994
2995 static void save_tls_state (Thread *thisthread)
2996
2997 {
2998 int i, rc;
2999
3000 #ifdef __i386__
3001 asm volatile ("movw %%fs,%0" : "=m" (thisthread -> fs));
3002 asm volatile ("movw %%gs,%0" : "=m" (thisthread -> gs));
3003 #endif
3004 #ifdef __x86_64__
3005 //asm volatile ("movl %%fs,%0" : "=m" (thisthread -> fs));
3006 //asm volatile ("movl %%gs,%0" : "=m" (thisthread -> gs));
3007 #endif
3008
3009 memset (thisthread -> gdtentrytls, 0, sizeof thisthread -> gdtentrytls);
3010
3011 /* On older Linuxes, we must save several GDT entries available to threads. */
3012
3013 #if MTCP__SAVE_MANY_GDT_ENTRIES
3014 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i ++) {
3015 thisthread -> gdtentrytls[i-GDT_ENTRY_TLS_MIN].entry_number = i;
3016 rc = mtcp_sys_get_thread_area (&(thisthread -> gdtentrytls[i-GDT_ENTRY_TLS_MIN]));
3017 if (rc < 0) {
3018 mtcp_printf ("mtcp checkpointeverything: error saving GDT TLS entry[%d]: %s\n", i, strerror (mtcp_sys_errno));
3019 mtcp_abort ();
3020 }
3021 }
3022
3023 /* With newer Linuxes, we just save the one GDT entry indexed by GS so we don't need the GDT_ENTRY_TLS_... definitions. */
3024 /* We get the particular index of the GDT entry to save by reading GS. */
3025
3026 #else
3027 i = thisthread -> TLSSEGREG / 8;
3028 thisthread -> gdtentrytls[0].entry_number = i;
3029 rc = mtcp_sys_get_thread_area (&(thisthread -> gdtentrytls[0]));
3030 if (rc < 0) {
3031 mtcp_printf ("mtcp checkpointeverything: error saving GDT TLS entry[%d]: %s\n", i, strerror (mtcp_sys_errno));
3032 mtcp_abort ();
3033 }
3034 #endif
3035 }
3036
3037 static char *memsubarray (char *array, char *subarray, int len) {
3038 char *i_ptr;
3039 int j;
3040 int word1 = *(int *)subarray;
3041 // Assume subarray length is at least sizeof(int) and < 2048.
3042 if (len < sizeof(int))
3043 mtcp_abort();
3044 for (i_ptr = array; i_ptr < array+2048; i_ptr++) {
3045 if (*(int *)i_ptr == word1) {
3046 for (j=0; j < len; j++)
3047 if (i_ptr[j] != subarray[j])
3048 break;
3049 if (j == len)
3050 return i_ptr;
3051 }
3052 }
3053 return NULL;
3054 }
3055 static int mtcp_get_tls_segreg(void)
3056 { mtcp_segreg_t tlssegreg;
3057 #ifdef __i386__
3058 asm volatile ("movw %%gs,%0" : "=g" (tlssegreg)); /* any general register */
3059 #endif
3060 #ifdef __x86_64__
3061 asm volatile ("movl %%fs,%0" : "=q" (tlssegreg)); /* q = a,b,c,d for i386; 8 low bits of r class reg for x86_64 */
3062 #endif
3063 return (int)tlssegreg;
3064 }
3065 static void *mtcp_get_tls_base_addr(void)
3066 {
3067 struct user_desc gdtentrytls;
3068
3069 #if MTCP__SAVE_MANY_GDT_ENTRIES
3070 if (mtcp_get_tls_segreg() / 8 != GDT_ENTRY_TLS_MIN) {
3071 mtcp_printf ("mtcp_init: gs %X not set to first TLS GDT ENTRY %X\n",
3072 gs, GDT_ENTRY_TLS_MIN * 8 + 3);
3073 mtcp_abort ();
3074 }
3075 #endif
3076
3077 gdtentrytls.entry_number = mtcp_get_tls_segreg() / 8;
3078 if ( mtcp_sys_get_thread_area ( &gdtentrytls ) < 0 ) {
3079 mtcp_printf ("mtcp_init: error getting GDT TLS entry: %s\n",
3080 strerror (mtcp_sys_errno));
3081 mtcp_abort ();
3082 }
3083 return (void *)(*(unsigned long *)&(gdtentrytls.base_addr));
3084 }
3085
3086 static void renametempoverperm (void)
3087
3088 {
3089 if (rename (temp_checkpointfilename, perm_checkpointfilename) < 0) {
3090 mtcp_printf ("mtcp checkpointeverything: error renaming %s to %s: %s\n", temp_checkpointfilename, perm_checkpointfilename,
3091 strerror (errno));
3092 mtcp_abort ();
3093 }
3094 }
3095
3096 /********************************************************************************************************************************/
3097 /* */
3098 /* Get current thread struct pointer */
3099 /* It is keyed by the calling thread's gettid value */
3100 /* Maybe improve someday by using TLS */
3101 /* */
3102 /********************************************************************************************************************************/
3103
3104 static Thread *getcurrenthread (void)
3105
3106 {
3107 int tid;
3108 Thread *thread;
3109
3110 tid = mtcp_sys_kernel_gettid ();
3111 lock_threads ();
3112 for (thread = threads; thread != NULL; thread = thread -> next) {
3113 if (thread -> tid == tid) {
3114 unlk_threads ();
3115 return (thread);
3116 }
3117 }
3118 mtcp_printf ("mtcp getcurrenthread: can't find thread id %d\n", tid);
3119 mtcp_abort ();
3120 return thread; /* NOTREACHED : stop compiler warning */
3121 }
3122
3123 /********************************************************************************************************************************/
3124 /* */
3125 /* Lock and unlock the 'threads' list */
3126 /* */
3127 /********************************************************************************************************************************/
3128
3129 static void lock_threads (void)
3130
3131 {
3132 while (!mtcp_state_set (&threadslocked, 1, 0)) {
3133 mtcp_state_futex (&threadslocked, FUTEX_WAIT, 1, NULL);
3134 }
3135 RMB; // don't prefetch anything until we have the lock
3136 }
3137
3138 static void unlk_threads (void)
3139
3140 {
3141 WMB; // flush data written before unlocking
3142 // FIXME: Should we be checking return value of mtcp_state_set? Can it ever fail?
3143 mtcp_state_set(&threadslocked , 0, 1);
3144 mtcp_state_futex (&threadslocked, FUTEX_WAKE, 1, NULL);
3145 }
3146
3147 /********************************************************************************************************************************/
3148 /* */
3149 /* Read /proc/self/maps line, converting it to an Area descriptor struct */
3150 /* */
3151 /* Input: */
3152 /* */
3153 /* mapsfd = /proc/self/maps file, positioned to beginning of a line */
3154 /* */
3155 /* Output: */
3156 /* */
3157 /* readmapsline = 0 : was at end-of-file, nothing read */
3158 /* 1 : read and processed one line */
3159 /* *area = filled in */
3160 /* */
3161 /* Note: */
3162 /* */
3163 /* Line from /procs/self/maps is in form: */
3164 /* */
3165 /* <startaddr>-<endaddrexclusive> rwxs <fileoffset> <devmaj>:<devmin> <inode> <filename>\n */
3166 /* all numbers in hexadecimal except inode is in decimal */
3167 /* anonymous will be shown with offset=devmaj=devmin=inode=0 and no ' filename' */
3168 /* */
3169 /********************************************************************************************************************************/
3170
3171 static int readmapsline (int mapsfd, Area *area)
3172
3173 {
3174 char c, rflag, sflag, wflag, xflag;
3175 int i, rc;
3176 struct stat statbuf;
3177 VA devmajor, devminor, devnum, endaddr, inodenum, startaddr;
3178
3179 c = mtcp_readhex (mapsfd, &startaddr);
3180 if (c != '-') {
3181 if ((c == 0) && (startaddr == 0)) return (0);
3182 goto skipeol;
3183 }
3184 c = mtcp_readhex (mapsfd, &endaddr);
3185 if (c != ' ') goto skipeol;
3186 if (endaddr < startaddr) goto skipeol;
3187
3188 rflag = c = mtcp_readchar (mapsfd);
3189 if ((c != 'r') && (c != '-')) goto skipeol;
3190 wflag = c = mtcp_readchar (mapsfd);
3191 if ((c != 'w') && (c != '-')) goto skipeol;
3192 xflag = c = mtcp_readchar (mapsfd);
3193 if ((c != 'x') && (c != '-')) goto skipeol;
3194 sflag = c = mtcp_readchar (mapsfd);
3195 if ((c != 's') && (c != 'p')) goto skipeol;
3196
3197 c = mtcp_readchar (mapsfd);
3198 if (c != ' ') goto skipeol;
3199
3200 c = mtcp_readhex (mapsfd, &devmajor);
3201 if (c != ' ') goto skipeol;
3202 area -> offset = devmajor;
3203
3204 c = mtcp_readhex (mapsfd, &devmajor);
3205 if (c != ':') goto skipeol;
3206 c = mtcp_readhex (mapsfd, &devminor);
3207 if (c != ' ') goto skipeol;
3208 c = mtcp_readdec (mapsfd, &inodenum);
3209 area -> name[0] = '\0';
3210 while (c == ' ') c = mtcp_readchar (mapsfd);
3211 if (c == '/' || c == '[') { /* absolute pathname, or [stack], [vdso], etc. */
3212 i = 0;
3213 do {
3214 area -> name[i++] = c;
3215 if (i == sizeof area -> name) goto skipeol;
3216 c = mtcp_readchar (mapsfd);
3217 } while (c != '\n');
3218 area -> name[i] = '\0';
3219 }
3220 if (mtcp_strstartswith(area -> name, nscd_mmap_str) ||
3221 mtcp_strstartswith(area -> name, nscd_mmap_str2) ||
3222 mtcp_strstartswith(area -> name, nscd_mmap_str3)) {
3223 /* if nscd is active */
3224 } else if ( mtcp_strstartswith(area -> name, sys_v_shmem_file) ) {
3225 /* System V Shared-Memory segments are handled by DMTCP. */
3226 } else if ( mtcp_strendswith(area -> name, " (deleted)") ) {
3227 /* Deleted File */
3228 } else if (area -> name[0] == '/') { /* if an absolute pathname */
3229 rc = stat (area -> name, &statbuf);
3230 if (rc < 0) {
3231 mtcp_printf ("ERROR: mtcp readmapsline: error %d statting %s\n",
3232 -rc, area -> name);
3233 return (1); /* 0 would mean last line of maps; could do mtcp_abort() */
3234 }
3235 devnum = makedev (devmajor, devminor);
3236 if ((devnum != statbuf.st_dev) || (inodenum != statbuf.st_ino)) {
3237 mtcp_printf ("ERROR: mtcp readmapsline: image %s dev:inode %X:%u"
3238 " not eq maps %X:%u\n",
3239 area -> name, statbuf.st_dev, statbuf.st_ino,
3240 devnum, inodenum);
3241 return (1); /* 0 would mean last line of maps; could do mtcp_abort() */
3242 }
3243 } else {
3244 /* Special area like [heap] or anonymous area. */
3245 }
3246
3247 if (c != '\n') goto skipeol;
3248
3249 area -> addr = (void *)startaddr;
3250 area -> size = endaddr - startaddr;
3251 area -> prot = 0;
3252 if (rflag == 'r') area -> prot |= PROT_READ;
3253 if (wflag == 'w') area -> prot |= PROT_WRITE;
3254 if (xflag == 'x') area -> prot |= PROT_EXEC;
3255 area -> flags = MAP_FIXED;
3256 if (sflag == 's') area -> flags |= MAP_SHARED;
3257 if (sflag == 'p') area -> flags |= MAP_PRIVATE;
3258 if (area -> name[0] == '\0') area -> flags |= MAP_ANONYMOUS;
3259
3260 return (1);
3261
3262 skipeol:
3263 DPRINTF (("ERROR: mtcp readmapsline*: bad maps line <%c", c));
3264 while ((c != '\n') && (c != '\0')) {
3265 c = mtcp_readchar (mapsfd);
3266 mtcp_printf ("%c", c);
3267 }
3268 mtcp_printf (">\n");
3269 mtcp_abort ();
3270 return (0); /* NOTREACHED : stop compiler warning */
3271 }
3272
3273 /********************************************************************************************************************************/
3274 /* */
3275 /* Do restore from checkpoint file */
3276 /* This routine is called from the mtcp_restore program to perform the restore */
3277 /* It resides in the libmtcp.so image in exactly the same spot that the checkpointed process had its libmtcp.so loaded at, so this */
3278 /* can't possibly interfere with restoring the checkpointed process */
3279 /* The restore can't use malloc because that might create memory sections. */
3280 /* Strerror seems to mess up with its Locale stuff in here too. */
3281 /* */
3282 /* Input: */
3283 /* */
3284 /* fd = checkpoint file, positioned just after the CS_RESTOREIMAGE data */
3285 /* */
3286 /********************************************************************************************************************************/
3287
3288 #ifdef __x86_64__
3289 # define UNUSED_IN_64_BIT __attribute__ ((unused))
3290 #else
3291 # define UNUSED_IN_64_BIT
3292 #endif
3293
3294 #define STRINGS_LEN 10000
3295 static char UNUSED_IN_64_BIT STRINGS[STRINGS_LEN];
3296 void mtcp_restore_start (int fd, int verify, pid_t gzip_child_pid,char *ckpt_newname,
3297 char *cmd_file, char *argv[], char *envp[] )
3298
3299 {
3300 #ifndef __x86_64__
3301 int i;
3302 char *strings = STRINGS;
3303 #endif
3304
3305 DEBUG_RESTARTING = 1;
3306 /* If we just replace extendedStack by (tempstack+STACKSIZE) in "asm"
3307 * below, the optimizer generates non-PIC code if it's not -O0 - Gene
3308 */
3309 long long * extendedStack = tempstack + STACKSIZE;
3310
3311 /* Not used until we do longjmps, but get it out of the way now */
3312
3313 // FIXME: Should we be checking return value of mtcp_state_set? Can it ever fail?
3314 mtcp_state_set(&restoreinprog ,1, 0);
3315
3316 mtcp_sys_gettimeofday (&restorestarted, NULL);
3317
3318 /* Save parameter away in a static memory location as we're about to wipe the stack */
3319
3320 mtcp_restore_cpfd = fd;
3321 mtcp_restore_verify = verify;
3322 mtcp_restore_gzip_child_pid = gzip_child_pid;
3323 // Copy newname to save it too
3324 {
3325 int i;
3326 for(i=0;ckpt_newname[i];i++){
3327 mtcp_ckpt_newname[i] = ckpt_newname[i];
3328 }
3329 mtcp_ckpt_newname[i] = '\0';
3330 }
3331
3332
3333 #ifndef __x86_64__
3334 // Copy command line to libmtcp.so, so that we can re-exec if randomized vdso
3335 // steps on us. This won't be needed when we use the linker to map areas.
3336 strings = STRINGS;
3337 // This version of STRCPY copies source string into STRINGS,
3338 // and sets destination string to point there.
3339 # define STRCPY(x,y) \
3340 if (strings + 256 < STRINGS + STRINGS_LEN) { \
3341 mtcp_sys_strcpy(strings,y); \
3342 x = strings; \
3343 strings += mtcp_sys_strlen(y) + 1; \
3344 } else { \
3345 DPRINTF(("MTCP: ran out of string space." \
3346 " Trying to continue anyway\n")); \
3347 }
3348 STRCPY(mtcp_restore_cmd_file, cmd_file);
3349 for (i = 0; argv[i] != NULL; i++) {
3350 STRCPY(mtcp_restore_argv[i], argv[i]);
3351 }
3352 mtcp_restore_argv[i] = NULL;
3353 for (i = 0; envp[i] != NULL; i++) {
3354 STRCPY(mtcp_restore_envp[i], envp[i]);
3355 }
3356 mtcp_restore_envp[i] = NULL;
3357 #endif
3358
3359 /* Switch to a stack area that's part of the shareable's memory address range
3360 * and thus not used by the checkpointed program
3361 */
3362
3363 asm volatile (CLEAN_FOR_64_BIT(mov %0,%%esp\n\t)
3364 /* This next assembly language confuses gdb,
3365 but seems to work fine anyway */
3366 CLEAN_FOR_64_BIT(xor %%ebp,%%ebp\n\t)
3367 : : "g" (extendedStack) : "memory");
3368
3369 /* Once we're on the new stack, we can't access any local variables or parameters */
3370 /* Call the restoreverything to restore files and memory areas */
3371
3372 /* This should never return */
3373 mtcp_restoreverything();
3374 asm volatile ("hlt");
3375 }
3376
3377
3378 /********************************************************************************************************************************/
3379 /* */
3380 /* Restore proper heap */
3381 /* */
3382 /********************************************************************************************************************************/
3383 static void restore_heap()
3384 {
3385 /*
3386 * If the original start of heap is lower than the current end of heap, we
3387 * want to mmap the area between mtcp_saved_break and current break. This
3388 * happens when the size of checkpointed program is smaller then the size of
3389 * mtcp_restart program.
3390 */
3391 void* current_break = mtcp_sys_brk (NULL);
3392 if (current_break > mtcp_saved_break) {
3393 DPRINTF(("mtcp finishrestore: Area between mtcp_saved_break:%p and "
3394 "Current_break:%p not mapped, mapping it now\n",
3395 mtcp_saved_break, current_break));
3396 size_t oldsize = mtcp_saved_break - saved_heap_start;
3397 size_t newsize = current_break - saved_heap_start;
3398
3399 void* addr = mremap (saved_heap_start, oldsize, newsize, 0);
3400 if (addr == NULL) {
3401 mtcp_printf("mtcp finishrestore: mremap failed to map area between "
3402 "mtcp_saved_break (%p) and current_break (%p)\n",
3403 mtcp_saved_break, current_break);
3404 mtcp_abort();
3405 }
3406 }
3407 }
3408
3409 /********************************************************************************************************************************/
3410 /* */
3411 /* The original program's memory and files have been restored */
3412 /* */
3413 /********************************************************************************************************************************/
3414
3415 static void finishrestore (void)
3416 {
3417 struct timeval stopped;
3418 int nnamelen;
3419
3420 DPRINTF (("mtcp finishrestore*: mtcp_printf works; libc should work\n"));
3421
3422 restore_heap();
3423
3424 if ( (nnamelen = strlen(mtcp_ckpt_newname))
3425 && strcmp(mtcp_ckpt_newname,perm_checkpointfilename) ) {
3426 // we start from different place - change it!
3427 DPRINTF(("mtcp finishrestore*: checkpoint file name was changed\n"));
3428 if (strlen(mtcp_ckpt_newname) >= MAXPATHLEN) {
3429 mtcp_printf("mtcp finishrestore: new ckpt file name (%s) too long (>=512 bytes)\n",
3430 mtcp_ckpt_newname);
3431 mtcp_abort();
3432 }
3433 strncpy(perm_checkpointfilename,mtcp_ckpt_newname,MAXPATHLEN);
3434 memcpy(temp_checkpointfilename,perm_checkpointfilename,MAXPATHLEN);
3435 strncpy(temp_checkpointfilename + nnamelen, ".temp",MAXPATHLEN - nnamelen);
3436 }
3437
3438 mtcp_sys_gettimeofday (&stopped, NULL);
3439 stopped.tv_usec += (stopped.tv_sec - restorestarted.tv_sec) * 1000000 - restorestarted.tv_usec;
3440 TPRINTF (("mtcp finishrestore*: time %u uS\n", stopped.tv_usec));
3441
3442 /* Now we can access all our files and memory that existed at the time of the checkpoint */
3443 /* We are still on the temporary stack, though */
3444
3445 /* Fill in the new mother process id */
3446 motherpid = mtcp_sys_getpid();
3447
3448 /* Call another routine because our internal stack is whacked and we can't have local vars */
3449
3450 ///JA: v54b port
3451 // so restarthread will have a big stack
3452 asm volatile (CLEAN_FOR_64_BIT(mov %0,%%esp)
3453 : : "g" (motherofall -> savctx.SAVEDSP - 128 ) : "memory"); // -128 for red zone
3454 restarthread (motherofall);
3455 }
3456
3457 static int restarthread (void *threadv)
3458 {
3459 int rip;
3460 Thread *child;
3461 Thread *const thread = threadv;
3462 struct MtcpRestartThreadArg mtcpRestartThreadArg;
3463
3464 restore_tls_state (thread);
3465
3466
3467 if (thread == motherofall) {
3468 // Compute the set of signals which was pending for all the threads at the
3469 // time of checkpoint. This is a heuristic to compute the set of signals
3470 // which were pending for the entire process at the time of checkpoint.
3471 sigset_t tmp;
3472 sigfillset ( &tmp );
3473 Thread *th;
3474 for (th = threads; th != NULL; th = th -> next) {
3475 sigandset ( &sigpending_global, &tmp, &(th->sigpending) );
3476 tmp = sigpending_global;
3477 }
3478
3479 setup_sig_handler ();
3480
3481 set_tid_address (&(thread -> child_tid));
3482
3483 if (callback_post_ckpt != NULL) {
3484 DPRINTF(("mtcp finishrestore*: before callback_post_ckpt(1=restarting)"
3485 " (&%x,%x) \n",
3486 &callback_post_ckpt, callback_post_ckpt));
3487 (*callback_post_ckpt)(1);
3488 DPRINTF(("mtcp finishrestore*: after callback_post_ckpt(1=restarting)\n"));
3489 }
3490 /* Do it once only, in motherofall thread. */
3491
3492 restore_term_settings();
3493
3494 if (dmtcp_info_restore_working_directory
3495 && chdir(saved_working_directory) == -1) {
3496 perror("chdir");
3497 mtcp_abort ();
3498 }
3499
3500 /* DMTCP restores signal handlers. But if we are running standalone,
3501 * MTCP must do it.
3502 * Because signal handlers are per-process, we only do this once.
3503 */
3504 if (!dmtcp_exists)
3505 restore_sig_handlers(thread);
3506 }
3507
3508 restore_sig_state (thread);
3509
3510 for (child = thread -> children; child != NULL; child = child -> siblings) {
3511
3512 /* Increment number of threads created but haven't completed their longjmp */
3513
3514 do rip = mtcp_state_value(&restoreinprog);
3515 while (!mtcp_state_set (&restoreinprog, rip + 1, rip));
3516
3517 /* Create the thread so it can finish restoring itself. */
3518 /* Don't do CLONE_SETTLS (it'll puke). We do it later via restore_tls_state. */
3519
3520 ///JA: v54b port
3521 errno = -1;
3522
3523 void *clone_arg = (void *)child;
3524
3525 /*
3526 * DMTCP needs to know original_tid of the thread being created by the
3527 * following clone() call.
3528 *
3529 * Threads are created by using syscall which is intercepted by DMTCP and
3530 * the original_tid is sent to DMTCP as a field of MtcpRestartThreadArg
3531 * structure. DMTCP will automatically extract the actual argument
3532 * (clone_arg -> arg) from clone_arg and will pass it on to the real
3533 * clone call.
3534 * (--Kapil)
3535 */
3536 mtcpRestartThreadArg.arg = (void *)child;
3537 mtcpRestartThreadArg.original_tid = child -> original_tid;
3538 clone_arg = (void *) &mtcpRestartThreadArg;
3539
3540 /*
3541 * syscall is wrapped by DMTCP when configured with PID-Virtualization.
3542 * It calls __clone which goes to DMTCP:__clone which then calls MTCP:__clone.
3543 * DMTCP:__clone checks for tid-conflict with any original tid. If
3544 * conflict, it replaces the thread with a new one with a new tid.
3545 * DMTCP:__clone wrapper calls the glibc:__clone if the computation is not
3546 * in RUNNING state (must be restarting), it calls the mtcp:__clone otherwise.
3547 * IF No PID-Virtualization, call glibc:__clone because threads created
3548 * during mtcp_restart should not go to MTCP:__clone; MTCP remembers those
3549 * threads from the checkpoint image.
3550 */
3551
3552 /* If running under DMTCP */
3553 pid_t tid;
3554 if (dmtcp_info_pid_virtualization_enabled == 1) {
3555 tid = syscall(SYS_clone, restarthread,
3556 (void *)(child -> savctx.SAVEDSP - 128), // -128 for red zone
3557 (child -> clone_flags & ~CLONE_SETTLS) | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
3558 clone_arg, child -> parent_tidptr, NULL, child -> actual_tidptr);
3559 } else {
3560 tid = ((*clone_entry)( restarthread,
3561 (void *)(child -> savctx.SAVEDSP - 128), // -128 for red zone
3562 (child -> clone_flags & ~CLONE_SETTLS) | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
3563 child, child -> parent_tidptr, NULL, child -> actual_tidptr));
3564 }
3565
3566 if (tid < 0) {
3567 mtcp_printf ("mtcp restarthread: error %d recreating thread\n", errno);
3568 mtcp_printf ("mtcp restarthread: clone_flags %X, savedsp %p\n",
3569 child -> clone_flags, child -> savctx.SAVEDSP);
3570 mtcp_abort ();
3571 }
3572 DPRINTF((" Parent:%d, tid of newly created thread:%d\n\n", thread->tid, tid));
3573 }
3574
3575 /* All my children have been created, jump to the stopthisthread routine just after getcontext call */
3576 /* Note that if this is the restored checkpointhread, it jumps to the checkpointhread routine */
3577
3578 if (mtcp_have_thread_sysinfo_offset())
3579 mtcp_set_thread_sysinfo(saved_sysinfo);
3580 ///JA: v54b port
3581 DPRINTF (("mtcp restarthread*: calling setcontext: thread->tid: %d, original_tid:%d\n",
3582 thread->tid, thread->original_tid));
3583 setcontext (&(thread -> savctx)); /* Shouldn't return */
3584 mtcp_abort ();
3585 return (0); /* NOTREACHED : stop compiler warning */
3586 }
3587
3588 /********************************************************************************************************************************/
3589 /* */
3590 /* Restore the GDT entries that are part of a thread's state */
3591 /* */
3592 /* The kernel provides set_thread_area system call for a thread to alter a particular range of GDT entries, and it switches */
3593 /* those entries on a per-thread basis. So from our perspective, this is per-thread state that is saved outside user */
3594 /* addressable memory that must be manually saved. */
3595 /* */
3596 /********************************************************************************************************************************/
3597
3598 static void restore_tls_state (Thread *thisthread)
3599
3600 {
3601 int rc;
3602 #if MTCP__SAVE_MANY_GDT_ENTRIES
3603 int i;
3604 #endif
3605
3606 /* The assumption that this points to the pid was checked by that tls_pid crap near the beginning */
3607
3608 *(pid_t *)(*(unsigned long *)&(thisthread -> gdtentrytls[0].base_addr) + TLS_PID_OFFSET()) = motherpid;
3609
3610 /* Likewise, we must jam the new pid into the mother thread's tid slot (checked by tls_tid carpola) */
3611
3612 if (thisthread == motherofall) {
3613 *(pid_t *)(*(unsigned long *)&(thisthread -> gdtentrytls[0].base_addr) + TLS_TID_OFFSET()) = motherpid;
3614 }
3615
3616 /* Restore all three areas */
3617
3618 #if MTCP__SAVE_MANY_GDT_ENTRIES
3619 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i ++) {
3620 rc = mtcp_sys_set_thread_area (&(thisthread -> gdtentrytls[i-GDT_ENTRY_TLS_MIN]));
3621 if (rc < 0) {
3622 mtcp_printf ("mtcp restore_tls_state: error %d restoring GDT TLS entry[%d]\n", mtcp_sys_errno, i);
3623 mtcp_abort ();
3624 }
3625 }
3626
3627 /* For newer Linuces, we just restore the one GDT entry that was indexed by GS */
3628
3629 #else
3630 rc = mtcp_sys_set_thread_area (&(thisthread -> gdtentrytls[0]));
3631 if (rc < 0) {
3632 mtcp_printf ("mtcp restore_tls_state: error %d restoring GDT TLS entry[%d]\n", mtcp_sys_errno, thisthread -> gdtentrytls[0].entry_number);
3633 mtcp_abort ();
3634 }
3635 #endif
3636
3637 /* Restore the rest of the stuff */
3638
3639 #ifdef __i386__
3640 asm volatile ("movw %0,%%fs" : : "m" (thisthread -> fs));
3641 asm volatile ("movw %0,%%gs" : : "m" (thisthread -> gs));
3642 #endif
3643 #ifdef __x86_64__
3644 /* Don't directly set fs. It would only set 32 bits, and we just
3645 * set the full 64-bit base of fs, using sys_set_thread_area,
3646 * which called arch_prctl.
3647 *asm volatile ("movl %0,%%fs" : : "m" (thisthread -> fs));
3648 *asm volatile ("movl %0,%%gs" : : "m" (thisthread -> gs));
3649 */
3650 #endif
3651
3652 thisthread -> tid = mtcp_sys_kernel_gettid ();
3653 }
3654
3655 /********************************************************************************************************************************/
3656 /* */
3657 /* Set the thread's STOPSIGNAL handler. Threads are sent STOPSIGNAL when they are to suspend execution the application, save */
3658 /* their state and wait for the checkpointhread to write the checkpoint file. */
3659 /* */
3660 /* Output: */
3661 /* */
3662 /* Calling thread will call stopthisthread () when sent a STOPSIGNAL */
3663 /* */
3664 /********************************************************************************************************************************/
3665
3666 static void setup_sig_handler (void)
3667 {
3668 struct sigaction act, old_act;
3669
3670 act.sa_handler = &stopthisthread;
3671 sigfillset(&act.sa_mask);
3672 act.sa_flags = SA_RESTART;
3673
3674 if (_real_sigaction(STOPSIGNAL, &act, &old_act) == -1) {
3675 mtcp_printf ("mtcp setupthread: error setting up signal handler: %s\n",
3676 strerror (errno));
3677 mtcp_abort ();
3678 }
3679
3680 if ((old_act.sa_handler != SIG_IGN) && (old_act.sa_handler != SIG_DFL) &&
3681 (old_act.sa_handler != stopthisthread)) {
3682 mtcp_printf ("mtcp setupthread: signal handler %d already in use (%p).\n"
3683 " You may employ a different signal by setting the\n"
3684 " environment variable MTCP_SIGCKPT (or DMTCP_SIGCKPT)"
3685 " to the number\n of the signal MTCP should "
3686 "use for checkpointing.\n", STOPSIGNAL, old_act.sa_handler);
3687 mtcp_abort ();
3688 }
3689 }
3690
3691 /********************************************************************************************************************************/
3692 /* */
3693 /* Sync shared memory pages with backup files on disk */
3694 /* */
3695 /********************************************************************************************************************************/
3696 static void sync_shared_mem(void)
3697 {
3698 int mapsfd;
3699 Area area;
3700
3701 mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
3702 if (mapsfd < 0) {
3703 mtcp_printf ("mtcp sync_shared_memory: error opening /proc/self/maps: %s\n",
3704 strerror (mtcp_sys_errno));
3705 mtcp_abort ();
3706 }
3707
3708 while (readmapsline (mapsfd, &area)) {
3709 /* Skip anything that has no read or execute permission. This occurs on one page in a Linux 2.6.9 installation. No idea why. This code would also take care of kernel sections since we don't have read/execute permission there. */
3710
3711 if (!((area.prot & PROT_READ) || (area.prot & PROT_WRITE))) continue;
3712
3713 if (!(area.flags & MAP_SHARED)) continue;
3714
3715 if (strstr(area.name, " (deleted)")) continue;
3716
3717 DPRINTF(("mtcp sync_shared_memory: syncing %X at %p from %s + %X\n", area.size, area.addr, area.name, area.offset));
3718
3719 if ( msync(area.addr, area.size, MS_SYNC) < 0 ){
3720 mtcp_printf ("mtcp sync_shared_memory: error syncing %X at %p from %s + %X\n", area.size, area.addr, area.name, area.offset);
3721 mtcp_abort();
3722 }
3723 }
3724
3725 close (mapsfd);
3726 }