1 /*****************************************************************************
2 * Copyright (C) 2006-2010 by Michael Rieker, Jason Ansel, Kapil Arya, and *
3 * Gene Cooperman *
4 * mrieker@nii.net, jansel@csail.mit.edu, kapil@ccs.neu.edu, and *
5 * gene@ccs.neu.edu *
6 * *
7 * This file is part of the MTCP module of DMTCP (DMTCP:mtcp). *
8 * *
9 * DMTCP:mtcp is free software: you can redistribute it and/or *
10 * modify it under the terms of the GNU Lesser General Public License as *
11 * published by the Free Software Foundation, either version 3 of the *
12 * License, or (at your option) any later version. *
13 * *
14 * DMTCP:dmtcp/src is distributed in the hope that it will be useful, *
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
17 * GNU Lesser General Public License for more details. *
18 * *
19 * You should have received a copy of the GNU Lesser General Public *
20 * License along with DMTCP:dmtcp/src. If not, see *
21 * <http://www.gnu.org/licenses/>. *
22 *****************************************************************************/
23
24 /********************************************************************************************************************************/
25 /* */
26 /* Multi-threaded checkpoint library */
27 /* */
28 /* Link this in as part of your program that you want checkpoints taken */
29 /* Call the mtcp_init routine at the beginning of your program */
30 /* Call the mtcp_ok routine when it's OK to do checkpointing */
31 /* Call the mtcp_no routine when you want checkpointing inhibited */
32 /* */
33 /* This module also contains a __clone wrapper routine */
34 /* */
35 /********************************************************************************************************************************/
36
37
38 // Set _GNU_SOURCE in order to expose glibc-defined sigandset()
39 #define _GNU_SOURCE
40 #include <asm/ldt.h> // for struct user_desc
41 //#include <asm/segment.h> // for GDT_ENTRY_TLS_... stuff
42 #include <dirent.h>
43 #include <dlfcn.h>
44 #include <errno.h>
45 #include <fcntl.h>
46 #include <pthread.h>
47 #include <semaphore.h>
48 #include <sched.h>
49 #include <signal.h>
50 #include <stdarg.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <sys/mman.h>
55 #include <sys/resource.h>
56 #include <sys/sem.h>
57 #include <sys/stat.h>
58 #include <sys/syscall.h>
59 #include <sys/ioctl.h>
60 #include <termios.h> // for tcdrain, tcsetattr, etc.
61 #include <unistd.h>
62 #include <ucontext.h>
63 #include <sys/types.h> // for gettid, tkill, waitpid
64 #include <sys/wait.h> // for waitpid
65 #include <linux/unistd.h> // for gettid, tkill
66 #include <gnu/libc-version.h>
67
68 #define MTCP_SYS_STRCPY
69 #define MTCP_SYS_STRLEN
70 #include "mtcp_internal.h"
71
72 /* required for ptrace sake */
73 #include <sys/user.h>
74 #include "mtcp_ptrace.h"
75
76 static int WAIT=1;
77 // static int WAIT=0;
78
79 #if 0
80 // Force thread to stop, without use of a system call.
81 static int WAIT=1;
82 # define DEBUG_WAIT \
83 if (DEBUG_RESTARTING) \
84 {int i,j; \
85 for (i = 0; WAIT && i < 1000000000; i++) \
86 for (j = 0; j < 1000000000; j++) ; \
87 }
88 #else
89 # define DEBUG_WAIT
90 #endif
91
92 #if defined(GDT_ENTRY_TLS_ENTRIES) && !defined(__x86_64__)
93 #define MTCP__SAVE_MANY_GDT_ENTRIES 1
94 #else
95 #define MTCP__SAVE_MANY_GDT_ENTRIES 0
96 #endif
97
98 /* Retrieve saved stack pointer saved by getcontext () */
99 #ifdef __x86_64__
100 #define MYREG_RSP 15
101 #define SAVEDSP uc_mcontext.gregs[MYREG_RSP]
102 #else
103 #define MYREG_ESP 7
104 #define SAVEDSP uc_mcontext.gregs[MYREG_ESP]
105 #endif
106
107 /* TLS segment registers used differently in i386 and x86_64. - Gene */
108 #ifdef __i386__
109 # define TLSSEGREG gs
110 #endif
111 #ifdef __x86_64__
112 # define TLSSEGREG fs
113 #endif
114
115 /* Offset computed (&x.pid - &x) for
116 * struct pthread x;
117 * as found in: glibc-2.5/nptl/descr.h
118 * It was 0x4c and 0x48 for pid and tid for i386.
119 * Roughly, the definition is:
120 *glibc-2.5/nptl/descr.h:
121 * struct pthread
122 * {
123 * union {
124 * tcbheader_t tcbheader;
125 * void *__padding[16];
126 * };
127 * list_t list;
128 * pid_t tid;
129 * pid_t pid;
130 * ...
131 * } __attribute ((aligned (TCB_ALIGNMENT)));
132 *
133 *glibc-2.5/nptl/sysdeps/pthread/list.h:
134 * typedef struct list_head
135 * {
136 * struct list_head *next;
137 * struct list_head *prev;
138 * } list_t;
139 *
140 * NOTE: glibc-2.10 changes the size of __padding from 16 to 24. --KAPIL
141 *
142 * NOTE: glibc-2.10 further changes the size tcphead_t without updating the
143 * size of __padding in struct pthread. We need to add an extra 512 bytes
144 * to accomodate this. -- KAPIL
145 */
146 #if __GLIBC_PREREQ (2,12)
147 /* WHEN WE HAVE CONFIDENCE IN THIS VERSION, REMOVE ALL OTHER __GLIBC_PREREQ
148 * AND MAKE THIS THE ONLY VERSION. IT SHOULD BE BACKWARDS COMPATIBLE.
149 */
150 /* These function definitions should succeed independently of the glibc version.
151 * They use get_thread_area() to match (tid, pid) and find offset.
152 * In other code, on restart, that offset is used to set (tid,pid) to
153 * the latest tid and pid of the new thread, instead of the (tid,pid)
154 * of the original thread.
155 * SEE: "struct pthread" in glibc-2.XX/nptl/descr.h for 'struct pthread'.
156 */
157 static int TLS_TID_OFFSET(void);
158
159 /* Can remove the unused attribute when this __GLIBC_PREREQ is the only one. */
160 static char *memsubarray (char *array, char *subarray, int len)
161 __attribute__ ((unused));
162 static int mtcp_get_tls_segreg(void);
163 static void *mtcp_get_tls_base_addr(void);
164
165 static int TLS_TID_OFFSET(void) {
166 static int tid_offset = -1;
167 if (tid_offset == -1) {
168 struct {pid_t tid; pid_t pid;} tid_pid;
169 /* struct pthread has adjacent fields, tid and pid, in that order.
170 * Try to find at what offset that bit patttern occurs in struct pthread.
171 */
172 char * tmp;
173 tid_pid.tid = mtcp_sys_kernel_gettid();
174 tid_pid.pid = mtcp_sys_getpid();
175 /* Get entry number of current thread descriptor from its segment register:
176 * Segment register / 8 is the entry_number for the "thread area", which
177 * is of type 'struct user_desc'. The base_addr field of that struct
178 * points to the struct pthread for the thread with that entry_number.
179 * The tid and pid are contained in the 'struct pthread'.
180 * So, to access the tid/pid fields, first find the entry number.
181 * Then fill in the entry_number field of an empty 'struct user_desc', and
182 * get_thread_area(struct user_desc *uinfo) will fill in the rest.
183 * Then use the filled in base_address field to get the 'struct pthread'.
184 * The function mtcp_get_tls_base_addr() returns this 'struct pthread' addr.
185 */
186 void * pthread_desc = mtcp_get_tls_base_addr();
187 /* A false hit for tid_offset probably can't happen since a new
188 * 'struct pthread' is zeroed out before adding tid and pid.
189 */
190 tmp = memsubarray((char *)pthread_desc, (char *)&tid_pid, sizeof(tid_pid));
191 if (tmp == NULL) {
192 mtcp_printf("MTCP: Couldn't find offsets of tid/pid in thread_area.\n");
193 mtcp_abort();
194 }
195 tid_offset = tmp - (char *)pthread_desc;
196 #ifdef __x86_64__
197 if (tid_offset != 512+26*sizeof(void *))
198 #else
199 if (tid_offset != 26*sizeof(void *))
200 #endif
201 mtcp_printf("MTCP: Warning: tid_offset = %d; different from expected.\n"
202 " Continuing anyway. If this fails, please try again.\n",
203 tid_offset);
204 DPRINTF(("tid_offset: %d\n", tid_offset));
205 if (tid_offset % sizeof(int) != 0) {
206 mtcp_printf("MTCP: tid_offset is not divisible by sizeof(int).\n");
207 mtcp_abort();
208 }
209 /* Should we do a double-check, and spawn a new thread and see
210 * if its TID matches at this tid_offset? This would give greater
211 * confidence, but for the reasons above, it's probably not necessary.
212 */
213 }
214 return tid_offset;
215 }
216 static int TLS_PID_OFFSET(void) {
217 static int pid_offset = -1;
218 struct {pid_t tid; pid_t pid;} tid_pid;
219 if (pid_offset == -1) {
220 int tid_offset = TLS_TID_OFFSET();
221 pid_offset = tid_offset + (char *)&(tid_pid.pid) - (char *)&tid_pid;
222 DPRINTF(("pid_offset: %d\n", pid_offset));
223 }
224 return pid_offset;
225 }
226 #elif __GLIBC_PREREQ (2,11)
227 # ifdef __x86_64__
228 # define TLS_PID_OFFSET() \
229 (512+26*sizeof(void *)+sizeof(pid_t)) // offset of pid in pthread struct
230 # define TLS_TID_OFFSET() (512+26*sizeof(void *)) // offset of tid in pthread struct
231 # else
232 # define TLS_PID_OFFSET() \
233 (26*sizeof(void *)+sizeof(pid_t)) // offset of pid in pthread struct
234 # define TLS_TID_OFFSET() (26*sizeof(void *)) // offset of tid in pthread struct
235 # endif
236 #elif __GLIBC_PREREQ (2,10)
237 # define TLS_PID_OFFSET() \
238 (26*sizeof(void *)+sizeof(pid_t)) // offset of pid in pthread struct
239 # define TLS_TID_OFFSET() (26*sizeof(void *)) // offset of tid in pthread struct
240 #else
241 # define TLS_PID_OFFSET() \
242 (18*sizeof(void *)+sizeof(pid_t)) // offset of pid in pthread struct
243 # define TLS_TID_OFFSET() (18*sizeof(void *)) // offset of tid in pthread struct
244 #endif
245
246 /* this call to gettid is hijacked by DMTCP for PID/TID-Virtualization */
247 #define GETTID() (int)syscall(SYS_gettid)
248
249 sem_t sem_start;
250
251 typedef struct Thread Thread;
252
253 struct Thread { Thread *next; // next thread in 'threads' list
254 Thread **prev; // prev thread in 'threads' list
255 int tid; // this thread's id as returned by mtcp_sys_kernel_gettid ()
256 int original_tid; // this is the the thread's "original" tid
257 MtcpState state; // see ST_... below
258 Thread *parent; // parent thread (or NULL if top-level thread)
259 Thread *children; // one of this thread's child threads
260 Thread *siblings; // one of this thread's sibling threads
261
262 int clone_flags; // parameters to __clone that created this thread
263 int *parent_tidptr;
264 int *given_tidptr; // (this is what __clone caller passed in)
265 int *actual_tidptr; // (this is what we passed to the system call, either given_tidptr or &child_tid)
266 int child_tid; // this is used for child_tidptr if the original call did not
267 // ... have both CLONE_CHILD_SETTID and CLONE_CHILD_CLEARTID
268 int (*fn) (void *arg); // thread's initial function entrypoint and argument
269 void *arg;
270
271 sigset_t sigblockmask; // blocked signals
272 sigset_t sigpending; // pending signals
273
274 ///JA: new code ported from v54b
275 ucontext_t savctx; // context saved on suspend
276
277 mtcp_segreg_t fs, gs; // thread local storage pointers
278 pthread_t pth; // added for pthread_join
279 #if MTCP__SAVE_MANY_GDT_ENTRIES
280 struct user_desc gdtentrytls[GDT_ENTRY_TLS_ENTRIES];
281 #else
282 struct user_desc gdtentrytls[1];
283 #endif
284 };
285
286 /*
287 * struct MtcpRestartThreadArg
288 *
289 * DMTCP requires the original_tids of the threads being created during
290 * the RESTARTING phase. We use MtcpRestartThreadArg structure is to pass
291 * the original_tid of the thread being created from MTCP to DMTCP.
292 *
293 * actual clone call: clone (fn, child_stack, flags, void *, ... )
294 * new clone call : clone (fn, child_stack, flags, (struct MtcpRestartThreadArg *), ...)
295 *
296 * DMTCP automatically extracts arg from this structure and passes that
297 * to the _real_clone call.
298 *
299 * IMPORTANT NOTE: While updating, this structure must be kept in sync
300 * with the structure defined with the same name in mtcpinterface.cpp
301 */
302 struct MtcpRestartThreadArg {
303 void *arg;
304 pid_t original_tid;
305 };
306
307 #define ST_RUNDISABLED 0 // thread is running normally but with checkpointing disabled
308 #define ST_RUNENABLED 1 // thread is running normally and has checkpointing enabled
309 #define ST_SIGDISABLED 2 // thread is running normally with cp disabled, but checkpoint thread is waiting for it to enable
310 #define ST_SIGENABLED 3 // thread is running normally with cp enabled, and checkpoint thread has signalled it to stop
311 #define ST_SUSPINPROG 4 // thread context being saved (very brief)
312 #define ST_SUSPENDED 5 // thread is suspended waiting for checkpoint to complete
313 #define ST_CKPNTHREAD 6 // thread is the checkpointing thread (special state just for that thread)
314
315 /* Global data */
316
317 void *mtcp_libc_dl_handle = NULL; // dlopen handle for whatever libc.so is loaded with application program
318 Area mtcp_libc_area; // some area of that libc.so
319
320 /* DMTCP Info Variables */
321
322 /* These are reset by dmtcphijack.so at initialization. */
323 int dmtcp_exists = 0; /* Are we running under DMTCP? */
324 int dmtcp_info_pid_virtualization_enabled = 0;
325 /* The following two DMTCP Info variables are defined in mtcp_printf.c */
326 //int dmtcp_info_stderr_fd = 2;
327 //int dmtcp_info_jassertlog_fd = -1;
328 int dmtcp_info_restore_working_directory = -1;
329
330 /* Static data */
331
332 static sigset_t sigpending_global; // pending signals for the process
333 static char const *nscd_mmap_str = "/var/run/nscd/"; // OpenSUSE
334 static char const *nscd_mmap_str2 = "/var/cache/nscd"; // Debian / Ubuntu
335 static char const *nscd_mmap_str3 = "/var/db/nscd"; // RedHat (Linux 2.6.9)
336 static char const *dev_zero_deleted_str = "/dev/zero (deleted)";
337 static char const *dev_null_deleted_str = "/dev/null (deleted)";
338 static char const *sys_v_shmem_file = "/SYSV";
339 //static char const *perm_checkpointfilename = NULL;
340 //static char const *temp_checkpointfilename = NULL;
341 static char perm_checkpointfilename[MAXPATHLEN];
342 static char temp_checkpointfilename[MAXPATHLEN];
343 static size_t checkpointsize;
344 static int intervalsecs;
345 static pid_t motherpid;
346 static size_t restore_size;
347 static int showtiming;
348 static int threadenabledefault;
349 static int verify_count; // number of checkpoints to go
350 static int verify_total; // value given by envar
351 static pid_t mtcp_ckpt_gzip_child_pid = -1;
352 static int volatile checkpointhreadstarting = 0;
353 static MtcpState restoreinprog = MTCP_STATE_INITIALIZER;
354 static MtcpState threadslocked = MTCP_STATE_INITIALIZER;
355 static pthread_t checkpointhreadid;
356 static struct timeval restorestarted;
357 static int DEBUG_RESTARTING = 0;
358 static Thread *motherofall = NULL;
359 static Thread *ckpthread = NULL;
360 static Thread *threads = NULL;
361 struct sigaction sigactions[NSIG]; // signal handlers
362 static VA restore_begin, restore_end;
363 static void *restore_start; /* will be bound to fnc, mtcp_restore_start */
364 static void *saved_sysinfo;
365 static void *saved_heap_start = NULL;
366 static char saved_working_directory[MTCP_MAX_PATH];
367 static void (*callback_sleep_between_ckpt)(int sec) = NULL;
368 static void (*callback_pre_ckpt)() = NULL;
369 static void (*callback_post_ckpt)(int is_restarting) = NULL;
370 static int (*callback_ckpt_fd)(int fd) = NULL;
371 static void (*callback_write_dmtcp_header)(int fd) = NULL;
372 static void (*callback_restore_virtual_pid_table)() = NULL;
373
374 static int (*clone_entry) (int (*fn) (void *arg),
375 void *child_stack,
376 int flags,
377 void *arg,
378 int *parent_tidptr,
379 struct user_desc *newtls,
380 int *child_tidptr);
381
382 /* temp stack used internally by restore so we don't go outside the
383 * libmtcp.so address range for anything;
384 * including "+ 1" since will set %esp/%rsp to tempstack+STACKSIZE
385 */
386 static long long tempstack[STACKSIZE + 1];
387
388 /* Internal routines */
389
390 static long set_tid_address (int *tidptr);
391
392 static char *memsubarray (char *array, char *subarray, int len)
393 __attribute__ ((unused));
394 static int mtcp_get_tls_segreg(void);
395 static void *mtcp_get_tls_base_addr(void);
396 static int threadcloned (void *threadv);
397 static void setupthread (Thread *thread);
398 static void setup_clone_entry (void);
399 static void threadisdead (Thread *thread);
400 static void *checkpointhread (void *dummy);
401 static int test_use_compression(void);
402 static int open_ckpt_to_write(int fd, int pipe_fds[2], char *gzip_path);
403 static void checkpointeverything (void);
404 static void writefiledescrs (int fd);
405 static void writememoryarea (int fd, Area *area,
406 int stack_was_seen, int vsyscall_exists);
407 static void writecs (int fd, char cs);
408 static void writefile (int fd, void const *buff, size_t size);
409 static void preprocess_special_segments(int *vsyscall_exists);
410 static void stopthisthread (int signum);
411 static void wait_for_all_restored (void);
412 static void save_sig_state (Thread *thisthread);
413 static void restore_sig_state (Thread *thisthread);
414 static void save_sig_handlers (void);
415 static void restore_sig_handlers (Thread *thisthread);
416 static void save_tls_state (Thread *thisthread);
417 static void renametempoverperm (void);
418 static Thread *getcurrenthread (void);
419 static void lock_threads (void);
420 static void unlk_threads (void);
421 static int readmapsline (int mapsfd, Area *area);
422 static void restore_heap(void);
423 static void finishrestore (void);
424 static int restarthread (void *threadv);
425 static void restore_tls_state (Thread *thisthread);
426 static void setup_sig_handler (void);
427 static void sync_shared_mem(void);
428
429 /* FIXME:
430 * dmtcp/src/syscallsreal.c has wrappers around signal, sigaction, sigprocmask
431 * The wrappers go to these mtcp_real_XXX versions so that MTCP can call
432 * the actual system calls and avoid the wrappers. But if that is still
433 * an issue, then we can create mtcp_sys_signal(), etc., for direct calls.
434 *
435 * Update:
436 * mtcp_real_XXX versions have been renamed to _real_XXX in DMTCP.
437 * sigprocmask should not be used in multi-threaded process, use
438 * pthread_sigmask instead.
439 */
440 int _real_sigaction(int signum, const struct sigaction *act,
441 struct sigaction *oldact){
442 if (dmtcp_exists) {
443 mtcp_printf("mtcp %s: This function mustn't be called when working under DMTCP\n",
444 __FUNCTION__);
445 mtcp_abort();
446 }
447 return sigaction(signum, act, oldact);
448 }
449
450
451 /********************************************************************************************************************************/
452 /* */
453 /* This routine must be called at startup time to initiate checkpointing */
454 /* */
455 /* Input: */
456 /* */
457 /* checkpointfilename = name to give the checkpoint file */
458 /* interval = interval, in seconds, to write the checkpoint file */
459 /* clonenabledefault = 0 : clone checkpointing blocked by default (call mtcp_ok in the thread to enable) */
460 /* 1 : clone checkpointing enabled by default (call mtcp_no in the thread to block if you want) */
461 /* */
462 /* envar MTCP_WRAPPER_LIBC_SO = what library to use for inner wrappers (default libc.??.so) */
463 /* envar MTCP_VERIFY_CHECKPOINT = every n checkpoints, verify by doing a restore to resume */
464 /* default is 0, ie, don't ever verify */
465 /* */
466 /********************************************************************************************************************************/
467 /* These hook functions provide an alternative to DMTCP callbacks, using
468 * weak symbols. While MTCP is immature, let's allow both, in case
469 * the flexibility of a second hook mechanism is useful in the future.
470 * The mechanism is invisible unless end user compiles w/ -Wl,-export-dynamic
471 */
472 __attribute__ ((weak)) void mtcpHookPreCheckpoint( void ) { }
473
474 __attribute__ ((weak)) void mtcpHookPostCheckpoint( void ) { }
475
476 __attribute__ ((weak)) void mtcpHookRestart( void ) { }
477
478 /* Statically allocate this. Malloc is dangerous here if application is
479 * defining its own (possibly not thread-safe) malloc routine.
480 */
481 static Thread ckptThreadStorage;
482
483 void mtcp_init (char const *checkpointfilename, int interval, int clonenabledefault)
484 {
485 char *p, *tmp, *endp;
486 int len;
487 Thread *ckptThreadDescriptor = & ckptThreadStorage;
488 mtcp_segreg_t TLSSEGREG;
489 #ifdef PTRACE
490 init_thread_local();
491 #endif
492
493 if (sizeof(void *) != sizeof(long)) {
494 mtcp_printf("ERROR: sizeof(void *) != sizeof(long) on this architecture.\n"
495 " This code assumes they are equal.\n");
496 mtcp_abort ();
497 }
498
499 #ifndef __x86_64__
500 /* Nobody else has a right to preload on internal processes generated
501 * by mtcp_check_XXX() -- not even DMTCP, if it's currently operating.
502 *
503 * Saving LD_PRELOAD in a temp env var and restoring it later --Kapil.
504 *
505 * TODO: To insert some sort of error checking to make sure that we
506 * are correctly setting LD_PRELOAD after we are done with
507 * vdso check.
508 */
509
510 // Shouldn't this removal of LD_PRELOAD be around fork/exec of gzip ?
511 // setenv( "MTCP_TMP_LD_PRELOAD", getenv("LD_PRELOAD"), 1);
512 // unsetenv("LD_PRELOAD");
513 // Allow user program to run with randomize_va
514 // mtcp_check_vdso_enabled();
515 // setenv("LD_PRELOAD", getenv("MTCP_TMP_LD_PRELOAD"), 1);
516 // unsetenv("MTCP_TMP_LD_PRELOAD");
517 #endif
518
519 #if 0
520 { struct user_desc u_info;
521 u_info.entry_number = 12;
522 if (-1 == mtcp_sys_get_thread_area(&u_info) && mtcp_sys_errno == ENOSYS)
523 mtcp_printf(
524 "Apparently, get_thread_area is not implemented in your kernel.\n"
525 " If this doesn't work, please try on a more recent kernel,\n"
526 " or one configured to support get_thread_area.\n"
527 );
528 }
529 #endif
530
531 intervalsecs = interval;
532
533 if (strlen(mtcp_ckpt_newname) >= MAXPATHLEN) {
534 mtcp_printf("mtcp mtcp_init: new ckpt file name (%s) too long (>=512 bytes)\n",
535 mtcp_ckpt_newname);
536 mtcp_abort();
537 }
538 strncpy(perm_checkpointfilename,checkpointfilename,MAXPATHLEN); // this is what user wants the checkpoint file called
539 len = strlen (perm_checkpointfilename); // make up another name, same as that, with ".temp" on the end
540 memcpy(temp_checkpointfilename, perm_checkpointfilename, len);
541 strncpy(temp_checkpointfilename + len, ".temp",MAXPATHLEN-len);
542 // ... we use it to write to in case we crash while writing
543 // we will leave the previous good one intact
544
545 #ifdef PTRACE
546 /* TODO: USE flock WHEN WRITING TO THESE THREE FILES (NOT YET DONE FOR ptrace_setoptions_file? */
547 memset(ptrace_shared_file, '\0', MAXPATHLEN);
|
Event secure_coding: |
[VERY RISKY]. Using "sprintf" can cause a buffer overflow when done incorrectly. Because sprintf() assumes an arbitrarily long string, callers must be careful not to overflow the actual space of the destination. Use snprintf() instead, or correct precision specifiers. |
| Also see events: |
[secure_coding][secure_coding] |
548 sprintf(ptrace_shared_file, "%s/ptrace_shared_file.txt", dir);
549 memset(ptrace_setoptions_file, '\0', MAXPATHLEN);
|
Event secure_coding: |
[VERY RISKY]. Using "sprintf" can cause a buffer overflow when done incorrectly. Because sprintf() assumes an arbitrarily long string, callers must be careful not to overflow the actual space of the destination. Use snprintf() instead, or correct precision specifiers. |
| Also see events: |
[secure_coding][secure_coding] |
550 sprintf(ptrace_setoptions_file, "%s/ptrace_setoptions_file.txt", dir);
551 memset(checkpoint_threads_file, '\0', MAXPATHLEN);
|
Event secure_coding: |
[VERY RISKY]. Using "sprintf" can cause a buffer overflow when done incorrectly. Because sprintf() assumes an arbitrarily long string, callers must be careful not to overflow the actual space of the destination. Use snprintf() instead, or correct precision specifiers. |
| Also see events: |
[secure_coding][secure_coding] |
552 sprintf(checkpoint_threads_file, "%s/checkpoint_threads_file.txt", dir);
553 #endif
554
555 DPRINTF (("mtcp_init*: main tid %d\n", mtcp_sys_kernel_gettid ()));
556 /* If MTCP_INIT_PAUSE set, sleep 15 seconds and allow for gdb attach. */
557 if (getenv("MTCP_INIT_PAUSE")) {
558 mtcp_printf("Pausing 15 seconds. Do: gdb attach %d\n", mtcp_sys_getpid());
559 sleep(15);
560 }
561
562 threadenabledefault = clonenabledefault; // save this away where it's easy to get
563
564 p = getenv ("MTCP_SHOWTIMING");
565 showtiming = ((p != NULL) && (*p & 1));
566
567 /* Maybe dump out some stuff about the TLS */
568
569 mtcp_dump_tls (__FILE__, __LINE__);
570
571 /* Save this process's pid. Then verify that the TLS has it where it should be. */
572 /* When we do a restore, we will have to modify each thread's TLS with the new motherpid. */
573 /* We also assume that GS uses the first GDT entry for its descriptor. */
574
575 motherpid = mtcp_sys_getpid (); /* libc/getpid can lie if we had
576 * used kernel fork() instead of libc fork().
577 */
578 {
579 pid_t tls_pid, tls_tid;
580 tls_pid = *(pid_t *) (mtcp_get_tls_base_addr() + TLS_PID_OFFSET());
581 tls_tid = *(pid_t *) (mtcp_get_tls_base_addr() + TLS_TID_OFFSET());
582
583 if ((tls_pid != motherpid) || (tls_tid != motherpid)) {
584 mtcp_printf ("mtcp_init: getpid %d, tls pid %d, tls tid %d, must all match\n",
585 motherpid, tls_pid, tls_tid);
586 mtcp_abort ();
587 }
588 }
589
590 /* Get verify envar */
591
592 tmp = getenv ("MTCP_VERIFY_CHECKPOINT");
593 verify_total = 0;
594 if (tmp != NULL) {
595 verify_total = strtol (tmp, &p, 0);
596 if ((*p != '\0') || (verify_total < 0)) {
597 mtcp_printf ("mtcp_init: bad MTCP_VERIFY_CHECKPOINT %s\n", tmp);
598 mtcp_abort ();
599 }
600 }
601
602 /* If the user has defined a signal, use that to suspend. Otherwise, use MTCP_DEFAULT_SIGNAL */
603
604 tmp = getenv("MTCP_SIGCKPT");
605 if (tmp == NULL)
606 STOPSIGNAL = MTCP_DEFAULT_SIGNAL;
607 else
608 {
609 errno = 0;
610 STOPSIGNAL = strtol(tmp, &endp, 0);
611
612 if ((errno != 0) || (tmp == endp))
613 {
614 mtcp_printf("mtcp_init: Your chosen SIGCKPT of \"%s\" does not "
615 "translate to a number,\n"
616 " and cannot be used. Signal %d "
617 "will be used instead.\n", tmp, MTCP_DEFAULT_SIGNAL);
618 STOPSIGNAL = MTCP_DEFAULT_SIGNAL;
619 }
620 else if (STOPSIGNAL < 1 || STOPSIGNAL > 31)
621 {
622 mtcp_printf("mtcp_init: Your chosen SIGCKPT of \"%d\" is not a valid "
623 "signal, and cannot be used.\n"
624 " Signal %d will be used instead.\n",
625 STOPSIGNAL, MTCP_DEFAULT_SIGNAL);
626 STOPSIGNAL = MTCP_DEFAULT_SIGNAL;
627 }
628 }
629
630 /* Set up signal handler so we can interrupt the thread for checkpointing */
631 setup_sig_handler ();
632
633 /* Get size and address of the shareable - used to separate it from the rest of the stuff */
634 /* All routines needed to perform restore must be within this address range */
635
636 restore_begin = (((VA)mtcp_shareable_begin) & -MTCP_PAGE_SIZE);
637 restore_size = ((VA)mtcp_shareable_end - restore_begin + MTCP_PAGE_SIZE - 1) & -MTCP_PAGE_SIZE;
638 restore_end = restore_begin + restore_size;
639 restore_start = mtcp_restore_start;
640
641 /* Setup clone_entry to point to glibc's __clone routine */
642
643 setup_clone_entry ();
644
645 /* Set up caller as one of our threads so we can work on it */
646
647 memset (ckptThreadDescriptor, 0, sizeof *ckptThreadDescriptor);
648 setupthread (ckptThreadDescriptor);
649 ckptThreadDescriptor -> child_tid = mtcp_sys_kernel_gettid (); // need to set this up so the checkpointhread can see we haven't exited
650 set_tid_address (&(ckptThreadDescriptor -> child_tid)); // we are assuming mtcp_init has been called before application may have called set_tid_address
651 // ... or else we will end up overwriting that set_tid_address value
652 motherofall = ckptThreadDescriptor;
653
654 /* Spawn off a thread that will perform the checkpoints from time to time */
655
656 checkpointhreadstarting = 1;
657 /* If we return from a fork(), we don't know what is the semaphore value. */
658 errno = 0;
659 while (sem_trywait(&sem_start) == -1 && (errno == EAGAIN || errno == EINTR)) {
660 if ( errno == EAGAIN )
661 sem_post(&sem_start);
662 errno = 0;
663 }
664 if (errno != 0)
665 perror("ERROR: continue anyway from " __FILE__ ":mtcp_init:sem_trywait()");
666 /* Now we successfully locked it. The sempaphore value is zero. */
667 if (pthread_create (&checkpointhreadid, NULL, checkpointhread, NULL) < 0) {
668 mtcp_printf ("mtcp_init: error creating checkpoint thread: %s\n", strerror (errno));
669 mtcp_abort ();
670 }
671 if (checkpointhreadstarting) mtcp_abort (); // make sure the clone wrapper executed (ie, not just the standard clone)
672 /* Stop until checkpoint thread has finished initializing.
673 * Some programs (like gcl) implement their own glibc functions in
674 * a non-thread-safe manner. In case we're using non-thread-safe glibc,
675 * don't run the checkpoint thread and user thread at the same time.
676 */
677 errno = 0;
678 while (-1 == sem_wait(&sem_start) && errno == EINTR)
679 errno = 0;
680 /* The child thread checkpointhread will now wake us. */
681 }
682
683 /********************************************************************************************************************************
684 *
685 * The routine mtcp_set_callbacks below may be called BEFORE the first
686 * MTCP checkpoint, to add special functionality to checkpointing
687 *
688 * Its arguments (callback functions) are:
689 *
690 * sleep_between_ckpt: Called in between checkpoints to replace the default "sleep(sec)" functionality,
691 * when this function returns checkpoint will start
692 * pre_ckpt: Called after all user threads are suspended, but BEFORE checkpoint written
693 * post_ckpt: Called after checkpoint, and after restore. is_restarting will be 1 for restore 0 for after checkpoint
694 * ckpt_fd: Called to test if mtcp should checkpoint a given FD returns 1 if it should
695 *
696 *******************************************************************************************************************************/
697
698 void mtcp_set_callbacks(void (*sleep_between_ckpt)(int sec),
699 void (*pre_ckpt)(),
700 void (*post_ckpt)(int is_restarting),
701 int (*ckpt_fd)(int fd),
702 void (*write_dmtcp_header)(int fd),
703 void (*restore_virtual_pid_table)())
704 {
705 callback_sleep_between_ckpt = sleep_between_ckpt;
706 callback_pre_ckpt = pre_ckpt;
707 callback_post_ckpt = post_ckpt;
708 callback_ckpt_fd = ckpt_fd;
709 callback_write_dmtcp_header = write_dmtcp_header;
710 callback_restore_virtual_pid_table = restore_virtual_pid_table;
711 }
712
713 /*************************************************************************/
714 /* */
715 /* Dump out the TLS stuff pointed to by %gs */
716 /* */
717 /*************************************************************************/
718
719 void mtcp_dump_tls (char const *file, int line)
720 {
721 #if 000
722 int i, j, mypid;
723 sigset_t blockall, oldsigmask;
724 struct user_desc gdtentry;
725 unsigned char byt;
726 unsigned short gs;
727
728 static int mutex = 0;
729
730 /* Block all signals whilst we have the futex */
731
732 memset (&blockall, -1, sizeof blockall);
733 if (sigprocmask (SIG_SETMASK, &blockall, &oldsigmask) < 0) {
734 abort ();
735 }
736
737 /* Block other threads from doing this so the output doesn't mix */
738
739 while (!atomic_setif_int (&mutex, 1, 0)) {
740 mtcp_sys_futex (&mutex, FUTEX_WAIT, 1, NULL, NULL, 0);
741 }
742
743 /* Get the segment for the TLS stuff */
744
745 asm volatile ("movw %%gs,%0" : "=g" (gs));
746 mtcp_printf("mtcp_init: gs=%X at %s:%d\n", gs, file, line);
747 if (gs != 0) {
748
749 /* We only handle GDT based stuff */
750
751 if (gs & 4) mtcp_printf(" *** part of LDT\n");
752
753 /* It's in the GDT */
754
755 else {
756
757 /* Read the TLS descriptor */
758
759 gdtentry.entry_number = gs / 8;
760 i = mtcp_sys_get_thread_area (&gdtentry);
761 if (i < 0) mtcp_printf(" error getting GDT entry %d: %d\n", gdtentry.entry_number, mtcp_sys_errno);
762 else {
763
764 /* Print out descriptor and first 80 bytes of data */
765
766 mtcp_printf(" limit %X, baseaddr %X\n", gdtentry.limit, gdtentry.base_addr);
767 for (i = 0; i < 80; i += 16) {
768 for (j = 16; -- j >= 0;) {
769 if ((j & 3) == 3) fputc (' ', stderr);
770 asm volatile ("movb %%gs:(%1),%0" : "=r" (byt) : "r" (i + j));
771 mtcp_printf("%2.2X", byt);
772 }
773 mtcp_printf(" : gs+%2.2X\n", i);
774 }
775 for (i = 0; i < 80; i += 16) {
776 for (j = 16; -- j >= 0;) {
777 if ((j & 3) == 3) fputc (' ', stderr);
778 byt = ((unsigned char *)gdtentry.base_addr)[i+j];
779 mtcp_printf("%2.2X", byt);
780 }
781 mtcp_printf(" : %8.8X\n", gdtentry.base_addr + i);
782 }
783
784 /* Offset 4C should be the process id */
785
786 asm volatile ("mov %%gs:0x4C,%0" : "=r" (i));
787 mtcp_printf("mtcp_init: getpid=%d, gettid=%d, tls=%d\n", getpid (), mtcp_sys_kernel_gettid (), i);
788 }
789 }
790 }
791
792 /* Release mutex and restore signal delivery */
793
794 mutex = 0;
795 mtcp_sys_futex (&mutex, FUTEX_WAKE, 1, NULL, NULL, 0);
796 if (_real_sigprocmask (SIG_SETMASK, &oldsigmask, NULL) < 0) {
797 abort ();
798 }
799 #endif
800 }
801
802 /*****************************************************************************/
803 /* */
804 /* This is our clone system call wrapper */
805 /* */
806 /* Note: */
807 /* */
808 /* pthread_create eventually calls __clone to create threads */
809 /* It uses flags = 0x3D0F00: */
810 /* CLONE_VM = VM shared between processes */
811 /* CLONE_FS = fs info shared between processes (root, cwd, umask) */
812 /* CLONE_FILES = open files shared between processes (fd table) */
813 /* CLONE_SIGHAND = signal handlers and blocked signals shared */
814 /* (sigaction common to parent and child) */
815 /* CLONE_THREAD = add to same thread group */
816 /* CLONE_SYSVSEM = share system V SEM_UNDO semantics */
817 /* CLONE_SETTLS = create a new TLS for the child from newtls parameter*/
818 /* CLONE_PARENT_SETTID = set the TID in the parent (before MM copy) */
819 /* CLONE_CHILD_CLEARTID = clear the TID in the child and do */
820 /* futex wake at that address */
821 /* CLONE_DETACHED = create clone detached */
822 /* */
823 /*****************************************************************************/
824
825 int __clone (int (*fn) (void *arg), void *child_stack, int flags, void *arg,
826 int *parent_tidptr, struct user_desc *newtls, int *child_tidptr)
827 {
828 int rc;
829 Thread *thread;
830 #ifdef PTRACE
831 int i;
832 #endif
833
834 /* Maybe they decided not to call mtcp_init */
835 if (motherofall != NULL) {
836
837 /* They called mtcp_init meaning we are to do checkpointing.
838 * So we are going to track this thread.
839 */
840
841 thread = malloc (sizeof *thread);
842 memset (thread, 0, sizeof *thread);
843 thread -> fn = fn; // this is the user's function
844 thread -> arg = arg; // ... and the parameter
845 thread -> parent = getcurrenthread ();
846 if (checkpointhreadstarting) {
847 checkpointhreadstarting = 0;
848 mtcp_state_init(&thread->state, ST_CKPNTHREAD);
849 } else {
850 mtcp_state_init(&thread->state, ST_RUNDISABLED);
851 }
852
853 DPRINTF (("mtcp wrapper clone*: calling clone thread=%p,"
854 " fn=%p, flags=0x%X\n", thread, fn, flags));
855 DPRINTF (("mtcp wrapper clone*: parent_tidptr=%p, newtls=%p,"
856 " child_tidptr=%p\n", parent_tidptr, newtls, child_tidptr));
857 //asm volatile ("int3");
858
859 /* Save exactly what the caller is supplying */
860
861 thread -> clone_flags = flags;
862 thread -> parent_tidptr = parent_tidptr;
863 thread -> given_tidptr = child_tidptr;
864
865 /* We need the CLEARTID feature so we can detect */
866 /* when the thread has exited */
867 /* So if the caller doesn't want it, we enable it */
868 /* Retain what the caller originally gave us so we can pass the tid back */
869
870 if (!(flags & CLONE_CHILD_CLEARTID)) {
871 child_tidptr = &(thread -> child_tid);
872 }
873 thread -> actual_tidptr = child_tidptr;
874 DPRINTF (("mtcp wrapper clone*: thread %p -> actual_tidptr %p\n",
875 thread, thread -> actual_tidptr));
876
877 /* Alter call parameters, forcing CLEARTID and make it call the wrapper routine */
878
879 flags |= CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID;
880 fn = threadcloned;
881 arg = thread;
882 }
883
884 /* mtcp_init not called, no checkpointing, but make sure clone_entry is */
885 /* set up so we can call the real clone */
886
887 else if (clone_entry == NULL) setup_clone_entry ();
888
889 /* Now create the thread */
890
891 DPRINTF (("mtcp wrapper clone*: clone fn=%p, child_stack=%p, flags=%X, arg=%p\n", fn, child_stack, flags, arg));
892 DPRINTF (("mtcp wrapper clone*: parent_tidptr=%p, newtls=%p, child_tidptr=%p\n", parent_tidptr, newtls, child_tidptr));
893 rc = (*clone_entry) (fn, child_stack, flags, arg, parent_tidptr, newtls, child_tidptr);
894 if (rc < 0) {
895 DPRINTF (("mtcp wrapper clone*: clone rc=%d, errno=%d\n", rc, errno));
896 } else {
897 DPRINTF (("mtcp wrapper clone*: clone rc=%d\n", rc));
898 }
899
900 #ifdef PTRACE
901 /*************************************************************************/
902 /* Code added to keep record of new tasks and processes in a file */
903 /*************************************************************************/
904
905 // initialize the ptrace_tid_pairs array
906 if (!init_ptrace_pairs) {
907 for (i = 0; i < MAX_PTRACE_PAIRS_COUNT; i++) {
908 ptrace_pairs[i].last_command = PTRACE_UNSPECIFIED_COMMAND;
909 ptrace_pairs[i].singlestep_waited_on = FALSE;
910 ptrace_pairs[i].free = TRUE;
911 ptrace_pairs[i].inferior_st = 'u'; // undefined
912 }
913 init_ptrace_pairs = 1;
914 }
915
916 // initialize the semaphore used when motherofall reads the ptrace shared file
917 if (!init_ptrace_read_pairs_sem) {
918 sem_init(&ptrace_read_pairs_sem, 0, 0);
919 init_ptrace_read_pairs_sem = 1;
920 }
921
922 if (!init__sem) {
923 sem_init(&__sem, 0, 1);
924 init__sem = 1;
925 }
926
927 if (is_ptrace_setoptions == TRUE) writeptraceinfo (setoptions_superior, rc);
928 else {
929 // read from file
930 int setoptions_fd = -1;
931 pid_t inferior;
932 pid_t superior;
933
934 setoptions_fd = open(ptrace_setoptions_file, O_RDONLY);
935
936 if (setoptions_fd != -1) {
937 while (readall(setoptions_fd, &superior, sizeof(pid_t)) > 0) {
938 readall(setoptions_fd, &inferior, sizeof(pid_t));
939 if (inferior == GETTID()) {
940 setoptions_superior = superior;
941 is_ptrace_setoptions = TRUE;
942 writeptraceinfo (setoptions_superior, rc);
943 }
944 }
945 if ( close(setoptions_fd) != 0 ) {
946 mtcp_printf("__clone: Error closing file: %s\n",
947 strerror(errno));
948 mtcp_abort();
949 }
950 }
951 }
952 /* the structure of checkpoint_threads_file is pairs of pid and tid */
953 write_info_to_file (2, getpid(), rc);
954 /*************************************************************************/
955 /* Done recording new tasks and processes. */
956 /*************************************************************************/
957 #endif
958
959 return (rc);
960 }
961
962 void fill_in_pthread (pid_t tid, pthread_t pth) {
963 struct Thread *thread;
964 for (thread = threads; thread != NULL; thread = thread -> next) {
965 if (thread -> tid == tid) {
966 thread -> pth = pth;
967 break;
968 }
969 }
970 }
971
972 void delete_thread_on_pthread_join (pthread_t pth) {
973 struct Thread *thread;
974 for (thread = threads; thread != NULL; thread = thread -> next) {
975 if (thread -> pth == pth) {
976 threadisdead (thread);
977 break;
978 }
979 }
980 }
981
982 asm (".global clone ; .type clone,@function ; clone = __clone");
983
984 /*****************************************************************************/
985 /* */
986 /* This routine is called (via clone) as the top-level routine of a thread */
987 /* that we are tracking. */
988 /* */
989 /* It fills in remaining items of our thread struct, calls the user function,*/
990 /* then cleans up the thread struct before exiting. */
991 /* */
992 /*****************************************************************************/
993
994 static int threadcloned (void *threadv)
995
996 {
997 int rc;
998 Thread *const thread = threadv;
999
1000 DPRINTF (("mtcp threadcloned*: starting thread %p\n", thread));
1001
1002 setupthread (thread);
1003
1004 /* The new TLS should have the process ID in place at TLS_PID_OFFSET() */
1005 /* This is a verification step and is therefore optional as such */
1006 {
1007 pid_t tls_pid = *(pid_t *) (mtcp_get_tls_base_addr() + TLS_PID_OFFSET());
1008 if ((tls_pid != motherpid) && (tls_pid != (pid_t)-1)) {
1009 mtcp_printf ("mtcp threadcloned: getpid %d, tls pid %d at offset %d, must match\n",
1010 motherpid, tls_pid, TLS_PID_OFFSET());
1011 mtcp_printf (" %X\n", motherpid);
1012 for (rc = 0; rc < 256; rc += 4) {
1013 tls_pid = *(pid_t *) (mtcp_get_tls_base_addr() + rc);
1014 mtcp_printf (" %d: %X", rc, tls_pid);
1015 if ((rc & 31) == 28) mtcp_printf ("\n");
1016 }
1017 mtcp_abort ();
1018 }
1019 }
1020
1021 /* If the caller wants the child tid but didn't have CLEARTID, pass the tid back to it */
1022
1023 if ((thread -> clone_flags & (CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) == CLONE_CHILD_SETTID) {
1024 *(thread -> given_tidptr) = thread -> child_tid;
1025 }
1026
1027 /* Maybe enable checkpointing by default */
1028
1029 if (threadenabledefault) mtcp_ok ();
1030
1031 #ifdef PTRACE
1032 init_thread_local();
1033 #endif
1034
1035 /* Call the user's function for whatever processing they want done */
1036
1037 DPRINTF (("mtcp threadcloned*: calling %p (%p)\n", thread -> fn, thread -> arg));
1038 rc = (*(thread -> fn)) (thread -> arg);
1039 DPRINTF (("mtcp threadcloned*: returned %d\n", rc));
1040
1041 /* Make sure checkpointing is inhibited while we clean up and exit */
1042 /* Otherwise, checkpointer might wait forever for us to re-enable */
1043
1044 mtcp_no ();
1045
1046 /* Do whatever to unlink and free thread block */
1047
1048 threadisdead (thread);
1049
1050 /* Return the user's status as the exit code */
1051
1052 return (rc);
1053 }
1054
1055 /*****************************************************************************/
1056 /* */
1057 /* set_tid_address wrapper routine */
1058 /* */
1059 /* We save the new address of the tidptr that will get cleared when the */
1060 /* thread exits */
1061 /* */
1062 /*****************************************************************************/
1063
1064 static long set_tid_address (int *tidptr)
1065
1066 {
1067 long rc;
1068 Thread *thread;
1069
1070 thread = getcurrenthread ();
1071 DPRINTF (("set_tid_address wrapper*: thread %p -> tid %d, tidptr %p\n",
1072 thread, thread -> tid, tidptr));
1073 thread -> actual_tidptr = tidptr; // save new tidptr so subsequent restore will create with new pointer
1074 rc = mtcp_sys_set_tid_address(tidptr);
1075 return (rc); // now we tell kernel to change it for the current thread
1076 }
1077
1078 /*****************************************************************************/
1079 /* */
1080 /* Link thread struct to the lists and finish filling it in */
1081 /* */
1082 /* Input: */
1083 /* */
1084 /* thread = thread to set up */
1085 /* */
1086 /* Output: */
1087 /* */
1088 /* thread linked to 'threads' list and 'motherofall' tree */
1089 /* thread -> tid = filled in with thread id */
1090 /* thread -> state = ST_RUNDISABLED (thread initially has checkpointing */
1091 /* disabled) */
1092 /* signal handler set up */
1093 /* */
1094 /*****************************************************************************/
1095
1096 static void setupthread (Thread *thread)
1097
1098 {
1099 Thread *parent;
1100
1101 /* Save the thread's ID number and put in threads list so we can look it up */
1102 /* Set state to disable checkpointing so checkpointer won't race between adding to list and setting up handler */
1103
1104 thread -> tid = mtcp_sys_kernel_gettid ();
1105 thread -> original_tid = GETTID ();
1106
1107 DPRINTF (("mtcp setupthread*: thread %p -> tid %d\n", thread, thread->tid));
1108
1109 lock_threads ();
1110
1111 if ((thread -> next = threads) != NULL) {
1112 thread -> next -> prev = &(thread -> next);
1113 }
1114 thread -> prev = &threads;
1115 threads = thread;
1116
1117 parent = thread -> parent;
1118 if (parent != NULL) {
1119 thread -> siblings = parent -> children;
1120 parent -> children = thread;
1121 }
1122
1123 unlk_threads ();
1124 }
1125
1126 /*****************************************************************************/
1127 /* */
1128 /* Set up 'clone_entry' variable */
1129 /* */
1130 /* Output: */
1131 /* */
1132 /* clone_entry = points to clone routine within libc.so */
1133 /* */
1134 /*****************************************************************************/
1135
1136 static void setup_clone_entry (void)
1137
1138 {
1139 char *p, *tmp;
1140 int mapsfd;
1141
1142 /* Get name of whatever concoction we have for a libc shareable image */
1143 /* This is used by the wrapper routines */
1144
1145 tmp = getenv ("MTCP_WRAPPER_LIBC_SO");
1146 if (tmp != NULL) {
1147 if (strlen(tmp) >= sizeof(mtcp_libc_area.name)) {
1148 mtcp_printf("mtcp setup_clone_entry: libc area name (%s) too long (>=1024 chars)\n",
1149 tmp);
1150 mtcp_abort();
1151 }
1152 strncpy (mtcp_libc_area.name, tmp, sizeof mtcp_libc_area.name);
1153 } else {
1154 mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
1155 if (mapsfd < 0) {
1156 mtcp_printf ("mtcp_init: error opening /proc/self/maps: %s\n", strerror (mtcp_sys_errno));
1157 mtcp_abort ();
1158 }
1159 p = NULL;
1160 while (readmapsline (mapsfd, &mtcp_libc_area)) {
1161 p = strstr (mtcp_libc_area.name, "/libc");
1162 if ((p != NULL) && ((p[5] == '-') || (p[5] == '.'))) break;
1163 }
1164 close (mapsfd);
1165 if (p == NULL) {
1166 mtcp_printf ("mtcp_init: cannot find */libc[-.]* in /proc/self/maps\n");
1167 mtcp_abort ();
1168 }
1169 }
1170 mtcp_libc_dl_handle = dlopen (mtcp_libc_area.name, RTLD_LAZY | RTLD_GLOBAL);
1171 if (mtcp_libc_dl_handle == NULL) {
1172 mtcp_printf ("mtcp_init: error opening libc shareable %s: %s\n", mtcp_libc_area.name, dlerror ());
1173 mtcp_abort ();
1174 }
1175
1176 /* Find the clone routine therein */
1177
1178 clone_entry = mtcp_get_libc_symbol ("__clone");
1179 }
1180
1181 /********************************************************************************************************************************/
1182 /* */
1183 /* Thread has exited - unlink it from lists and free struct */
1184 /* */
1185 /* Input: */
1186 /* */
1187 /* thread = thread that has exited */
1188 /* */
1189 /* Output: */
1190 /* */
1191 /* thread removed from 'threads' list and motherofall tree */
1192 /* thread pointer no longer valid */
1193 /* checkpointer woken if waiting for this thread */
1194 /* */
1195 /********************************************************************************************************************************/
1196
1197 static void threadisdead (Thread *thread)
1198
1199 {
1200 Thread **lthread, *parent, *xthread;
1201
1202 lock_threads ();
1203
1204 DPRINTF (("mtcp threadisdead*: thread %p -> tid %d\n", thread, thread -> tid));
1205
1206 /* Remove thread block from 'threads' list */
1207
1208 if ((*(thread -> prev) = thread -> next) != NULL) {
1209 thread -> next -> prev = thread -> prev;
1210 }
1211
1212 /* Remove thread block from parent's list of children */
1213
1214 parent = thread -> parent;
1215 if (parent != NULL) {
1216 for (lthread = &(parent -> children); (xthread = *lthread) != thread; lthread = &(xthread -> siblings)) {}
1217 *lthread = xthread -> siblings;
1218 }
1219
1220 /* If this thread has children, give them to its parent */
1221
1222 if (parent != NULL) {
1223 while ((xthread = thread -> children) != NULL) {
1224 thread -> children = xthread -> siblings;
1225 xthread -> siblings = parent -> children;
1226 parent -> children = xthread;
1227 }
1228 } else {
1229 while ((xthread = thread -> children) != NULL) {
1230 thread -> children = xthread -> siblings;
1231 xthread -> siblings = motherofall;
1232 motherofall = xthread;
1233 }
1234 }
1235
1236 unlk_threads ();
1237
1238 /* If checkpointer is waiting for us, wake it to see this thread no longer in list */
1239
1240 mtcp_state_futex (&(thread -> state), FUTEX_WAKE, 1, NULL);
1241
1242 mtcp_state_destroy( &(thread -> state) );
1243
1244 free (thread);
1245 }
1246
1247 void *mtcp_get_libc_symbol (char const *name)
1248
1249 {
1250 void *temp;
1251
1252 temp = dlsym (mtcp_libc_dl_handle, name);
1253 if (temp == NULL) {
1254 mtcp_printf ("mtcp_get_libc_symbol: error getting %s from %s: %s\n",
1255 name, mtcp_libc_area.name, dlerror ());
1256 mtcp_abort ();
1257 }
1258 return (temp);
1259 }
1260
1261 /********************************************************************************************************************************/
1262 /* */
1263 /* Call this when it's OK to checkpoint */
1264 /* */
1265 /********************************************************************************************************************************/
1266
1267 int mtcp_ok (void)
1268
1269 {
1270 Thread *thread;
1271
1272 if (getenv("MTCP_NO_CHECKPOINT"))
1273 return 0;
1274 thread = getcurrenthread ();
1275
1276 again:
1277 switch (mtcp_state_value(&thread -> state)) {
1278
1279 /* Thread was running normally with checkpointing disabled. Enable checkpointing then just return. */
1280
1281 case ST_RUNDISABLED: {
1282 if (!mtcp_state_set (&(thread -> state), ST_RUNENABLED, ST_RUNDISABLED)) goto again;
1283 return (0);
1284 }
1285
1286 /* Thread was running normally with checkpointing already enabled. So just return as is. */
1287
1288 case ST_RUNENABLED: {
1289 return (1);
1290 }
1291
1292 /* Thread was running with checkpointing disabled, but the checkpointhread wants to write a checkpoint. So mark the thread */
1293 /* as having checkpointing enabled, then just 'manually' call the signal handler as if the signal to suspend were just sent. */
1294
1295 case ST_SIGDISABLED: {
1296 if (!mtcp_state_set (&(thread -> state), ST_SIGENABLED, ST_SIGDISABLED)) goto again;
1297 stopthisthread (0);
1298 return (0);
1299 }
1300
1301 /* Thread is running with checkpointing enabled, but the checkpointhread wants to write a checkpoint and has sent a signal */
1302 /* telling the thread to call 'stopthisthread'. So we'll just keep going as is until the signal is actually delivered. */
1303
1304 case ST_SIGENABLED: {
1305 return (1);
1306 }
1307
1308 /* Thread is the checkpointhread so we just ignore the call (from threadcloned routine). */
1309
1310 case ST_CKPNTHREAD: {
1311 return (-1);
1312 }
1313
1314 /* How'd we get here? */
1315
1316 default: {
1317 mtcp_abort ();
1318 return (0); /* NOTREACHED : stop compiler warning */
1319 }
1320 }
1321 }
1322
1323 /* Likewise, disable checkpointing */
1324
1325 int mtcp_no (void)
1326 {
1327 Thread *thread;
1328
1329 if (getenv("MTCP_NO_CHECKPOINT"))
1330 return 0;
1331 thread = getcurrenthread ();
1332
1333 again:
1334 switch (mtcp_state_value(&thread -> state)) {
1335 case ST_RUNDISABLED: {
1336 return (0);
1337 }
1338
1339 case ST_RUNENABLED: {
1340 if (!mtcp_state_set (&(thread -> state), ST_RUNDISABLED, ST_RUNENABLED)) goto again;
1341 return (1);
1342 }
1343
1344 case ST_SIGDISABLED: {
1345 return (0);
1346 }
1347
1348 case ST_SIGENABLED: {
1349 stopthisthread (0);
1350 goto again;
1351 }
1352
1353 default: {
1354 mtcp_abort ();
1355 return (0); /* NOTREACHED : stop compiler warning */
1356 }
1357 }
1358 }
1359
1360 /* This is used by ../dmtcp/src/mtcpinterface.cpp */
1361 void mtcp_kill_ckpthread (void)
1362 {
1363 Thread *thread;
1364
1365 lock_threads ();
1366 for (thread = threads; thread != NULL; thread = thread -> next) {
1367 if ( mtcp_state_value(&thread -> state) == ST_CKPNTHREAD ) {
1368 unlk_threads ();
1369 DPRINTF(("mtcp_kill_ckpthread: Kill checkpinthread, tid=%d\n",thread->tid));
1370 mtcp_sys_kernel_tkill(thread -> tid, STOPSIGNAL);
1371 return;
1372 }
1373 }
1374 unlk_threads ();
1375 }
1376
1377
1378 /*************************************************************************/
1379 /* */
1380 /* Save and restore terminal settings. */
1381 /* */
1382 /*************************************************************************/
1383
1384 static int saved_termios_exists = 0;
1385 static struct termios saved_termios;
1386 static struct winsize win;
1387
1388 static void save_term_settings() {
1389 saved_termios_exists = ( isatty(STDIN_FILENO)
1390 && tcgetattr(STDIN_FILENO, &saved_termios) >= 0 );
1391 if (saved_termios_exists)
1392 ioctl (STDIN_FILENO, TIOCGWINSZ, (char *) &win);
1393 }
1394 int safe_tcsetattr(int fd, int optional_actions,
1395 const struct termios *termios_p) {
1396 struct termios old_termios, new_termios;
1397 /* We will compare old and new, and we don't want unitialized data */
1398 memset(&new_termios, 0, sizeof(new_termios));
1399 /* tcgetattr returns success as long as at least one of requested
1400 * changes was executed. So, repeat until no more changes.
1401 */
1402 do {
1403 memcpy(&old_termios, &new_termios, sizeof(new_termios));
1404 if (tcsetattr(fd, TCSANOW, termios_p) == -1) return -1;
1405 if (tcgetattr(fd, &new_termios) == -1) return -1;
1406 } while (memcmp(&new_termios, &old_termios, sizeof(new_termios)) != 0);
1407 return 0;
1408 }
1409 static void restore_term_settings() {
1410 if (saved_termios_exists){
1411 /* First check if we are in foreground. If not, skip this and print
1412 * warning. If we try to call tcsetattr in background, we will hang up.
1413 */
1414 int foreground = (tcgetpgrp(STDIN_FILENO) == getpgrp());
1415 DPRINTF(("restore terminal attributes, check foreground status first: %d\n",
1416 foreground));
1417 if (foreground) {
1418 if ( ( ! isatty(STDIN_FILENO)
1419 || safe_tcsetattr(STDIN_FILENO, TCSANOW, &saved_termios) == -1) )
1420 DPRINTF(("WARNING: mtcp finishrestore*: failed to restore terminal\n"));
1421 else {
1422 struct winsize cur_win;
1423 DPRINTF(("mtcp finishrestore*: restored terminal\n"));
1424 ioctl (STDIN_FILENO, TIOCGWINSZ, (char *) &cur_win);
1425 /* ws_row/ws_col was probably not 0/0 prior to checkpoint. We change
1426 * it back to last known row/col prior to checkpoint, and then send a
1427 * SIGWINCH (see below) to notify process that window might have changed
1428 */
1429 if (cur_win.ws_row == 0 && cur_win.ws_col == 0)
1430 ioctl (STDIN_FILENO, TIOCSWINSZ, (char *) &win);
1431 }
1432 } else {
1433 DPRINTF(("WARNING: mtcp finishrestore*: skip restore terminal step\n"
1434 " -- we are in BACKGROUND\n"));
1435 }
1436 }
1437 if (kill(getpid(), SIGWINCH) == -1) {} /* No remedy if error */
1438 }
1439
1440
1441 /*************************************************************************/
1442 /* */
1443 /* This executes as a thread. It sleeps for the checkpoint interval */
1444 /* seconds, then wakes to write the checkpoint file. */
1445 /* */
1446 /*************************************************************************/
1447
1448 static void *checkpointhread (void *dummy)
1449 {
1450 int needrescan;
1451 struct timespec sleeperiod;
1452 struct timeval started, stopped;
1453 Thread *thread;
1454 char * dmtcp_checkpoint_filename = NULL;
1455
1456 /* This is the start function of the checkpoint thread.
1457 * We also call getcontext to get a snapshot of this call frame,
1458 * since we will never exit this call frame. We always return
1459 * to this call frame at time of startup, on restart. Hence, restart
1460 * will forget any modifications to our local variables since restart.
1461 */
1462 static int originalstartup = 1;
1463
1464 #ifdef PTRACE
1465 init_thread_local();
1466 check_size_for_ptrace_file (ptrace_shared_file);
1467 check_size_for_ptrace_file (ptrace_setoptions_file);
1468 check_size_for_ptrace_file (checkpoint_threads_file);
1469 #endif
1470
1471 /* We put a timeout in case the thread being waited for exits whilst we are waiting */
1472
1473 static struct timespec const enabletimeout = { 10, 0 };
1474
1475 DPRINTF (("mtcp checkpointhread*: %d started\n", mtcp_sys_kernel_gettid ()));
1476
1477 /* Set up our restart point, ie, we get jumped to here after a restore */
1478
1479 ckpthread = getcurrenthread ();
1480
1481 save_sig_state( ckpthread );
1482 save_tls_state (ckpthread);
1483 /* Release user thread after we've initialized. */
1484 sem_post(&sem_start);
1485 if (getcontext (&(ckpthread -> savctx)) < 0) mtcp_abort ();
1486
1487 DPRINTF (("mtcp checkpointhread*: after getcontext. current_tid %d, original_tid:%d\n",
1488 mtcp_sys_kernel_gettid(), ckpthread->original_tid));
1489 if (originalstartup)
1490 originalstartup = 0;
1491 else {
1492
1493 /* We are being restored. Wait for all other threads to finish being restored before resuming checkpointing. */
1494
1495 DPRINTF (("mtcp checkpointhread*: waiting for other threads after restore\n"));
1496 wait_for_all_restored ();
1497 #ifdef PTRACE
1498 create_file (GETTID());
1499 #endif
1500 DPRINTF (("mtcp checkpointhread*: resuming after restore\n"));
1501 }
1502
1503 /* Reset the verification counter - on init, this will set it to it's start value. */
1504 /* After a verification, it will reset it to its start value. After a normal */
1505 /* restore, it will set it to its start value. So this covers all cases. */
1506
1507 verify_count = verify_total;
1508 DPRINTF (("After verify count mtcp checkpointhread*: %d started\n",
1509 mtcp_sys_kernel_gettid ()));
1510
1511 while (1) {
1512 #ifdef PTRACE
1513 int ptraced_by = 0;
1514 #endif
1515
1516 /* Wait a while between writing checkpoint files */
1517
1518 if (callback_sleep_between_ckpt == NULL)
1519 {
1520 memset (&sleeperiod, 0, sizeof sleeperiod);
1521 sleeperiod.tv_sec = intervalsecs;
1522 while ((nanosleep (&sleeperiod, &sleeperiod) < 0) && (errno == EINTR)) {}
1523 }
1524 else
1525 {
1526 DPRINTF(("mtcp checkpointhread*: before callback_sleep_between_ckpt(%d)\n",intervalsecs));
1527 (*callback_sleep_between_ckpt)(intervalsecs);
1528 DPRINTF(("mtcp checkpointhread*: after callback_sleep_between_ckpt(%d)\n",intervalsecs));
1529 }
1530
1531 mtcp_sys_gettimeofday (&started, NULL);
1532 checkpointsize = 0;
1533
1534 #ifdef PTRACE
1535 // Refresh ptrace information
1536 has_ptrace_file = 0;
1537 delete_ptrace_leader = -1;
1538 has_setoptions_file = 0;
1539 delete_setoptions_leader = -1;
1540 has_checkpoint_file = 0;
1541 delete_checkpoint_leader = -1;
1542 process_ptrace_info( &delete_ptrace_leader, &has_ptrace_file,
1543 &delete_setoptions_leader, &has_setoptions_file,
1544 &delete_checkpoint_leader, &has_checkpoint_file);
1545
1546 for (thread = threads; thread != NULL; thread = thread -> next) {
1547 int i;
1548 for (i = 0; i < ptrace_pairs_count; i++) {
1549 DPRINTF(("COMPARE: intf=%d, tid=%d\n",
1550 ptrace_pairs[i].inferior, thread->original_tid));
1551 if( ptrace_pairs[i].inferior == thread->original_tid ){
1552 ptraced_by = ptrace_pairs[i].superior;
1553 break;
1554 }
1555 }
1556 if( ptraced_by )
1557 break;
1558 }
1559
1560 DPRINTF(("\n\n%d ptraced by %d\n\n",(thread) ? thread->tid : 0,ptraced_by));
1561 if( ptraced_by ){
1562 DPRINTF(("\n\n%d Wait for superior %d\n\n",thread->tid,ptraced_by));
1563 ptrace_wait4(ptraced_by);
1564 //sleep(1);
1565 DPRINTF(("\n\n%d Wait for superior %d - SUCCESS\n\n",thread->tid,ptraced_by));
1566 }
1567 #endif
1568
1569 /* Halt all other threads - force them to call stopthisthread */
1570 /* If any have blocked checkpointing, wait for them to unblock before signalling */
1571
1572 rescan:
1573 needrescan = 0;
1574 lock_threads ();
1575 for (thread = threads; thread != NULL; thread = thread -> next) {
1576
1577 /* If thread no longer running, remove it from thread list */
1578
1579 again:
1580 if (*(thread -> actual_tidptr) == 0) {
1581 DPRINTF (("mtcp checkpointhread*: thread %d disappeared\n", thread -> tid));
1582 unlk_threads ();
1583 threadisdead (thread);
1584 goto rescan;
1585 }
1586
1587 /* Do various things based on thread's state */
1588
1589 switch (mtcp_state_value (&thread -> state) ) {
1590
1591 /* Thread is running but has checkpointing disabled */
1592 /* Tell the mtcp_ok routine that we are waiting for it */
1593 /* We will need to rescan so we will see it suspended */
1594
1595 case ST_RUNDISABLED: {
1596 if (!mtcp_state_set (&(thread -> state), ST_SIGDISABLED, ST_RUNDISABLED)) goto again;
1597 needrescan = 1;
1598 break;
1599 }
1600
1601 /* Thread is running and has checkpointing enabled */
1602 /* Send it a signal so it will call stopthisthread */
1603 /* We will need to rescan (hopefully it will be suspended by then) */
1604
1605 case ST_RUNENABLED: {
1606 if (!mtcp_state_set (&(thread -> state), ST_SIGENABLED, ST_RUNENABLED)) goto again;
1607 #ifdef PTRACE
1608 ptrace_save_threads_state ();
1609 int index;
1610 char inferior_st = 'N';
1611 char inf_st;
1612 for (index = 0; index < ptrace_pairs_count; index++) {
1613 inf_st = procfs_state(ptrace_pairs[index].inferior);
1614 DPRINTF(("tid = %d now=%c stored=%c superior = %d inferior = %d\n",
1615 GETTID(), inf_st, ptrace_pairs[index].inferior_st,
1616 ptrace_pairs[index].superior, ptrace_pairs[index].inferior));
1617 if (ptrace_pairs[index].inferior == thread -> original_tid) {
1618 inferior_st = ptrace_pairs[index].inferior_st;
1619 break;
1620 }
1621 }
1622 DPRINTF(("%d %c\n", GETTID(), inferior_st));
1623 if (inferior_st == 'N') {
1624 // superior
1625 if (mtcp_sys_kernel_tkill (thread -> tid, STOPSIGNAL) < 0) {
1626 if (mtcp_sys_errno != ESRCH) {
1627 mtcp_printf("mtcp checkpointhread: error signalling thread %d: %s\n",
1628 thread -> tid, strerror (mtcp_sys_errno));
1629 }
1630 unlk_threads ();
1631 threadisdead (thread);
1632 goto rescan;
1633 }
1634 }
1635 else {
1636 // inferior
1637 DPRINTF(("++++++++++++++++++++++++++++++++%c %d\n", inferior_st, thread -> original_tid));
1638 if (inferior_st != 'T') {
1639 if (mtcp_sys_kernel_tkill (thread -> tid, STOPSIGNAL) < 0) {
1640 if (mtcp_sys_errno != ESRCH) {
1641 mtcp_printf ("mtcp checkpointhread: error signalling thread %d: %s\n",
1642 thread -> tid, strerror (mtcp_sys_errno));
1643 }
1644 unlk_threads ();
1645 threadisdead (thread);
1646 goto rescan;
1647 }
1648 }
1649 create_file( thread -> original_tid );
1650 }
1651 #else
1652 if (mtcp_sys_kernel_tkill (thread -> tid, STOPSIGNAL) < 0) {
1653 if (mtcp_sys_errno != ESRCH) {
1654 mtcp_printf ("mtcp checkpointhread: error signalling thread %d: %s\n",
1655 thread -> tid, strerror (mtcp_sys_errno));
1656 }
1657 unlk_threads ();
1658 threadisdead (thread);
1659 goto rescan;
1660 }
1661 #endif
1662 needrescan = 1;
1663 break;
1664 }
1665
1666 /* Thread is running, we have signalled it to stop, but it has
1667 * checkpointing disabled. So we wait for it to change state.
1668 * We have to unlock because it may need lock to change state.
1669 */
1670
1671 case ST_SIGDISABLED: {
1672 unlk_threads ();
1673 mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SIGDISABLED,
1674 &enabletimeout);
1675 goto rescan;
1676 }
1677
1678 /* Thread is running and we have sent signal to stop it */
1679 /* So we have to wait for it to change state (enter signal handler) */
1680 /* We have to unlock because it may try to use lock meanwhile */
1681
1682 case ST_SIGENABLED: {
1683 unlk_threads ();
1684 mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SIGENABLED,
1685 &enabletimeout);
1686 goto rescan;
1687 }
1688
1689 /* Thread has entered signal handler and is saving its context.
1690 * So we have to wait for it to finish doing so. We don't need
1691 * to unlock because it won't use lock before changing state.
1692 */
1693
1694 case ST_SUSPINPROG: {
1695 mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SUSPINPROG,
1696 &enabletimeout);
1697 goto again;
1698 }
1699
1700 /* Thread is suspended and all ready for us to write checkpoint file */
1701
1702 case ST_SUSPENDED: {
1703 break;
1704 }
1705
1706 /* Don't do anything to the checkpointhread (this) thread */
1707
1708 case ST_CKPNTHREAD: {
1709 break;
1710 }
1711
1712 /* Who knows? */
1713
1714 default: {
1715 mtcp_abort ();
1716 }
1717 }
1718 }
1719 unlk_threads ();
1720
1721 /* If need to rescan (ie, some thread possibly not in ST_SUSPENDED STATE),
1722 * check them all again
1723 */
1724
1725 if (needrescan) goto rescan;
1726 RMB; // matched by WMB in stopthisthread
1727 DPRINTF (("mtcp checkpointhread*: everything suspended\n"));
1728
1729 /* If no threads, we're all done */
1730
1731 if (threads == NULL) {
1732 DPRINTF (("mtcp checkpointhread*: exiting (no threads)\n"));
1733 return (NULL);
1734 }
1735
1736 /* Call weak symbol of this file, possibly overridden by user's strong symbol.
1737 * User must compile his/her code with -Wl,-export-dynamic to make it visible.
1738 */
1739 mtcpHookPreCheckpoint();
1740
1741 if (!dmtcp_exists) {
1742 save_sig_handlers();
1743 }
1744
1745 /* All other threads halted in 'stopthisthread' routine (they are all
1746 * in state ST_SUSPENDED). It's safe to write checkpoint file now.
1747 */
1748 if (callback_pre_ckpt != NULL){
1749 // Here we want to synchronize the shared memory pages with the backup files
1750 DPRINTF(("mtcp checkpointhread*: syncing shared memory with backup files\n"));
1751 sync_shared_mem();
1752
1753 DPRINTF(("mtcp checkpointhread*: before callback_pre_ckpt() (&%x,%x) \n",
1754 &callback_pre_ckpt, callback_pre_ckpt));
1755 (*callback_pre_ckpt)(&dmtcp_checkpoint_filename);
1756 if (dmtcp_checkpoint_filename &&
1757 strcmp(dmtcp_checkpoint_filename, "/dev/null") != 0) {
1758 mtcp_sys_strcpy(perm_checkpointfilename, dmtcp_checkpoint_filename);
1759 DPRINTF(("mtcp checkpointhread*: Checkpoint filename changed to %s\n",
1760 perm_checkpointfilename));
1761 }
1762 }
1763
1764 #ifdef PTRACE
1765 /* If old stale files of these names exist, we append, with big problems
1766 * It's okay if files don't exist and unlink fails.
1767 * Pre_ckpt is a barrier from coordinator. So, all processes finished
1768 * reading ptrace pairs from files prior to this barrier.
1769 */
1770 unlink(ptrace_shared_file);
1771 unlink(ptrace_setoptions_file);
1772 unlink(checkpoint_threads_file);
1773 #endif
1774
1775 mtcp_saved_break = (void*) mtcp_sys_brk(NULL); // kernel returns mm->brk when passed zero
1776 /* Do this once, same for all threads. But restore for each thread. */
1777 if (mtcp_have_thread_sysinfo_offset())
1778 saved_sysinfo = mtcp_get_thread_sysinfo();
1779 /* Do this once. It's the same for all threads. */
1780 save_term_settings();
1781
1782 if (getcwd(saved_working_directory, MTCP_MAX_PATH) == NULL) {
1783 // buffer wasn't large enough
1784 perror("getcwd");
1785 mtcp_printf ("getcwd failed.");
1786 mtcp_abort ();
1787 }
1788
1789 DPRINTF (("mtcp checkpointhread*: mtcp_saved_break=%p\n", mtcp_saved_break));
1790
1791 if ( dmtcp_checkpoint_filename == NULL ||
1792 strcmp (dmtcp_checkpoint_filename, "/dev/null") != 0) {
1793 checkpointeverything ();
1794 } else {
1795 mtcp_printf("mtcp checkpointhread*: received \'/dev/null\'" \
1796 " as ckpt filename.\n*** Skipping checkpoint. ***\n");
1797 }
1798
1799 if (callback_post_ckpt != NULL){
1800 DPRINTF(("mtcp checkpointhread*: before callback_post_ckpt() (&%x,%x) \n",
1801 &callback_post_ckpt, callback_post_ckpt));
1802 (*callback_post_ckpt)(0);
1803 }
1804 if (showtiming) {
1805 mtcp_sys_gettimeofday (&stopped, NULL);
1806 stopped.tv_usec += (stopped.tv_sec - started.tv_sec) * 1000000 - started.tv_usec;
1807 mtcp_printf ("mtcp checkpoint: time %u uS, size %u megabytes," \
1808 " avg rate %u MB/s\n",
1809 stopped.tv_usec, (unsigned int)(checkpointsize / 1000000),
1810 (unsigned int)(checkpointsize / stopped.tv_usec));
1811 }
1812
1813 /* Call weak symbol of this file, possibly overridden by user's strong symbol.
1814 * User must compile his/her code with -Wl,-export-dynamic to make it visible.
1815 */
1816 mtcpHookPostCheckpoint();
1817
1818 /* Resume all threads. But if we're doing a checkpoint verify,
1819 * abort all threads except the main thread, as we don't want them
1820 * running when we exec the mtcp_restore program.
1821 */
1822
1823 DPRINTF (("mtcp checkpointhread*: resuming everything\n"));
1824 lock_threads();
1825 for (thread = threads; thread != NULL; thread = thread -> next) {
1826 if (mtcp_state_value(&(thread -> state)) != ST_CKPNTHREAD) {
1827 if (!mtcp_state_set (&(thread -> state), ST_RUNENABLED, ST_SUSPENDED))
1828 mtcp_abort();
1829 mtcp_state_futex(&(thread -> state), FUTEX_WAKE, 1, NULL);
1830 }
1831 }
1832 unlk_threads ();
1833 DPRINTF (("mtcp checkpointhread*: everything resumed\n"));
1834 /* But if we're doing a restore verify, just exit. The main thread is doing the exec to start the restore. */
1835 #ifdef PTRACE
1836 create_file (GETTID());
1837 #endif
1838 if ((verify_total != 0) && (verify_count == 0)) return (NULL);
1839 }
1840 }
1841
1842 /**
1843 * This function returns the fd to which the checkpoint file should be written.
1844 * The purpose of using this function over mtcp_sys_open() is that this
1845 * function will handle compression and gzipping.
1846 */
1847 static int test_use_compression(void)
1848 {
1849 char *do_we_compress;
1850
1851 do_we_compress = getenv("MTCP_GZIP");
1852 // allow alternate name for env var
1853 if (do_we_compress == NULL)
1854 do_we_compress = getenv("DMTCP_GZIP");
1855 // env var is unset, let's default to enabled
1856 // to disable compression, run with MTCP_GZIP=0
1857 if (do_we_compress == NULL)
1858 do_we_compress = "1";
1859
1860 char *endptr;
1861 strtol(do_we_compress, &endptr, 0);
1862 if ( *do_we_compress == '\0' || *endptr != '\0' ) {
1863 mtcp_printf("WARNING: MTCP_GZIP/DMTCP_GZIP defined as %s (not a number)\n"
1864 " Checkpoint image will not be compressed.\n",
1865 do_we_compress);
1866 do_we_compress = "0";
1867 }
1868 if ( 0 == strcmp(do_we_compress, "0") )
1869 return 0;
1870 /* If we arrive down here, it's safe to ccompress. */
1871 return 1;
1872 }
1873
1874 static int open_ckpt_to_write(int fd, int pipe_fds[2], char *gzip_path)
1875 {
1876 pid_t cpid;
1877 char *gzip_args[] = { "gzip", "-1", "-", NULL };
1878
1879 gzip_args[0] = gzip_path;
1880
1881 cpid = mtcp_sys_fork();
1882 if (cpid == -1) {
1883 mtcp_printf("WARNING: error forking child process `%s`. Compression will "
1884 "not be used [%s].\n", gzip_path, strerror(mtcp_sys_errno));
1885 close(pipe_fds[0]);
1886 close(pipe_fds[1]);
1887 //fall through to return fd
1888 } else if (cpid > 0) { /* parent process */
1889 //Before running gzip in child process, we must not use LD_PRELOAD.
1890 // See revision log 342 for details concerning bash.
1891 mtcp_ckpt_gzip_child_pid = cpid;
1892 if (close(pipe_fds[0]) == -1)
1893 mtcp_printf("WARNING: (in open_ckpt_to_write) close failed: %s\n",
1894 strerror(errno));
1895 if (close(fd) == -1)
1896 mtcp_printf("WARNING: (in open_ckpt_to_write) close failed: %s\n",
1897 strerror(errno));
1898 fd=pipe_fds[1];//change return value
1899 } else { /* child process */
1900 static int (*libc_unsetenv) (const char *name);
1901 static int (*libc_execvp) (const char *path, char *const argv[]);
1902
1903 close(pipe_fds[1]);
1904 dup2(pipe_fds[0], STDIN_FILENO);
1905 close(pipe_fds[0]);
1906 dup2(fd, STDOUT_FILENO);
1907 close(fd);
1908
1909 // Don't load dmtcphijack.so, etc. in exec.
1910 unsetenv("LD_PRELOAD"); // If in bash, this is bash env. var. version
1911 libc_unsetenv = mtcp_get_libc_symbol("unsetenv");
1912 (*libc_unsetenv)("LD_PRELOAD");
1913
1914 libc_execvp = mtcp_get_libc_symbol("execvp");
1915 (*libc_execvp)(gzip_path, gzip_args);
1916
1917 /* should not arrive here */
1918 mtcp_printf("ERROR: compression failed! No checkpointing will be"
1919 "performed! Cancel now!\n");
1920 mtcp_sys_exit(1);
1921 }
1922
1923 return fd;
1924 }
1925
1926
1927 /********************************************************************************************************************************/
1928 /* */
1929 /* This routine is called from time-to-time to write a new checkpoint file. */
1930 /* It assumes all the threads are suspended. */
1931 /* */
1932 /********************************************************************************************************************************/
1933
1934 static void checkpointeverything (void)
1935 {
1936 Area area;
1937 int fd, mapsfd;
1938 VA area_begin, area_end;
1939 int stack_was_seen = 0;
1940 int vsyscall_exists = 0;
1941 int forked_checkpointing = 0;
1942 int forked_cpid;
1943 int use_compression = -1; /* decide later */
1944 int pipe_fds[2]; /* for potential piping */
1945 char *gzip_cmd = "gzip";
1946 char gzip_path[MTCP_MAX_PATH];
1947 char tmpDMTCPHeaderBuf[] = "/tmp/dmtcp.XXXXXX";
1948 char *tmpDMTCPHeaderFileName = tmpDMTCPHeaderBuf;
1949 int tmpDMTCPHeaderFd = -1;
1950
1951 static void *const frpointer = finishrestore;
1952
1953 DPRINTF (("mtcp checkpointeverything*: tid %d\n", mtcp_sys_kernel_gettid ()));
1954
1955 if (getenv("MTCP_FORKED_CHECKPOINT") != NULL)
1956 forked_checkpointing = 1;
1957 #ifdef TEST_FORKED_CHECKPOINTING
1958 forked_checkpointing = 1;
1959 #endif
1960
1961 if (callback_write_dmtcp_header != 0) {
1962 /* Temp file for DMTCP header; will be written into the checkpoint file. */
1963 tmpDMTCPHeaderFd = mkstemp(tmpDMTCPHeaderFileName);
1964 if (tmpDMTCPHeaderFd < 0) {
1965 mtcp_printf("error %d creating temp file: %s\n", errno, strerror(errno));
1966 mtcp_abort();
1967 }
1968
1969 if (unlink(tmpDMTCPHeaderFileName) == -1) {
1970 mtcp_printf("NOTE: error %d unlinking temp file: %s\n", errno,
1971 strerror(errno));
1972 }
1973
1974 /* Better to do this in parent, not child, for most accurate header info */
1975 (*callback_write_dmtcp_header)(tmpDMTCPHeaderFd);
1976 }
1977
1978 if (forked_checkpointing) {
1979 forked_cpid = mtcp_sys_fork();
1980 if (forked_cpid == -1) {
1981 mtcp_printf("WARNING: Failed to do forked checkpointing,"
1982 " trying normal checkpoint\n");
1983 } else if (forked_cpid > 0) {
1984 /* Parent process*/
1985 if (tmpDMTCPHeaderFd != -1)
1986 close(tmpDMTCPHeaderFd);
1987 // Calling waitpid here, but on 32-bit Linux, libc:waitpid() calls wait4()
1988 if ( waitpid(forked_cpid, NULL, 0) == -1 )
1989 DPRINTF (("mtcp restoreverything*: error waitpid: errno: %d",
1990 mtcp_sys_errno));
1991 DPRINTF (("mtcp checkpointeverything*: checkpoint complete\n"));
1992 return;
1993 } else {
1994 pid_t grandchild_pid = mtcp_sys_fork();
1995 if (grandchild_pid == -1) {
1996 mtcp_printf("WARNING: Forked checkpoint failed, no checkpoint available\n");
1997 } else if (grandchild_pid > 0) {
1998 mtcp_sys_exit(0); /* child exits */
1999 }
2000 /* grandchild continues; no need now to waitpid() on grandchild */
2001 DPRINTF (("mtcp checkpointeverything*: inside grandchild process\n"));
2002 }
2003 }
2004
2005 /* 1. Test if using compression */
2006 use_compression = test_use_compression();
2007 /* 2. Get gzip path */
2008 if (use_compression && mtcp_find_executable(gzip_cmd, gzip_path) == NULL) {
2009 mtcp_printf("WARNING: gzip cannot be executed. Compression will "
2010 "not be used.\n");
2011 use_compression = 0;
2012 }
2013 /* 3. Create pipe */
2014 /* Note: Must use mtcp_sys_pipe(), to go to kernel, since
2015 * DMTCP has a wrapper around glibc promoting pipes to socketpairs,
2016 * DMTCP doesn't directly checkpoint/restart pipes.
2017 */
2018 if ( use_compression && mtcp_sys_pipe(pipe_fds) == -1 ) {
2019 mtcp_printf("WARNING: error creating pipe. Compression will "
2020 "not be used.\n");
2021 use_compression = 0;
2022 }
2023 /* 4. Open fd to checkpoint image on disk */
2024 /* Create temp checkpoint file and write magic number to it */
2025 /* This is a callback to DMTCP. DMTCP writes header and returns fd. */
2026 fd = mtcp_safe_open(temp_checkpointfilename,
2027 O_CREAT | O_TRUNC | O_WRONLY, 0600);
2028 if (fd < 0) {
2029 mtcp_printf("mtcp.c: checkpointeverything: error creating %s: %s\n",
2030 temp_checkpointfilename, strerror(mtcp_sys_errno));
2031 mtcp_abort();
2032 }
2033 /* 5. We now have the information to pipe to gzip, or directly to fd
2034 * We do it this way, so that gzip will be direct child of forked process
2035 * when using forked checkpointing.
2036 */
2037
2038 #if 1
2039 /* Temporary fix, until DMTCP uses its own separate allocator.
2040 * The else code should really go lower down, just before we checkpoint
2041 * the heap.
2042 */
2043 #else
2044 if (mtcp_sys_break(0) != mtcp_saved_break)
2045 mtcp_printf("\n\n*** ERROR: End of heap grew."
2046 " Continue at your own risk. ***\n\n\n");
2047 #endif
2048
2049 /* Drain stdin and stdout before checkpoint */
2050 tcdrain(STDOUT_FILENO);
2051 tcdrain(STDERR_FILENO);
2052
2053 if (use_compression) /* if use_compression, fork a gzip process */
2054 fd = open_ckpt_to_write(fd, pipe_fds, gzip_path);
2055
2056 if (tmpDMTCPHeaderFd != -1 ) {
2057 char tmpBuff[1024];
2058 int retval = -1;
2059 lseek(tmpDMTCPHeaderFd, 0, SEEK_SET);
2060
2061 while (retval != 0) {
2062 retval = read (tmpDMTCPHeaderFd, tmpBuff, 1024);
2063 if (retval == -1 && (errno == EAGAIN || errno == EINTR))
2064 continue;
2065 if (retval == -1) {
2066 mtcp_printf("Error writing checkpoint file: %s\n", strerror(errno));
2067 mtcp_abort();
2068 }
2069 writefile(fd, tmpBuff, retval);
2070 }
2071 close(tmpDMTCPHeaderFd);
2072 }
2073
2074 // Preprocess special segments like vsyscall, stack, heap etc.
2075 preprocess_special_segments(&vsyscall_exists);
2076
2077 writefile (fd, MAGIC, MAGIC_LEN);
2078
2079 DPRINTF (("mtcp checkpointeverything*: restore_begin %X at %p from [libmtcp.so]\n",
2080 restore_size, restore_begin));
2081
2082 struct rlimit stack_rlimit;
2083 getrlimit(RLIMIT_STACK, &stack_rlimit);
2084
2085 DPRINTF (("mtcp_restart: saved stack resource limit: soft_lim:%p, hard_lim:%p\n",
2086 stack_rlimit.rlim_cur, stack_rlimit.rlim_max));
2087
2088 writecs (fd, CS_STACKRLIMIT);
2089 writefile (fd, &stack_rlimit, sizeof stack_rlimit);
2090
2091 DPRINTF (("mtcp checkpointeverything*: [libmtcp.so] image of size %X at %p\n",
2092 restore_size, restore_begin));
2093
2094 writecs (fd, CS_RESTOREBEGIN);
2095 writefile (fd, &restore_begin, sizeof restore_begin);
2096 writecs (fd, CS_RESTORESIZE);
2097 writefile (fd, &restore_size, sizeof restore_size);
2098 writecs (fd, CS_RESTORESTART);
2099 writefile (fd, &restore_start, sizeof restore_start);
2100 writecs (fd, CS_RESTOREIMAGE);
2101 writefile (fd, (void *)restore_begin, restore_size);
2102 writecs (fd, CS_FINISHRESTORE);
2103 writefile (fd, &frpointer, sizeof frpointer);
2104
2105 /* Write out file descriptors */
2106
2107 writefiledescrs (fd);
2108
2109 /* Finally comes the memory contents */
2110
2111 /**************************************************************************/
2112 /* We can't do any more mallocing at this point because malloc stuff is */
2113 /* outside the limits of the libmtcp.so image, so it won't get */
2114 /* checkpointed, and it's possible that we would checkpoint an */
2115 /* inconsistent state. See note in restoreverything routine. */
2116 /**************************************************************************/
2117
2118 mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
2119
2120 while (readmapsline (mapsfd, &area)) {
2121 area_begin = (VA)area.addr;
2122 area_end = area_begin + area.size;
2123
2124 /* Original comment: Skip anything in kernel address space ---
2125 * beats me what's at FFFFE000..FFFFFFFF - we can't even read it;
2126 * Added: That's the vdso section for earlier Linux 2.6 kernels. For later
2127 * 2.6 kernels, vdso occurs at an earlier address. If it's unreadable,
2128 * then we simply won't copy it. But let's try to read all areas, anyway.
2129 * **COMMENTED OUT:** if (area_begin >= HIGHEST_VA) continue;
2130 */
2131 /* If it's readable, but it's VDSO, it will be dangerous to restore it.
2132 * In 32-bit mode later Red Hat RHEL Linux 2.6.9 releases use 0xffffe000,
2133 * the last page of virtual memory. Note 0xffffe000 >= HIGHEST_VA
2134 * implies we're in 32-bit mode.
2135 */
2136 if (area_begin >= HIGHEST_VA && area_begin == 0xffffe000) continue;
2137 #ifdef __x86_64__
2138 /* And in 64-bit mode later Red Hat RHEL Linux 2.6.9 releases
2139 * use 0xffffffffff600000 for VDSO.
2140 */
2141 if (area_begin >= HIGHEST_VA && area_begin == 0xffffffffff600000) continue;
2142 #endif
2143
2144 /* Skip anything that has no read or execute permission. This occurs
2145 * on one page in a Linux 2.6.9 installation. No idea why. This code
2146 * would also take care of kernel sections since we don't have read/execute
2147 * permission there.
2148 */
2149
2150 if (!((area.prot & PROT_READ) || (area.prot & PROT_WRITE))) continue;
2151
2152 // If the process has an area labelled as "/dev/zero (deleted)", we mark
2153 // the area as Anonymous and save the contents to the ckpt image file.
2154 // IF this area has a MAP_SHARED attribute, it should be replaced with
2155 // MAP_PRIVATE and we won't do any harm because, the /dev/zero file is an
2156 // absolute source and sink. Anything written to it will be discarded and
2157 // anything read from it will be all zeros.
2158 // The following call to mmap will create "/dev/zero (deleted)" area
2159 // mmap(addr, size, protection, MAP_SHARED | MAP_ANONYMOUS, 0, 0)
2160 //
2161 // The above explanation also applies to "/dev/null (deleted)"
2162
2163 if ( mtcp_strstartswith(area.name, dev_zero_deleted_str) ||
2164 mtcp_strstartswith(area.name, dev_null_deleted_str) ) {
2165 DPRINTF(("mtcp checkpointeverything: saving area \"%s\" as Anonymous\n",
2166 area.name));
2167 area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
2168 area.name[0] = '\0';
2169 }
2170
2171 if (mtcp_strstartswith(area.name, sys_v_shmem_file)) {
2172 DPRINTF(("mtcp checkpointeverything: saving area \"%s\" as Anonymous\n",
2173 area.name));
2174 area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
2175 area.name[0] = '\0';
2176 }
2177
2178 /* Special Case Handling: nscd is enabled*/
2179 if ( mtcp_strstartswith(area.name, nscd_mmap_str) ||
2180 mtcp_strstartswith(area.name, nscd_mmap_str2) ||
2181 mtcp_strstartswith(area.name, nscd_mmap_str3) ) {
2182 DPRINTF(("mtcp checkpointeverything: NSCD daemon shared memory area present. MTCP will now try to remap\n" \
2183 " this area in read/write mode and then will fill it with zeros so that\n" \
2184 " glibc will automatically ask NSCD daemon for new shared area\n\n"));
2185 area.prot = PROT_READ | PROT_WRITE;
2186 area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
2187
2188 if ( munmap(area.addr, area.size) == -1) {
2189 mtcp_printf ("mtcp checkpointeverything: error unmapping NSCD shared area: %s\n",
2190 strerror (mtcp_sys_errno));
2191 mtcp_abort();
2192 }
2193
2194 if ( mmap(area.addr, area.size, area.prot, area.flags, 0, 0)
2195 == MAP_FAILED ){
2196 mtcp_printf ("mtcp checkpointeverything: error remapping NSCD shared area: %s\n",
2197 strerror (mtcp_sys_errno));
2198 mtcp_abort();
2199 }
2200
2201 memset(area.addr, 0, area.size);
2202 }
2203
2204 /* Force the anonymous flag if it's a private writeable section, as the
2205 * data has probably changed from the contents of the original images.
2206 */
2207
2208 /* We also do this for read-only private sections as it's possible
2209 * to modify a page there, too (via mprotect).
2210 */
2211
2212 if ((area.flags & MAP_PRIVATE) /*&& (area.prot & PROT_WRITE)*/) {
2213 area.flags |= MAP_ANONYMOUS;
2214 }
2215
2216 if ( area.flags & MAP_SHARED ) {
2217 /* invalidate shared memory pages so that the next read to it (when we are
2218 * writing them to ckpt file) will cause them to be reloaded from the disk.
2219 */
2220 if ( msync(area.addr, area.size, MS_INVALIDATE) < 0 ){
2221 mtcp_printf ("mtcp sync_shared_memory: error %d Invalidating %X"
2222 " at %p from %s + %X\n", mtcp_sys_errno, area.size,
2223 area.addr, area.name, area.offset);
2224 mtcp_abort();
2225 }
2226 }
2227
2228
2229 /* Skip any mapping for this image - it got saved as CS_RESTOREIMAGE
2230 * at the beginning.
2231 */
2232
2233 if (area_begin < restore_begin) {
2234 if (area_end <= restore_begin) {
2235 writememoryarea (fd, &area, 0, vsyscall_exists); // the whole thing is before the restore image
2236 } else if (area_end <= restore_end) {
2237 area.size = restore_begin - area_begin; // we just have to chop the end part off
2238 writememoryarea (fd, &area, 0, vsyscall_exists);
2239 } else {
2240 area.size = restore_begin - area_begin; // we have to write stuff that comes before restore image
2241 writememoryarea (fd, &area, 0, vsyscall_exists);
2242 area.offset += restore_end - area_begin; // ... and we have to write stuff that comes after restore image
2243 area.size = area_end - restore_end;
2244 area.addr = (void *)restore_end;
2245 writememoryarea (fd, &area, 0, vsyscall_exists);
2246 }
2247 } else if (area_begin < restore_end) {
2248 if (area_end > restore_end) {
2249 area.offset += restore_end - area_begin; // we have to write stuff that comes after restore image
2250 area.size = area_end - restore_end;
2251 area.addr = (void *)restore_end;
2252 writememoryarea (fd, &area, 0, vsyscall_exists);
2253 }
2254 } else {
2255 if ( strstr (area.name, "[stack]") )
2256 stack_was_seen = 1;
2257 writememoryarea (fd, &area, stack_was_seen, vsyscall_exists); // the whole thing comes after the restore image
2258 }
2259 }
2260
2261 close (mapsfd);
2262
2263 /* That's all folks */
2264
2265 writecs (fd, CS_THEEND);
2266 if (close (fd) < 0) {
2267 mtcp_printf ("mtcp checkpointeverything(grandchild):"
2268 " error closing checkpoint file: %s\n", strerror (errno));
2269 mtcp_abort ();
2270 }
2271 if (use_compression) {
2272 /* IF OUT OF DISK SPACE, REPORT IT HERE. */
2273 if ( waitpid(mtcp_ckpt_gzip_child_pid, NULL, 0 ) == -1 )
2274 mtcp_printf ("mtcp checkpointeverything(grandchild): waitpid: %s\n",
2275 strerror (errno));
2276 mtcp_ckpt_gzip_child_pid = -1;
2277 }
2278
2279 /* Maybe it's time to verify the checkpoint.
2280 * If so, exec an mtcp_restore with the temp file (in case temp file is bad,
2281 * we'll still have the last one).
2282 * If the new file is good, mtcp_restore will rename it over the last one.
2283 */
2284
2285 if (verify_total != 0) -- verify_count;
2286
2287 /* Now that temp checkpoint file is complete, rename it over old permanent
2288 * checkpoint file. Uses rename() syscall, which doesn't change i-nodes.
2289 * So, gzip process can continue to write to file even after renaming.
2290 */
2291
2292 else renametempoverperm ();
2293
2294 if (forked_checkpointing)
2295 mtcp_sys_exit (0); /* grandchild exits */
2296
2297 DPRINTF (("mtcp checkpointeverything*: checkpoint complete\n"));
2298 }
2299
2300 /* True if the given FD should be checkpointed */
2301 static int should_ckpt_fd (int fd)
2302 {
2303 if ( callback_ckpt_fd!=NULL )
2304 return (*callback_ckpt_fd)(fd); //delegate to callback
2305 else if (fd > 2)
2306 return 1;
2307 else
2308 {
2309 /* stdin/stdout/stderr */
2310 /* we only want to checkpoint these if they are from a file */
2311 struct stat statbuf;
2312 fstat(fd, &statbuf);
2313 return S_ISREG(statbuf.st_mode);
2314 }
2315 }
2316
2317 /* Write list of open files to the checkpoint file */
2318
2319 static void writefiledescrs (int fd)
2320
2321 {
2322 char dbuf[BUFSIZ], linkbuf[FILENAMESIZE], *p, procfdname[64];
2323 int doff, dsiz, fddir, fdnum, linklen, rc;
2324 off_t offset;
2325 struct linux_dirent *dent;
2326 struct stat lstatbuf, statbuf;
2327
2328 writecs (fd, CS_FILEDESCRS);
2329
2330 /* Open /proc/self/fd directory - it contains a list of files I have open */
2331
2332 fddir = mtcp_sys_open ("/proc/self/fd", O_RDONLY, 0);
2333 if (fddir < 0) {
2334 mtcp_printf ("mtcp writefiledescrs: error opening directory /proc/self/fd: %s\n", strerror (errno));
2335 mtcp_abort ();
2336 }
2337
2338 /* Check each entry */
2339
2340 while (1) {
2341 dsiz = -1;
2342 if (sizeof dent -> d_ino == 4) dsiz = mtcp_sys_getdents (fddir, dbuf, sizeof dbuf);
2343 if (sizeof dent -> d_ino == 8) dsiz = mtcp_sys_getdents64 (fddir, dbuf, sizeof dbuf);
2344 if (dsiz <= 0) break;
2345
2346 for (doff = 0; doff < dsiz; doff += dent -> d_reclen) {
2347 dent = (struct linux_dirent *) (dbuf + doff);
2348
2349 /* The filename should just be a decimal number = the fd it represents.
2350 * Also, skip the entry for the checkpoint and directory files
2351 * as we don't want the restore to know about them.
2352 */
2353
2354 fdnum = strtol (dent -> d_name, &p, 10);
2355 if ((*p == '\0') && (fdnum >= 0) && (fdnum != fd) && (fdnum != fddir)
2356 && (should_ckpt_fd (fdnum) > 0)) {
2357
2358 /* Read the symbolic link so we get the filename that's open on the fd */
2359
2360 sprintf (procfdname, "/proc/self/fd/%d", fdnum);
2361 linklen = readlink (procfdname, linkbuf, sizeof linkbuf - 1);
2362 if ((linklen >= 0) || (errno != ENOENT)) { // probably was the proc/self/fd directory itself
2363 if (linklen < 0) {
2364 mtcp_printf ("mtcp writefiledescrs: error reading %s: %s\n",
2365 procfdname, strerror (errno));
2366 mtcp_abort ();
2367 }
2368 linkbuf[linklen] = '\0';
2369
2370 DPRINTF (("mtcp writefiledescrs*: checkpointing fd %d -> %s\n",
2371 fdnum, linkbuf));
2372
2373 /* Read about the link itself so we know read/write open flags */
2374
2375 rc = lstat (procfdname, &lstatbuf);
2376 if (rc < 0) {
2377 mtcp_printf ("mtcp writefiledescrs: error statting %s -> %s: %s\n",
2378 procfdname, linkbuf, strerror (-rc));
2379 mtcp_abort ();
2380 }
2381
2382 /* Read about the actual file open on the fd */
2383
2384 rc = stat (linkbuf, &statbuf);
2385 if (rc < 0) {
2386 mtcp_printf ("mtcp writefiledescrs: error statting %s -> %s: %s\n",
2387 procfdname, linkbuf, strerror (-rc));
2388 }
2389
2390 /* Write state information to checkpoint file.
2391 * Replace file's permissions with current access flags
2392 * so restore will know how to open it.
2393 */
2394
2395 else {
2396 offset = 0;
2397 if (S_ISREG (statbuf.st_mode))
2398 offset = mtcp_sys_lseek (fdnum, 0, SEEK_CUR);
2399 statbuf.st_mode = (statbuf.st_mode & ~0777)
2400 | (lstatbuf.st_mode & 0777);
2401 writefile (fd, &fdnum, sizeof fdnum);
2402 writefile (fd, &statbuf, sizeof statbuf);
2403 writefile (fd, &offset, sizeof offset);
2404 writefile (fd, &linklen, sizeof linklen);
2405 writefile (fd, linkbuf, linklen);
2406 }
2407 }
2408 }
2409 }
2410 }
2411 if (dsiz < 0) {
2412 mtcp_printf ("mtcp writefiledescrs: error reading /proc/self/fd: %s\n",
2413 strerror (mtcp_sys_errno));
2414 mtcp_abort ();
2415 }
2416
2417 mtcp_sys_close (fddir);
2418
2419 /* Write end-of-fd-list marker to checkpoint file */
2420
2421 fdnum = -1;
2422 writefile (fd, &fdnum, sizeof fdnum);
2423 }
2424
2425 static void writememoryarea (int fd, Area *area, int stack_was_seen,
2426 int vsyscall_exists)
2427
2428 { static void * orig_stack = NULL;
2429
2430 /* Write corresponding descriptor to the file */
2431
2432 if (orig_stack == NULL && 0 == strcmp(area -> name, "[stack]"))
2433 orig_stack = area -> addr + area -> size;
2434
2435 if (0 == strcmp(area -> name, "[vdso]") && !stack_was_seen)
2436 DPRINTF (("mtcp checkpointeverything*: skipping over [vdso] section"
2437 " %p at %p\n", area -> size, area -> addr));
2438 else if (0 == strcmp(area -> name, "[vsyscall]") && !stack_was_seen)
2439 DPRINTF (("mtcp checkpointeverything*: skipping over [vsyscall] section"
2440 " %p at %p\n", area -> size, area -> addr));
2441 else if (0 == strcmp(area -> name, "[stack]") &&
2442 orig_stack != area -> addr + area -> size)
2443 /* Kernel won't let us munmap this. But we don't need to restore it. */
2444 DPRINTF (("mtcp checkpointeverything*: skipping over [stack] segment"
2445 " %X at %pi (not the orig stack)\n", area -> size, area -> addr));
2446 else if (!(area -> flags & MAP_ANONYMOUS))
2447 DPRINTF (("mtcp checkpointeverything*: save %p at %p from %s + %X\n",
2448 area -> size, area -> addr, area -> name, area -> offset));
2449 else if (area -> name[0] == '\0')
2450 DPRINTF (("mtcp checkpointeverything*: save anonymous %p at %p\n",
2451 area -> size, area -> addr));
2452 else DPRINTF (("mtcp checkpointeverything*: save anonymous %p at %p"
2453 " from %s + %X\n",
2454 area -> size, area -> addr, area -> name, area -> offset));
2455
2456 if ((area -> name[0]) == '\0') {
2457 void *brk = mtcp_sys_brk(NULL);
2458 if (brk > area -> addr && brk <= area -> addr + area -> size)
2459 mtcp_sys_strcpy(area -> name, "[heap]");
2460 }
2461
2462 if ( 0 != strcmp(area -> name, "[vsyscall]")
2463 && ( (0 != strcmp(area -> name, "[vdso]")
2464 || vsyscall_exists /* which implies vdso can be overwritten */
2465 || !stack_was_seen ))) /* If vdso appeared before stack, it can be replaced */
2466 {
2467 writecs (fd, CS_AREADESCRIP);
2468 writefile (fd, area, sizeof *area);
2469
2470 /* Anonymous sections need to have their data copied to the file,
2471 * as there is no file that contains their data
2472 * We also save shared files to checkpoint file to handle shared memory
2473 * implemented with backing files
2474 */
2475 if (area -> flags & MAP_ANONYMOUS || area -> flags & MAP_SHARED) {
2476 writecs (fd, CS_AREACONTENTS);
2477 writefile (fd, area -> addr, area -> size);
2478 }
2479 }
2480 }
2481
2482 /* Write checkpoint section number to checkpoint file */
2483
2484 static void writecs (int fd, char cs)
2485
2486 {
2487 writefile (fd, &cs, sizeof cs);
2488 }
2489
2490 /* Write something to checkpoint file */
2491
2492 static char zeroes[MTCP_PAGE_SIZE] = { 0 };
2493 static void writefile (int fd, void const *buff, size_t size)
2494
2495 {
2496 char const *bf;
2497 ssize_t rc;
2498 size_t sz, wt;
2499
2500 checkpointsize += size;
2501
2502 bf = buff;
2503 sz = size;
2504 while (sz > 0) {
2505 for (wt = sz; wt > 0; wt /= 2) {
2506 rc = write (fd, bf, wt);
2507 if ((rc >= 0) || (errno != EFAULT)) break;
2508 }
2509
2510 /* Sometimes image page alignment will leave a hole in the middle of an image */
2511 /* ... but the idiot proc/self/maps will include it anyway */
2512
2513 if (wt == 0) {
2514 rc = (sz > sizeof zeroes ? sizeof zeroes : sz);
2515 checkpointsize -= rc; /* Correct now, since writefile will add rc back */
2516 writefile (fd, zeroes, rc);
2517 }
2518
2519 /* Otherwise, check for real error */
2520
2521 else {
2522 if (rc == 0) errno = EPIPE;
2523 if (rc <= 0) {
2524 mtcp_printf ("mtcp writefile: error writing from %p to %s: %s\n",
2525 bf, temp_checkpointfilename, strerror (errno));
2526 mtcp_abort ();
2527 }
2528 }
2529
2530 /* It's ok, we're on to next part */
2531
2532 sz -= rc;
2533 bf += rc;
2534 }
2535 }
2536
2537 static void preprocess_special_segments(int *vsyscall_exists)
2538 {
2539 Area area;
2540 int mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
2541 if (mapsfd < 0) {
2542 mtcp_printf ("mtcp checkpointeverything: error opening"
2543 " /proc/self/maps: %s\n", strerror (mtcp_sys_errno));
2544 mtcp_abort ();
2545 }
2546
2547 while (readmapsline (mapsfd, &area)) {
2548 if (0 == strcmp(area.name, "[vsyscall]")) {
2549 /* Determine if [vsyscall] exists. If [vdso] and [vsyscall] exist,
2550 * [vdso] will be saved and restored.
2551 * NOTE: [vdso] is relocated if /proc/sys/kernel/randomize_va_space == 2.
2552 * We must restore old [vdso] and also keep [vdso] in that case.
2553 * On Linux 2.6.25, 32-bit Linux has: [heap], /lib/ld-2.7.so, [vdso], libs, [stack].
2554 * On Linux 2.6.25, 64-bit Linux has: [stack], [vdso], [vsyscall].
2555 * and at least for gcl, [stack], libmtcp.so, [vsyscall] seen.
2556 * If 32-bit process in 64-bit Linux: [stack] (0xffffd000), [vdso] (0xffffe0000)
2557 * On 32-bit Linux, mtcp_restart has [vdso], /lib/ld-2.7.so, [stack]
2558 * Need to restore old [vdso] into mtcp_restart, to restart.
2559 * With randomize_va_space turned off, libraries start at high address
2560 * 0xb8000000 and are loaded progressively at lower addresses.
2561 * mtcp_restart loads vdso (which looks like a shared library) first.
2562 * But libpthread/libdl/libc libraries are loaded above vdso in user image.
2563 * So, we must use the opposite of the user's setting (no randomization if
2564 * user turned it on, and vice versa). We must also keep the
2565 * new vdso segment, provided by mtcp_restart.
2566 */
2567 *vsyscall_exists = 1;
2568 } else if (!saved_heap_start && strcmp(area.name, "[heap]") == 0) {
2569 // Record start of heap which will later be used in finishrestore()
2570 saved_heap_start = area.addr;
2571 } else if (strcmp(area.name, "[stack]") == 0) {
2572 /*
2573 * When using Matlab with dmtcp_checkpoint, sometimes the bottom most
2574 * page of stack (the page with highest address) which contains the
2575 * environment strings and the argv[] was not shown in /proc/self/maps.
2576 * This happens on some odd combination of environment passed on to
2577 * Matlab process. As a result, the page was not checkpointed and hence
2578 * the process segfaulted on restart. The fix is to try to mprotect this
2579 * page with RWX permission to make the page visible again. This call
2580 * will fail if no stack page was invisible to begin with.
2581 */
2582 int ret = mprotect(area.addr + area.size, 0x1000,
2583 PROT_READ | PROT_WRITE | PROT_EXEC);
2584 if (ret == 0) {
2585 mtcp_printf("mtcp checkpointeverything: bottom-most page of stack\n"
2586 "(page with highest address) was invisible in /proc/self/maps.\n"
2587 "It is made visible again now.\n");
2588 }
2589 }
2590 }
2591 close(mapsfd);
2592 }
2593
2594 /********************************************************************************************************************************/
2595 /* */
2596 /* This signal handler is forced by the main thread doing a 'mtcp_sys_kernel_tkill' to stop these threads so it can do a */
2597 /* checkpoint */
2598 /* */
2599 /********************************************************************************************************************************/
2600 /* Grow the stack by kbStack*1024 so that large stack is allocated on restart
2601 * The kernel won't do it automatically for us any more, since it thinks
2602 * the stack is in a different place after restart.
2603 */
2604 /* growstackValue is volatile so compiler doesn't optimize away growstack
2605 * Maybe it's not needed if we use ((optimize(0))) .
2606 */
2607 static volatile unsigned int growstackValue = 0;
2608 __attribute__ ((optimize(0))) static void growstack (int kbStack);
2609 static void growstack (int kbStack) {
2610 const int kBincrement = 1024;
2611 char array[kBincrement * 1024] __attribute__ ((unused));
2612 volatile int dummy_value __attribute__ ((unused)) = 1; /*Again, try to prevent compiler optimization*/
2613 if (kbStack > 0)
2614 growstack(kbStack - kBincrement);
2615 else
2616 growstackValue++;
2617 }
2618
2619 static void stopthisthread (int signum)
2620
2621 {
2622 int rc;
2623 Thread *thread;
2624 #define BT_SIZE 1024
2625 #define STDERR_FD 826
2626 #define LOG_FD 826
2627
2628 #ifdef PTRACE
2629 ptrace_unlock_inferiors();
2630 ptrace_remove_notexisted();
2631 ptrace_detach_checkpoint_threads ();
2632 ptrace_detach_user_threads ();
2633 #endif
2634
2635 DPRINTF (("mtcp stopthisthread*: tid %d returns to %p\n",
2636 mtcp_sys_kernel_gettid (), __builtin_return_address (0)));
2637
2638 thread = getcurrenthread (); // see which thread this is
2639
2640 // If this is checkpoint thread - exit immidiately
2641 if ( mtcp_state_value(&thread -> state) == ST_CKPNTHREAD ) {
2642 return ;
2643 }
2644
2645 if (0 && thread == motherofall) {
2646 #include <execinfo.h>
2647 void *buffer[BT_SIZE];
2648 int nptrs;
2649
2650 DPRINTF (( "printing stacktrace of the motherofall Thread\n\n" ));
2651 nptrs = backtrace (buffer, BT_SIZE);
2652 backtrace_symbols_fd ( buffer, nptrs, STDERR_FD );
2653 backtrace_symbols_fd ( buffer, nptrs, LOG_FD );
2654 }
2655 if (mtcp_state_set (&(thread -> state), ST_SUSPINPROG, ST_SIGENABLED)) { // make sure we don't get called twice for same thread
2656 static int is_first_checkpoint = 1;
2657
2658 save_sig_state (thread); // save signal state (and block signal delivery)
2659 save_tls_state (thread); // save thread local storage state
2660
2661 /* Grow stack only on first ckpt. Kernel agrees this is main stack and
2662 * will mmap it. On second ckpt and later, we would segfault if we tried
2663 * to grow the former stack beyond the portion that is already mmap'ed.
2664 */
2665 if (thread == motherofall) {
2666 static char *orig_stack_ptr;
2667 int kbStack = 2048;
2668 if (is_first_checkpoint) {
2669 orig_stack_ptr = (char *)&kbStack;
2670 is_first_checkpoint = 0;
2671 DPRINTF(("mtcp_stopthisthread: temp. grow main stack by %d kilobytes\n",
2672 kbStack));
2673 growstack(kbStack);
2674 } else if (orig_stack_ptr - (char *)&kbStack > 3 * kbStack*1024 / 4) {
2675 mtcp_printf("WARNING: Stack within %d bytes of end;\n"
2676 " Consider increasing 'kbStack' at line %d of mtcp/%s\n",
2677 kbStack*1024/4, __LINE__-9, __FILE__);
2678 }
2679 }
2680
2681 ///JA: new code ported from v54b
2682 rc = getcontext (&(thread -> savctx));
2683 if (rc < 0) {
2684 mtcp_printf ("mtcp stopthisthread: getcontext rc %d errno %d\n",
2685 rc, errno);
2686 mtcp_abort ();
2687 }
2688 DPRINTF (("mtcp stopthisthread*: after getcontext\n"));
2689 if (mtcp_state_value(&restoreinprog) == 0) {
2690
2691 /* We are the original process and all context is saved
2692 * restoreinprog is 0 ; wait for ckpt thread to write ckpt, and resume.
2693 */
2694
2695 WMB; // matched by RMB in checkpointhread
2696
2697 /* Next comes the first time we use the old stack. */
2698 /* Tell the checkpoint thread that we're all saved away */
2699 if (!mtcp_state_set (&(thread -> state), ST_SUSPENDED, ST_SUSPINPROG))
2700 mtcp_abort (); // tell checkpointhread all our context is saved
2701 mtcp_state_futex (&(thread -> state), FUTEX_WAKE, 1, NULL); // wake checkpoint thread if it's waiting for me
2702
2703 /* Then we wait for the checkpoint thread to write the checkpoint file then wake us up */
2704
2705 DPRINTF (("mtcp stopthisthread*: thread %d suspending\n", thread -> tid));
2706 while (mtcp_state_value(&thread -> state) == ST_SUSPENDED) {
2707 mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SUSPENDED, NULL);
2708 }
2709
2710 #ifdef PTRACE
2711 DPRINTF (("mtcp stopthisthread*: thread %d after suspending before deleting files\n", thread -> tid));
2712 delete_file(0, delete_ptrace_leader, has_ptrace_file);
2713 delete_file(1, delete_setoptions_leader, has_setoptions_file);
2714 delete_file(2, delete_checkpoint_leader, has_checkpoint_file);
2715 ptrace_attach_threads(0);
2716 #endif
2717
2718 /* Maybe there is to be a checkpoint verification. If so, and we're the main */
2719 /* thread, exec the restore program. If so and we're not the main thread, exit. */
2720
2721 if ((verify_total != 0) && (verify_count == 0)) {
2722
2723 /* If not the main thread, exit. Either normal exit() or _exit()
2724 * seems to cause other threads to exit.
2725 */
2726
2727 if (thread != motherofall) {
2728 mtcp_sys_exit(0);
2729 }
2730
2731 /* This is the main thread, verify checkpoint then restart by doing
2732 * a restart.
2733 * The restore will rename the file after it has done the restart.
2734 */
2735
2736 DPRINTF (("mtcp checkpointeverything*: verifying checkpoint...\n"));
2737 execlp ("mtcp_restart", "mtcp_restart", "--verify", temp_checkpointfilename, NULL);
2738 mtcp_printf ("mtcp checkpointeverything: error execing mtcp_restart %s: %s\n", temp_checkpointfilename, strerror (errno));
2739 mtcp_abort ();
2740 }
2741
2742 /* No verification, resume where we left off */
2743
2744 DPRINTF (("mtcp stopthisthread*: thread %d resuming\n", thread -> tid));
2745 }
2746
2747 /* Else restoreinprog >= 1; This stuff executes to do a restart */
2748
2749 else {
2750 if (!mtcp_state_set (&(thread -> state), ST_RUNENABLED, ST_SUSPENDED))
2751 mtcp_abort (); // checkpoint was written when thread in SUSPENDED state
2752 wait_for_all_restored ();
2753 DPRINTF (("mtcp stopthisthread*: thread %d restored\n", thread -> tid));
2754
2755 if (thread == motherofall) {
2756
2757 /* If we're a restore verification, rename the temp file
2758 * over the permanent one
2759 */
2760
2761 if (mtcp_restore_verify) renametempoverperm ();
2762 }
2763
2764 #ifdef PTRACE
2765 ptrace_attach_threads(1);
2766 #endif
2767 }
2768 }
2769 DPRINTF (("mtcp stopthisthread*: tid %d returning to %p\n",
2770 mtcp_sys_kernel_gettid (), __builtin_return_address (0)));
2771 #ifdef PTRACE
2772 ptrace_lock_inferiors();
2773 #endif
2774 }
2775
2776 /********************************************************************************************************************************/
2777 /* */
2778 /* Wait for all threads to finish restoring their context, then release them all to continue on their way. */
2779 /* */
2780 /* Input: */
2781 /* */
2782 /* restoreinprog = number of threads, including this, that hasn't called 'wait_for_all_restored' yet */
2783 /* thread list locked */
2784 /* */
2785 /* Output: */
2786 /* */
2787 /* restoreinprog = decremented */
2788 /* if now zero, all threads woken and thread list unlocked */
2789 /* */
2790 /********************************************************************************************************************************/
2791
2792 static void wait_for_all_restored (void)
2793
2794 {
2795 int rip;
2796
2797 do rip = mtcp_state_value(&restoreinprog); // dec number of threads cloned but not completed longjmp'ing
2798 while (!mtcp_state_set (&restoreinprog, rip - 1, rip));
2799 if (-- rip == 0) {
2800
2801 /* raise the signals which were pending for the entire process at the time
2802 * of checkpoint. It is assumed that if a signal is pending for all threads
2803 * including the ckpt-thread, then it was sent to the process as opposed to
2804 * sent to individual threads.
2805 */
2806 int i;
2807 for (i = NSIG; i > 0; --i) {
2808 if (sigismember(&sigpending_global, i) == 1) {
2809 kill(getpid(), i);
2810 }
2811 }
2812
2813 if (callback_restore_virtual_pid_table != NULL) {
2814 DPRINTF(("Before callback_restore_virtual_pid_table: Thread:%d \n",
2815 mtcp_sys_kernel_gettid()));
2816 (*callback_restore_virtual_pid_table)();
2817 DPRINTF(("After callback_restore_virtual_pid_table: Thread:%d \n",
2818 mtcp_sys_kernel_gettid()));
2819 }
2820
2821 mtcp_state_futex (&restoreinprog, FUTEX_WAKE, 999999999, NULL); // if this was last of all, wake everyone up
2822
2823 // NOTE: This is last safe moment for hook. All previous threads
2824 // have executed the "else" and are waiting on the futex.
2825 // This last thread has not yet unlocked the threads: unlk_threads()
2826 // So, no race condition occurs.
2827 // By comparison, *callback_post_ckpt() is called before creating
2828 // additional user threads. Only motherofall (checkpoint thread existed)
2829 /* call weak symbol of this file, possibly overridden by the user's strong symbol */
2830 /* user must compile his/her code with -Wl,-export-dynamic to make it visible */
2831 mtcpHookRestart();
2832 unlk_threads (); // ... and release the thread list
2833 } else {
2834 while ((rip = mtcp_state_value(&restoreinprog)) > 0) { // otherwise, wait for last of all to wake this one up
2835 mtcp_state_futex (&restoreinprog, FUTEX_WAIT, rip, NULL);
2836 }
2837 }
2838 }
2839
2840 /********************************************************************************************************************************/
2841 /* */
2842 /* Save signal mask and list of pending signals delivery */
2843 /* */
2844 /********************************************************************************************************************************/
2845
2846 static void save_sig_state (Thread *thisthread)
2847 {
2848 /* For checkpoint thread, we want to block delivery of all but some special signals*/
2849 if (thisthread == ckpthread) {
2850 /*
2851 * For the checkpoint thread, we should not block SIGSETXID which is used
2852 * by the setsid family of system calls to change the session leader. Glibc
2853 * uses this signal to notify the process threads of the change in session
2854 * leader information. This signal is not documented and is used internally
2855 * by glibc. It is defined in <glibc-src-root>/nptl/pthreadP.h
2856 * screen was getting affected by this since it used setsid to change the
2857 * session leaders.
2858 */
2859 #define SIGSETXID (__SIGRTMIN + 1)
2860 sigset_t set;
2861
2862 sigfillset(&set);
2863 sigdelset(&set, SIGSETXID);
2864
2865 if (pthread_sigmask(SIG_SETMASK, &set, NULL) < 0) {
2866 mtcp_printf("mtcp %s: error getting sigal mask: %s\n",
2867 __FUNCTION__, strerror(errno));
2868 mtcp_abort ();
2869 }
2870 }
2871 // Save signal block mask
2872 if (pthread_sigmask (SIG_SETMASK, NULL, &(thisthread -> sigblockmask)) < 0) {
2873 mtcp_printf("mtcp %s: error getting sigal mask: %s\n",
2874 __FUNCTION__, strerror(errno));
2875 mtcp_abort ();
2876 }
2877
2878 // Save pending signals
2879 sigpending ( &(thisthread->sigpending) );
2880 }
2881
2882 /********************************************************************************************************************************/
2883 /* */
2884 /* Restore signal mask and all pending signals */
2885 /* */
2886 /********************************************************************************************************************************/
2887
2888 static void restore_sig_state (Thread *thisthread)
2889 {
2890 int i;
2891 DPRINTF (("mtcp restore_sig_state*: restoring handlers for thread %d\n",
2892 thisthread->original_tid));
2893 if (pthread_sigmask (SIG_SETMASK, &(thisthread -> sigblockmask), NULL) < 0) {
2894 mtcp_printf("mtcp %s: error setting sigal mask: %s\n",
2895 __FUNCTION__, strerror(errno));
2896 mtcp_abort ();
2897 }
2898
2899 // Raise the signals which were pending for only this thread at the time of checkpoint.
2900 for (i = NSIG; i > 0; --i) {
2901 if (sigismember(&(thisthread -> sigpending), i) == 1 &&
2902 sigismember(&(thisthread -> sigblockmask), i) == 1 &&
2903 sigismember(&(sigpending_global), i) == 0) {
2904 raise(i);
2905 }
2906 }
2907 }
2908
2909 /********************************************************************************************************************************/
2910 /* */
2911 /* Save all signal handlers */
2912 /* */
2913 /********************************************************************************************************************************/
2914 static void save_sig_handlers (void)
2915 {
2916 int i;
2917
2918 if (dmtcp_exists) {
2919 mtcp_printf("mtcp:%s Illegal function call when running under DMTCP*****\n",
2920 __FUNCTION__);
2921 // Do a simple return instead of killing the process
2922 return;
2923 //mtcp_abort();
2924 }
2925
2926 /* Now save all the signal handlers */
2927 DPRINTF (("mtcp save_sig_handlers*: saving signal handlers\n"));
2928 for (i = NSIG; i > 0; --i) {
2929 if (_real_sigaction (i, NULL, &sigactions[i]) < 0) {
2930 if (errno == EINVAL)
2931 memset (&sigactions[i], 0, sizeof sigactions[i]);
2932 else {
2933 mtcp_printf ("mtcp save_sig_handlers: error saving signal %d action: %s\n",
2934 i, strerror(errno));
2935 mtcp_abort ();
2936 }
2937 }
2938
2939 DPRINTF (("mtcp save_sig_handlers*: saving signal handler for %d -> %p\n",
2940 i,
2941 (sigactions[i].sa_flags & SA_SIGINFO ?
2942 (void *)(sigactions[i].sa_sigaction) :
2943 (void *)(sigactions[i].sa_handler)) ));
2944 }
2945 }
2946
2947 /********************************************************************************************************************************/
2948 /* */
2949 /* Restore all saved signal handlers */
2950 /* */
2951 /********************************************************************************************************************************/
2952 static void restore_sig_handlers (Thread *thisthread)
2953 {
2954 int i;
2955
2956 if (dmtcp_exists) {
2957 mtcp_printf("mtcp:%s Illegal function when running under DMTCP*****\n",
2958 __FUNCTION__);
2959 // Do a simple return instead of killing the process
2960 return;
2961 //mtcp_abort();
2962 }
2963
2964 DPRINTF (("mtcp restore_sig_handlers*: restoring signal handlers\n"));
2965 #if 0
2966 # define VERBOSE_DEBUG
2967 #endif
2968 for(i = NSIG; i > 0; --i) {
2969 #ifdef VERBOSE_DEBUG
2970 DPRINTF (("mtcp restore_sig_handlers*: restore signal handler for %d -> %p\n",
2971 i,
2972 (sigactions[i].sa_flags & SA_SIGINFO ?
2973 sigactions[i].sa_sigaction :
2974 sigactions[i].sa_handler) ));
2975 #endif
2976
2977 if (_real_sigaction(i, &sigactions[i], NULL) < 0) {
2978 if (errno != EINVAL) {
2979 mtcp_printf ("mtcp restore_sig_handlers:" \
2980 " error restoring signal %d handler: %s\n",
2981 i, strerror(errno));
2982 mtcp_abort ();
2983 }
2984 }
2985 }
2986 }
2987
2988 /********************************************************************************************************************************/
2989 /* */
2990 /* Save state necessary for TLS restore */
2991 /* Linux saves stuff in the GDT, switching it on a per-thread basis */
2992 /* */
2993 /********************************************************************************************************************************/
2994
2995 static void save_tls_state (Thread *thisthread)
2996
2997 {
2998 int i, rc;
2999
3000 #ifdef __i386__
3001 asm volatile ("movw %%fs,%0" : "=m" (thisthread -> fs));
3002 asm volatile ("movw %%gs,%0" : "=m" (thisthread -> gs));
3003 #endif
3004 #ifdef __x86_64__
3005 //asm volatile ("movl %%fs,%0" : "=m" (thisthread -> fs));
3006 //asm volatile ("movl %%gs,%0" : "=m" (thisthread -> gs));
3007 #endif
3008
3009 memset (thisthread -> gdtentrytls, 0, sizeof thisthread -> gdtentrytls);
3010
3011 /* On older Linuxes, we must save several GDT entries available to threads. */
3012
3013 #if MTCP__SAVE_MANY_GDT_ENTRIES
3014 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i ++) {
3015 thisthread -> gdtentrytls[i-GDT_ENTRY_TLS_MIN].entry_number = i;
3016 rc = mtcp_sys_get_thread_area (&(thisthread -> gdtentrytls[i-GDT_ENTRY_TLS_MIN]));
3017 if (rc < 0) {
3018 mtcp_printf ("mtcp checkpointeverything: error saving GDT TLS entry[%d]: %s\n", i, strerror (mtcp_sys_errno));
3019 mtcp_abort ();
3020 }
3021 }
3022
3023 /* With newer Linuxes, we just save the one GDT entry indexed by GS so we don't need the GDT_ENTRY_TLS_... definitions. */
3024 /* We get the particular index of the GDT entry to save by reading GS. */
3025
3026 #else
3027 i = thisthread -> TLSSEGREG / 8;
3028 thisthread -> gdtentrytls[0].entry_number = i;
3029 rc = mtcp_sys_get_thread_area (&(thisthread -> gdtentrytls[0]));
3030 if (rc < 0) {
3031 mtcp_printf ("mtcp checkpointeverything: error saving GDT TLS entry[%d]: %s\n", i, strerror (mtcp_sys_errno));
3032 mtcp_abort ();
3033 }
3034 #endif
3035 }
3036
3037 static char *memsubarray (char *array, char *subarray, int len) {
3038 char *i_ptr;
3039 int j;
3040 int word1 = *(int *)subarray;
3041 // Assume subarray length is at least sizeof(int) and < 2048.
3042 if (len < sizeof(int))
3043 mtcp_abort();
3044 for (i_ptr = array; i_ptr < array+2048; i_ptr++) {
3045 if (*(int *)i_ptr == word1) {
3046 for (j=0; j < len; j++)
3047 if (i_ptr[j] != subarray[j])
3048 break;
3049 if (j == len)
3050 return i_ptr;
3051 }
3052 }
3053 return NULL;
3054 }
3055 static int mtcp_get_tls_segreg(void)
3056 { mtcp_segreg_t tlssegreg;
3057 #ifdef __i386__
3058 asm volatile ("movw %%gs,%0" : "=g" (tlssegreg)); /* any general register */
3059 #endif
3060 #ifdef __x86_64__
3061 asm volatile ("movl %%fs,%0" : "=q" (tlssegreg)); /* q = a,b,c,d for i386; 8 low bits of r class reg for x86_64 */
3062 #endif
3063 return (int)tlssegreg;
3064 }
3065 static void *mtcp_get_tls_base_addr(void)
3066 {
3067 struct user_desc gdtentrytls;
3068
3069 #if MTCP__SAVE_MANY_GDT_ENTRIES
3070 if (mtcp_get_tls_segreg() / 8 != GDT_ENTRY_TLS_MIN) {
3071 mtcp_printf ("mtcp_init: gs %X not set to first TLS GDT ENTRY %X\n",
3072 gs, GDT_ENTRY_TLS_MIN * 8 + 3);
3073 mtcp_abort ();
3074 }
3075 #endif
3076
3077 gdtentrytls.entry_number = mtcp_get_tls_segreg() / 8;
3078 if ( mtcp_sys_get_thread_area ( &gdtentrytls ) < 0 ) {
3079 mtcp_printf ("mtcp_init: error getting GDT TLS entry: %s\n",
3080 strerror (mtcp_sys_errno));
3081 mtcp_abort ();
3082 }
3083 return (void *)(*(unsigned long *)&(gdtentrytls.base_addr));
3084 }
3085
3086 static void renametempoverperm (void)
3087
3088 {
3089 if (rename (temp_checkpointfilename, perm_checkpointfilename) < 0) {
3090 mtcp_printf ("mtcp checkpointeverything: error renaming %s to %s: %s\n", temp_checkpointfilename, perm_checkpointfilename,
3091 strerror (errno));
3092 mtcp_abort ();
3093 }
3094 }
3095
3096 /********************************************************************************************************************************/
3097 /* */
3098 /* Get current thread struct pointer */
3099 /* It is keyed by the calling thread's gettid value */
3100 /* Maybe improve someday by using TLS */
3101 /* */
3102 /********************************************************************************************************************************/
3103
3104 static Thread *getcurrenthread (void)
3105
3106 {
3107 int tid;
3108 Thread *thread;
3109
3110 tid = mtcp_sys_kernel_gettid ();
3111 lock_threads ();
3112 for (thread = threads; thread != NULL; thread = thread -> next) {
3113 if (thread -> tid == tid) {
3114 unlk_threads ();
3115 return (thread);
3116 }
3117 }
3118 mtcp_printf ("mtcp getcurrenthread: can't find thread id %d\n", tid);
3119 mtcp_abort ();
3120 return thread; /* NOTREACHED : stop compiler warning */
3121 }
3122
3123 /********************************************************************************************************************************/
3124 /* */
3125 /* Lock and unlock the 'threads' list */
3126 /* */
3127 /********************************************************************************************************************************/
3128
3129 static void lock_threads (void)
3130
3131 {
3132 while (!mtcp_state_set (&threadslocked, 1, 0)) {
3133 mtcp_state_futex (&threadslocked, FUTEX_WAIT, 1, NULL);
3134 }
3135 RMB; // don't prefetch anything until we have the lock
3136 }
3137
3138 static void unlk_threads (void)
3139
3140 {
3141 WMB; // flush data written before unlocking
3142 // FIXME: Should we be checking return value of mtcp_state_set? Can it ever fail?
3143 mtcp_state_set(&threadslocked , 0, 1);
3144 mtcp_state_futex (&threadslocked, FUTEX_WAKE, 1, NULL);
3145 }
3146
3147 /********************************************************************************************************************************/
3148 /* */
3149 /* Read /proc/self/maps line, converting it to an Area descriptor struct */
3150 /* */
3151 /* Input: */
3152 /* */
3153 /* mapsfd = /proc/self/maps file, positioned to beginning of a line */
3154 /* */
3155 /* Output: */
3156 /* */
3157 /* readmapsline = 0 : was at end-of-file, nothing read */
3158 /* 1 : read and processed one line */
3159 /* *area = filled in */
3160 /* */
3161 /* Note: */
3162 /* */
3163 /* Line from /procs/self/maps is in form: */
3164 /* */
3165 /* <startaddr>-<endaddrexclusive> rwxs <fileoffset> <devmaj>:<devmin> <inode> <filename>\n */
3166 /* all numbers in hexadecimal except inode is in decimal */
3167 /* anonymous will be shown with offset=devmaj=devmin=inode=0 and no ' filename' */
3168 /* */
3169 /********************************************************************************************************************************/
3170
3171 static int readmapsline (int mapsfd, Area *area)
3172
3173 {
3174 char c, rflag, sflag, wflag, xflag;
3175 int i, rc;
3176 struct stat statbuf;
3177 VA devmajor, devminor, devnum, endaddr, inodenum, startaddr;
3178
3179 c = mtcp_readhex (mapsfd, &startaddr);
3180 if (c != '-') {
3181 if ((c == 0) && (startaddr == 0)) return (0);
3182 goto skipeol;
3183 }
3184 c = mtcp_readhex (mapsfd, &endaddr);
3185 if (c != ' ') goto skipeol;
3186 if (endaddr < startaddr) goto skipeol;
3187
3188 rflag = c = mtcp_readchar (mapsfd);
3189 if ((c != 'r') && (c != '-')) goto skipeol;
3190 wflag = c = mtcp_readchar (mapsfd);
3191 if ((c != 'w') && (c != '-')) goto skipeol;
3192 xflag = c = mtcp_readchar (mapsfd);
3193 if ((c != 'x') && (c != '-')) goto skipeol;
3194 sflag = c = mtcp_readchar (mapsfd);
3195 if ((c != 's') && (c != 'p')) goto skipeol;
3196
3197 c = mtcp_readchar (mapsfd);
3198 if (c != ' ') goto skipeol;
3199
3200 c = mtcp_readhex (mapsfd, &devmajor);
3201 if (c != ' ') goto skipeol;
3202 area -> offset = devmajor;
3203
3204 c = mtcp_readhex (mapsfd, &devmajor);
3205 if (c != ':') goto skipeol;
3206 c = mtcp_readhex (mapsfd, &devminor);
3207 if (c != ' ') goto skipeol;
3208 c = mtcp_readdec (mapsfd, &inodenum);
3209 area -> name[0] = '\0';
3210 while (c == ' ') c = mtcp_readchar (mapsfd);
3211 if (c == '/' || c == '[') { /* absolute pathname, or [stack], [vdso], etc. */
3212 i = 0;
3213 do {
3214 area -> name[i++] = c;
3215 if (i == sizeof area -> name) goto skipeol;
3216 c = mtcp_readchar (mapsfd);
3217 } while (c != '\n');
3218 area -> name[i] = '\0';
3219 }
3220 if (mtcp_strstartswith(area -> name, nscd_mmap_str) ||
3221 mtcp_strstartswith(area -> name, nscd_mmap_str2) ||
3222 mtcp_strstartswith(area -> name, nscd_mmap_str3)) {
3223 /* if nscd is active */
3224 } else if ( mtcp_strstartswith(area -> name, sys_v_shmem_file) ) {
3225 /* System V Shared-Memory segments are handled by DMTCP. */
3226 } else if ( mtcp_strendswith(area -> name, " (deleted)") ) {
3227 /* Deleted File */
3228 } else if (area -> name[0] == '/') { /* if an absolute pathname */
3229 rc = stat (area -> name, &statbuf);
3230 if (rc < 0) {
3231 mtcp_printf ("ERROR: mtcp readmapsline: error %d statting %s\n",
3232 -rc, area -> name);
3233 return (1); /* 0 would mean last line of maps; could do mtcp_abort() */
3234 }
3235 devnum = makedev (devmajor, devminor);
3236 if ((devnum != statbuf.st_dev) || (inodenum != statbuf.st_ino)) {
3237 mtcp_printf ("ERROR: mtcp readmapsline: image %s dev:inode %X:%u"
3238 " not eq maps %X:%u\n",
3239 area -> name, statbuf.st_dev, statbuf.st_ino,
3240 devnum, inodenum);
3241 return (1); /* 0 would mean last line of maps; could do mtcp_abort() */
3242 }
3243 } else {
3244 /* Special area like [heap] or anonymous area. */
3245 }
3246
3247 if (c != '\n') goto skipeol;
3248
3249 area -> addr = (void *)startaddr;
3250 area -> size = endaddr - startaddr;
3251 area -> prot = 0;
3252 if (rflag == 'r') area -> prot |= PROT_READ;
3253 if (wflag == 'w') area -> prot |= PROT_WRITE;
3254 if (xflag == 'x') area -> prot |= PROT_EXEC;
3255 area -> flags = MAP_FIXED;
3256 if (sflag == 's') area -> flags |= MAP_SHARED;
3257 if (sflag == 'p') area -> flags |= MAP_PRIVATE;
3258 if (area -> name[0] == '\0') area -> flags |= MAP_ANONYMOUS;
3259
3260 return (1);
3261
3262 skipeol:
3263 DPRINTF (("ERROR: mtcp readmapsline*: bad maps line <%c", c));
3264 while ((c != '\n') && (c != '\0')) {
3265 c = mtcp_readchar (mapsfd);
3266 mtcp_printf ("%c", c);
3267 }
3268 mtcp_printf (">\n");
3269 mtcp_abort ();
3270 return (0); /* NOTREACHED : stop compiler warning */
3271 }
3272
3273 /********************************************************************************************************************************/
3274 /* */
3275 /* Do restore from checkpoint file */
3276 /* This routine is called from the mtcp_restore program to perform the restore */
3277 /* It resides in the libmtcp.so image in exactly the same spot that the checkpointed process had its libmtcp.so loaded at, so this */
3278 /* can't possibly interfere with restoring the checkpointed process */
3279 /* The restore can't use malloc because that might create memory sections. */
3280 /* Strerror seems to mess up with its Locale stuff in here too. */
3281 /* */
3282 /* Input: */
3283 /* */
3284 /* fd = checkpoint file, positioned just after the CS_RESTOREIMAGE data */
3285 /* */
3286 /********************************************************************************************************************************/
3287
3288 #ifdef __x86_64__
3289 # define UNUSED_IN_64_BIT __attribute__ ((unused))
3290 #else
3291 # define UNUSED_IN_64_BIT
3292 #endif
3293
3294 #define STRINGS_LEN 10000
3295 static char UNUSED_IN_64_BIT STRINGS[STRINGS_LEN];
3296 void mtcp_restore_start (int fd, int verify, pid_t gzip_child_pid,char *ckpt_newname,
3297 char *cmd_file, char *argv[], char *envp[] )
3298
3299 {
3300 #ifndef __x86_64__
3301 int i;
3302 char *strings = STRINGS;
3303 #endif
3304
3305 DEBUG_RESTARTING = 1;
3306 /* If we just replace extendedStack by (tempstack+STACKSIZE) in "asm"
3307 * below, the optimizer generates non-PIC code if it's not -O0 - Gene
3308 */
3309 long long * extendedStack = tempstack + STACKSIZE;
3310
3311 /* Not used until we do longjmps, but get it out of the way now */
3312
3313 // FIXME: Should we be checking return value of mtcp_state_set? Can it ever fail?
3314 mtcp_state_set(&restoreinprog ,1, 0);
3315
3316 mtcp_sys_gettimeofday (&restorestarted, NULL);
3317
3318 /* Save parameter away in a static memory location as we're about to wipe the stack */
3319
3320 mtcp_restore_cpfd = fd;
3321 mtcp_restore_verify = verify;
3322 mtcp_restore_gzip_child_pid = gzip_child_pid;
3323 // Copy newname to save it too
3324 {
3325 int i;
3326 for(i=0;ckpt_newname[i];i++){
3327 mtcp_ckpt_newname[i] = ckpt_newname[i];
3328 }
3329 mtcp_ckpt_newname[i] = '\0';
3330 }
3331
3332
3333 #ifndef __x86_64__
3334 // Copy command line to libmtcp.so, so that we can re-exec if randomized vdso
3335 // steps on us. This won't be needed when we use the linker to map areas.
3336 strings = STRINGS;
3337 // This version of STRCPY copies source string into STRINGS,
3338 // and sets destination string to point there.
3339 # define STRCPY(x,y) \
3340 if (strings + 256 < STRINGS + STRINGS_LEN) { \
3341 mtcp_sys_strcpy(strings,y); \
3342 x = strings; \
3343 strings += mtcp_sys_strlen(y) + 1; \
3344 } else { \
3345 DPRINTF(("MTCP: ran out of string space." \
3346 " Trying to continue anyway\n")); \
3347 }
3348 STRCPY(mtcp_restore_cmd_file, cmd_file);
3349 for (i = 0; argv[i] != NULL; i++) {
3350 STRCPY(mtcp_restore_argv[i], argv[i]);
3351 }
3352 mtcp_restore_argv[i] = NULL;
3353 for (i = 0; envp[i] != NULL; i++) {
3354 STRCPY(mtcp_restore_envp[i], envp[i]);
3355 }
3356 mtcp_restore_envp[i] = NULL;
3357 #endif
3358
3359 /* Switch to a stack area that's part of the shareable's memory address range
3360 * and thus not used by the checkpointed program
3361 */
3362
3363 asm volatile (CLEAN_FOR_64_BIT(mov %0,%%esp\n\t)
3364 /* This next assembly language confuses gdb,
3365 but seems to work fine anyway */
3366 CLEAN_FOR_64_BIT(xor %%ebp,%%ebp\n\t)
3367 : : "g" (extendedStack) : "memory");
3368
3369 /* Once we're on the new stack, we can't access any local variables or parameters */
3370 /* Call the restoreverything to restore files and memory areas */
3371
3372 /* This should never return */
3373 mtcp_restoreverything();
3374 asm volatile ("hlt");
3375 }
3376
3377
3378 /********************************************************************************************************************************/
3379 /* */
3380 /* Restore proper heap */
3381 /* */
3382 /********************************************************************************************************************************/
3383 static void restore_heap()
3384 {
3385 /*
3386 * If the original start of heap is lower than the current end of heap, we
3387 * want to mmap the area between mtcp_saved_break and current break. This
3388 * happens when the size of checkpointed program is smaller then the size of
3389 * mtcp_restart program.
3390 */
3391 void* current_break = mtcp_sys_brk (NULL);
3392 if (current_break > mtcp_saved_break) {
3393 DPRINTF(("mtcp finishrestore: Area between mtcp_saved_break:%p and "
3394 "Current_break:%p not mapped, mapping it now\n",
3395 mtcp_saved_break, current_break));
3396 size_t oldsize = mtcp_saved_break - saved_heap_start;
3397 size_t newsize = current_break - saved_heap_start;
3398
3399 void* addr = mremap (saved_heap_start, oldsize, newsize, 0);
3400 if (addr == NULL) {
3401 mtcp_printf("mtcp finishrestore: mremap failed to map area between "
3402 "mtcp_saved_break (%p) and current_break (%p)\n",
3403 mtcp_saved_break, current_break);
3404 mtcp_abort();
3405 }
3406 }
3407 }
3408
3409 /********************************************************************************************************************************/
3410 /* */
3411 /* The original program's memory and files have been restored */
3412 /* */
3413 /********************************************************************************************************************************/
3414
3415 static void finishrestore (void)
3416 {
3417 struct timeval stopped;
3418 int nnamelen;
3419
3420 DPRINTF (("mtcp finishrestore*: mtcp_printf works; libc should work\n"));
3421
3422 restore_heap();
3423
3424 if ( (nnamelen = strlen(mtcp_ckpt_newname))
3425 && strcmp(mtcp_ckpt_newname,perm_checkpointfilename) ) {
3426 // we start from different place - change it!
3427 DPRINTF(("mtcp finishrestore*: checkpoint file name was changed\n"));
3428 if (strlen(mtcp_ckpt_newname) >= MAXPATHLEN) {
3429 mtcp_printf("mtcp finishrestore: new ckpt file name (%s) too long (>=512 bytes)\n",
3430 mtcp_ckpt_newname);
3431 mtcp_abort();
3432 }
3433 strncpy(perm_checkpointfilename,mtcp_ckpt_newname,MAXPATHLEN);
3434 memcpy(temp_checkpointfilename,perm_checkpointfilename,MAXPATHLEN);
3435 strncpy(temp_checkpointfilename + nnamelen, ".temp",MAXPATHLEN - nnamelen);
3436 }
3437
3438 mtcp_sys_gettimeofday (&stopped, NULL);
3439 stopped.tv_usec += (stopped.tv_sec - restorestarted.tv_sec) * 1000000 - restorestarted.tv_usec;
3440 TPRINTF (("mtcp finishrestore*: time %u uS\n", stopped.tv_usec));
3441
3442 /* Now we can access all our files and memory that existed at the time of the checkpoint */
3443 /* We are still on the temporary stack, though */
3444
3445 /* Fill in the new mother process id */
3446 motherpid = mtcp_sys_getpid();
3447
3448 /* Call another routine because our internal stack is whacked and we can't have local vars */
3449
3450 ///JA: v54b port
3451 // so restarthread will have a big stack
3452 asm volatile (CLEAN_FOR_64_BIT(mov %0,%%esp)
3453 : : "g" (motherofall -> savctx.SAVEDSP - 128 ) : "memory"); // -128 for red zone
3454 restarthread (motherofall);
3455 }
3456
3457 static int restarthread (void *threadv)
3458 {
3459 int rip;
3460 Thread *child;
3461 Thread *const thread = threadv;
3462 struct MtcpRestartThreadArg mtcpRestartThreadArg;
3463
3464 restore_tls_state (thread);
3465
3466
3467 if (thread == motherofall) {
3468 // Compute the set of signals which was pending for all the threads at the
3469 // time of checkpoint. This is a heuristic to compute the set of signals
3470 // which were pending for the entire process at the time of checkpoint.
3471 sigset_t tmp;
3472 sigfillset ( &tmp );
3473 Thread *th;
3474 for (th = threads; th != NULL; th = th -> next) {
3475 sigandset ( &sigpending_global, &tmp, &(th->sigpending) );
3476 tmp = sigpending_global;
3477 }
3478
3479 setup_sig_handler ();
3480
3481 set_tid_address (&(thread -> child_tid));
3482
3483 if (callback_post_ckpt != NULL) {
3484 DPRINTF(("mtcp finishrestore*: before callback_post_ckpt(1=restarting)"
3485 " (&%x,%x) \n",
3486 &callback_post_ckpt, callback_post_ckpt));
3487 (*callback_post_ckpt)(1);
3488 DPRINTF(("mtcp finishrestore*: after callback_post_ckpt(1=restarting)\n"));
3489 }
3490 /* Do it once only, in motherofall thread. */
3491
3492 restore_term_settings();
3493
3494 if (dmtcp_info_restore_working_directory
3495 && chdir(saved_working_directory) == -1) {
3496 perror("chdir");
3497 mtcp_abort ();
3498 }
3499
3500 /* DMTCP restores signal handlers. But if we are running standalone,
3501 * MTCP must do it.
3502 * Because signal handlers are per-process, we only do this once.
3503 */
3504 if (!dmtcp_exists)
3505 restore_sig_handlers(thread);
3506 }
3507
3508 restore_sig_state (thread);
3509
3510 for (child = thread -> children; child != NULL; child = child -> siblings) {
3511
3512 /* Increment number of threads created but haven't completed their longjmp */
3513
3514 do rip = mtcp_state_value(&restoreinprog);
3515 while (!mtcp_state_set (&restoreinprog, rip + 1, rip));
3516
3517 /* Create the thread so it can finish restoring itself. */
3518 /* Don't do CLONE_SETTLS (it'll puke). We do it later via restore_tls_state. */
3519
3520 ///JA: v54b port
3521 errno = -1;
3522
3523 void *clone_arg = (void *)child;
3524
3525 /*
3526 * DMTCP needs to know original_tid of the thread being created by the
3527 * following clone() call.
3528 *
3529 * Threads are created by using syscall which is intercepted by DMTCP and
3530 * the original_tid is sent to DMTCP as a field of MtcpRestartThreadArg
3531 * structure. DMTCP will automatically extract the actual argument
3532 * (clone_arg -> arg) from clone_arg and will pass it on to the real
3533 * clone call.
3534 * (--Kapil)
3535 */
3536 mtcpRestartThreadArg.arg = (void *)child;
3537 mtcpRestartThreadArg.original_tid = child -> original_tid;
3538 clone_arg = (void *) &mtcpRestartThreadArg;
3539
3540 /*
3541 * syscall is wrapped by DMTCP when configured with PID-Virtualization.
3542 * It calls __clone which goes to DMTCP:__clone which then calls MTCP:__clone.
3543 * DMTCP:__clone checks for tid-conflict with any original tid. If
3544 * conflict, it replaces the thread with a new one with a new tid.
3545 * DMTCP:__clone wrapper calls the glibc:__clone if the computation is not
3546 * in RUNNING state (must be restarting), it calls the mtcp:__clone otherwise.
3547 * IF No PID-Virtualization, call glibc:__clone because threads created
3548 * during mtcp_restart should not go to MTCP:__clone; MTCP remembers those
3549 * threads from the checkpoint image.
3550 */
3551
3552 /* If running under DMTCP */
3553 pid_t tid;
3554 if (dmtcp_info_pid_virtualization_enabled == 1) {
3555 tid = syscall(SYS_clone, restarthread,
3556 (void *)(child -> savctx.SAVEDSP - 128), // -128 for red zone
3557 (child -> clone_flags & ~CLONE_SETTLS) | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
3558 clone_arg, child -> parent_tidptr, NULL, child -> actual_tidptr);
3559 } else {
3560 tid = ((*clone_entry)( restarthread,
3561 (void *)(child -> savctx.SAVEDSP - 128), // -128 for red zone
3562 (child -> clone_flags & ~CLONE_SETTLS) | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
3563 child, child -> parent_tidptr, NULL, child -> actual_tidptr));
3564 }
3565
3566 if (tid < 0) {
3567 mtcp_printf ("mtcp restarthread: error %d recreating thread\n", errno);
3568 mtcp_printf ("mtcp restarthread: clone_flags %X, savedsp %p\n",
3569 child -> clone_flags, child -> savctx.SAVEDSP);
3570 mtcp_abort ();
3571 }
3572 DPRINTF((" Parent:%d, tid of newly created thread:%d\n\n", thread->tid, tid));
3573 }
3574
3575 /* All my children have been created, jump to the stopthisthread routine just after getcontext call */
3576 /* Note that if this is the restored checkpointhread, it jumps to the checkpointhread routine */
3577
3578 if (mtcp_have_thread_sysinfo_offset())
3579 mtcp_set_thread_sysinfo(saved_sysinfo);
3580 ///JA: v54b port
3581 DPRINTF (("mtcp restarthread*: calling setcontext: thread->tid: %d, original_tid:%d\n",
3582 thread->tid, thread->original_tid));
3583 setcontext (&(thread -> savctx)); /* Shouldn't return */
3584 mtcp_abort ();
3585 return (0); /* NOTREACHED : stop compiler warning */
3586 }
3587
3588 /********************************************************************************************************************************/
3589 /* */
3590 /* Restore the GDT entries that are part of a thread's state */
3591 /* */
3592 /* The kernel provides set_thread_area system call for a thread to alter a particular range of GDT entries, and it switches */
3593 /* those entries on a per-thread basis. So from our perspective, this is per-thread state that is saved outside user */
3594 /* addressable memory that must be manually saved. */
3595 /* */
3596 /********************************************************************************************************************************/
3597
3598 static void restore_tls_state (Thread *thisthread)
3599
3600 {
3601 int rc;
3602 #if MTCP__SAVE_MANY_GDT_ENTRIES
3603 int i;
3604 #endif
3605
3606 /* The assumption that this points to the pid was checked by that tls_pid crap near the beginning */
3607
3608 *(pid_t *)(*(unsigned long *)&(thisthread -> gdtentrytls[0].base_addr) + TLS_PID_OFFSET()) = motherpid;
3609
3610 /* Likewise, we must jam the new pid into the mother thread's tid slot (checked by tls_tid carpola) */
3611
3612 if (thisthread == motherofall) {
3613 *(pid_t *)(*(unsigned long *)&(thisthread -> gdtentrytls[0].base_addr) + TLS_TID_OFFSET()) = motherpid;
3614 }
3615
3616 /* Restore all three areas */
3617
3618 #if MTCP__SAVE_MANY_GDT_ENTRIES
3619 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i ++) {
3620 rc = mtcp_sys_set_thread_area (&(thisthread -> gdtentrytls[i-GDT_ENTRY_TLS_MIN]));
3621 if (rc < 0) {
3622 mtcp_printf ("mtcp restore_tls_state: error %d restoring GDT TLS entry[%d]\n", mtcp_sys_errno, i);
3623 mtcp_abort ();
3624 }
3625 }
3626
3627 /* For newer Linuces, we just restore the one GDT entry that was indexed by GS */
3628
3629 #else
3630 rc = mtcp_sys_set_thread_area (&(thisthread -> gdtentrytls[0]));
3631 if (rc < 0) {
3632 mtcp_printf ("mtcp restore_tls_state: error %d restoring GDT TLS entry[%d]\n", mtcp_sys_errno, thisthread -> gdtentrytls[0].entry_number);
3633 mtcp_abort ();
3634 }
3635 #endif
3636
3637 /* Restore the rest of the stuff */
3638
3639 #ifdef __i386__
3640 asm volatile ("movw %0,%%fs" : : "m" (thisthread -> fs));
3641 asm volatile ("movw %0,%%gs" : : "m" (thisthread -> gs));
3642 #endif
3643 #ifdef __x86_64__
3644 /* Don't directly set fs. It would only set 32 bits, and we just
3645 * set the full 64-bit base of fs, using sys_set_thread_area,
3646 * which called arch_prctl.
3647 *asm volatile ("movl %0,%%fs" : : "m" (thisthread -> fs));
3648 *asm volatile ("movl %0,%%gs" : : "m" (thisthread -> gs));
3649 */
3650 #endif
3651
3652 thisthread -> tid = mtcp_sys_kernel_gettid ();
3653 }
3654
3655 /********************************************************************************************************************************/
3656 /* */
3657 /* Set the thread's STOPSIGNAL handler. Threads are sent STOPSIGNAL when they are to suspend execution the application, save */
3658 /* their state and wait for the checkpointhread to write the checkpoint file. */
3659 /* */
3660 /* Output: */
3661 /* */
3662 /* Calling thread will call stopthisthread () when sent a STOPSIGNAL */
3663 /* */
3664 /********************************************************************************************************************************/
3665
3666 static void setup_sig_handler (void)
3667 {
3668 struct sigaction act, old_act;
3669
3670 act.sa_handler = &stopthisthread;
3671 sigfillset(&act.sa_mask);
3672 act.sa_flags = SA_RESTART;
3673
3674 if (_real_sigaction(STOPSIGNAL, &act, &old_act) == -1) {
3675 mtcp_printf ("mtcp setupthread: error setting up signal handler: %s\n",
3676 strerror (errno));
3677 mtcp_abort ();
3678 }
3679
3680 if ((old_act.sa_handler != SIG_IGN) && (old_act.sa_handler != SIG_DFL) &&
3681 (old_act.sa_handler != stopthisthread)) {
3682 mtcp_printf ("mtcp setupthread: signal handler %d already in use (%p).\n"
3683 " You may employ a different signal by setting the\n"
3684 " environment variable MTCP_SIGCKPT (or DMTCP_SIGCKPT)"
3685 " to the number\n of the signal MTCP should "
3686 "use for checkpointing.\n", STOPSIGNAL, old_act.sa_handler);
3687 mtcp_abort ();
3688 }
3689 }
3690
3691 /********************************************************************************************************************************/
3692 /* */
3693 /* Sync shared memory pages with backup files on disk */
3694 /* */
3695 /********************************************************************************************************************************/
3696 static void sync_shared_mem(void)
3697 {
3698 int mapsfd;
3699 Area area;
3700
3701 mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
3702 if (mapsfd < 0) {
3703 mtcp_printf ("mtcp sync_shared_memory: error opening /proc/self/maps: %s\n",
3704 strerror (mtcp_sys_errno));
3705 mtcp_abort ();
3706 }
3707
3708 while (readmapsline (mapsfd, &area)) {
3709 /* Skip anything that has no read or execute permission. This occurs on one page in a Linux 2.6.9 installation. No idea why. This code would also take care of kernel sections since we don't have read/execute permission there. */
3710
3711 if (!((area.prot & PROT_READ) || (area.prot & PROT_WRITE))) continue;
3712
3713 if (!(area.flags & MAP_SHARED)) continue;
3714
3715 if (strstr(area.name, " (deleted)")) continue;
3716
3717 DPRINTF(("mtcp sync_shared_memory: syncing %X at %p from %s + %X\n", area.size, area.addr, area.name, area.offset));
3718
3719 if ( msync(area.addr, area.size, MS_SYNC) < 0 ){
3720 mtcp_printf ("mtcp sync_shared_memory: error syncing %X at %p from %s + %X\n", area.size, area.addr, area.name, area.offset);
3721 mtcp_abort();
3722 }
3723 }
3724
3725 close (mapsfd);
3726 }