1    	/*****************************************************************************
2    	 *   Copyright (C) 2006-2010 by Michael Rieker, Jason Ansel, Kapil Arya, and *
3    	 *                                                            Gene Cooperman *
4    	 *   mrieker@nii.net, jansel@csail.mit.edu, kapil@ccs.neu.edu, and           *
5    	 *                                                          gene@ccs.neu.edu *
6    	 *                                                                           *
7    	 *   This file is part of the MTCP module of DMTCP (DMTCP:mtcp).             *
8    	 *                                                                           *
9    	 *  DMTCP:mtcp is free software: you can redistribute it and/or              *
10   	 *  modify it under the terms of the GNU Lesser General Public License as    *
11   	 *  published by the Free Software Foundation, either version 3 of the       *
12   	 *  License, or (at your option) any later version.                          *
13   	 *                                                                           *
14   	 *  DMTCP:dmtcp/src is distributed in the hope that it will be useful,       *
15   	 *  but WITHOUT ANY WARRANTY; without even the implied warranty of           *
16   	 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
17   	 *  GNU Lesser General Public License for more details.                      *
18   	 *                                                                           *
19   	 *  You should have received a copy of the GNU Lesser General Public         *
20   	 *  License along with DMTCP:dmtcp/src.  If not, see                         *
21   	 *  <http://www.gnu.org/licenses/>.                                          *
22   	 *****************************************************************************/
23   	
24   	/********************************************************************************************************************************/
25   	/*																*/
26   	/*  Multi-threaded checkpoint library												*/
27   	/*																*/
28   	/*  Link this in as part of your program that you want checkpoints taken							*/
29   	/*  Call the mtcp_init routine at the beginning of your program									*/
30   	/*  Call the mtcp_ok routine when it's OK to do checkpointing									*/
31   	/*  Call the mtcp_no routine when you want checkpointing inhibited								*/
32   	/*																*/
33   	/*  This module also contains a __clone wrapper routine										*/
34   	/*																*/
35   	/********************************************************************************************************************************/
36   	
37   	
38   	// Set _GNU_SOURCE in order to expose glibc-defined sigandset()
39   	#define _GNU_SOURCE
40   	#include <asm/ldt.h>      // for struct user_desc
41   	//#include <asm/segment.h>  // for GDT_ENTRY_TLS_... stuff
42   	#include <dirent.h>
43   	#include <dlfcn.h>
44   	#include <errno.h>
45   	#include <fcntl.h>
46   	#include <pthread.h>
47   	#include <semaphore.h>
48   	#include <sched.h>
49   	#include <signal.h>
50   	#include <stdarg.h>
51   	#include <stdio.h>
52   	#include <stdlib.h>
53   	#include <string.h>
54   	#include <sys/mman.h>
55   	#include <sys/resource.h>
56   	#include <sys/sem.h>
57   	#include <sys/stat.h>
58   	#include <sys/syscall.h>
59   	#include <sys/ioctl.h>
60   	#include <termios.h>       // for tcdrain, tcsetattr, etc.
61   	#include <unistd.h>
62   	#include <ucontext.h>
63   	#include <sys/types.h>     // for gettid, tkill, waitpid
64   	#include <sys/wait.h>	   // for waitpid
65   	#include <linux/unistd.h>  // for gettid, tkill
66   	#include <gnu/libc-version.h>
67   	
68   	#define MTCP_SYS_STRCPY
69   	#define MTCP_SYS_STRLEN
70   	#include "mtcp_internal.h"
71   	
72   	/* required for ptrace sake */
73   	#include <sys/user.h>
74   	#include "mtcp_ptrace.h" 
75   	
76   	static int WAIT=1;
77   	// static int WAIT=0;
78   	
79   	#if 0
80   	// Force thread to stop, without use of a system call.
81   	static int WAIT=1;
82   	# define DEBUG_WAIT \
83   	if (DEBUG_RESTARTING) \
84   	  {int i,j; \
85   	    for (i = 0; WAIT && i < 1000000000; i++) \
86   	      for (j = 0; j < 1000000000; j++) ; \
87   	  }
88   	#else
89   	# define DEBUG_WAIT
90   	#endif
91   	
92   	#if defined(GDT_ENTRY_TLS_ENTRIES) && !defined(__x86_64__)
93   	#define MTCP__SAVE_MANY_GDT_ENTRIES 1
94   	#else
95   	#define MTCP__SAVE_MANY_GDT_ENTRIES 0
96   	#endif
97   	
98   	/* Retrieve saved stack pointer saved by getcontext () */
99   	#ifdef __x86_64__
100  	#define MYREG_RSP 15
101  	#define SAVEDSP uc_mcontext.gregs[MYREG_RSP]
102  	#else
103  	#define MYREG_ESP 7
104  	#define SAVEDSP uc_mcontext.gregs[MYREG_ESP]
105  	#endif
106  	
107  	/* TLS segment registers used differently in i386 and x86_64. - Gene */
108  	#ifdef __i386__
109  	# define TLSSEGREG gs
110  	#endif
111  	#ifdef __x86_64__
112  	# define TLSSEGREG fs
113  	#endif
114  	
115  	/* Offset computed (&x.pid - &x) for
116  	 *   struct pthread x;
117  	 * as found in:  glibc-2.5/nptl/descr.h
118  	 * It was 0x4c and 0x48 for pid and tid for i386.
119  	 * Roughly, the definition is:
120  	 *glibc-2.5/nptl/descr.h:
121  	 * struct pthread
122  	 * {
123  	 *  union {
124  	 *   tcbheader_t tcbheader;
125  	 *   void *__padding[16];
126  	 *  };
127  	 *  list_t list;
128  	 *  pid_t tid;
129  	 *  pid_t pid;
130  	 *  ...
131  	 * } __attribute ((aligned (TCB_ALIGNMENT)));
132  	 *
133  	 *glibc-2.5/nptl/sysdeps/pthread/list.h:
134  	 * typedef struct list_head
135  	 * {
136  	 *  struct list_head *next;
137  	 *  struct list_head *prev;
138  	 * } list_t;
139  	 *
140  	 * NOTE: glibc-2.10 changes the size of __padding from 16 to 24.  --KAPIL
141  	 *
142  	 * NOTE: glibc-2.10 further changes the size tcphead_t without updating the
143  	 *       size of __padding in struct pthread. We need to add an extra 512 bytes
144  	 *       to accomodate this.                                     -- KAPIL
145  	 */
146  	#if __GLIBC_PREREQ (2,12)
147  	/* WHEN WE HAVE CONFIDENCE IN THIS VERSION, REMOVE ALL OTHER __GLIBC_PREREQ
148  	 * AND MAKE THIS THE ONLY VERSION.  IT SHOULD BE BACKWARDS COMPATIBLE.
149  	 */
150  	/* These function definitions should succeed independently of the glibc version.
151  	 * They use get_thread_area() to match (tid, pid) and find offset.
152  	 * In other code, on restart, that offset is used to set (tid,pid) to
153  	 *   the latest tid and pid of the new thread, instead of the (tid,pid)
154  	 *   of the original thread.
155  	 * SEE: "struct pthread" in glibc-2.XX/nptl/descr.h for 'struct pthread'.
156  	 */
157  	static int TLS_TID_OFFSET(void);
158  	
159  	/* Can remove the unused attribute when this __GLIBC_PREREQ is the only one. */
160  	static char *memsubarray (char *array, char *subarray, int len)
161  						 __attribute__ ((unused));
162  	static int mtcp_get_tls_segreg(void);
163  	static void *mtcp_get_tls_base_addr(void);
164  	
165  	static int TLS_TID_OFFSET(void) {
166  	  static int tid_offset = -1;
167  	  if (tid_offset == -1) {
168  	    struct {pid_t tid; pid_t pid;} tid_pid;
169  	    /* struct pthread has adjacent fields, tid and pid, in that order.
170  	     * Try to find at what offset that bit patttern occurs in struct pthread.
171  	     */
172  	    char * tmp;
173  	    tid_pid.tid = mtcp_sys_kernel_gettid();
174  	    tid_pid.pid = mtcp_sys_getpid();
175  	    /* Get entry number of current thread descriptor from its segment register:
176  	     * Segment register / 8 is the entry_number for the "thread area", which
177  	     * is of type 'struct user_desc'.   The base_addr field of that struct
178  	     * points to the struct pthread for the thread with that entry_number.
179  	     * The tid and pid are contained in the 'struct pthread'.
180  	     *   So, to access the tid/pid fields, first find the entry number.
181  	     * Then fill in the entry_number field of an empty 'struct user_desc', and
182  	     * get_thread_area(struct user_desc *uinfo) will fill in the rest.
183  	     * Then use the filled in base_address field to get the 'struct pthread'.
184  	     * The function mtcp_get_tls_base_addr() returns this 'struct pthread' addr.
185  	     */
186  	    void * pthread_desc = mtcp_get_tls_base_addr();
187  	    /* A false hit for tid_offset probably can't happen since a new
188  	     * 'struct pthread' is zeroed out before adding tid and pid.
189  	     */
190  	    tmp = memsubarray((char *)pthread_desc, (char *)&tid_pid, sizeof(tid_pid));
191  	    if (tmp == NULL) {
192  	      mtcp_printf("MTCP:  Couldn't find offsets of tid/pid in thread_area.\n");
193  	      mtcp_abort();
194  	    }
195  	    tid_offset = tmp - (char *)pthread_desc;
196  	#ifdef __x86_64__
197  	    if (tid_offset != 512+26*sizeof(void *))
198  	#else
199  	    if (tid_offset != 26*sizeof(void *))
200  	#endif
201  	      mtcp_printf("MTCP:  Warning:  tid_offset = %d; different from expected.\n"
202  	                  "  Continuing anyway.  If this fails, please try again.\n",
203  	                  tid_offset);
204  	    DPRINTF(("tid_offset: %d\n", tid_offset));
205  	    if (tid_offset % sizeof(int) != 0) {
206  	      mtcp_printf("MTCP:  tid_offset is not divisible by sizeof(int).\n");
207  	      mtcp_abort();
208  	    }
209  	    /* Should we do a double-check, and spawn a new thread and see
210  	     *  if its TID matches at this tid_offset?  This would give greater
211  	     *  confidence, but for the reasons above, it's probably not necessary.
212  	     */
213  	  }
214  	  return tid_offset;
215  	}
216  	static int TLS_PID_OFFSET(void) {
217  	  static int pid_offset = -1;
218  	  struct {pid_t tid; pid_t pid;} tid_pid;
219  	  if (pid_offset == -1) {
220  	    int tid_offset = TLS_TID_OFFSET();
221  	    pid_offset = tid_offset + (char *)&(tid_pid.pid) - (char *)&tid_pid;
222  	    DPRINTF(("pid_offset: %d\n", pid_offset));
223  	  }
224  	  return pid_offset;
225  	}
226  	#elif __GLIBC_PREREQ (2,11)
227  	# ifdef __x86_64__
228  	#  define TLS_PID_OFFSET() \
229  	           (512+26*sizeof(void *)+sizeof(pid_t))  // offset of pid in pthread struct
230  	#  define TLS_TID_OFFSET() (512+26*sizeof(void *))  // offset of tid in pthread struct
231  	# else
232  	#  define TLS_PID_OFFSET() \
233  	           (26*sizeof(void *)+sizeof(pid_t))  // offset of pid in pthread struct
234  	#  define TLS_TID_OFFSET() (26*sizeof(void *))  // offset of tid in pthread struct
235  	# endif
236  	#elif __GLIBC_PREREQ (2,10)
237  	# define TLS_PID_OFFSET() \
238  		  (26*sizeof(void *)+sizeof(pid_t))  // offset of pid in pthread struct
239  	# define TLS_TID_OFFSET() (26*sizeof(void *))  // offset of tid in pthread struct
240  	#else
241  	# define TLS_PID_OFFSET() \
242  		  (18*sizeof(void *)+sizeof(pid_t))  // offset of pid in pthread struct
243  	# define TLS_TID_OFFSET() (18*sizeof(void *))  // offset of tid in pthread struct
244  	#endif
245  	
246  	/* this call to gettid is hijacked by DMTCP for PID/TID-Virtualization */
247  	#define GETTID() (int)syscall(SYS_gettid)
248  	
249  	sem_t sem_start;
250  	
251  	typedef struct Thread Thread;
252  	
253  	struct Thread { Thread *next;                       // next thread in 'threads' list
254  	                Thread **prev;                      // prev thread in 'threads' list
255  	                int tid;                            // this thread's id as returned by mtcp_sys_kernel_gettid ()
256  	                int original_tid;                   // this is the the thread's "original" tid
257  	                MtcpState state;                    // see ST_... below
258  	                Thread *parent;                     // parent thread (or NULL if top-level thread)
259  	                Thread *children;                   // one of this thread's child threads
260  	                Thread *siblings;                   // one of this thread's sibling threads
261  	
262  	                int clone_flags;                    // parameters to __clone that created this thread
263  	                int *parent_tidptr;
264  	                int *given_tidptr;                  // (this is what __clone caller passed in)
265  	                int *actual_tidptr;                 // (this is what we passed to the system call, either given_tidptr or &child_tid)
266  	                int child_tid;                      // this is used for child_tidptr if the original call did not
267  	                                                    // ... have both CLONE_CHILD_SETTID and CLONE_CHILD_CLEARTID
268  	                int (*fn) (void *arg);              // thread's initial function entrypoint and argument
269  	                void *arg;
270  	
271  	                sigset_t sigblockmask;              // blocked signals
272  	                sigset_t sigpending;                // pending signals
273  	
274  	                ///JA: new code ported from v54b
275  	                ucontext_t savctx;                  // context saved on suspend
276  	
277  	                mtcp_segreg_t fs, gs;               // thread local storage pointers
278  	                pthread_t pth;                      // added for pthread_join
279  	#if MTCP__SAVE_MANY_GDT_ENTRIES
280  	                struct user_desc gdtentrytls[GDT_ENTRY_TLS_ENTRIES];
281  	#else
282  	                struct user_desc gdtentrytls[1];
283  	#endif
284  	              };
285  	
286  	/*
287  	 * struct MtcpRestartThreadArg
288  	 *
289  	 * DMTCP requires the original_tids  of the threads being created during
290  	 *  the RESTARTING phase. We use MtcpRestartThreadArg structure is to pass
291  	 *  the original_tid of the thread being created from MTCP to DMTCP.
292  	 *
293  	 * actual clone call: clone (fn, child_stack, flags, void *, ... )
294  	 * new clone call   : clone (fn, child_stack, flags, (struct MtcpRestartThreadArg *), ...)
295  	 *
296  	 * DMTCP automatically extracts arg from this structure and passes that
297  	 * to the _real_clone call.
298  	 *
299  	 * IMPORTANT NOTE: While updating, this structure must be kept in sync
300  	 * with the structure defined with the same name in mtcpinterface.cpp
301  	 */
302  	struct MtcpRestartThreadArg {
303  	  void *arg;
304  	  pid_t original_tid;
305  	};
306  	
307  	#define ST_RUNDISABLED 0     // thread is running normally but with checkpointing disabled
308  	#define ST_RUNENABLED 1      // thread is running normally and has checkpointing enabled
309  	#define ST_SIGDISABLED 2     // thread is running normally with cp disabled, but checkpoint thread is waiting for it to enable
310  	#define ST_SIGENABLED 3      // thread is running normally with cp enabled, and checkpoint thread has signalled it to stop
311  	#define ST_SUSPINPROG 4      // thread context being saved (very brief)
312  	#define ST_SUSPENDED 5       // thread is suspended waiting for checkpoint to complete
313  	#define ST_CKPNTHREAD 6      // thread is the checkpointing thread (special state just for that thread)
314  	
315  		/* Global data */
316  	
317  	void *mtcp_libc_dl_handle = NULL;  // dlopen handle for whatever libc.so is loaded with application program
318  	Area mtcp_libc_area;               // some area of that libc.so
319  	
320  	/* DMTCP Info Variables */
321  	
322  	/* These are reset by dmtcphijack.so at initialization. */
323  	int dmtcp_exists = 0; /* Are we running under DMTCP? */
324  	int dmtcp_info_pid_virtualization_enabled = 0;
325  	/* The following two DMTCP Info variables are defined in mtcp_printf.c */
326  	//int dmtcp_info_stderr_fd = 2;
327  	//int dmtcp_info_jassertlog_fd = -1;
328  	int dmtcp_info_restore_working_directory = -1;
329  	
330  		/* Static data */
331  	
332  	static sigset_t sigpending_global;                // pending signals for the process
333  	static char const *nscd_mmap_str = "/var/run/nscd/";    // OpenSUSE
334  	static char const *nscd_mmap_str2 = "/var/cache/nscd";  // Debian / Ubuntu
335  	static char const *nscd_mmap_str3 = "/var/db/nscd";     // RedHat (Linux 2.6.9)
336  	static char const *dev_zero_deleted_str = "/dev/zero (deleted)";
337  	static char const *dev_null_deleted_str = "/dev/null (deleted)";
338  	static char const *sys_v_shmem_file = "/SYSV";
339  	//static char const *perm_checkpointfilename = NULL;
340  	//static char const *temp_checkpointfilename = NULL;
341  	static char perm_checkpointfilename[MAXPATHLEN];
342  	static char temp_checkpointfilename[MAXPATHLEN];
343  	static size_t checkpointsize;
344  	static int intervalsecs;
345  	static pid_t motherpid;
346  	static size_t restore_size;
347  	static int showtiming;
348  	static int threadenabledefault;
349  	static int verify_count;  // number of checkpoints to go
350  	static int verify_total;  // value given by envar
351  	static pid_t mtcp_ckpt_gzip_child_pid = -1;
352  	static int volatile checkpointhreadstarting = 0;
353  	static MtcpState restoreinprog = MTCP_STATE_INITIALIZER;
354  	static MtcpState threadslocked = MTCP_STATE_INITIALIZER;
355  	static pthread_t checkpointhreadid;
356  	static struct timeval restorestarted;
357  	static int DEBUG_RESTARTING = 0;
358  	static Thread *motherofall = NULL;
359  	static Thread *ckpthread = NULL;
360  	static Thread *threads = NULL;
361  	struct sigaction sigactions[NSIG];  // signal handlers
362  	static VA restore_begin, restore_end;
363  	static void *restore_start; /* will be bound to fnc, mtcp_restore_start */
364  	static void *saved_sysinfo;
365  	static void *saved_heap_start = NULL;
366  	static char saved_working_directory[MTCP_MAX_PATH];
367  	static void (*callback_sleep_between_ckpt)(int sec) = NULL;
368  	static void (*callback_pre_ckpt)() = NULL;
369  	static void (*callback_post_ckpt)(int is_restarting) = NULL;
370  	static int  (*callback_ckpt_fd)(int fd) = NULL;
371  	static void (*callback_write_dmtcp_header)(int fd) = NULL;
372  	static void (*callback_restore_virtual_pid_table)() = NULL;
373  	
374  	static int (*clone_entry) (int (*fn) (void *arg),
375  	                           void *child_stack,
376  	                           int flags,
377  	                           void *arg,
378  	                           int *parent_tidptr,
379  	                           struct user_desc *newtls,
380  	                           int *child_tidptr);
381  	
382  	/* temp stack used internally by restore so we don't go outside the
383  	 *   libmtcp.so address range for anything;
384  	 * including "+ 1" since will set %esp/%rsp to tempstack+STACKSIZE
385  	 */
386  	static long long tempstack[STACKSIZE + 1];
387  	
388  		/* Internal routines */
389  	
390  	static long set_tid_address (int *tidptr);
391  	
392  	static char *memsubarray (char *array, char *subarray, int len)
393  						 __attribute__ ((unused));
394  	static int mtcp_get_tls_segreg(void);
395  	static void *mtcp_get_tls_base_addr(void);
396  	static int threadcloned (void *threadv);
397  	static void setupthread (Thread *thread);
398  	static void setup_clone_entry (void);
399  	static void threadisdead (Thread *thread);
400  	static void *checkpointhread (void *dummy);
401  	static int test_use_compression(void);
402  	static int open_ckpt_to_write(int fd, int pipe_fds[2], char *gzip_path);
403  	static void checkpointeverything (void);
404  	static void writefiledescrs (int fd);
405  	static void writememoryarea (int fd, Area *area,
406  				     int stack_was_seen, int vsyscall_exists);
407  	static void writecs (int fd, char cs);
408  	static void writefile (int fd, void const *buff, size_t size);
409  	static void preprocess_special_segments(int *vsyscall_exists);
410  	static void stopthisthread (int signum);
411  	static void wait_for_all_restored (void);
412  	static void save_sig_state (Thread *thisthread);
413  	static void restore_sig_state (Thread *thisthread);
414  	static void save_sig_handlers (void);
415  	static void restore_sig_handlers (Thread *thisthread);
416  	static void save_tls_state (Thread *thisthread);
417  	static void renametempoverperm (void);
418  	static Thread *getcurrenthread (void);
419  	static void lock_threads (void);
420  	static void unlk_threads (void);
421  	static int readmapsline (int mapsfd, Area *area);
422  	static void restore_heap(void);
423  	static void finishrestore (void);
424  	static int restarthread (void *threadv);
425  	static void restore_tls_state (Thread *thisthread);
426  	static void setup_sig_handler (void);
427  	static void sync_shared_mem(void);
428  	
429  	/* FIXME:
430  	 * dmtcp/src/syscallsreal.c has wrappers around signal, sigaction, sigprocmask
431  	 * The wrappers go to these mtcp_real_XXX versions so that MTCP can call
432  	 * the actual system calls and avoid the wrappers.  But if that is still
433  	 * an issue, then we can create mtcp_sys_signal(), etc., for direct calls.
434  	 *
435  	 * Update: 
436  	 * mtcp_real_XXX versions have been renamed to _real_XXX in DMTCP.
437  	 * sigprocmask should not be used in multi-threaded process, use
438  	 * pthread_sigmask instead.
439  	 */
440  	int _real_sigaction(int signum, const struct sigaction *act,
441  				struct sigaction *oldact){
442  	  if (dmtcp_exists) {
443  	    mtcp_printf("mtcp %s: This function mustn't be called when working under DMTCP\n",
444  	                __FUNCTION__);
445  	    mtcp_abort();
446  	  }
447  	  return sigaction(signum, act, oldact);
448  	}
449  	
450  	
451  	/********************************************************************************************************************************/
452  	/*																*/
453  	/*  This routine must be called at startup time to initiate checkpointing							*/
454  	/*																*/
455  	/*    Input:															*/
456  	/*																*/
457  	/*	checkpointfilename = name to give the checkpoint file									*/
458  	/*	interval = interval, in seconds, to write the checkpoint file								*/
459  	/*	clonenabledefault = 0 : clone checkpointing blocked by default (call mtcp_ok in the thread to enable)			*/
460  	/*	                    1 : clone checkpointing enabled by default (call mtcp_no in the thread to block if you want)	*/
461  	/*																*/
462  	/*	envar MTCP_WRAPPER_LIBC_SO = what library to use for inner wrappers (default libc.??.so)				*/
463  	/*	envar MTCP_VERIFY_CHECKPOINT = every n checkpoints, verify by doing a restore to resume					*/
464  	/*	                               default is 0, ie, don't ever verify							*/
465  	/*																*/
466  	/********************************************************************************************************************************/
467  	/* These hook functions provide an alternative to DMTCP callbacks, using
468  	 * weak symbols.  While MTCP is immature, let's allow both, in case
469  	 * the flexibility of a second hook mechanism is useful in the future.
470  	 * The mechanism is invisible unless end user compiles w/ -Wl,-export-dynamic
471  	 */
472  	__attribute__ ((weak)) void mtcpHookPreCheckpoint( void ) { }
473  	
474  	__attribute__ ((weak)) void mtcpHookPostCheckpoint( void ) { }
475  	
476  	__attribute__ ((weak)) void mtcpHookRestart( void ) { }
477  	
478  	/* Statically allocate this.  Malloc is dangerous here if application is
479  	 *   defining its own (possibly not thread-safe) malloc routine.
480  	 */
481  	static Thread ckptThreadStorage;
482  	
483  	void mtcp_init (char const *checkpointfilename, int interval, int clonenabledefault)
484  	{
485  	  char *p, *tmp, *endp;
486  	  int len;
487  	  Thread *ckptThreadDescriptor = & ckptThreadStorage;
488  	  mtcp_segreg_t TLSSEGREG;
489  	#ifdef PTRACE 
490  	  init_thread_local();
491  	#endif
492  	
493  	  if (sizeof(void *) != sizeof(long)) {
494  	    mtcp_printf("ERROR: sizeof(void *) != sizeof(long) on this architecture.\n"
495  		   "       This code assumes they are equal.\n");
496  	    mtcp_abort ();
497  	  }
498  	
499  	#ifndef __x86_64__
500  	  /* Nobody else has a right to preload on internal processes generated
501  	   * by mtcp_check_XXX() -- not even DMTCP, if it's currently operating.
502  	   *
503  	   * Saving LD_PRELOAD in a temp env var and restoring it later --Kapil.
504  	   *
505  	   * TODO: To insert some sort of error checking to make sure that we
506  	   *       are correctly setting LD_PRELOAD after we are done with
507  	   *       vdso check.
508  	   */
509  	
510  	  // Shouldn't this removal of LD_PRELOAD be around fork/exec of gzip ?
511  	  // setenv( "MTCP_TMP_LD_PRELOAD", getenv("LD_PRELOAD"), 1);
512  	  // unsetenv("LD_PRELOAD");
513  	  // Allow user program to run with randomize_va
514  	  // mtcp_check_vdso_enabled();
515  	  // setenv("LD_PRELOAD", getenv("MTCP_TMP_LD_PRELOAD"), 1);
516  	  // unsetenv("MTCP_TMP_LD_PRELOAD");
517  	#endif
518  	
519  	#if 0
520  	  { struct user_desc u_info;
521  	    u_info.entry_number = 12;
522  	    if (-1 == mtcp_sys_get_thread_area(&u_info) && mtcp_sys_errno == ENOSYS)
523  	      mtcp_printf(
524  	        "Apparently, get_thread_area is not implemented in your kernel.\n"
525  	        "  If this doesn't work, please try on a more recent kernel,\n"
526  	        "  or one configured to support get_thread_area.\n"
527  	      );
528  	  }
529  	#endif
530  	
531  	  intervalsecs = interval;
532  	
533  	  if (strlen(mtcp_ckpt_newname) >= MAXPATHLEN) {
534  	    mtcp_printf("mtcp mtcp_init: new ckpt file name (%s) too long (>=512 bytes)\n",
535  	                mtcp_ckpt_newname);
536  	    mtcp_abort();
537  	  }
538  	  strncpy(perm_checkpointfilename,checkpointfilename,MAXPATHLEN);  // this is what user wants the checkpoint file called
539  	  len = strlen (perm_checkpointfilename);        // make up another name, same as that, with ".temp" on the end
540  	  memcpy(temp_checkpointfilename, perm_checkpointfilename, len);
541  	  strncpy(temp_checkpointfilename + len, ".temp",MAXPATHLEN-len);
542  	                                                 // ... we use it to write to in case we crash while writing
543  	                                                 //     we will leave the previous good one intact
544  	
545  	#ifdef PTRACE
546  	  /* TODO:  USE flock WHEN WRITING TO THESE THREE FILES (NOT YET DONE FOR ptrace_setoptions_file? */
547  	  memset(ptrace_shared_file, '\0', MAXPATHLEN);
548  	  sprintf(ptrace_shared_file, "%s/ptrace_shared_file.txt", dir);
549  	  memset(ptrace_setoptions_file, '\0', MAXPATHLEN);
550  	  sprintf(ptrace_setoptions_file, "%s/ptrace_setoptions_file.txt", dir);
551  	  memset(checkpoint_threads_file, '\0', MAXPATHLEN);
552  	  sprintf(checkpoint_threads_file, "%s/checkpoint_threads_file.txt", dir);
553  	#endif
554  	
555  	  DPRINTF (("mtcp_init*: main tid %d\n", mtcp_sys_kernel_gettid ()));
556  	  /* If MTCP_INIT_PAUSE set, sleep 15 seconds and allow for gdb attach. */
557  	  if (getenv("MTCP_INIT_PAUSE")) {
558  	    mtcp_printf("Pausing 15 seconds.  Do:  gdb attach %d\n", mtcp_sys_getpid());
559  	    sleep(15);
560  	  }
561  	
562  	  threadenabledefault = clonenabledefault;       // save this away where it's easy to get
563  	
564  	  p = getenv ("MTCP_SHOWTIMING");
565  	  showtiming = ((p != NULL) && (*p & 1));
566  	
567  	  /* Maybe dump out some stuff about the TLS */
568  	
569  	  mtcp_dump_tls (__FILE__, __LINE__);
570  	
571  	  /* Save this process's pid.  Then verify that the TLS has it where it should be.           */
572  	  /* When we do a restore, we will have to modify each thread's TLS with the new motherpid. */
573  	  /* We also assume that GS uses the first GDT entry for its descriptor.                    */
574  	
575  	  motherpid = mtcp_sys_getpid (); /* libc/getpid can lie if we had
576  					   * used kernel fork() instead of libc fork().
577  					   */
578  	  {
579  	    pid_t tls_pid, tls_tid;
580  	    tls_pid = *(pid_t *) (mtcp_get_tls_base_addr() + TLS_PID_OFFSET());
581  	    tls_tid = *(pid_t *) (mtcp_get_tls_base_addr() + TLS_TID_OFFSET());
582  	
583  	    if ((tls_pid != motherpid) || (tls_tid != motherpid)) {
584  	      mtcp_printf ("mtcp_init: getpid %d, tls pid %d, tls tid %d, must all match\n",
585  	                    motherpid, tls_pid, tls_tid);
586  	      mtcp_abort ();
587  	    }
588  	  }
589  	
590  	  /* Get verify envar */
591  	
592  	  tmp = getenv ("MTCP_VERIFY_CHECKPOINT");
593  	  verify_total = 0;
594  	  if (tmp != NULL) {
595  	    verify_total = strtol (tmp, &p, 0);
596  	    if ((*p != '\0') || (verify_total < 0)) {
597  	      mtcp_printf ("mtcp_init: bad MTCP_VERIFY_CHECKPOINT %s\n", tmp);
598  	      mtcp_abort ();
599  	    }
600  	  }
601  	
602  	  /* If the user has defined a signal, use that to suspend.  Otherwise, use MTCP_DEFAULT_SIGNAL */
603  	
604  	  tmp = getenv("MTCP_SIGCKPT");
605  	  if (tmp == NULL)
606  	      STOPSIGNAL = MTCP_DEFAULT_SIGNAL;
607  	  else
608  	  {
609  	      errno = 0;
610  	      STOPSIGNAL = strtol(tmp, &endp, 0);
611  	
612  	      if ((errno != 0) || (tmp == endp))
613  	      {
614  	          mtcp_printf("mtcp_init: Your chosen SIGCKPT of \"%s\" does not "
615  	                        "translate to a number,\n"
616  				"  and cannot be used.  Signal %d "
617  	                        "will be used instead.\n", tmp, MTCP_DEFAULT_SIGNAL);
618  	          STOPSIGNAL = MTCP_DEFAULT_SIGNAL;
619  	      }
620  	      else if (STOPSIGNAL < 1 || STOPSIGNAL > 31)
621  	      {
622  	          mtcp_printf("mtcp_init: Your chosen SIGCKPT of \"%d\" is not a valid "
623  	                        "signal, and cannot be used.\n"
624  				"  Signal %d will be used instead.\n",
625  			       STOPSIGNAL, MTCP_DEFAULT_SIGNAL);
626  	          STOPSIGNAL = MTCP_DEFAULT_SIGNAL;
627  	      }
628  	  }
629  	
630  	  /* Set up signal handler so we can interrupt the thread for checkpointing */
631  	  setup_sig_handler ();
632  	
633  	  /* Get size and address of the shareable - used to separate it from the rest of the stuff */
634  	  /* All routines needed to perform restore must be within this address range               */
635  	
636  	  restore_begin = (((VA)mtcp_shareable_begin) & -MTCP_PAGE_SIZE);
637  	  restore_size  = ((VA)mtcp_shareable_end - restore_begin + MTCP_PAGE_SIZE - 1) & -MTCP_PAGE_SIZE;
638  	  restore_end   = restore_begin + restore_size;
639  	  restore_start = mtcp_restore_start;
640  	
641  	  /* Setup clone_entry to point to glibc's __clone routine */
642  	
643  	  setup_clone_entry ();
644  	
645  	  /* Set up caller as one of our threads so we can work on it */
646  	
647  	  memset (ckptThreadDescriptor, 0, sizeof *ckptThreadDescriptor);
648  	  setupthread (ckptThreadDescriptor);
649  	  ckptThreadDescriptor -> child_tid = mtcp_sys_kernel_gettid (); // need to set this up so the checkpointhread can see we haven't exited
650  	  set_tid_address (&(ckptThreadDescriptor -> child_tid));  // we are assuming mtcp_init has been called before application may have called set_tid_address
651  	                                             // ... or else we will end up overwriting that set_tid_address value
652  	  motherofall = ckptThreadDescriptor;
653  	
654  	  /* Spawn off a thread that will perform the checkpoints from time to time */
655  	
656  	  checkpointhreadstarting = 1;
657  	  /* If we return from a fork(), we don't know what is the semaphore value. */
658  	  errno = 0;
659  	  while (sem_trywait(&sem_start) == -1 && (errno == EAGAIN || errno == EINTR)) {
660  	    if ( errno == EAGAIN )
661  	      sem_post(&sem_start);
662  	    errno = 0;
663  	  }
664  	  if (errno != 0)
665  	    perror("ERROR: continue anyway from " __FILE__ ":mtcp_init:sem_trywait()");
666  	  /* Now we successfully locked it.  The sempaphore value is zero. */
667  	  if (pthread_create (&checkpointhreadid, NULL, checkpointhread, NULL) < 0) {
668  	    mtcp_printf ("mtcp_init: error creating checkpoint thread: %s\n", strerror (errno));
669  	    mtcp_abort ();
670  	  }
671  	  if (checkpointhreadstarting) mtcp_abort ();  // make sure the clone wrapper executed (ie, not just the standard clone)
672  	  /* Stop until checkpoint thread has finished initializing.
673  	   * Some programs (like gcl) implement their own glibc functions in
674  	   * a non-thread-safe manner.  In case we're using non-thread-safe glibc,
675  	   * don't run the checkpoint thread and user thread at the same time.
676  	   */
677  	  errno = 0;
678  	  while (-1 == sem_wait(&sem_start) && errno == EINTR)
679  	    errno = 0;
680  	  /* The child thread checkpointhread will now wake us. */
681  	}
682  	
683  	/********************************************************************************************************************************
684  	 *
685  	 *  The routine mtcp_set_callbacks below may be called BEFORE the first
686  	 *  MTCP checkpoint, to add special functionality to checkpointing
687  	 *
688  	 *    Its arguments (callback functions) are:
689  	 *
690  	 * sleep_between_ckpt:  Called in between checkpoints to replace the default "sleep(sec)" functionality,
691  	 *                      when this function returns checkpoint will start
692  	 * pre_ckpt:            Called after all user threads are suspended, but BEFORE checkpoint written
693  	 * post_ckpt:           Called after checkpoint, and after restore.  is_restarting will be 1 for restore 0 for after checkpoint
694  	 * ckpt_fd:             Called to test if mtcp should checkpoint a given FD returns 1 if it should
695  	 *
696  	 *******************************************************************************************************************************/
697  	
698  	void mtcp_set_callbacks(void (*sleep_between_ckpt)(int sec),
699  	                        void (*pre_ckpt)(),
700  	                        void (*post_ckpt)(int is_restarting),
701  	                        int  (*ckpt_fd)(int fd),
702  	                        void (*write_dmtcp_header)(int fd),
703  	                        void (*restore_virtual_pid_table)())
704  	{
705  	    callback_sleep_between_ckpt = sleep_between_ckpt;
706  	    callback_pre_ckpt = pre_ckpt;
707  	    callback_post_ckpt = post_ckpt;
708  	    callback_ckpt_fd = ckpt_fd;
709  	    callback_write_dmtcp_header = write_dmtcp_header;
710  	    callback_restore_virtual_pid_table = restore_virtual_pid_table;
711  	}
712  	
713  	/*************************************************************************/
714  	/*						                         */
715  	/*  Dump out the TLS stuff pointed to by %gs	                         */
716  	/*						                         */
717  	/*************************************************************************/
718  	
719  	void mtcp_dump_tls (char const *file, int line)
720  	{
721  	#if 000
722  	  int i, j, mypid;
723  	  sigset_t blockall, oldsigmask;
724  	  struct user_desc gdtentry;
725  	  unsigned char byt;
726  	  unsigned short gs;
727  	
728  	  static int mutex = 0;
729  	
730  	  /* Block all signals whilst we have the futex */
731  	
732  	  memset (&blockall, -1, sizeof blockall);
733  	  if (sigprocmask (SIG_SETMASK, &blockall, &oldsigmask) < 0) {
734  	    abort ();
735  	  }
736  	
737  	  /* Block other threads from doing this so the output doesn't mix */
738  	
739  	  while (!atomic_setif_int (&mutex, 1, 0)) {
740  	    mtcp_sys_futex (&mutex, FUTEX_WAIT, 1, NULL, NULL, 0);
741  	  }
742  	
743  	  /* Get the segment for the TLS stuff */
744  	
745  	  asm volatile ("movw %%gs,%0" : "=g" (gs));
746  	  mtcp_printf("mtcp_init: gs=%X at %s:%d\n", gs, file, line);
747  	  if (gs != 0) {
748  	
749  	    /* We only handle GDT based stuff */
750  	
751  	    if (gs & 4) mtcp_printf("   *** part of LDT\n");
752  	
753  	    /* It's in the GDT */
754  	
755  	    else {
756  	
757  	      /* Read the TLS descriptor */
758  	
759  	      gdtentry.entry_number = gs / 8;
760  	      i = mtcp_sys_get_thread_area (&gdtentry);
761  	      if (i < 0) mtcp_printf("  error getting GDT entry %d: %d\n", gdtentry.entry_number, mtcp_sys_errno);
762  	      else {
763  	
764  	        /* Print out descriptor and first 80 bytes of data */
765  	
766  	        mtcp_printf("  limit %X, baseaddr %X\n", gdtentry.limit, gdtentry.base_addr);
767  	        for (i = 0; i < 80; i += 16) {
768  	          for (j = 16; -- j >= 0;) {
769  	            if ((j & 3) == 3) fputc (' ', stderr);
770  	            asm volatile ("movb %%gs:(%1),%0" : "=r" (byt) : "r" (i + j));
771  	            mtcp_printf("%2.2X", byt);
772  	          }
773  	          mtcp_printf(" : gs+%2.2X\n", i);
774  	        }
775  	        for (i = 0; i < 80; i += 16) {
776  	          for (j = 16; -- j >= 0;) {
777  	            if ((j & 3) == 3) fputc (' ', stderr);
778  	            byt = ((unsigned char *)gdtentry.base_addr)[i+j];
779  	            mtcp_printf("%2.2X", byt);
780  	          }
781  	          mtcp_printf(" : %8.8X\n", gdtentry.base_addr + i);
782  	        }
783  	
784  	        /* Offset 4C should be the process id */
785  	
786  	        asm volatile ("mov %%gs:0x4C,%0" : "=r" (i));
787  	        mtcp_printf("mtcp_init: getpid=%d, gettid=%d, tls=%d\n", getpid (), mtcp_sys_kernel_gettid (), i);
788  	      }
789  	    }
790  	  }
791  	
792  	  /* Release mutex and restore signal delivery */
793  	
794  	  mutex = 0;
795  	  mtcp_sys_futex (&mutex, FUTEX_WAKE, 1, NULL, NULL, 0);
796  	  if (_real_sigprocmask (SIG_SETMASK, &oldsigmask, NULL) < 0) {
797  	    abort ();
798  	  }
799  	#endif
800  	}
801  	
802  	/*****************************************************************************/
803  	/*									     */
804  	/*  This is our clone system call wrapper				     */
805  	/*									     */
806  	/*    Note:								     */
807  	/*									     */
808  	/*      pthread_create eventually calls __clone to create threads	     */
809  	/*      It uses flags = 0x3D0F00:					     */
810  	/*	      CLONE_VM = VM shared between processes			     */
811  	/*	      CLONE_FS = fs info shared between processes (root, cwd, umask) */
812  	/*	   CLONE_FILES = open files shared between processes (fd table)	     */
813  	/*	 CLONE_SIGHAND = signal handlers and blocked signals shared	     */
814  	/*	 			 (sigaction common to parent and child)	     */
815  	/*	  CLONE_THREAD = add to same thread group			     */
816  	/*	 CLONE_SYSVSEM = share system V SEM_UNDO semantics		     */
817  	/*	  CLONE_SETTLS = create a new TLS for the child from newtls parameter*/
818  	/*	 CLONE_PARENT_SETTID = set the TID in the parent (before MM copy)    */
819  	/*	CLONE_CHILD_CLEARTID = clear the TID in the child and do	     */
820  	/*				 futex wake at that address		     */
821  	/*	      CLONE_DETACHED = create clone detached			     */
822  	/*									     */
823  	/*****************************************************************************/
824  	
825  	int __clone (int (*fn) (void *arg), void *child_stack, int flags, void *arg,
826  		     int *parent_tidptr, struct user_desc *newtls, int *child_tidptr)
827  	{
828  	  int rc;
829  	  Thread *thread;
830  	#ifdef PTRACE
831  	  int i;
832  	#endif
833  	
834  	  /* Maybe they decided not to call mtcp_init */
835  	  if (motherofall != NULL) {
836  	
837  	    /* They called mtcp_init meaning we are to do checkpointing.
838  	     * So we are going to track this thread.
839  	     */
840  	
841  	    thread = malloc (sizeof *thread);
842  	    memset (thread, 0, sizeof *thread);
843  	    thread -> fn     = fn;   // this is the user's function
844  	    thread -> arg    = arg;  // ... and the parameter
845  	    thread -> parent = getcurrenthread ();
846  	    if (checkpointhreadstarting) {
847  	      checkpointhreadstarting = 0;
848  	      mtcp_state_init(&thread->state, ST_CKPNTHREAD);
849  	    } else {
850  	      mtcp_state_init(&thread->state, ST_RUNDISABLED);
851  	    }
852  	
853  	    DPRINTF (("mtcp wrapper clone*: calling clone thread=%p,"
854  		      " fn=%p, flags=0x%X\n", thread, fn, flags));
855  	    DPRINTF (("mtcp wrapper clone*: parent_tidptr=%p, newtls=%p,"
856  		      " child_tidptr=%p\n", parent_tidptr, newtls, child_tidptr));
857  	    //asm volatile ("int3");
858  	
859  	    /* Save exactly what the caller is supplying */
860  	
861  	    thread -> clone_flags   = flags;
862  	    thread -> parent_tidptr = parent_tidptr;
863  	    thread -> given_tidptr  = child_tidptr;
864  	
865  	    /* We need the CLEARTID feature so we can detect			     */
866  	    /*   when the thread has exited					     */
867  	    /* So if the caller doesn't want it, we enable it                        */
868  	    /* Retain what the caller originally gave us so we can pass the tid back */
869  	
870  	    if (!(flags & CLONE_CHILD_CLEARTID)) {
871  	      child_tidptr = &(thread -> child_tid);
872  	    }
873  	    thread -> actual_tidptr = child_tidptr;
874  	    DPRINTF (("mtcp wrapper clone*: thread %p -> actual_tidptr %p\n",
875  		      thread, thread -> actual_tidptr));
876  	
877  	    /* Alter call parameters, forcing CLEARTID and make it call the wrapper routine */
878  	
879  	    flags |= CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID;
880  	    fn = threadcloned;
881  	    arg = thread;
882  	  }
883  	
884  	  /* mtcp_init not called, no checkpointing, but make sure clone_entry is */
885  	  /* set up so we can call the real clone                                 */
886  	
887  	  else if (clone_entry == NULL) setup_clone_entry ();
888  	
889  	  /* Now create the thread */
890  	
891  	  DPRINTF (("mtcp wrapper clone*: clone fn=%p, child_stack=%p, flags=%X, arg=%p\n", fn, child_stack, flags, arg));
892  	  DPRINTF (("mtcp wrapper clone*: parent_tidptr=%p, newtls=%p, child_tidptr=%p\n", parent_tidptr, newtls, child_tidptr));
893  	  rc = (*clone_entry) (fn, child_stack, flags, arg, parent_tidptr, newtls, child_tidptr);
894  	  if (rc < 0) {
895  	    DPRINTF (("mtcp wrapper clone*: clone rc=%d, errno=%d\n", rc, errno));
896  	  } else {
897  	    DPRINTF (("mtcp wrapper clone*: clone rc=%d\n", rc));
898  	  }
899  	
900  	#ifdef PTRACE
901  	 /*************************************************************************/
902  	  /*  Code added to keep record of new tasks and processes in a file       */
903  	  /*************************************************************************/
904  	
905  	  // initialize the ptrace_tid_pairs array  
906  	  if (!init_ptrace_pairs) {
907  	    for (i = 0; i < MAX_PTRACE_PAIRS_COUNT; i++) {
908  	      ptrace_pairs[i].last_command = PTRACE_UNSPECIFIED_COMMAND;
909  	      ptrace_pairs[i].singlestep_waited_on = FALSE;
910  	      ptrace_pairs[i].free = TRUE;
911  	      ptrace_pairs[i].inferior_st = 'u'; // undefined
912  	    }
913  	    init_ptrace_pairs = 1;
914  	  }
915  	
916  	  // initialize the semaphore used when motherofall reads the ptrace shared file  
917  	  if (!init_ptrace_read_pairs_sem) {
918  	    sem_init(&ptrace_read_pairs_sem, 0, 0);
919  	    init_ptrace_read_pairs_sem = 1;
920  	  }
921  	
922  	  if (!init__sem) {
923  	    sem_init(&__sem, 0, 1);
924  	    init__sem = 1;
925  	  }
926  	
927  	  if (is_ptrace_setoptions == TRUE) writeptraceinfo (setoptions_superior, rc);
928  	  else {
929  	    // read from file
930  	    int setoptions_fd = -1;
931  	    pid_t inferior;
932  	    pid_t superior;
933  	
934  	    setoptions_fd = open(ptrace_setoptions_file, O_RDONLY);
935  	
936  	    if (setoptions_fd != -1) {
937  	      while (readall(setoptions_fd, &superior, sizeof(pid_t)) > 0) {
938  	        readall(setoptions_fd, &inferior, sizeof(pid_t));
939  	  if (inferior == GETTID()) {
940  	    setoptions_superior = superior;
941  	    is_ptrace_setoptions = TRUE;
942  	    writeptraceinfo (setoptions_superior, rc);
943  	  }
944  	      }
945  	      if ( close(setoptions_fd) != 0 ) {
946  	        mtcp_printf("__clone: Error closing file: %s\n",
947  	                    strerror(errno));
948  	  mtcp_abort();
949  	      }
950  	    }
951  	  }
952  	  /* the structure of checkpoint_threads_file is pairs of pid and tid */
953  	  write_info_to_file (2, getpid(), rc);
954  	  /*************************************************************************/
955  	  /*  Done recording new tasks and processes.                              */
956  	  /*************************************************************************/
957  	#endif
958  	
959  	  return (rc);
960  	}
961  	
962  	void fill_in_pthread (pid_t tid, pthread_t pth) {
963  	  struct Thread *thread;
964  	  for (thread = threads; thread != NULL; thread = thread -> next) {
965  	    if (thread -> tid == tid) {
966  	      thread -> pth = pth;
967  	      break;
968  	    }
969  	  }
970  	}
971  	
972  	void delete_thread_on_pthread_join (pthread_t pth) {
973  	  struct Thread *thread;
974  	  for (thread = threads; thread != NULL; thread = thread -> next) {
975  	    if (thread -> pth == pth) {
976  	      threadisdead (thread);
977  	      break;
978  	    }
979  	  }
980  	}
981  	
982  	asm (".global clone ; .type clone,@function ; clone = __clone");
983  	
984  	/*****************************************************************************/
985  	/*									     */
986  	/*  This routine is called (via clone) as the top-level routine of a thread  */
987  	/*  that we are tracking.						     */
988  	/*									     */
989  	/*  It fills in remaining items of our thread struct, calls the user function,*/
990  	/*  then cleans up the thread struct before exiting.			     */
991  	/*									     */
992  	/*****************************************************************************/
993  	
994  	static int threadcloned (void *threadv)
995  	
996  	{
997  	  int rc;
998  	  Thread *const thread = threadv;
999  	
1000 	  DPRINTF (("mtcp threadcloned*: starting thread %p\n", thread));
1001 	
1002 	  setupthread (thread);
1003 	
1004 	  /* The new TLS should have the process ID in place at TLS_PID_OFFSET() */
1005 	  /* This is a verification step and is therefore optional as such     */
1006 	  {
1007 	    pid_t  tls_pid = *(pid_t *) (mtcp_get_tls_base_addr() + TLS_PID_OFFSET());
1008 	    if ((tls_pid != motherpid) && (tls_pid != (pid_t)-1)) {
1009 	      mtcp_printf ("mtcp threadcloned: getpid %d, tls pid %d at offset %d, must match\n",
1010 	                    motherpid, tls_pid, TLS_PID_OFFSET());
1011 	      mtcp_printf ("      %X\n", motherpid);
1012 	      for (rc = 0; rc < 256; rc += 4) {
1013 	        tls_pid = *(pid_t *) (mtcp_get_tls_base_addr() + rc);
1014 	        mtcp_printf ("   %d: %X", rc, tls_pid);
1015 	        if ((rc & 31) == 28) mtcp_printf ("\n");
1016 	      }
1017 	      mtcp_abort ();
1018 	    }
1019 	  }
1020 	
1021 	  /* If the caller wants the child tid but didn't have CLEARTID, pass the tid back to it */
1022 	
1023 	  if ((thread -> clone_flags & (CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) == CLONE_CHILD_SETTID) {
1024 	    *(thread -> given_tidptr) = thread -> child_tid;
1025 	  }
1026 	
1027 	  /* Maybe enable checkpointing by default */
1028 	
1029 	  if (threadenabledefault) mtcp_ok ();
1030 	
1031 	#ifdef PTRACE
1032 	  init_thread_local();
1033 	#endif
1034 	
1035 	  /* Call the user's function for whatever processing they want done */
1036 	
1037 	  DPRINTF (("mtcp threadcloned*: calling %p (%p)\n", thread -> fn, thread -> arg));
1038 	  rc = (*(thread -> fn)) (thread -> arg);
1039 	  DPRINTF (("mtcp threadcloned*: returned %d\n", rc));
1040 	
1041 	  /* Make sure checkpointing is inhibited while we clean up and exit */
1042 	  /* Otherwise, checkpointer might wait forever for us to re-enable  */
1043 	
1044 	  mtcp_no ();
1045 	
1046 	  /* Do whatever to unlink and free thread block */
1047 	
1048 	  threadisdead (thread);
1049 	
1050 	  /* Return the user's status as the exit code */
1051 	
1052 	  return (rc);
1053 	}
1054 	
1055 	/*****************************************************************************/
1056 	/*									     */
1057 	/*  set_tid_address wrapper routine					     */
1058 	/*									     */
1059 	/*  We save the new address of the tidptr that will get cleared when the     */
1060 	/*  thread exits							     */
1061 	/*									     */
1062 	/*****************************************************************************/
1063 	
1064 	static long set_tid_address (int *tidptr)
1065 	
1066 	{
1067 	  long rc;
1068 	  Thread *thread;
1069 	
1070 	  thread = getcurrenthread ();
1071 	  DPRINTF (("set_tid_address wrapper*: thread %p -> tid %d, tidptr %p\n",
1072 		    thread, thread -> tid, tidptr));
1073 	  thread -> actual_tidptr = tidptr;  // save new tidptr so subsequent restore will create with new pointer
1074 	  rc = mtcp_sys_set_tid_address(tidptr);
1075 	  return (rc);                       // now we tell kernel to change it for the current thread
1076 	}
1077 	
1078 	/*****************************************************************************/
1079 	/*									     */
1080 	/*  Link thread struct to the lists and finish filling it in		     */
1081 	/*									     */
1082 	/*    Input:								     */
1083 	/*									     */
1084 	/*	thread = thread to set up					     */
1085 	/*									     */
1086 	/*    Output:								     */
1087 	/*									     */
1088 	/*	thread linked to 'threads' list and 'motherofall' tree		     */
1089 	/*	thread -> tid = filled in with thread id			     */
1090 	/*	thread -> state = ST_RUNDISABLED (thread initially has checkpointing */
1091 	/*        disabled)							     */
1092 	/*	signal handler set up						     */
1093 	/*									     */
1094 	/*****************************************************************************/
1095 	
1096 	static void setupthread (Thread *thread)
1097 	
1098 	{
1099 	  Thread *parent;
1100 	
1101 	  /* Save the thread's ID number and put in threads list so we can look it up                                    */
1102 	  /* Set state to disable checkpointing so checkpointer won't race between adding to list and setting up handler */
1103 	
1104 	  thread -> tid = mtcp_sys_kernel_gettid ();
1105 	  thread -> original_tid = GETTID ();
1106 	
1107 	  DPRINTF (("mtcp setupthread*: thread %p -> tid %d\n", thread, thread->tid));
1108 	
1109 	  lock_threads ();
1110 	
1111 	  if ((thread -> next = threads) != NULL) {
1112 	    thread -> next -> prev = &(thread -> next);
1113 	  }
1114 	  thread -> prev = &threads;
1115 	  threads = thread;
1116 	
1117 	  parent = thread -> parent;
1118 	  if (parent != NULL) {
1119 	    thread -> siblings = parent -> children;
1120 	    parent -> children = thread;
1121 	  }
1122 	
1123 	  unlk_threads ();
1124 	}
1125 	
1126 	/*****************************************************************************/
1127 	/*									     */
1128 	/*  Set up 'clone_entry' variable					     */
1129 	/*									     */
1130 	/*    Output:								     */
1131 	/*									     */
1132 	/*	clone_entry = points to clone routine within libc.so		     */
1133 	/*									     */
1134 	/*****************************************************************************/
1135 	
1136 	static void setup_clone_entry (void)
1137 	
1138 	{
1139 	  char *p, *tmp;
1140 	  int mapsfd;
1141 	
1142 	  /* Get name of whatever concoction we have for a libc shareable image */
1143 	  /* This is used by the wrapper routines                               */
1144 	
1145 	  tmp = getenv ("MTCP_WRAPPER_LIBC_SO");
1146 	  if (tmp != NULL) {
1147 	    if (strlen(tmp) >= sizeof(mtcp_libc_area.name)) {
1148 	      mtcp_printf("mtcp setup_clone_entry: libc area name (%s) too long (>=1024 chars)\n",
1149 	                  tmp);
1150 	      mtcp_abort();
1151 	    }
1152 	    strncpy (mtcp_libc_area.name, tmp, sizeof mtcp_libc_area.name);
1153 	  } else {
1154 	    mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
1155 	    if (mapsfd < 0) {
1156 	      mtcp_printf ("mtcp_init: error opening /proc/self/maps: %s\n", strerror (mtcp_sys_errno));
1157 	      mtcp_abort ();
1158 	    }
1159 	    p = NULL;
1160 	    while (readmapsline (mapsfd, &mtcp_libc_area)) {
1161 	      p = strstr (mtcp_libc_area.name, "/libc");
1162 	      if ((p != NULL) && ((p[5] == '-') || (p[5] == '.'))) break;
1163 	    }
1164 	    close (mapsfd);
1165 	    if (p == NULL) {
1166 	      mtcp_printf ("mtcp_init: cannot find */libc[-.]* in /proc/self/maps\n");
1167 	      mtcp_abort ();
1168 	    }
1169 	  }
1170 	  mtcp_libc_dl_handle = dlopen (mtcp_libc_area.name, RTLD_LAZY | RTLD_GLOBAL);
1171 	  if (mtcp_libc_dl_handle == NULL) {
1172 	    mtcp_printf ("mtcp_init: error opening libc shareable %s: %s\n", mtcp_libc_area.name, dlerror ());
1173 	    mtcp_abort ();
1174 	  }
1175 	
1176 	  /* Find the clone routine therein */
1177 	
1178 	  clone_entry = mtcp_get_libc_symbol ("__clone");
1179 	}
1180 	
1181 	/********************************************************************************************************************************/
1182 	/*																*/
1183 	/*  Thread has exited - unlink it from lists and free struct									*/
1184 	/*																*/
1185 	/*    Input:															*/
1186 	/*																*/
1187 	/*	thread = thread that has exited												*/
1188 	/*																*/
1189 	/*    Output:															*/
1190 	/*																*/
1191 	/*	thread removed from 'threads' list and motherofall tree									*/
1192 	/*	thread pointer no longer valid												*/
1193 	/*	checkpointer woken if waiting for this thread										*/
1194 	/*																*/
1195 	/********************************************************************************************************************************/
1196 	
1197 	static void threadisdead (Thread *thread)
1198 	
1199 	{
1200 	  Thread **lthread, *parent, *xthread;
1201 	
1202 	  lock_threads ();
1203 	
1204 	  DPRINTF (("mtcp threadisdead*: thread %p -> tid %d\n", thread, thread -> tid));
1205 	
1206 	  /* Remove thread block from 'threads' list */
1207 	
1208 	  if ((*(thread -> prev) = thread -> next) != NULL) {
1209 	    thread -> next -> prev = thread -> prev;
1210 	  }
1211 	
1212 	  /* Remove thread block from parent's list of children */
1213 	
1214 	  parent = thread -> parent;
1215 	  if (parent != NULL) {
1216 	    for (lthread = &(parent -> children); (xthread = *lthread) != thread; lthread = &(xthread -> siblings)) {}
1217 	    *lthread = xthread -> siblings;
1218 	  }
1219 	
1220 	  /* If this thread has children, give them to its parent */
1221 	
1222 	  if (parent != NULL) {
1223 	    while ((xthread = thread -> children) != NULL) {
1224 	      thread -> children = xthread -> siblings;
1225 	      xthread -> siblings = parent -> children;
1226 	      parent -> children = xthread;
1227 	    }
1228 	  } else {
1229 	    while ((xthread = thread -> children) != NULL) {
1230 	      thread -> children = xthread -> siblings;
1231 	      xthread -> siblings = motherofall;
1232 	      motherofall = xthread;
1233 	    }
1234 	  }
1235 	
1236 	  unlk_threads ();
1237 	
1238 	  /* If checkpointer is waiting for us, wake it to see this thread no longer in list */
1239 	
1240 	  mtcp_state_futex (&(thread -> state), FUTEX_WAKE, 1, NULL);
1241 	
1242 	  mtcp_state_destroy( &(thread -> state) );
1243 	
1244 	  free (thread);
1245 	}
1246 	
1247 	void *mtcp_get_libc_symbol (char const *name)
1248 	
1249 	{
1250 	  void *temp;
1251 	
1252 	  temp = dlsym (mtcp_libc_dl_handle, name);
1253 	  if (temp == NULL) {
1254 	    mtcp_printf ("mtcp_get_libc_symbol: error getting %s from %s: %s\n",
1255 	                 name, mtcp_libc_area.name, dlerror ());
1256 	    mtcp_abort ();
1257 	  }
1258 	  return (temp);
1259 	}
1260 	
1261 	/********************************************************************************************************************************/
1262 	/*																*/
1263 	/*  Call this when it's OK to checkpoint											*/
1264 	/*																*/
1265 	/********************************************************************************************************************************/
1266 	
1267 	int mtcp_ok (void)
1268 	
1269 	{
1270 	  Thread *thread;
1271 	
1272 	  if (getenv("MTCP_NO_CHECKPOINT"))
1273 	    return 0;
1274 	  thread = getcurrenthread ();
1275 	
1276 	again:
1277 	  switch (mtcp_state_value(&thread -> state)) {
1278 	
1279 	    /* Thread was running normally with checkpointing disabled.  Enable checkpointing then just return. */
1280 	
1281 	    case ST_RUNDISABLED: {
1282 	      if (!mtcp_state_set (&(thread -> state), ST_RUNENABLED, ST_RUNDISABLED)) goto again;
1283 	      return (0);
1284 	    }
1285 	
1286 	    /* Thread was running normally with checkpointing already enabled.  So just return as is. */
1287 	
1288 	    case ST_RUNENABLED: {
1289 	      return (1);
1290 	    }
1291 	
1292 	    /* Thread was running with checkpointing disabled, but the checkpointhread wants to write a checkpoint.  So mark the thread  */
1293 	    /* as having checkpointing enabled, then just 'manually' call the signal handler as if the signal to suspend were just sent. */
1294 	
1295 	    case ST_SIGDISABLED: {
1296 	      if (!mtcp_state_set (&(thread -> state), ST_SIGENABLED, ST_SIGDISABLED)) goto again;
1297 	      stopthisthread (0);
1298 	      return (0);
1299 	    }
1300 	
1301 	    /* Thread is running with checkpointing enabled, but the checkpointhread wants to write a checkpoint and has sent a signal */
1302 	    /* telling the thread to call 'stopthisthread'.  So we'll just keep going as is until the signal is actually delivered.    */
1303 	
1304 	    case ST_SIGENABLED: {
1305 	      return (1);
1306 	    }
1307 	
1308 	    /* Thread is the checkpointhread so we just ignore the call (from threadcloned routine). */
1309 	
1310 	    case ST_CKPNTHREAD: {
1311 	      return (-1);
1312 	    }
1313 	
1314 	    /* How'd we get here? */
1315 	
1316 	    default: {
1317 	      mtcp_abort ();
1318 	      return (0); /* NOTREACHED : stop compiler warning */
1319 	    }
1320 	  }
1321 	}
1322 	
1323 	/* Likewise, disable checkpointing */
1324 	
1325 	int mtcp_no (void)
1326 	{
1327 	  Thread *thread;
1328 	
1329 	  if (getenv("MTCP_NO_CHECKPOINT"))
1330 	    return 0;
1331 	  thread = getcurrenthread ();
1332 	
1333 	again:
1334 	  switch (mtcp_state_value(&thread -> state)) {
1335 	    case ST_RUNDISABLED: {
1336 	      return (0);
1337 	    }
1338 	
1339 	    case ST_RUNENABLED: {
1340 	      if (!mtcp_state_set (&(thread -> state), ST_RUNDISABLED, ST_RUNENABLED)) goto again;
1341 	      return (1);
1342 	    }
1343 	
1344 	    case ST_SIGDISABLED: {
1345 	      return (0);
1346 	    }
1347 	
1348 	    case ST_SIGENABLED: {
1349 	      stopthisthread (0);
1350 	      goto again;
1351 	    }
1352 	
1353 	    default: {
1354 	      mtcp_abort ();
1355 	      return (0); /* NOTREACHED : stop compiler warning */
1356 	    }
1357 	  }
1358 	}
1359 	
1360 	/* This is used by ../dmtcp/src/mtcpinterface.cpp */
1361 	void mtcp_kill_ckpthread (void)
1362 	{
1363 	  Thread *thread;
1364 	
1365 	  lock_threads ();
1366 	  for (thread = threads; thread != NULL; thread = thread -> next) {
1367 	    if ( mtcp_state_value(&thread -> state) == ST_CKPNTHREAD ) {
1368 	      unlk_threads ();
1369 	      DPRINTF(("mtcp_kill_ckpthread: Kill checkpinthread, tid=%d\n",thread->tid));
1370 	      mtcp_sys_kernel_tkill(thread -> tid, STOPSIGNAL);
1371 	      return;
1372 	    }
1373 	  }
1374 	  unlk_threads ();
1375 	}
1376 	
1377 	
1378 	/*************************************************************************/
1379 	/*						                         */
1380 	/*  Save and restore terminal settings.		                         */
1381 	/*						                         */
1382 	/*************************************************************************/
1383 	
1384 	static int saved_termios_exists = 0;
1385 	static struct termios saved_termios;
1386 	static struct winsize win;
1387 	
1388 	static void save_term_settings() {
1389 	  saved_termios_exists = ( isatty(STDIN_FILENO)
1390 	  		           && tcgetattr(STDIN_FILENO, &saved_termios) >= 0 );
1391 	  if (saved_termios_exists)
1392 	    ioctl (STDIN_FILENO, TIOCGWINSZ, (char *) &win);
1393 	}
1394 	int safe_tcsetattr(int fd, int optional_actions,
1395 			   const struct termios *termios_p) {
1396 	  struct termios old_termios, new_termios;
1397 	  /* We will compare old and new, and we don't want unitialized data */
1398 	  memset(&new_termios, 0, sizeof(new_termios));
1399 	  /* tcgetattr returns success as long as at least one of requested
1400 	   * changes was executed.  So, repeat until no more changes.
1401 	   */ 
1402 	  do {
1403 	    memcpy(&old_termios, &new_termios, sizeof(new_termios));
1404 	    if (tcsetattr(fd, TCSANOW, termios_p) == -1) return -1;
1405 	    if (tcgetattr(fd, &new_termios) == -1) return -1;
1406 	  } while (memcmp(&new_termios, &old_termios, sizeof(new_termios)) != 0);
1407 	  return 0;
1408 	}
1409 	static void restore_term_settings() {
1410 	  if (saved_termios_exists){
1411 	    /* First check if we are in foreground. If not, skip this and print
1412 	     *   warning.  If we try to call tcsetattr in background, we will hang up.
1413 	     */
1414 	    int foreground = (tcgetpgrp(STDIN_FILENO) == getpgrp());
1415 	    DPRINTF(("restore terminal attributes, check foreground status first: %d\n",
1416 	             foreground));
1417 	    if (foreground) {
1418 	      if ( ( ! isatty(STDIN_FILENO)
1419 	             || safe_tcsetattr(STDIN_FILENO, TCSANOW, &saved_termios) == -1) )
1420 	        DPRINTF(("WARNING: mtcp finishrestore*: failed to restore terminal\n"));
1421 	      else {
1422 	        struct winsize cur_win;
1423 	        DPRINTF(("mtcp finishrestore*: restored terminal\n"));
1424 	        ioctl (STDIN_FILENO, TIOCGWINSZ, (char *) &cur_win);
1425 		/* ws_row/ws_col was probably not 0/0 prior to checkpoint.  We change
1426 		 * it back to last known row/col prior to checkpoint, and then send a
1427 		 * SIGWINCH (see below) to notify process that window might have changed
1428 		 */
1429 	        if (cur_win.ws_row == 0 && cur_win.ws_col == 0)
1430 	          ioctl (STDIN_FILENO, TIOCSWINSZ, (char *) &win);
1431 	      }
1432 	    } else {
1433 	      DPRINTF(("WARNING: mtcp finishrestore*: skip restore terminal step\n"
1434 		       " -- we are in BACKGROUND\n"));
1435 	    }
1436 	  }
1437 	  if (kill(getpid(), SIGWINCH) == -1) {}  /* No remedy if error */
1438 	}
1439 	
1440 	
1441 	/*************************************************************************/
1442 	/*						                         */
1443 	/*  This executes as a thread.  It sleeps for the checkpoint interval    */
1444 	/*    seconds, then wakes to write the checkpoint file.			 */
1445 	/*						                         */
1446 	/*************************************************************************/
1447 	
1448 	static void *checkpointhread (void *dummy)
1449 	{
1450 	  int needrescan;
1451 	  struct timespec sleeperiod;
1452 	  struct timeval started, stopped;
1453 	  Thread *thread;
1454 	  char * dmtcp_checkpoint_filename = NULL;
1455 	
1456 	  /* This is the start function of the checkpoint thread.
1457 	   * We also call getcontext to get a snapshot of this call frame,
1458 	   * since we will never exit this call frame.  We always return
1459 	   * to this call frame at time of startup, on restart.  Hence, restart
1460 	   * will forget any modifications to our local variables since restart.
1461 	   */
1462 	  static int originalstartup = 1;
1463 	
1464 	#ifdef PTRACE
1465 	  init_thread_local();
1466 	  check_size_for_ptrace_file (ptrace_shared_file);
1467 	  check_size_for_ptrace_file (ptrace_setoptions_file);
1468 	  check_size_for_ptrace_file (checkpoint_threads_file);
1469 	#endif
1470 	
1471 	  /* We put a timeout in case the thread being waited for exits whilst we are waiting */
1472 	
1473 	  static struct timespec const enabletimeout = { 10, 0 };
1474 	
1475 	  DPRINTF (("mtcp checkpointhread*: %d started\n", mtcp_sys_kernel_gettid ()));
1476 	
1477 	  /* Set up our restart point, ie, we get jumped to here after a restore */
1478 	
1479 	  ckpthread = getcurrenthread ();
1480 	
1481 	  save_sig_state( ckpthread );
1482 	  save_tls_state (ckpthread);
1483 	  /* Release user thread after we've initialized. */
1484 	  sem_post(&sem_start);
1485 	  if (getcontext (&(ckpthread -> savctx)) < 0) mtcp_abort ();
1486 	
1487 	  DPRINTF (("mtcp checkpointhread*: after getcontext. current_tid %d, original_tid:%d\n",
1488 	        mtcp_sys_kernel_gettid(), ckpthread->original_tid));
1489 	  if (originalstartup)
1490 	    originalstartup = 0;
1491 	  else {
1492 	
1493 	    /* We are being restored.  Wait for all other threads to finish being restored before resuming checkpointing. */
1494 	
1495 	    DPRINTF (("mtcp checkpointhread*: waiting for other threads after restore\n"));
1496 	    wait_for_all_restored ();
1497 	#ifdef PTRACE
1498 	    create_file (GETTID());
1499 	#endif
1500 	    DPRINTF (("mtcp checkpointhread*: resuming after restore\n"));
1501 	  }
1502 	
1503 	  /* Reset the verification counter - on init, this will set it to it's start value. */
1504 	  /* After a verification, it will reset it to its start value.  After a normal      */
1505 	  /* restore, it will set it to its start value.  So this covers all cases.          */
1506 	
1507 	  verify_count = verify_total;
1508 	  DPRINTF (("After verify count mtcp checkpointhread*: %d started\n",
1509 		    mtcp_sys_kernel_gettid ()));
1510 	
1511 	  while (1) {
1512 	#ifdef PTRACE
1513 	    int ptraced_by = 0;
1514 	#endif
1515 	
1516 	    /* Wait a while between writing checkpoint files */
1517 	
1518 	    if (callback_sleep_between_ckpt == NULL)
1519 	    {
1520 	        memset (&sleeperiod, 0, sizeof sleeperiod);
1521 	        sleeperiod.tv_sec = intervalsecs;
1522 	        while ((nanosleep (&sleeperiod, &sleeperiod) < 0) && (errno == EINTR)) {}
1523 	    }
1524 	    else
1525 	    {
1526 	        DPRINTF(("mtcp checkpointhread*: before callback_sleep_between_ckpt(%d)\n",intervalsecs));
1527 	        (*callback_sleep_between_ckpt)(intervalsecs);
1528 	        DPRINTF(("mtcp checkpointhread*: after callback_sleep_between_ckpt(%d)\n",intervalsecs));
1529 	    }
1530 	
1531 	    mtcp_sys_gettimeofday (&started, NULL);
1532 	    checkpointsize = 0;
1533 	
1534 	#ifdef PTRACE
1535 	    // Refresh ptrace information
1536 	    has_ptrace_file = 0;
1537 	    delete_ptrace_leader = -1;
1538 	    has_setoptions_file = 0;
1539 	    delete_setoptions_leader = -1;
1540 	    has_checkpoint_file = 0;
1541 	    delete_checkpoint_leader = -1;
1542 	    process_ptrace_info( &delete_ptrace_leader, &has_ptrace_file,
1543 	                         &delete_setoptions_leader, &has_setoptions_file,
1544 	                         &delete_checkpoint_leader, &has_checkpoint_file);
1545 	
1546 	    for (thread = threads; thread != NULL; thread = thread -> next) {
1547 	      int i;
1548 	      for (i = 0; i < ptrace_pairs_count; i++) {
1549 	        DPRINTF(("COMPARE: intf=%d, tid=%d\n",
1550 	                 ptrace_pairs[i].inferior, thread->original_tid));
1551 	        if( ptrace_pairs[i].inferior == thread->original_tid ){
1552 	          ptraced_by = ptrace_pairs[i].superior;
1553 	          break;
1554 	        }
1555 	      }
1556 	      if( ptraced_by )
1557 	        break;
1558 	    }
1559 	
1560 	    DPRINTF(("\n\n%d ptraced by %d\n\n",(thread) ? thread->tid : 0,ptraced_by));
1561 	    if( ptraced_by ){
1562 	      DPRINTF(("\n\n%d Wait for superior %d\n\n",thread->tid,ptraced_by));
1563 	      ptrace_wait4(ptraced_by);
1564 	      //sleep(1);
1565 	      DPRINTF(("\n\n%d Wait for superior %d - SUCCESS\n\n",thread->tid,ptraced_by));
1566 	    }
1567 	#endif 
1568 	
1569 	    /* Halt all other threads - force them to call stopthisthread                    */
1570 	    /* If any have blocked checkpointing, wait for them to unblock before signalling */
1571 	
1572 	rescan:
1573 	    needrescan = 0;
1574 	    lock_threads ();
1575 	    for (thread = threads; thread != NULL; thread = thread -> next) {
1576 	
1577 	      /* If thread no longer running, remove it from thread list */
1578 	
1579 	again:
1580 	      if (*(thread -> actual_tidptr) == 0) {
1581 	        DPRINTF (("mtcp checkpointhread*: thread %d disappeared\n", thread -> tid));
1582 	        unlk_threads ();
1583 	        threadisdead (thread);
1584 	        goto rescan;
1585 	      }
1586 	
1587 	      /* Do various things based on thread's state */
1588 	
1589 	      switch (mtcp_state_value (&thread -> state) ) {
1590 	
1591 	        /* Thread is running but has checkpointing disabled    */
1592 	        /* Tell the mtcp_ok routine that we are waiting for it */
1593 	        /* We will need to rescan so we will see it suspended  */
1594 	
1595 	        case ST_RUNDISABLED: {
1596 	          if (!mtcp_state_set (&(thread -> state), ST_SIGDISABLED, ST_RUNDISABLED)) goto again;
1597 	          needrescan = 1;
1598 	          break;
1599 	        }
1600 	
1601 	        /* Thread is running and has checkpointing enabled                 */
1602 	        /* Send it a signal so it will call stopthisthread                 */
1603 	        /* We will need to rescan (hopefully it will be suspended by then) */
1604 	
1605 	        case ST_RUNENABLED: {
1606 	          if (!mtcp_state_set (&(thread -> state), ST_SIGENABLED, ST_RUNENABLED)) goto again;
1607 	#ifdef PTRACE
1608 	          ptrace_save_threads_state ();
1609 	          int index;  
1610 	          char inferior_st = 'N';
1611 	          char inf_st;
1612 	          for (index = 0; index < ptrace_pairs_count; index++) {
1613 	            inf_st = procfs_state(ptrace_pairs[index].inferior);
1614 	            DPRINTF(("tid = %d now=%c stored=%c superior = %d inferior = %d\n",
1615 	                     GETTID(), inf_st, ptrace_pairs[index].inferior_st,
1616 	                     ptrace_pairs[index].superior, ptrace_pairs[index].inferior));
1617 	            if (ptrace_pairs[index].inferior == thread -> original_tid) {
1618 	              inferior_st = ptrace_pairs[index].inferior_st;
1619 	              break;
1620 	            }
1621 	          }
1622 	          DPRINTF(("%d %c\n", GETTID(), inferior_st));
1623 	          if (inferior_st == 'N') {
1624 	            // superior 
1625 	            if (mtcp_sys_kernel_tkill (thread -> tid, STOPSIGNAL) < 0) {
1626 	              if (mtcp_sys_errno != ESRCH) {
1627 	                mtcp_printf("mtcp checkpointhread: error signalling thread %d: %s\n",
1628 	                            thread -> tid, strerror (mtcp_sys_errno));
1629 	              }
1630 	              unlk_threads ();
1631 	              threadisdead (thread);
1632 	              goto rescan;
1633 	            }
1634 	          }
1635 	          else {
1636 	            // inferior 
1637 	            DPRINTF(("++++++++++++++++++++++++++++++++%c %d\n", inferior_st, thread -> original_tid));
1638 	            if (inferior_st != 'T') {
1639 	            if (mtcp_sys_kernel_tkill (thread -> tid, STOPSIGNAL) < 0) {
1640 	                if (mtcp_sys_errno != ESRCH) {
1641 	                  mtcp_printf ("mtcp checkpointhread: error signalling thread %d: %s\n",
1642 	                               thread -> tid, strerror (mtcp_sys_errno));
1643 	                }
1644 	                unlk_threads ();
1645 	                threadisdead (thread);
1646 	                goto rescan;
1647 	              }
1648 	            }
1649 	            create_file( thread -> original_tid );
1650 	          }
1651 	#else
1652 	          if (mtcp_sys_kernel_tkill (thread -> tid, STOPSIGNAL) < 0) {
1653 	            if (mtcp_sys_errno != ESRCH) {
1654 	              mtcp_printf ("mtcp checkpointhread: error signalling thread %d: %s\n",
1655 	                           thread -> tid, strerror (mtcp_sys_errno));
1656 	            }
1657 	            unlk_threads ();
1658 	            threadisdead (thread);
1659 	            goto rescan;
1660 	          }
1661 	#endif
1662 	          needrescan = 1;
1663 	          break;
1664 	        }
1665 	
1666 	        /* Thread is running, we have signalled it to stop, but it has
1667 		 * checkpointing disabled.  So we wait for it to change state.
1668 	         * We have to unlock because it may need lock to change state.
1669 		 */
1670 	
1671 	        case ST_SIGDISABLED: {
1672 	          unlk_threads ();
1673 	          mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SIGDISABLED,
1674 				    &enabletimeout);
1675 	          goto rescan;
1676 	        }
1677 	
1678 	        /* Thread is running and we have sent signal to stop it             */
1679 	        /* So we have to wait for it to change state (enter signal handler) */
1680 	        /* We have to unlock because it may try to use lock meanwhile       */
1681 	
1682 	        case ST_SIGENABLED: {
1683 	          unlk_threads ();
1684 	          mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SIGENABLED,
1685 				    &enabletimeout);
1686 	          goto rescan;
1687 	        }
1688 	
1689 	        /* Thread has entered signal handler and is saving its context.
1690 	         * So we have to wait for it to finish doing so.  We don't need
1691 		 * to unlock because it won't use lock before changing state.
1692 		 */
1693 	
1694 	        case ST_SUSPINPROG: {
1695 	          mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SUSPINPROG,
1696 				    &enabletimeout);
1697 	          goto again;
1698 	        }
1699 	
1700 	        /* Thread is suspended and all ready for us to write checkpoint file */
1701 	
1702 	        case ST_SUSPENDED: {
1703 	          break;
1704 	        }
1705 	
1706 	        /* Don't do anything to the checkpointhread (this) thread */
1707 	
1708 	        case ST_CKPNTHREAD: {
1709 	          break;
1710 	        }
1711 	
1712 	        /* Who knows? */
1713 	
1714 	        default: {
1715 	          mtcp_abort ();
1716 	        }
1717 	      }
1718 	    }
1719 	    unlk_threads ();
1720 	
1721 	    /* If need to rescan (ie, some thread possibly not in ST_SUSPENDED STATE),
1722 	     * check them all again
1723 	     */
1724 	
1725 	    if (needrescan) goto rescan;
1726 	    RMB; // matched by WMB in stopthisthread
1727 	    DPRINTF (("mtcp checkpointhread*: everything suspended\n"));
1728 	
1729 	    /* If no threads, we're all done */
1730 	
1731 	    if (threads == NULL) {
1732 	      DPRINTF (("mtcp checkpointhread*: exiting (no threads)\n"));
1733 	      return (NULL);
1734 	    }
1735 	
1736 	    /* Call weak symbol of this file, possibly overridden by user's strong symbol.
1737 	     * User must compile his/her code with -Wl,-export-dynamic to make it visible.
1738 	     */
1739 	    mtcpHookPreCheckpoint();
1740 	
1741 	    if (!dmtcp_exists) {
1742 	      save_sig_handlers();
1743 	    }
1744 	
1745 	    /* All other threads halted in 'stopthisthread' routine (they are all
1746 	     * in state ST_SUSPENDED).  It's safe to write checkpoint file now.
1747 	     */
1748 	    if (callback_pre_ckpt != NULL){
1749 	      // Here we want to synchronize the shared memory pages with the backup files
1750 	      DPRINTF(("mtcp checkpointhread*: syncing shared memory with backup files\n"));
1751 	      sync_shared_mem();
1752 	
1753 	      DPRINTF(("mtcp checkpointhread*: before callback_pre_ckpt() (&%x,%x) \n",
1754 		       &callback_pre_ckpt, callback_pre_ckpt));
1755 	      (*callback_pre_ckpt)(&dmtcp_checkpoint_filename);
1756 	      if (dmtcp_checkpoint_filename &&
1757 	          strcmp(dmtcp_checkpoint_filename, "/dev/null") != 0) {
1758 	        mtcp_sys_strcpy(perm_checkpointfilename, dmtcp_checkpoint_filename);
1759 	        DPRINTF(("mtcp checkpointhread*: Checkpoint filename changed to %s\n",
1760 			perm_checkpointfilename));
1761 	      }
1762 	    }
1763 	
1764 	#ifdef PTRACE
1765 	    /* If old stale files of these names exist, we append, with big problems
1766 	     * It's okay if files don't exist and unlink fails.
1767 	     * Pre_ckpt is a barrier from coordinator.  So, all processes finished
1768 	     *  reading ptrace pairs from files prior to this barrier.
1769 	     */
1770 	    unlink(ptrace_shared_file);
1771 	    unlink(ptrace_setoptions_file);
1772 	    unlink(checkpoint_threads_file);
1773 	#endif
1774 	
1775 	    mtcp_saved_break = (void*) mtcp_sys_brk(NULL);  // kernel returns mm->brk when passed zero
1776 	    /* Do this once, same for all threads.  But restore for each thread. */
1777 	    if (mtcp_have_thread_sysinfo_offset())
1778 	      saved_sysinfo = mtcp_get_thread_sysinfo();
1779 	    /* Do this once.  It's the same for all threads. */
1780 	    save_term_settings();
1781 	
1782 	    if (getcwd(saved_working_directory, MTCP_MAX_PATH) == NULL) {
1783 	      // buffer wasn't large enough
1784 	      perror("getcwd");
1785 	      mtcp_printf ("getcwd failed.");
1786 	      mtcp_abort ();
1787 	    }
1788 	
1789 	    DPRINTF (("mtcp checkpointhread*: mtcp_saved_break=%p\n", mtcp_saved_break));
1790 	
1791 	    if ( dmtcp_checkpoint_filename == NULL ||
1792 	         strcmp (dmtcp_checkpoint_filename, "/dev/null") != 0) {
1793 	      checkpointeverything ();
1794 	    } else {
1795 	      mtcp_printf("mtcp checkpointhread*:  received \'/dev/null\'" \
1796 			  " as ckpt filename.\n*** Skipping checkpoint. ***\n");
1797 	    }
1798 	
1799 	    if (callback_post_ckpt != NULL){
1800 	        DPRINTF(("mtcp checkpointhread*: before callback_post_ckpt() (&%x,%x) \n",
1801 			 &callback_post_ckpt, callback_post_ckpt));
1802 	        (*callback_post_ckpt)(0);
1803 	    }
1804 	    if (showtiming) {
1805 	      mtcp_sys_gettimeofday (&stopped, NULL);
1806 	      stopped.tv_usec += (stopped.tv_sec - started.tv_sec) * 1000000 - started.tv_usec;
1807 	      mtcp_printf ("mtcp checkpoint: time %u uS, size %u megabytes," \
1808 			   " avg rate %u MB/s\n",
1809 	                   stopped.tv_usec, (unsigned int)(checkpointsize / 1000000),
1810 	                   (unsigned int)(checkpointsize / stopped.tv_usec));
1811 	    }
1812 	
1813 	    /* Call weak symbol of this file, possibly overridden by user's strong symbol.
1814 	     * User must compile his/her code with -Wl,-export-dynamic to make it visible.
1815 	     */
1816 	    mtcpHookPostCheckpoint();
1817 	
1818 	    /* Resume all threads.  But if we're doing a checkpoint verify,
1819 	     * abort all threads except the main thread, as we don't want them
1820 	     * running when we exec the mtcp_restore program.
1821 	     */
1822 	
1823 	    DPRINTF (("mtcp checkpointhread*: resuming everything\n"));
1824 	    lock_threads();
1825 	    for (thread = threads; thread != NULL; thread = thread -> next) {
1826 	      if (mtcp_state_value(&(thread -> state)) != ST_CKPNTHREAD) {
1827 	        if (!mtcp_state_set (&(thread -> state), ST_RUNENABLED, ST_SUSPENDED))
1828 		  mtcp_abort();
1829 	        mtcp_state_futex(&(thread -> state), FUTEX_WAKE, 1, NULL);
1830 	      }
1831 	    }
1832 	    unlk_threads ();
1833 	    DPRINTF (("mtcp checkpointhread*: everything resumed\n"));
1834 	    /* But if we're doing a restore verify, just exit.  The main thread is doing the exec to start the restore. */
1835 	#ifdef PTRACE
1836 	    create_file (GETTID());
1837 	#endif
1838 	    if ((verify_total != 0) && (verify_count == 0)) return (NULL);
1839 	  }
1840 	}
1841 	
1842 	/**
1843 	 * This function returns the fd to which the checkpoint file should be written.
1844 	 * The purpose of using this function over mtcp_sys_open() is that this
1845 	 * function will handle compression and gzipping.
1846 	 */
1847 	static int test_use_compression(void)
1848 	{
1849 	  char *do_we_compress;
1850 	
1851 	  do_we_compress = getenv("MTCP_GZIP");
1852 	  // allow alternate name for env var
1853 	  if (do_we_compress == NULL)
1854 	    do_we_compress = getenv("DMTCP_GZIP");
1855 	  // env var is unset, let's default to enabled
1856 	  // to disable compression, run with MTCP_GZIP=0
1857 	  if (do_we_compress == NULL)
1858 	    do_we_compress = "1";
1859 	
1860 	  char *endptr;
1861 	  strtol(do_we_compress, &endptr, 0);
1862 	  if ( *do_we_compress == '\0' || *endptr != '\0' ) {
1863 	    mtcp_printf("WARNING: MTCP_GZIP/DMTCP_GZIP defined as %s (not a number)\n"
1864 		        "  Checkpoint image will not be compressed.\n",
1865 		        do_we_compress);
1866 	    do_we_compress = "0";
1867 	  }
1868 	  if ( 0 == strcmp(do_we_compress, "0") )
1869 	    return 0;
1870 	  /* If we arrive down here, it's safe to ccompress. */
1871 	  return 1;
1872 	}
1873 	
1874 	static int open_ckpt_to_write(int fd, int pipe_fds[2], char *gzip_path)
1875 	{
1876 	  pid_t cpid;
1877 	  char *gzip_args[] = { "gzip", "-1", "-", NULL };
1878 	
1879 	  gzip_args[0] = gzip_path;
1880 	
1881 	  cpid = mtcp_sys_fork();
1882 	  if (cpid == -1) {
1883 	    mtcp_printf("WARNING: error forking child process `%s`.  Compression will "
1884 	                "not be used [%s].\n", gzip_path, strerror(mtcp_sys_errno));
1885 	    close(pipe_fds[0]);
1886 	    close(pipe_fds[1]);
1887 	    //fall through to return fd
1888 	  } else if (cpid > 0) { /* parent process */
1889 	    //Before running gzip in child process, we must not use LD_PRELOAD.
1890 	    // See revision log 342 for details concerning bash.
1891 	    mtcp_ckpt_gzip_child_pid = cpid;
1892 	    if (close(pipe_fds[0]) == -1)
1893 	      mtcp_printf("WARNING: (in open_ckpt_to_write) close failed: %s\n",
1894 			  strerror(errno));
1895 	    if (close(fd) == -1)
1896 	      mtcp_printf("WARNING: (in open_ckpt_to_write) close failed: %s\n",
1897 			  strerror(errno));
1898 	    fd=pipe_fds[1];//change return value
1899 	  } else { /* child process */
1900 	    static int (*libc_unsetenv) (const char *name);
1901 	    static int (*libc_execvp) (const char *path, char *const argv[]);
1902 	
1903 	    close(pipe_fds[1]);
1904 	    dup2(pipe_fds[0], STDIN_FILENO);
1905 	    close(pipe_fds[0]);
1906 	    dup2(fd, STDOUT_FILENO);
1907 	    close(fd);
1908 	
1909 	    // Don't load dmtcphijack.so, etc. in exec.
1910 	    unsetenv("LD_PRELOAD"); // If in bash, this is bash env. var. version
1911 	    libc_unsetenv = mtcp_get_libc_symbol("unsetenv");
1912 	    (*libc_unsetenv)("LD_PRELOAD");
1913 	
1914 	    libc_execvp = mtcp_get_libc_symbol("execvp");
1915 	    (*libc_execvp)(gzip_path, gzip_args);
1916 	
1917 	    /* should not arrive here */
1918 	    mtcp_printf("ERROR: compression failed!  No checkpointing will be"
1919 	                "performed!  Cancel now!\n");
1920 	    mtcp_sys_exit(1);
1921 	  }
1922 	
1923 	  return fd;
1924 	}
1925 	
1926 	
1927 	/********************************************************************************************************************************/
1928 	/*																*/
1929 	/*  This routine is called from time-to-time to write a new checkpoint file.							*/
1930 	/*  It assumes all the threads are suspended.											*/
1931 	/*																*/
1932 	/********************************************************************************************************************************/
1933 	
1934 	static void checkpointeverything (void)
1935 	{
1936 	  Area area;
1937 	  int fd, mapsfd;
1938 	  VA area_begin, area_end;
1939 	  int stack_was_seen = 0;
1940 	  int vsyscall_exists = 0;
1941 	  int forked_checkpointing = 0;
1942 	  int forked_cpid;
1943 	  int use_compression = -1; /* decide later */
1944 	  int pipe_fds[2]; /* for potential piping */
1945 	  char *gzip_cmd = "gzip";
1946 	  char gzip_path[MTCP_MAX_PATH];
1947 	  char tmpDMTCPHeaderBuf[] = "/tmp/dmtcp.XXXXXX";
1948 	  char *tmpDMTCPHeaderFileName = tmpDMTCPHeaderBuf;
1949 	  int tmpDMTCPHeaderFd = -1;
1950 	
1951 	  static void *const frpointer = finishrestore;
1952 	
1953 	  DPRINTF (("mtcp checkpointeverything*: tid %d\n", mtcp_sys_kernel_gettid ()));
1954 	
1955 	  if (getenv("MTCP_FORKED_CHECKPOINT") != NULL)
1956 	    forked_checkpointing = 1;
1957 	#ifdef TEST_FORKED_CHECKPOINTING
1958 	  forked_checkpointing = 1;
1959 	#endif
1960 	
1961 	  if (callback_write_dmtcp_header != 0) {
1962 	    /* Temp file for DMTCP header; will be written into the checkpoint file. */
1963 	    tmpDMTCPHeaderFd = mkstemp(tmpDMTCPHeaderFileName);
1964 	    if (tmpDMTCPHeaderFd < 0) {
1965 	      mtcp_printf("error %d creating temp file: %s\n", errno, strerror(errno));
1966 	      mtcp_abort();
1967 	    }
1968 	
1969 	    if (unlink(tmpDMTCPHeaderFileName) == -1) {
1970 	      mtcp_printf("NOTE: error %d unlinking temp file: %s\n", errno,
1971 			  strerror(errno));
1972 	    }
1973 	
1974 	    /* Better to do this in parent, not child, for most accurate header info */
1975 	    (*callback_write_dmtcp_header)(tmpDMTCPHeaderFd);
1976 	  }
1977 	
1978 	  if (forked_checkpointing) {
1979 	    forked_cpid = mtcp_sys_fork();
1980 	    if (forked_cpid == -1) {
1981 	      mtcp_printf("WARNING: Failed to do forked checkpointing,"
1982 			  " trying normal checkpoint\n");
1983 	    } else if (forked_cpid > 0) {
1984 	      /* Parent process*/
1985 	      if (tmpDMTCPHeaderFd != -1)
1986 	        close(tmpDMTCPHeaderFd);
1987 	      // Calling waitpid here, but on 32-bit Linux, libc:waitpid() calls wait4()
1988 	      if ( waitpid(forked_cpid, NULL, 0) == -1 )
1989 	        DPRINTF (("mtcp restoreverything*: error waitpid: errno: %d",
1990 	              mtcp_sys_errno));
1991 	      DPRINTF (("mtcp checkpointeverything*: checkpoint complete\n"));
1992 	      return;
1993 	    } else {
1994 	      pid_t grandchild_pid = mtcp_sys_fork();
1995 	      if (grandchild_pid == -1) {
1996 	        mtcp_printf("WARNING: Forked checkpoint failed, no checkpoint available\n");
1997 	      } else if (grandchild_pid > 0) {
1998 	        mtcp_sys_exit(0); /* child exits */
1999 	      }
2000 	      /* grandchild continues; no need now to waitpid() on grandchild */
2001 	      DPRINTF (("mtcp checkpointeverything*: inside grandchild process\n"));
2002 	    }
2003 	  }
2004 	
2005 	  /* 1. Test if using compression */
2006 	  use_compression = test_use_compression();
2007 	  /* 2. Get gzip path */
2008 	  if (use_compression && mtcp_find_executable(gzip_cmd, gzip_path) == NULL) {
2009 	    mtcp_printf("WARNING: gzip cannot be executed.  Compression will "
2010 	                "not be used.\n");
2011 	    use_compression = 0;
2012 	  }
2013 	  /* 3. Create pipe */
2014 	  /* Note:  Must use mtcp_sys_pipe(), to go to kernel, since
2015 	   *   DMTCP has a wrapper around glibc promoting pipes to socketpairs,
2016 	   *   DMTCP doesn't directly checkpoint/restart pipes.
2017 	   */
2018 	  if ( use_compression && mtcp_sys_pipe(pipe_fds) == -1 ) {
2019 	    mtcp_printf("WARNING: error creating pipe. Compression will "
2020 	                "not be used.\n");
2021 	    use_compression = 0;
2022 	  }
2023 	  /* 4. Open fd to checkpoint image on disk */
2024 	  /* Create temp checkpoint file and write magic number to it */
2025 	  /* This is a callback to DMTCP.  DMTCP writes header and returns fd. */
2026 	  fd = mtcp_safe_open(temp_checkpointfilename,
2027 			      O_CREAT | O_TRUNC | O_WRONLY, 0600);
2028 	  if (fd < 0) {
2029 	    mtcp_printf("mtcp.c: checkpointeverything: error creating %s: %s\n",
2030 	                temp_checkpointfilename, strerror(mtcp_sys_errno));
2031 	    mtcp_abort();
2032 	  }
2033 	  /* 5. We now have the information to pipe to gzip, or directly to fd
2034 	  *     We do it this way, so that gzip will be direct child of forked process
2035 	  *       when using forked checkpointing.
2036 	  */
2037 	
2038 	#if 1
2039 	  /* Temporary fix, until DMTCP uses its own separate allocator.
2040 	   * The else code should really go lower down, just before we checkpoint
2041 	   * the heap.
2042 	   */
2043 	#else
2044 	  if (mtcp_sys_break(0) != mtcp_saved_break)
2045 	    mtcp_printf("\n\n*** ERROR:  End of heap grew."
2046 			"  Continue at your own risk. ***\n\n\n");
2047 	#endif
2048 	
2049 	  /* Drain stdin and stdout before checkpoint */
2050 	  tcdrain(STDOUT_FILENO);
2051 	  tcdrain(STDERR_FILENO);
2052 	
2053 	  if (use_compression) /* if use_compression, fork a gzip process */
2054 	    fd = open_ckpt_to_write(fd, pipe_fds, gzip_path);
2055 	
2056 	  if (tmpDMTCPHeaderFd != -1 ) {
2057 	    char tmpBuff[1024];
2058 	    int retval = -1;
2059 	    lseek(tmpDMTCPHeaderFd, 0, SEEK_SET);
2060 	
2061 	    while (retval != 0) {
2062 	      retval = read (tmpDMTCPHeaderFd, tmpBuff, 1024);
2063 	      if (retval == -1 && (errno == EAGAIN || errno == EINTR))
2064 	        continue;
2065 	      if (retval == -1) {
2066 	        mtcp_printf("Error writing checkpoint file: %s\n", strerror(errno));
2067 	        mtcp_abort();
2068 	      }
2069 	      writefile(fd, tmpBuff, retval);
2070 	    }
2071 	    close(tmpDMTCPHeaderFd);
2072 	  }
2073 	
2074 	  // Preprocess special segments like vsyscall, stack, heap etc.
2075 	  preprocess_special_segments(&vsyscall_exists);
2076 	
2077 	  writefile (fd, MAGIC, MAGIC_LEN);
2078 	
2079 	  DPRINTF (("mtcp checkpointeverything*: restore_begin %X at %p from [libmtcp.so]\n",
2080 	            restore_size, restore_begin));
2081 	
2082 	  struct rlimit stack_rlimit;
2083 	  getrlimit(RLIMIT_STACK, &stack_rlimit);
2084 	
2085 	  DPRINTF (("mtcp_restart: saved stack resource limit: soft_lim:%p, hard_lim:%p\n",
2086 		    stack_rlimit.rlim_cur, stack_rlimit.rlim_max));
2087 	
2088 	  writecs (fd, CS_STACKRLIMIT);
2089 	  writefile (fd, &stack_rlimit, sizeof stack_rlimit);
2090 	
2091 	  DPRINTF (("mtcp checkpointeverything*: [libmtcp.so] image of size %X at %p\n",
2092 		    restore_size, restore_begin));
2093 	
2094 	  writecs (fd, CS_RESTOREBEGIN);
2095 	  writefile (fd, &restore_begin, sizeof restore_begin);
2096 	  writecs (fd, CS_RESTORESIZE);
2097 	  writefile (fd, &restore_size, sizeof restore_size);
2098 	  writecs (fd, CS_RESTORESTART);
2099 	  writefile (fd, &restore_start, sizeof restore_start);
2100 	  writecs (fd, CS_RESTOREIMAGE);
2101 	  writefile (fd, (void *)restore_begin, restore_size);
2102 	  writecs (fd, CS_FINISHRESTORE);
2103 	  writefile (fd, &frpointer, sizeof frpointer);
2104 	
2105 	  /* Write out file descriptors */
2106 	
2107 	  writefiledescrs (fd);
2108 	
2109 	  /* Finally comes the memory contents */
2110 	
2111 	  /**************************************************************************/
2112 	  /* We can't do any more mallocing at this point because malloc stuff is   */
2113 	  /* outside the limits of the libmtcp.so image, so it won't get            */
2114 	  /* checkpointed, and it's possible that we would checkpoint an            */
2115 	  /* inconsistent state.  See note in restoreverything routine.             */
2116 	  /**************************************************************************/
2117 	
2118 	  mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
2119 	
2120 	  while (readmapsline (mapsfd, &area)) {
2121 	    area_begin = (VA)area.addr;
2122 	    area_end   = area_begin + area.size;
2123 	
2124 	    /* Original comment:  Skip anything in kernel address space ---
2125 	     *   beats me what's at FFFFE000..FFFFFFFF - we can't even read it;
2126 	     * Added: That's the vdso section for earlier Linux 2.6 kernels.  For later
2127 	     *  2.6 kernels, vdso occurs at an earlier address.  If it's unreadable,
2128 	     *  then we simply won't copy it.  But let's try to read all areas, anyway.
2129 	     * **COMMENTED OUT:** if (area_begin >= HIGHEST_VA) continue;
2130 	     */
2131 	    /* If it's readable, but it's VDSO, it will be dangerous to restore it.
2132 	     * In 32-bit mode later Red Hat RHEL Linux 2.6.9 releases use 0xffffe000,
2133 	     * the last page of virtual memory.  Note 0xffffe000 >= HIGHEST_VA
2134 	     * implies we're in 32-bit mode.
2135 	     */
2136 	    if (area_begin >= HIGHEST_VA && area_begin == 0xffffe000) continue;
2137 	#ifdef __x86_64__
2138 	    /* And in 64-bit mode later Red Hat RHEL Linux 2.6.9 releases
2139 	     * use 0xffffffffff600000 for VDSO.
2140 	     */
2141 	    if (area_begin >= HIGHEST_VA && area_begin == 0xffffffffff600000) continue;
2142 	#endif
2143 	
2144 	    /* Skip anything that has no read or execute permission.  This occurs
2145 	     * on one page in a Linux 2.6.9 installation.  No idea why.  This code
2146 	     * would also take care of kernel sections since we don't have read/execute
2147 	     * permission there.
2148 	     */
2149 	
2150 	    if (!((area.prot & PROT_READ) || (area.prot & PROT_WRITE))) continue;
2151 	
2152 	    // If the process has an area labelled as "/dev/zero (deleted)", we mark
2153 	    //   the area as Anonymous and save the contents to the ckpt image file.
2154 	    // IF this area has a MAP_SHARED attribute, it should be replaced with
2155 	    //   MAP_PRIVATE and we won't do any harm because, the /dev/zero file is an
2156 	    //   absolute source and sink. Anything written to it will be discarded and
2157 	    //   anything read from it will be all zeros.
2158 	    // The following call to mmap will create "/dev/zero (deleted)" area
2159 	    //         mmap(addr, size, protection, MAP_SHARED | MAP_ANONYMOUS, 0, 0)
2160 	    //
2161 	    // The above explanation also applies to "/dev/null (deleted)"
2162 	
2163 	    if ( mtcp_strstartswith(area.name, dev_zero_deleted_str) ||
2164 	         mtcp_strstartswith(area.name, dev_null_deleted_str) ) {
2165 	      DPRINTF(("mtcp checkpointeverything: saving area \"%s\" as Anonymous\n",
2166 		       area.name));
2167 	      area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
2168 	      area.name[0] = '\0';
2169 	    }
2170 	
2171 	    if (mtcp_strstartswith(area.name, sys_v_shmem_file)) {
2172 	      DPRINTF(("mtcp checkpointeverything: saving area \"%s\" as Anonymous\n",
2173 		       area.name));
2174 	      area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
2175 	      area.name[0] = '\0';
2176 	    }
2177 	
2178 	    /* Special Case Handling: nscd is enabled*/
2179 	    if ( mtcp_strstartswith(area.name, nscd_mmap_str) ||
2180 	         mtcp_strstartswith(area.name, nscd_mmap_str2) ||
2181 	         mtcp_strstartswith(area.name, nscd_mmap_str3) ) {
2182 	      DPRINTF(("mtcp checkpointeverything: NSCD daemon shared memory area present. MTCP will now try to remap\n" \
2183 	            "                           this area in read/write mode and then will fill it with zeros so that\n" \
2184 	            "                           glibc will automatically ask NSCD daemon for new shared area\n\n"));
2185 	      area.prot = PROT_READ | PROT_WRITE;
2186 	      area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
2187 	
2188 	      if ( munmap(area.addr, area.size) == -1) {
2189 	        mtcp_printf ("mtcp checkpointeverything: error unmapping NSCD shared area: %s\n",
2190 	                     strerror (mtcp_sys_errno));
2191 	        mtcp_abort();
2192 	      }
2193 	
2194 	      if ( mmap(area.addr, area.size, area.prot, area.flags, 0, 0)
2195 	           == MAP_FAILED ){
2196 	        mtcp_printf ("mtcp checkpointeverything: error remapping NSCD shared area: %s\n",
2197 	                     strerror (mtcp_sys_errno));
2198 	        mtcp_abort();
2199 	      }
2200 	
2201 	      memset(area.addr, 0, area.size);
2202 	    }
2203 	
2204 	    /* Force the anonymous flag if it's a private writeable section, as the
2205 	     * data has probably changed from the contents of the original images.
2206 	     */
2207 	
2208 	    /* We also do this for read-only private sections as it's possible
2209 	     * to modify a page there, too (via mprotect).
2210 	     */
2211 	
2212 	    if ((area.flags & MAP_PRIVATE) /*&& (area.prot & PROT_WRITE)*/) {
2213 	      area.flags |= MAP_ANONYMOUS;
2214 	    }
2215 	
2216 	    if ( area.flags & MAP_SHARED ) {
2217 	      /* invalidate shared memory pages so that the next read to it (when we are
2218 	       * writing them to ckpt file) will cause them to be reloaded from the disk.
2219 	       */
2220 	      if ( msync(area.addr, area.size, MS_INVALIDATE) < 0 ){
2221 	        mtcp_printf ("mtcp sync_shared_memory: error %d Invalidating %X"
2222 	            " at %p from %s + %X\n", mtcp_sys_errno, area.size,
2223 	            area.addr, area.name, area.offset);
2224 	        mtcp_abort();
2225 	      }
2226 	    }
2227 	
2228 	
2229 	    /* Skip any mapping for this image - it got saved as CS_RESTOREIMAGE
2230 	     * at the beginning.
2231 	     */
2232 	
2233 	    if (area_begin < restore_begin) {
2234 	      if (area_end <= restore_begin) {
2235 	        writememoryarea (fd, &area, 0, vsyscall_exists); // the whole thing is before the restore image
2236 	      } else if (area_end <= restore_end) {
2237 	        area.size = restore_begin - area_begin;    // we just have to chop the end part off
2238 	        writememoryarea (fd, &area, 0, vsyscall_exists);
2239 	      } else {
2240 	        area.size = restore_begin - area_begin;    // we have to write stuff that comes before restore image
2241 	        writememoryarea (fd, &area, 0, vsyscall_exists);
2242 	        area.offset += restore_end - area_begin;   // ... and we have to write stuff that comes after restore image
2243 	        area.size = area_end - restore_end;
2244 	        area.addr = (void *)restore_end;
2245 	        writememoryarea (fd, &area, 0, vsyscall_exists);
2246 	      }
2247 	    } else if (area_begin < restore_end) {
2248 	      if (area_end > restore_end) {
2249 	        area.offset += restore_end - area_begin;   // we have to write stuff that comes after restore image
2250 	        area.size = area_end - restore_end;
2251 	        area.addr = (void *)restore_end;
2252 	        writememoryarea (fd, &area, 0, vsyscall_exists);
2253 	      }
2254 	    } else {
2255 	      if ( strstr (area.name, "[stack]") )
2256 	        stack_was_seen = 1;
2257 	      writememoryarea (fd, &area, stack_was_seen, vsyscall_exists); // the whole thing comes after the restore image
2258 	    }
2259 	  }
2260 	
2261 	  close (mapsfd);
2262 	
2263 	  /* That's all folks */
2264 	
2265 	  writecs (fd, CS_THEEND);
2266 	  if (close (fd) < 0) {
2267 	    mtcp_printf ("mtcp checkpointeverything(grandchild):"
2268 	                 " error closing checkpoint file: %s\n", strerror (errno));
2269 	    mtcp_abort ();
2270 	  }
2271 	  if (use_compression) {
2272 	    /* IF OUT OF DISK SPACE, REPORT IT HERE. */
2273 	    if ( waitpid(mtcp_ckpt_gzip_child_pid, NULL, 0 ) == -1 )
2274 	      mtcp_printf ("mtcp checkpointeverything(grandchild): waitpid: %s\n",
2275 	                   strerror (errno));
2276 	    mtcp_ckpt_gzip_child_pid = -1;
2277 	  }
2278 	
2279 	  /* Maybe it's time to verify the checkpoint.
2280 	   * If so, exec an mtcp_restore with the temp file (in case temp file is bad,
2281 	   *   we'll still have the last one).
2282 	   * If the new file is good, mtcp_restore will rename it over the last one.
2283 	   */
2284 	
2285 	  if (verify_total != 0) -- verify_count;
2286 	
2287 	  /* Now that temp checkpoint file is complete, rename it over old permanent
2288 	   * checkpoint file.  Uses rename() syscall, which doesn't change i-nodes.
2289 	   * So, gzip process can continue to write to file even after renaming.
2290 	   */
2291 	
2292 	  else renametempoverperm ();
2293 	
2294 	  if (forked_checkpointing)
2295 	    mtcp_sys_exit (0); /* grandchild exits */
2296 	
2297 	  DPRINTF (("mtcp checkpointeverything*: checkpoint complete\n"));
2298 	}
2299 	
2300 	/* True if the given FD should be checkpointed */
2301 	static int should_ckpt_fd (int fd)
2302 	{
2303 	   if ( callback_ckpt_fd!=NULL )
2304 	     return (*callback_ckpt_fd)(fd); //delegate to callback
2305 	   else if (fd > 2)
2306 	     return 1;
2307 	   else
2308 	   {
2309 	     /* stdin/stdout/stderr */
2310 	     /* we only want to checkpoint these if they are from a file */
2311 	     struct stat statbuf;
2312 	     fstat(fd, &statbuf);
2313 	     return S_ISREG(statbuf.st_mode);
2314 	   }
2315 	}
2316 	
2317 	/* Write list of open files to the checkpoint file */
2318 	
2319 	static void writefiledescrs (int fd)
2320 	
2321 	{
2322 	  char dbuf[BUFSIZ], linkbuf[FILENAMESIZE], *p, procfdname[64];
2323 	  int doff, dsiz, fddir, fdnum, linklen, rc;
2324 	  off_t offset;
2325 	  struct linux_dirent *dent;
2326 	  struct stat lstatbuf, statbuf;
2327 	
2328 	  writecs (fd, CS_FILEDESCRS);
2329 	
2330 	  /* Open /proc/self/fd directory - it contains a list of files I have open */
2331 	
2332 	  fddir = mtcp_sys_open ("/proc/self/fd", O_RDONLY, 0);
2333 	  if (fddir < 0) {
2334 	    mtcp_printf ("mtcp writefiledescrs: error opening directory /proc/self/fd: %s\n", strerror (errno));
2335 	    mtcp_abort ();
2336 	  }
2337 	
2338 	  /* Check each entry */
2339 	
2340 	  while (1) {
2341 	    dsiz = -1;
2342 	    if (sizeof dent -> d_ino == 4) dsiz = mtcp_sys_getdents (fddir, dbuf, sizeof dbuf);
2343 	    if (sizeof dent -> d_ino == 8) dsiz = mtcp_sys_getdents64 (fddir, dbuf, sizeof dbuf);
2344 	    if (dsiz <= 0) break;
2345 	
2346 	    for (doff = 0; doff < dsiz; doff += dent -> d_reclen) {
2347 	      dent = (struct linux_dirent *) (dbuf + doff);
2348 	
2349 	      /* The filename should just be a decimal number = the fd it represents.
2350 	       * Also, skip the entry for the checkpoint and directory files
2351 	       * as we don't want the restore to know about them.
2352 	       */
2353 	
2354 	      fdnum = strtol (dent -> d_name, &p, 10);
2355 	      if ((*p == '\0') && (fdnum >= 0) && (fdnum != fd) && (fdnum != fddir)
2356 		  && (should_ckpt_fd (fdnum) > 0)) {
2357 	
2358 	        /* Read the symbolic link so we get the filename that's open on the fd */
2359 	
2360 	        sprintf (procfdname, "/proc/self/fd/%d", fdnum);
2361 	        linklen = readlink (procfdname, linkbuf, sizeof linkbuf - 1);
2362 	        if ((linklen >= 0) || (errno != ENOENT)) { // probably was the proc/self/fd directory itself
2363 	          if (linklen < 0) {
2364 	            mtcp_printf ("mtcp writefiledescrs: error reading %s: %s\n",
2365 		                 procfdname, strerror (errno));
2366 	            mtcp_abort ();
2367 	          }
2368 	          linkbuf[linklen] = '\0';
2369 	
2370 	          DPRINTF (("mtcp writefiledescrs*: checkpointing fd %d -> %s\n",
2371 			    fdnum, linkbuf));
2372 	
2373 	          /* Read about the link itself so we know read/write open flags */
2374 	
2375 	          rc = lstat (procfdname, &lstatbuf);
2376 	          if (rc < 0) {
2377 	            mtcp_printf ("mtcp writefiledescrs: error statting %s -> %s: %s\n",
2378 		                 procfdname, linkbuf, strerror (-rc));
2379 	            mtcp_abort ();
2380 	          }
2381 	
2382 	          /* Read about the actual file open on the fd */
2383 	
2384 	          rc = stat (linkbuf, &statbuf);
2385 	          if (rc < 0) {
2386 	            mtcp_printf ("mtcp writefiledescrs: error statting %s -> %s: %s\n",
2387 		                 procfdname, linkbuf, strerror (-rc));
2388 	          }
2389 	
2390 	          /* Write state information to checkpoint file.
2391 	           * Replace file's permissions with current access flags
2392 		   * so restore will know how to open it.
2393 		   */
2394 	
2395 	          else {
2396 	            offset = 0;
2397 	            if (S_ISREG (statbuf.st_mode))
2398 		      offset = mtcp_sys_lseek (fdnum, 0, SEEK_CUR);
2399 	            statbuf.st_mode = (statbuf.st_mode & ~0777)
2400 				       | (lstatbuf.st_mode & 0777);
2401 	            writefile (fd, &fdnum, sizeof fdnum);
2402 	            writefile (fd, &statbuf, sizeof statbuf);
2403 	            writefile (fd, &offset, sizeof offset);
2404 	            writefile (fd, &linklen, sizeof linklen);
2405 	            writefile (fd, linkbuf, linklen);
2406 	          }
2407 	        }
2408 	      }
2409 	    }
2410 	  }
2411 	  if (dsiz < 0) {
2412 	    mtcp_printf ("mtcp writefiledescrs: error reading /proc/self/fd: %s\n",
2413 	                 strerror (mtcp_sys_errno));
2414 	    mtcp_abort ();
2415 	  }
2416 	
2417 	  mtcp_sys_close (fddir);
2418 	
2419 	  /* Write end-of-fd-list marker to checkpoint file */
2420 	
2421 	  fdnum = -1;
2422 	  writefile (fd, &fdnum, sizeof fdnum);
2423 	}
2424 	
2425 	static void writememoryarea (int fd, Area *area, int stack_was_seen,
2426 				     int vsyscall_exists)
2427 	
2428 	{ static void * orig_stack = NULL;
2429 	
2430 	  /* Write corresponding descriptor to the file */
2431 	
2432 	  if (orig_stack == NULL && 0 == strcmp(area -> name, "[stack]"))
2433 	    orig_stack = area -> addr + area -> size;
2434 	
2435 	  if (0 == strcmp(area -> name, "[vdso]") && !stack_was_seen)
2436 	    DPRINTF (("mtcp checkpointeverything*: skipping over [vdso] section"
2437 	              " %p at %p\n", area -> size, area -> addr));
2438 	  else if (0 == strcmp(area -> name, "[vsyscall]") && !stack_was_seen)
2439 	    DPRINTF (("mtcp checkpointeverything*: skipping over [vsyscall] section"
2440 	    	      " %p at %p\n", area -> size, area -> addr));
2441 	  else if (0 == strcmp(area -> name, "[stack]") &&
2442 		   orig_stack != area -> addr + area -> size)
2443 	    /* Kernel won't let us munmap this.  But we don't need to restore it. */
2444 	    DPRINTF (("mtcp checkpointeverything*: skipping over [stack] segment"
2445 	    	      " %X at %pi (not the orig stack)\n", area -> size, area -> addr));
2446 	  else if (!(area -> flags & MAP_ANONYMOUS))
2447 	    DPRINTF (("mtcp checkpointeverything*: save %p at %p from %s + %X\n",
2448 	              area -> size, area -> addr, area -> name, area -> offset));
2449 	  else if (area -> name[0] == '\0')
2450 	    DPRINTF (("mtcp checkpointeverything*: save anonymous %p at %p\n",
2451 	              area -> size, area -> addr));
2452 	  else DPRINTF (("mtcp checkpointeverything*: save anonymous %p at %p"
2453 	                 " from %s + %X\n",
2454 			 area -> size, area -> addr, area -> name, area -> offset));
2455 	
2456 	  if ((area -> name[0]) == '\0') {
2457 	    void *brk = mtcp_sys_brk(NULL);
2458 	    if (brk > area -> addr && brk <= area -> addr + area -> size)
2459 	      mtcp_sys_strcpy(area -> name, "[heap]");
2460 	  }
2461 	
2462 	  if ( 0 != strcmp(area -> name, "[vsyscall]")
2463 	       && ( (0 != strcmp(area -> name, "[vdso]")
2464 	             || vsyscall_exists /* which implies vdso can be overwritten */
2465 	             || !stack_was_seen ))) /* If vdso appeared before stack, it can be replaced */
2466 	  {
2467 	    writecs (fd, CS_AREADESCRIP);
2468 	    writefile (fd, area, sizeof *area);
2469 	
2470 	    /* Anonymous sections need to have their data copied to the file,
2471 	     *   as there is no file that contains their data
2472 	     * We also save shared files to checkpoint file to handle shared memory
2473 	     *   implemented with backing files
2474 	     */
2475 	    if (area -> flags & MAP_ANONYMOUS || area -> flags & MAP_SHARED) {
2476 	      writecs (fd, CS_AREACONTENTS);
2477 	      writefile (fd, area -> addr, area -> size);
2478 	    }
2479 	  }
2480 	}
2481 	
2482 	/* Write checkpoint section number to checkpoint file */
2483 	
2484 	static void writecs (int fd, char cs)
2485 	
2486 	{
2487 	  writefile (fd, &cs, sizeof cs);
2488 	}
2489 	
2490 	/* Write something to checkpoint file */
2491 	
2492 	static char zeroes[MTCP_PAGE_SIZE] = { 0 };
2493 	static void writefile (int fd, void const *buff, size_t size)
2494 	
2495 	{
2496 	  char const *bf;
2497 	  ssize_t rc;
2498 	  size_t sz, wt;
2499 	
2500 	  checkpointsize += size;
2501 	
2502 	  bf = buff;
2503 	  sz = size;
2504 	  while (sz > 0) {
2505 	    for (wt = sz; wt > 0; wt /= 2) {
2506 	      rc = write (fd, bf, wt);
2507 	      if ((rc >= 0) || (errno != EFAULT)) break;
2508 	    }
2509 	
2510 	    /* Sometimes image page alignment will leave a hole in the middle of an image */
2511 	    /* ... but the idiot proc/self/maps will include it anyway                    */
2512 	
2513 	    if (wt == 0) {
2514 	      rc = (sz > sizeof zeroes ? sizeof zeroes : sz);
2515 	      checkpointsize -= rc; /* Correct now, since writefile will add rc back */
2516 	      writefile (fd, zeroes, rc);
2517 	    }
2518 	
2519 	    /* Otherwise, check for real error */
2520 	
2521 	    else {
2522 	      if (rc == 0) errno = EPIPE;
2523 	      if (rc <= 0) {
2524 	        mtcp_printf ("mtcp writefile: error writing from %p to %s: %s\n",
2525 		             bf, temp_checkpointfilename, strerror (errno));
2526 	        mtcp_abort ();
2527 	      }
2528 	    }
2529 	
2530 	    /* It's ok, we're on to next part */
2531 	
2532 	    sz -= rc;
2533 	    bf += rc;
2534 	  }
2535 	}
2536 	
2537 	static void preprocess_special_segments(int *vsyscall_exists)
2538 	{
2539 	  Area area;
2540 	  int mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
2541 	  if (mapsfd < 0) {
2542 	    mtcp_printf ("mtcp checkpointeverything: error opening"
2543 	        " /proc/self/maps: %s\n", strerror (mtcp_sys_errno));
2544 	    mtcp_abort ();
2545 	  }
2546 	
2547 	  while (readmapsline (mapsfd, &area)) {
2548 	    if (0 == strcmp(area.name, "[vsyscall]")) {
2549 	      /* Determine if [vsyscall] exists.  If [vdso] and [vsyscall] exist,
2550 	       * [vdso] will be saved and restored.
2551 	       * NOTE:  [vdso] is relocated if /proc/sys/kernel/randomize_va_space == 2.
2552 	       * We must restore old [vdso] and also keep [vdso] in that case.
2553 	       * On Linux 2.6.25, 32-bit Linux has:  [heap], /lib/ld-2.7.so, [vdso], libs, [stack].
2554 	       * On Linux 2.6.25, 64-bit Linux has:  [stack], [vdso], [vsyscall].
2555 	       *   and at least for gcl, [stack], libmtcp.so, [vsyscall] seen.
2556 	       * If 32-bit process in 64-bit Linux:  [stack] (0xffffd000), [vdso] (0xffffe0000)
2557 	       * On 32-bit Linux, mtcp_restart has [vdso], /lib/ld-2.7.so, [stack]
2558 	       * Need to restore old [vdso] into mtcp_restart, to restart.
2559 	       * With randomize_va_space turned off, libraries start at high address
2560 	       *     0xb8000000 and are loaded progressively at lower addresses.
2561 	       * mtcp_restart loads vdso (which looks like a shared library) first.
2562 	       * But libpthread/libdl/libc libraries are loaded above vdso in user image.
2563 	       * So, we must use the opposite of the user's setting (no randomization if
2564 	       *     user turned it on, and vice versa).  We must also keep the
2565 	       *     new vdso segment, provided by mtcp_restart.
2566 	       */
2567 	      *vsyscall_exists = 1;
2568 	    } else if (!saved_heap_start && strcmp(area.name, "[heap]") == 0) {
2569 	      // Record start of heap which will later be used in finishrestore()
2570 	      saved_heap_start = area.addr;
2571 	    } else if (strcmp(area.name, "[stack]") == 0) {
2572 	      /*
2573 	       * When using Matlab with dmtcp_checkpoint, sometimes the bottom most
2574 	       * page of stack (the page with highest address) which contains the
2575 	       * environment strings and the argv[] was not shown in /proc/self/maps.
2576 	       * This happens on some odd combination of environment passed on to
2577 	       * Matlab process. As a result, the page was not checkpointed and hence
2578 	       * the process segfaulted on restart. The fix is to try to mprotect this
2579 	       * page with RWX permission to make the page visible again. This call
2580 	       * will fail if no stack page was invisible to begin with.
2581 	       */
2582 	      int ret = mprotect(area.addr + area.size, 0x1000, 
2583 	                         PROT_READ | PROT_WRITE | PROT_EXEC);
2584 	      if (ret == 0) {
2585 	        mtcp_printf("mtcp checkpointeverything: bottom-most page of stack\n"
2586 	                 "(page with highest address) was invisible in /proc/self/maps.\n"
2587 	                 "It is made visible again now.\n");
2588 	      }
2589 	    }
2590 	  }
2591 	  close(mapsfd);
2592 	}
2593 	
2594 	/********************************************************************************************************************************/
2595 	/*																*/
2596 	/*  This signal handler is forced by the main thread doing a 'mtcp_sys_kernel_tkill' to stop these threads so it can do a 	*/
2597 	/*  checkpoint															*/
2598 	/*																*/
2599 	/********************************************************************************************************************************/
2600 	/* Grow the stack by kbStack*1024 so that large stack is allocated on restart
2601 	 * The kernel won't do it automatically for us any more, since it thinks
2602 	 * the stack is in a different place after restart.
2603 	 */
2604 	/* growstackValue is volatile so compiler doesn't optimize away growstack
2605 	 * Maybe it's not needed if we use ((optimize(0))) .
2606 	 */
2607 	static volatile unsigned int growstackValue = 0;
2608 	__attribute__ ((optimize(0))) static void growstack (int kbStack);
2609 	static void growstack (int kbStack) {
2610 	  const int kBincrement = 1024;
2611 	  char array[kBincrement * 1024] __attribute__ ((unused));
2612 	  volatile int dummy_value __attribute__ ((unused)) = 1; /*Again, try to prevent compiler optimization*/
2613 	  if (kbStack > 0)
2614 	    growstack(kbStack - kBincrement);
2615 	  else
2616 	    growstackValue++;
2617 	}
2618 	
2619 	static void stopthisthread (int signum)
2620 	
2621 	{
2622 	  int rc;
2623 	  Thread *thread;
2624 	#define BT_SIZE 1024
2625 	#define STDERR_FD 826
2626 	#define LOG_FD 826
2627 	
2628 	#ifdef PTRACE
2629 	  ptrace_unlock_inferiors();
2630 	  ptrace_remove_notexisted();
2631 	  ptrace_detach_checkpoint_threads ();
2632 	  ptrace_detach_user_threads ();
2633 	#endif
2634 	
2635 	  DPRINTF (("mtcp stopthisthread*: tid %d returns to %p\n",
2636 	            mtcp_sys_kernel_gettid (), __builtin_return_address (0)));
2637 	
2638 	  thread = getcurrenthread ();                                              // see which thread this is
2639 	
2640 	  // If this is checkpoint thread - exit immidiately
2641 	  if ( mtcp_state_value(&thread -> state) == ST_CKPNTHREAD ) {
2642 	    return ;
2643 	  }
2644 	
2645 	  if (0 && thread == motherofall) {
2646 	#include <execinfo.h>
2647 	    void *buffer[BT_SIZE];
2648 	    int nptrs;
2649 	
2650 	    DPRINTF (( "printing stacktrace of the motherofall Thread\n\n" ));
2651 	    nptrs = backtrace (buffer, BT_SIZE);
2652 	    backtrace_symbols_fd ( buffer, nptrs, STDERR_FD );
2653 	    backtrace_symbols_fd ( buffer, nptrs, LOG_FD );
2654 	  }
2655 	  if (mtcp_state_set (&(thread -> state), ST_SUSPINPROG, ST_SIGENABLED)) {  // make sure we don't get called twice for same thread
2656 	    static int is_first_checkpoint = 1;
2657 	
2658 	    save_sig_state (thread);      // save signal state (and block signal delivery)
2659 	    save_tls_state (thread);      // save thread local storage state
2660 	
2661 	    /* Grow stack only on first ckpt.  Kernel agrees this is main stack and
2662 	     * will mmap it.  On second ckpt and later, we would segfault if we tried
2663 	     * to grow the former stack beyond the portion that is already mmap'ed.
2664 	     */
2665 	    if (thread == motherofall) {
2666 	      static char *orig_stack_ptr;
2667 	      int kbStack = 2048;
2668 	      if (is_first_checkpoint) {
2669 		orig_stack_ptr = (char *)&kbStack;
2670 	        is_first_checkpoint = 0;
2671 	        DPRINTF(("mtcp_stopthisthread: temp. grow main stack by %d kilobytes\n",
2672 			 kbStack));
2673 	        growstack(kbStack);
2674 	      } else if (orig_stack_ptr - (char *)&kbStack > 3 * kbStack*1024 / 4) {
2675 	        mtcp_printf("WARNING:  Stack within %d bytes of end;\n"
2676 			    "  Consider increasing 'kbStack' at line %d of mtcp/%s\n",
2677 			    kbStack*1024/4, __LINE__-9, __FILE__);
2678 	      }
2679 	    }
2680 	
2681 	    ///JA: new code ported from v54b
2682 	    rc = getcontext (&(thread -> savctx));
2683 	    if (rc < 0) {
2684 	      mtcp_printf ("mtcp stopthisthread: getcontext rc %d errno %d\n",
2685 	                   rc, errno);
2686 	      mtcp_abort ();
2687 	    }
2688 	    DPRINTF (("mtcp stopthisthread*: after getcontext\n"));
2689 	    if (mtcp_state_value(&restoreinprog) == 0) {
2690 	
2691 	      /* We are the original process and all context is saved
2692 	       * restoreinprog is 0 ; wait for ckpt thread to write ckpt, and resume.
2693 	       */
2694 	
2695 	      WMB; // matched by RMB in checkpointhread
2696 	
2697 	      /* Next comes the first time we use the old stack. */
2698 	      /* Tell the checkpoint thread that we're all saved away */
2699 	      if (!mtcp_state_set (&(thread -> state), ST_SUSPENDED, ST_SUSPINPROG))
2700 		mtcp_abort ();  // tell checkpointhread all our context is saved
2701 	      mtcp_state_futex (&(thread -> state), FUTEX_WAKE, 1, NULL);                            // wake checkpoint thread if it's waiting for me
2702 	
2703 	      /* Then we wait for the checkpoint thread to write the checkpoint file then wake us up */
2704 	
2705 	      DPRINTF (("mtcp stopthisthread*: thread %d suspending\n", thread -> tid));
2706 	      while (mtcp_state_value(&thread -> state) == ST_SUSPENDED) {
2707 	        mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SUSPENDED, NULL);
2708 	      }
2709 	
2710 	#ifdef PTRACE
2711 	      DPRINTF (("mtcp stopthisthread*: thread %d after suspending before deleting files\n", thread -> tid));
2712 	      delete_file(0, delete_ptrace_leader, has_ptrace_file);
2713 	      delete_file(1, delete_setoptions_leader, has_setoptions_file);
2714 	      delete_file(2, delete_checkpoint_leader, has_checkpoint_file);
2715 	      ptrace_attach_threads(0);
2716 	#endif
2717 	
2718 	      /* Maybe there is to be a checkpoint verification.  If so, and we're the main    */
2719 	      /* thread, exec the restore program.  If so and we're not the main thread, exit. */
2720 	
2721 	      if ((verify_total != 0) && (verify_count == 0)) {
2722 	
2723 	        /* If not the main thread, exit.  Either normal exit() or _exit()
2724 	         * seems to cause other threads to exit.
2725 	         */
2726 	
2727 	        if (thread != motherofall) {
2728 	          mtcp_sys_exit(0);
2729 	        }
2730 	
2731 	        /* This is the main thread, verify checkpoint then restart by doing
2732 	         * a restart.
2733 	         * The restore will rename the file after it has done the restart.
2734 	         */
2735 	
2736 	        DPRINTF (("mtcp checkpointeverything*: verifying checkpoint...\n"));
2737 	        execlp ("mtcp_restart", "mtcp_restart", "--verify", temp_checkpointfilename, NULL);
2738 	        mtcp_printf ("mtcp checkpointeverything: error execing mtcp_restart %s: %s\n", temp_checkpointfilename, strerror (errno));
2739 	        mtcp_abort ();
2740 	      }
2741 	
2742 	      /* No verification, resume where we left off */
2743 	
2744 	      DPRINTF (("mtcp stopthisthread*: thread %d resuming\n", thread -> tid));
2745 	    }
2746 	
2747 	    /* Else restoreinprog >= 1;  This stuff executes to do a restart */
2748 	
2749 	    else {
2750 	      if (!mtcp_state_set (&(thread -> state), ST_RUNENABLED, ST_SUSPENDED))
2751 		mtcp_abort ();  // checkpoint was written when thread in SUSPENDED state
2752 	      wait_for_all_restored ();
2753 	      DPRINTF (("mtcp stopthisthread*: thread %d restored\n", thread -> tid));
2754 	
2755 	      if (thread == motherofall) {
2756 	
2757 	        /* If we're a restore verification, rename the temp file
2758 		 * over the permanent one
2759 		 */
2760 	
2761 	        if (mtcp_restore_verify) renametempoverperm ();
2762 	      }
2763 	
2764 	#ifdef PTRACE
2765 	      ptrace_attach_threads(1);
2766 	#endif 
2767 	    }
2768 	  }
2769 	  DPRINTF (("mtcp stopthisthread*: tid %d returning to %p\n",
2770 		    mtcp_sys_kernel_gettid (), __builtin_return_address (0)));
2771 	#ifdef PTRACE
2772 	  ptrace_lock_inferiors();
2773 	#endif
2774 	}
2775 	
2776 	/********************************************************************************************************************************/
2777 	/*																*/
2778 	/*  Wait for all threads to finish restoring their context, then release them all to continue on their way.			*/
2779 	/*																*/
2780 	/*    Input:															*/
2781 	/*																*/
2782 	/*	restoreinprog = number of threads, including this, that hasn't called 'wait_for_all_restored' yet			*/
2783 	/*	thread list locked													*/
2784 	/*																*/
2785 	/*    Output:															*/
2786 	/*																*/
2787 	/*	restoreinprog = decremented												*/
2788 	/*	                if now zero, all threads woken and thread list unlocked							*/
2789 	/*																*/
2790 	/********************************************************************************************************************************/
2791 	
2792 	static void wait_for_all_restored (void)
2793 	
2794 	{
2795 	  int rip;
2796 	
2797 	  do rip = mtcp_state_value(&restoreinprog);                         // dec number of threads cloned but not completed longjmp'ing
2798 	  while (!mtcp_state_set (&restoreinprog, rip - 1, rip));
2799 	  if (-- rip == 0) {
2800 	
2801 	    /* raise the signals which were pending for the entire process at the time
2802 	     * of checkpoint. It is assumed that if a signal is pending for all threads
2803 	     * including the ckpt-thread, then it was sent to the process as opposed to
2804 	     * sent to individual threads.
2805 	     */
2806 	    int i;
2807 	    for (i = NSIG; i > 0; --i) {
2808 	      if (sigismember(&sigpending_global, i) == 1) {
2809 	        kill(getpid(), i);
2810 	      }
2811 	    }
2812 	
2813 	    if (callback_restore_virtual_pid_table != NULL) {
2814 	      DPRINTF(("Before callback_restore_virtual_pid_table: Thread:%d \n", 
2815 	               mtcp_sys_kernel_gettid()));
2816 	      (*callback_restore_virtual_pid_table)();
2817 	      DPRINTF(("After callback_restore_virtual_pid_table: Thread:%d \n",
2818 	               mtcp_sys_kernel_gettid()));
2819 	    }
2820 	
2821 	    mtcp_state_futex (&restoreinprog, FUTEX_WAKE, 999999999, NULL);  // if this was last of all, wake everyone up
2822 	
2823 	    // NOTE:  This is last safe moment for hook.  All previous threads
2824 	    //   have executed the "else" and are waiting on the futex.
2825 	    //   This last thread has not yet unlocked the threads: unlk_threads()
2826 	    //   So, no race condition occurs.
2827 	    //   By comparison, *callback_post_ckpt() is called before creating
2828 	    //   additional user threads.  Only motherofall (checkpoint thread existed)
2829 	    /* call weak symbol of this file, possibly overridden by the user's strong symbol  */
2830 	    /* user must compile his/her code with -Wl,-export-dynamic to make it visible */
2831 	    mtcpHookRestart();
2832 	    unlk_threads ();                                                 // ... and release the thread list
2833 	  } else {
2834 	    while ((rip = mtcp_state_value(&restoreinprog)) > 0) {           // otherwise, wait for last of all to wake this one up
2835 	      mtcp_state_futex (&restoreinprog, FUTEX_WAIT, rip, NULL);
2836 	    }
2837 	  }
2838 	}
2839 	
2840 	/********************************************************************************************************************************/
2841 	/*																*/
2842 	/*  Save signal mask and list of pending signals delivery										*/
2843 	/*																*/
2844 	/********************************************************************************************************************************/
2845 	
2846 	static void save_sig_state (Thread *thisthread)
2847 	{
2848 	  /* For checkpoint thread, we want to block delivery of all but some special signals*/
2849 	  if (thisthread == ckpthread) {
2850 	    /* 
2851 	     * For the checkpoint thread, we should not block SIGSETXID which is used
2852 	     * by the setsid family of system calls to change the session leader. Glibc
2853 	     * uses this signal to notify the process threads of the change in session
2854 	     * leader information. This signal is not documented and is used internally
2855 	     * by glibc. It is defined in <glibc-src-root>/nptl/pthreadP.h
2856 	     * screen was getting affected by this since it used setsid to change the
2857 	     * session leaders.
2858 	     */
2859 	#define SIGSETXID (__SIGRTMIN + 1)
2860 	    sigset_t set;
2861 	
2862 	    sigfillset(&set);
2863 	    sigdelset(&set, SIGSETXID);
2864 	
2865 	    if (pthread_sigmask(SIG_SETMASK, &set, NULL) < 0) {
2866 	      mtcp_printf("mtcp %s: error getting sigal mask: %s\n",
2867 	          __FUNCTION__, strerror(errno));
2868 	      mtcp_abort ();
2869 	    }
2870 	  }
2871 	  // Save signal block mask
2872 	  if (pthread_sigmask (SIG_SETMASK, NULL, &(thisthread -> sigblockmask)) < 0) {
2873 	    mtcp_printf("mtcp %s: error getting sigal mask: %s\n",
2874 	                __FUNCTION__, strerror(errno));
2875 	    mtcp_abort ();
2876 	  }
2877 	
2878 	  // Save pending signals
2879 	  sigpending ( &(thisthread->sigpending) );
2880 	}
2881 	
2882 	/********************************************************************************************************************************/
2883 	/*																*/
2884 	/*  Restore signal mask and all pending signals										*/
2885 	/*																*/
2886 	/********************************************************************************************************************************/
2887 	
2888 	static void restore_sig_state (Thread *thisthread)
2889 	{
2890 	  int i;
2891 	  DPRINTF (("mtcp restore_sig_state*: restoring handlers for thread %d\n",
2892 		    thisthread->original_tid));
2893 	  if (pthread_sigmask (SIG_SETMASK, &(thisthread -> sigblockmask), NULL) < 0) {
2894 	    mtcp_printf("mtcp %s: error setting sigal mask: %s\n",
2895 	                __FUNCTION__, strerror(errno));
2896 	    mtcp_abort ();
2897 	  }
2898 	
2899 	  // Raise the signals which were pending for only this thread at the time of checkpoint.
2900 	  for (i = NSIG; i > 0; --i) {
2901 	    if (sigismember(&(thisthread -> sigpending), i)  == 1  &&
2902 	        sigismember(&(thisthread -> sigblockmask), i) == 1 &&
2903 	        sigismember(&(sigpending_global), i) == 0) {
2904 	      raise(i);
2905 	    }
2906 	  }
2907 	}
2908 	
2909 	/********************************************************************************************************************************/
2910 	/*																*/
2911 	/*  Save all signal handlers										*/
2912 	/*																*/
2913 	/********************************************************************************************************************************/
2914 	static void save_sig_handlers (void)
2915 	{
2916 	  int i;
2917 	
2918 	  if (dmtcp_exists) {
2919 	    mtcp_printf("mtcp:%s Illegal function call when running under DMTCP*****\n",
2920 	                __FUNCTION__);
2921 	    // Do a simple return instead of killing the process
2922 	    return;
2923 	    //mtcp_abort();
2924 	  }
2925 	
2926 	  /* Now save all the signal handlers */
2927 	  DPRINTF (("mtcp save_sig_handlers*: saving signal handlers\n"));
2928 	  for (i = NSIG; i > 0; --i) {
2929 	    if (_real_sigaction (i, NULL, &sigactions[i]) < 0) {
2930 	      if (errno == EINVAL)
2931 	         memset (&sigactions[i], 0, sizeof sigactions[i]);
2932 	      else {
2933 	        mtcp_printf ("mtcp save_sig_handlers: error saving signal %d action: %s\n",
2934 	                     i, strerror(errno));
2935 	        mtcp_abort ();
2936 	      }
2937 	    }
2938 	
2939 	    DPRINTF (("mtcp save_sig_handlers*: saving signal handler for %d -> %p\n",
2940 	              i,
2941 	              (sigactions[i].sa_flags & SA_SIGINFO ?
2942 	                 (void *)(sigactions[i].sa_sigaction) :
2943 	                 (void *)(sigactions[i].sa_handler)) ));
2944 	  }
2945 	}
2946 	
2947 	/********************************************************************************************************************************/
2948 	/*																*/
2949 	/*  Restore all saved signal handlers										*/
2950 	/*																*/
2951 	/********************************************************************************************************************************/
2952 	static void restore_sig_handlers (Thread *thisthread)
2953 	{
2954 	  int i;
2955 	
2956 	  if (dmtcp_exists) {
2957 	    mtcp_printf("mtcp:%s Illegal function when running under DMTCP*****\n",
2958 	                __FUNCTION__);
2959 	    // Do a simple return instead of killing the process
2960 	    return;
2961 	    //mtcp_abort();
2962 	  }
2963 	
2964 	  DPRINTF (("mtcp restore_sig_handlers*: restoring signal handlers\n"));
2965 	#if 0
2966 	# define VERBOSE_DEBUG
2967 	#endif
2968 	  for(i = NSIG; i > 0; --i) {
2969 	#ifdef VERBOSE_DEBUG
2970 	    DPRINTF (("mtcp restore_sig_handlers*: restore signal handler for %d -> %p\n",
2971 	              i,
2972 	              (sigactions[i].sa_flags & SA_SIGINFO ?
2973 	                 sigactions[i].sa_sigaction :
2974 	                 sigactions[i].sa_handler) ));
2975 	#endif
2976 	
2977 	    if (_real_sigaction(i, &sigactions[i], NULL) < 0) {
2978 	        if (errno != EINVAL) {
2979 	          mtcp_printf ("mtcp restore_sig_handlers:" \
2980 			       " error restoring signal %d handler: %s\n",
2981 			       i, strerror(errno));
2982 	          mtcp_abort ();
2983 	        }
2984 	    }
2985 	  }
2986 	}
2987 	
2988 	/********************************************************************************************************************************/
2989 	/*																*/
2990 	/*  Save state necessary for TLS restore											*/
2991 	/*  Linux saves stuff in the GDT, switching it on a per-thread basis								*/
2992 	/*																*/
2993 	/********************************************************************************************************************************/
2994 	
2995 	static void save_tls_state (Thread *thisthread)
2996 	
2997 	{
2998 	  int i, rc;
2999 	
3000 	#ifdef __i386__
3001 	  asm volatile ("movw %%fs,%0" : "=m" (thisthread -> fs));
3002 	  asm volatile ("movw %%gs,%0" : "=m" (thisthread -> gs));
3003 	#endif
3004 	#ifdef __x86_64__
3005 	  //asm volatile ("movl %%fs,%0" : "=m" (thisthread -> fs));
3006 	  //asm volatile ("movl %%gs,%0" : "=m" (thisthread -> gs));
3007 	#endif
3008 	
3009 	  memset (thisthread -> gdtentrytls, 0, sizeof thisthread -> gdtentrytls);
3010 	
3011 	  /* On older Linuxes, we must save several GDT entries available to threads. */
3012 	
3013 	#if MTCP__SAVE_MANY_GDT_ENTRIES
3014 	  for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i ++) {
3015 	    thisthread -> gdtentrytls[i-GDT_ENTRY_TLS_MIN].entry_number = i;
3016 	    rc = mtcp_sys_get_thread_area (&(thisthread -> gdtentrytls[i-GDT_ENTRY_TLS_MIN]));
3017 	    if (rc < 0) {
3018 	      mtcp_printf ("mtcp checkpointeverything: error saving GDT TLS entry[%d]: %s\n", i, strerror (mtcp_sys_errno));
3019 	      mtcp_abort ();
3020 	    }
3021 	  }
3022 	
3023 	  /* With newer Linuxes, we just save the one GDT entry indexed by GS so we don't need the GDT_ENTRY_TLS_... definitions. */
3024 	  /* We get the particular index of the GDT entry to save by reading GS.                                                  */
3025 	
3026 	#else
3027 	  i = thisthread -> TLSSEGREG / 8;
3028 	  thisthread -> gdtentrytls[0].entry_number = i;
3029 	  rc = mtcp_sys_get_thread_area (&(thisthread -> gdtentrytls[0]));
3030 	  if (rc < 0) {
3031 	    mtcp_printf ("mtcp checkpointeverything: error saving GDT TLS entry[%d]: %s\n", i, strerror (mtcp_sys_errno));
3032 	    mtcp_abort ();
3033 	  }
3034 	#endif
3035 	}
3036 	
3037 	static char *memsubarray (char *array, char *subarray, int len) {
3038 	   char *i_ptr;
3039 	   int j;
3040 	   int word1 = *(int *)subarray;
3041 	   // Assume subarray length is at least sizeof(int) and < 2048.
3042 	   if (len < sizeof(int))
3043 	     mtcp_abort();
3044 	   for (i_ptr = array; i_ptr < array+2048; i_ptr++) {
3045 	     if (*(int *)i_ptr == word1) {
3046 	       for (j=0; j < len; j++)
3047 		 if (i_ptr[j] != subarray[j])
3048 		   break;
3049 		if (j == len)
3050 		  return i_ptr;
3051 	     }
3052 	   }
3053 	   return NULL;
3054 	}
3055 	static int mtcp_get_tls_segreg(void)
3056 	{ mtcp_segreg_t tlssegreg;
3057 	#ifdef __i386__
3058 	  asm volatile ("movw %%gs,%0" : "=g" (tlssegreg)); /* any general register */
3059 	#endif
3060 	#ifdef __x86_64__
3061 	  asm volatile ("movl %%fs,%0" : "=q" (tlssegreg)); /* q = a,b,c,d for i386; 8 low bits of r class reg for x86_64 */
3062 	#endif
3063 	  return (int)tlssegreg;
3064 	}
3065 	static void *mtcp_get_tls_base_addr(void)
3066 	{
3067 	  struct user_desc gdtentrytls;
3068 	
3069 	#if MTCP__SAVE_MANY_GDT_ENTRIES
3070 	  if (mtcp_get_tls_segreg() / 8 != GDT_ENTRY_TLS_MIN) {
3071 	    mtcp_printf ("mtcp_init: gs %X not set to first TLS GDT ENTRY %X\n",
3072 	                 gs, GDT_ENTRY_TLS_MIN * 8 + 3);
3073 	    mtcp_abort ();
3074 	  }
3075 	#endif
3076 	
3077 	  gdtentrytls.entry_number = mtcp_get_tls_segreg() / 8;
3078 	  if ( mtcp_sys_get_thread_area ( &gdtentrytls ) < 0 ) {
3079 	    mtcp_printf ("mtcp_init: error getting GDT TLS entry: %s\n",
3080 	        strerror (mtcp_sys_errno));
3081 	    mtcp_abort ();
3082 	  }
3083 	  return (void *)(*(unsigned long *)&(gdtentrytls.base_addr));
3084 	}
3085 	
3086 	static void renametempoverperm (void)
3087 	
3088 	{
3089 	  if (rename (temp_checkpointfilename, perm_checkpointfilename) < 0) {
3090 	    mtcp_printf ("mtcp checkpointeverything: error renaming %s to %s: %s\n",  			temp_checkpointfilename, perm_checkpointfilename,
3091 			 strerror (errno));
3092 	    mtcp_abort ();
3093 	  }
3094 	}
3095 	
3096 	/********************************************************************************************************************************/
3097 	/*																*/
3098 	/*  Get current thread struct pointer												*/
3099 	/*  It is keyed by the calling thread's gettid value										*/
3100 	/*  Maybe improve someday by using TLS												*/
3101 	/*																*/
3102 	/********************************************************************************************************************************/
3103 	
3104 	static Thread *getcurrenthread (void)
3105 	
3106 	{
3107 	  int tid;
3108 	  Thread *thread;
3109 	
3110 	  tid = mtcp_sys_kernel_gettid ();
3111 	  lock_threads ();
3112 	  for (thread = threads; thread != NULL; thread = thread -> next) {
3113 	    if (thread -> tid == tid) {
3114 	      unlk_threads ();
3115 	      return (thread);
3116 	    }
3117 	  }
3118 	  mtcp_printf ("mtcp getcurrenthread: can't find thread id %d\n", tid);
3119 	  mtcp_abort ();
3120 	  return thread; /* NOTREACHED : stop compiler warning */
3121 	}
3122 	
3123 	/********************************************************************************************************************************/
3124 	/*																*/
3125 	/*  Lock and unlock the 'threads' list												*/
3126 	/*																*/
3127 	/********************************************************************************************************************************/
3128 	
3129 	static void lock_threads (void)
3130 	
3131 	{
3132 	  while (!mtcp_state_set (&threadslocked, 1, 0)) {
3133 	    mtcp_state_futex (&threadslocked, FUTEX_WAIT, 1, NULL);
3134 	  }
3135 	  RMB; // don't prefetch anything until we have the lock
3136 	}
3137 	
3138 	static void unlk_threads (void)
3139 	
3140 	{
3141 	  WMB; // flush data written before unlocking
3142 	  // FIXME: Should we be checking return value of mtcp_state_set? Can it ever fail?
3143 	  mtcp_state_set(&threadslocked , 0, 1);
3144 	  mtcp_state_futex (&threadslocked, FUTEX_WAKE, 1, NULL);
3145 	}
3146 	
3147 	/********************************************************************************************************************************/
3148 	/*																*/
3149 	/*  Read /proc/self/maps line, converting it to an Area descriptor struct							*/
3150 	/*																*/
3151 	/*    Input:															*/
3152 	/*																*/
3153 	/*	mapsfd = /proc/self/maps file, positioned to beginning of a line							*/
3154 	/*																*/
3155 	/*    Output:															*/
3156 	/*																*/
3157 	/*	readmapsline = 0 : was at end-of-file, nothing read									*/
3158 	/*	               1 : read and processed one line										*/
3159 	/*	*area = filled in													*/
3160 	/*																*/
3161 	/*    Note:															*/
3162 	/*																*/
3163 	/*	Line from /procs/self/maps is in form:											*/
3164 	/*																*/
3165 	/*	<startaddr>-<endaddrexclusive> rwxs <fileoffset> <devmaj>:<devmin> <inode>    <filename>\n				*/
3166 	/*	all numbers in hexadecimal except inode is in decimal									*/
3167 	/*	anonymous will be shown with offset=devmaj=devmin=inode=0 and no '     filename'					*/
3168 	/*																*/
3169 	/********************************************************************************************************************************/
3170 	
3171 	static int readmapsline (int mapsfd, Area *area)
3172 	
3173 	{
3174 	  char c, rflag, sflag, wflag, xflag;
3175 	  int i, rc;
3176 	  struct stat statbuf;
3177 	  VA devmajor, devminor, devnum, endaddr, inodenum, startaddr;
3178 	
3179 	  c = mtcp_readhex (mapsfd, &startaddr);
3180 	  if (c != '-') {
3181 	    if ((c == 0) && (startaddr == 0)) return (0);
3182 	    goto skipeol;
3183 	  }
3184 	  c = mtcp_readhex (mapsfd, &endaddr);
3185 	  if (c != ' ') goto skipeol;
3186 	  if (endaddr < startaddr) goto skipeol;
3187 	
3188 	  rflag = c = mtcp_readchar (mapsfd);
3189 	  if ((c != 'r') && (c != '-')) goto skipeol;
3190 	  wflag = c = mtcp_readchar (mapsfd);
3191 	  if ((c != 'w') && (c != '-')) goto skipeol;
3192 	  xflag = c = mtcp_readchar (mapsfd);
3193 	  if ((c != 'x') && (c != '-')) goto skipeol;
3194 	  sflag = c = mtcp_readchar (mapsfd);
3195 	  if ((c != 's') && (c != 'p')) goto skipeol;
3196 	
3197 	  c = mtcp_readchar (mapsfd);
3198 	  if (c != ' ') goto skipeol;
3199 	
3200 	  c = mtcp_readhex (mapsfd, &devmajor);
3201 	  if (c != ' ') goto skipeol;
3202 	  area -> offset = devmajor;
3203 	
3204 	  c = mtcp_readhex (mapsfd, &devmajor);
3205 	  if (c != ':') goto skipeol;
3206 	  c = mtcp_readhex (mapsfd, &devminor);
3207 	  if (c != ' ') goto skipeol;
3208 	  c = mtcp_readdec (mapsfd, &inodenum);
3209 	  area -> name[0] = '\0';
3210 	  while (c == ' ') c = mtcp_readchar (mapsfd);
3211 	  if (c == '/' || c == '[') { /* absolute pathname, or [stack], [vdso], etc. */
3212 	    i = 0;
3213 	    do {
3214 	      area -> name[i++] = c;
3215 	      if (i == sizeof area -> name) goto skipeol;
3216 	      c = mtcp_readchar (mapsfd);
3217 	    } while (c != '\n');
3218 	    area -> name[i] = '\0';
3219 	  }
3220 	  if (mtcp_strstartswith(area -> name, nscd_mmap_str)  ||
3221 	      mtcp_strstartswith(area -> name, nscd_mmap_str2) ||
3222 	      mtcp_strstartswith(area -> name, nscd_mmap_str3)) {
3223 	    /* if nscd is active */
3224 	  } else if ( mtcp_strstartswith(area -> name, sys_v_shmem_file) ) {
3225 	    /* System V Shared-Memory segments are handled by DMTCP. */
3226 	  } else if ( mtcp_strendswith(area -> name, " (deleted)") ) {
3227 	    /* Deleted File */
3228 	  } else if (area -> name[0] == '/') {                 /* if an absolute pathname */
3229 	    rc = stat (area -> name, &statbuf);
3230 	    if (rc < 0) {
3231 	      mtcp_printf ("ERROR:  mtcp readmapsline: error %d statting %s\n",
3232 	                   -rc, area -> name);
3233 	      return (1); /* 0 would mean last line of maps; could do mtcp_abort() */
3234 	    }
3235 	    devnum = makedev (devmajor, devminor);
3236 	    if ((devnum != statbuf.st_dev) || (inodenum != statbuf.st_ino)) {
3237 	      mtcp_printf ("ERROR:  mtcp readmapsline: image %s dev:inode %X:%u"
3238 			   " not eq maps %X:%u\n",
3239 	                   area -> name, statbuf.st_dev, statbuf.st_ino,
3240 			   devnum, inodenum);
3241 	      return (1); /* 0 would mean last line of maps; could do mtcp_abort() */
3242 	    }
3243 	  } else {
3244 	    /* Special area like [heap] or anonymous area. */
3245 	  }
3246 	
3247 	  if (c != '\n') goto skipeol;
3248 	
3249 	  area -> addr = (void *)startaddr;
3250 	  area -> size = endaddr - startaddr;
3251 	  area -> prot = 0;
3252 	  if (rflag == 'r') area -> prot |= PROT_READ;
3253 	  if (wflag == 'w') area -> prot |= PROT_WRITE;
3254 	  if (xflag == 'x') area -> prot |= PROT_EXEC;
3255 	  area -> flags = MAP_FIXED;
3256 	  if (sflag == 's') area -> flags |= MAP_SHARED;
3257 	  if (sflag == 'p') area -> flags |= MAP_PRIVATE;
3258 	  if (area -> name[0] == '\0') area -> flags |= MAP_ANONYMOUS;
3259 	
3260 	  return (1);
3261 	
3262 	skipeol:
3263 	  DPRINTF (("ERROR:  mtcp readmapsline*: bad maps line <%c", c));
3264 	  while ((c != '\n') && (c != '\0')) {
3265 	    c = mtcp_readchar (mapsfd);
3266 	    mtcp_printf ("%c", c);
3267 	  }
3268 	  mtcp_printf (">\n");
3269 	  mtcp_abort ();
3270 	  return (0);  /* NOTREACHED : stop compiler warning */
3271 	}
3272 	
3273 	/********************************************************************************************************************************/
3274 	/*																*/
3275 	/*  Do restore from checkpoint file												*/
3276 	/*  This routine is called from the mtcp_restore program to perform the restore							*/
3277 	/*  It resides in the libmtcp.so image in exactly the same spot that the checkpointed process had its libmtcp.so loaded at, so this 	*/
3278 	/*    can't possibly interfere with restoring the checkpointed process								*/
3279 	/*  The restore can't use malloc because that might create memory sections.							*/
3280 	/*  Strerror seems to mess up with its Locale stuff in here too.								*/
3281 	/*																*/
3282 	/*    Input:															*/
3283 	/*																*/
3284 	/*	fd = checkpoint file, positioned just after the CS_RESTOREIMAGE data							*/
3285 	/*																*/
3286 	/********************************************************************************************************************************/
3287 	
3288 	#ifdef __x86_64__
3289 	# define UNUSED_IN_64_BIT __attribute__ ((unused))
3290 	#else
3291 	# define UNUSED_IN_64_BIT
3292 	#endif
3293 	
3294 	#define STRINGS_LEN 10000
3295 	static char UNUSED_IN_64_BIT STRINGS[STRINGS_LEN];
3296 	void mtcp_restore_start (int fd, int verify, pid_t gzip_child_pid,char *ckpt_newname,
3297 				 char *cmd_file, char *argv[], char *envp[] )
3298 	
3299 	{
3300 	#ifndef __x86_64__
3301 	  int i;
3302 	  char *strings = STRINGS;
3303 	#endif
3304 	
3305 	  DEBUG_RESTARTING = 1;
3306 	  /* If we just replace extendedStack by (tempstack+STACKSIZE) in "asm"
3307 	   * below, the optimizer generates non-PIC code if it's not -O0 - Gene
3308 	   */
3309 	  long long * extendedStack = tempstack + STACKSIZE;
3310 	
3311 	  /* Not used until we do longjmps, but get it out of the way now */
3312 	
3313 	  // FIXME: Should we be checking return value of mtcp_state_set? Can it ever fail?
3314 	  mtcp_state_set(&restoreinprog ,1, 0);
3315 	
3316 	  mtcp_sys_gettimeofday (&restorestarted, NULL);
3317 	
3318 	  /* Save parameter away in a static memory location as we're about to wipe the stack */
3319 	
3320 	  mtcp_restore_cpfd   = fd;
3321 	  mtcp_restore_verify = verify;
3322 	  mtcp_restore_gzip_child_pid = gzip_child_pid;
3323 	  // Copy newname to save it too
3324 	  {
3325 	    int i;
3326 	    for(i=0;ckpt_newname[i];i++){
3327 	      mtcp_ckpt_newname[i] = ckpt_newname[i];
3328 	    }
3329 	    mtcp_ckpt_newname[i] = '\0';
3330 	  }
3331 	
3332 	
3333 	#ifndef __x86_64__
3334 	  // Copy command line to libmtcp.so, so that we can re-exec if randomized vdso
3335 	  //   steps on us.  This won't be needed when we use the linker to map areas.
3336 	  strings = STRINGS;
3337 	  // This version of STRCPY copies source string into STRINGS,
3338 	  // and sets destination string to point there.
3339 	# define STRCPY(x,y) \
3340 		if (strings + 256 < STRINGS + STRINGS_LEN) { \
3341 		  mtcp_sys_strcpy(strings,y); \
3342 		  x = strings; \
3343 		  strings += mtcp_sys_strlen(y) + 1; \
3344 		} else { \
3345 		  DPRINTF(("MTCP:  ran out of string space." \
3346 			   "  Trying to continue anyway\n")); \
3347 		}
3348 	  STRCPY(mtcp_restore_cmd_file, cmd_file);
3349 	  for (i = 0; argv[i] != NULL; i++) {
3350 	    STRCPY(mtcp_restore_argv[i], argv[i]);
3351 	  }
3352 	  mtcp_restore_argv[i] = NULL;
3353 	  for (i = 0; envp[i] != NULL; i++) {
3354 	    STRCPY(mtcp_restore_envp[i], envp[i]);
3355 	  }
3356 	  mtcp_restore_envp[i] = NULL;
3357 	#endif
3358 	
3359 	  /* Switch to a stack area that's part of the shareable's memory address range
3360 	   * and thus not used by the checkpointed program
3361 	   */
3362 	
3363 	  asm volatile (CLEAN_FOR_64_BIT(mov %0,%%esp\n\t)
3364 	                /* This next assembly language confuses gdb,
3365 			   but seems to work fine anyway */
3366 	                CLEAN_FOR_64_BIT(xor %%ebp,%%ebp\n\t)
3367 	                : : "g" (extendedStack) : "memory");
3368 	
3369 	  /* Once we're on the new stack, we can't access any local variables or parameters */
3370 	  /* Call the restoreverything to restore files and memory areas                    */
3371 	
3372 	  /* This should never return */
3373 	  mtcp_restoreverything();
3374 	  asm volatile ("hlt");
3375 	}
3376 	
3377 	
3378 	/********************************************************************************************************************************/
3379 	/*																*/
3380 	/*  Restore proper heap														*/
3381 	/*																*/
3382 	/********************************************************************************************************************************/
3383 	static void restore_heap()
3384 	{
3385 	  /*
3386 	   * If the original start of heap is lower than the current end of heap, we
3387 	   * want to mmap the area between mtcp_saved_break and current break. This
3388 	   * happens when the size of checkpointed program is smaller then the size of
3389 	   * mtcp_restart program.
3390 	   */
3391 	  void* current_break = mtcp_sys_brk (NULL);
3392 	  if (current_break > mtcp_saved_break) {
3393 	    DPRINTF(("mtcp finishrestore: Area between mtcp_saved_break:%p and "
3394 	             "Current_break:%p not mapped, mapping it now\n", 
3395 	             mtcp_saved_break, current_break));
3396 	    size_t oldsize = mtcp_saved_break - saved_heap_start;
3397 	    size_t newsize = current_break - saved_heap_start;
3398 	
3399 	    void* addr = mremap (saved_heap_start, oldsize, newsize, 0);
3400 	    if (addr == NULL) {
3401 	      mtcp_printf("mtcp finishrestore: mremap failed to map area between "
3402 	                  "mtcp_saved_break (%p) and current_break (%p)\n",
3403 	                  mtcp_saved_break, current_break);
3404 	      mtcp_abort();
3405 	    }
3406 	  }
3407 	}
3408 	
3409 	/********************************************************************************************************************************/
3410 	/*																*/
3411 	/*  The original program's memory and files have been restored									*/
3412 	/*																*/
3413 	/********************************************************************************************************************************/
3414 	
3415 	static void finishrestore (void)
3416 	{
3417 	  struct timeval stopped;
3418 	  int nnamelen;
3419 	
3420 	  DPRINTF (("mtcp finishrestore*: mtcp_printf works; libc should work\n"));
3421 	
3422 	  restore_heap();
3423 	
3424 	  if ( (nnamelen = strlen(mtcp_ckpt_newname))
3425 	       && strcmp(mtcp_ckpt_newname,perm_checkpointfilename) ) {
3426 	    // we start from different place - change it!
3427 	    DPRINTF(("mtcp finishrestore*: checkpoint file name was changed\n"));
3428 	    if (strlen(mtcp_ckpt_newname) >= MAXPATHLEN) {
3429 	      mtcp_printf("mtcp finishrestore: new ckpt file name (%s) too long (>=512 bytes)\n",
3430 	                  mtcp_ckpt_newname);
3431 	      mtcp_abort();
3432 	    }
Event buffer_size_warning: Calling strncpy with a maximum size argument of 512 bytes on destination array "perm_checkpointfilename" of size 512 bytes might leave the destination string unterminated.
3433 	    strncpy(perm_checkpointfilename,mtcp_ckpt_newname,MAXPATHLEN);
3434 	    memcpy(temp_checkpointfilename,perm_checkpointfilename,MAXPATHLEN);
3435 	    strncpy(temp_checkpointfilename + nnamelen, ".temp",MAXPATHLEN - nnamelen);
3436 	  }
3437 	
At conditional (1): "(unsigned long)resultvar >= 18446744073709547521UL": Taking true branch.
At conditional (2): "(unsigned long)resultvar >= 18446744073709547521UL": Taking true branch.
3438 	  mtcp_sys_gettimeofday (&stopped, NULL);
3439 	  stopped.tv_usec += (stopped.tv_sec - restorestarted.tv_sec) * 1000000 - restorestarted.tv_usec;
3440 	  TPRINTF (("mtcp finishrestore*: time %u uS\n", stopped.tv_usec));
3441 	
3442 	  /* Now we can access all our files and memory that existed at the time of the checkpoint  */
3443 	  /* We are still on the temporary stack, though                                            */
3444 	
3445 	  /* Fill in the new mother process id */
At conditional (3): "(unsigned long)resultvar >= 18446744073709547521UL": Taking true branch.
At conditional (4): "(unsigned long)resultvar >= 18446744073709547521UL": Taking true branch.
3446 	  motherpid = mtcp_sys_getpid();
3447 	
3448 	  /* Call another routine because our internal stack is whacked and we can't have local vars */
3449 	
3450 	  ///JA: v54b port
3451 	  // so restarthread will have a big stack
3452 	  asm volatile (CLEAN_FOR_64_BIT(mov %0,%%esp)
3453 			: : "g" (motherofall -> savctx.SAVEDSP - 128 ) : "memory");  // -128 for red zone
3454 	  restarthread (motherofall);
3455 	}
3456 	
3457 	static int restarthread (void *threadv)
3458 	{
3459 	  int rip;
3460 	  Thread *child;
3461 	  Thread *const thread = threadv;
3462 	  struct MtcpRestartThreadArg mtcpRestartThreadArg;
3463 	
3464 	  restore_tls_state (thread);
3465 	
3466 	
3467 	  if (thread == motherofall) {
3468 	    // Compute the set of signals which was pending for all the threads at the
3469 	    // time of checkpoint. This is a heuristic to compute the set of signals
3470 	    // which were pending for the entire process at the time of checkpoint.
3471 	    sigset_t tmp;
3472 	    sigfillset ( &tmp );
3473 	    Thread *th;
3474 	    for (th = threads; th != NULL; th = th -> next) {
3475 	      sigandset ( &sigpending_global, &tmp, &(th->sigpending) );
3476 	      tmp = sigpending_global;
3477 	    }
3478 	
3479 	    setup_sig_handler ();
3480 	
3481 	    set_tid_address (&(thread -> child_tid));
3482 	
3483 	    if (callback_post_ckpt != NULL) {
3484 	        DPRINTF(("mtcp finishrestore*: before callback_post_ckpt(1=restarting)"
3485 			 " (&%x,%x) \n",
3486 			 &callback_post_ckpt, callback_post_ckpt));
3487 	        (*callback_post_ckpt)(1);
3488 	        DPRINTF(("mtcp finishrestore*: after callback_post_ckpt(1=restarting)\n"));
3489 	    }
3490 	    /* Do it once only, in motherofall thread. */
3491 	
3492 	    restore_term_settings();
3493 	
3494 	    if (dmtcp_info_restore_working_directory
3495 	        && chdir(saved_working_directory) == -1) {
3496 	      perror("chdir");
3497 	      mtcp_abort ();
3498 	    }
3499 	
3500 	    /* DMTCP restores signal handlers.  But if we are running standalone,
3501 	     * MTCP must do it.
3502 	     * Because signal handlers are per-process, we only do this once.
3503 	     */
3504 	    if (!dmtcp_exists)
3505 	        restore_sig_handlers(thread);
3506 	  }
3507 	
3508 	  restore_sig_state (thread);
3509 	
3510 	  for (child = thread -> children; child != NULL; child = child -> siblings) {
3511 	
3512 	    /* Increment number of threads created but haven't completed their longjmp */
3513 	
3514 	    do rip = mtcp_state_value(&restoreinprog);
3515 	    while (!mtcp_state_set (&restoreinprog, rip + 1, rip));
3516 	
3517 	    /* Create the thread so it can finish restoring itself.                       */
3518 	    /* Don't do CLONE_SETTLS (it'll puke).  We do it later via restore_tls_state. */
3519 	
3520 	    ///JA: v54b port
3521 	    errno = -1;
3522 	
3523 	    void *clone_arg = (void *)child;
3524 	
3525 	    /*
3526 	     * DMTCP needs to know original_tid of the thread being created by the
3527 	     *  following clone() call.
3528 	     *
3529 	     * Threads are created by using syscall which is intercepted by DMTCP and
3530 	     *  the original_tid is sent to DMTCP as a field of MtcpRestartThreadArg
3531 	     *  structure. DMTCP will automatically extract the actual argument
3532 	     *  (clone_arg -> arg) from clone_arg and will pass it on to the real
3533 	     *  clone call.
3534 	     *                                                           (--Kapil)
3535 	     */
3536 	    mtcpRestartThreadArg.arg = (void *)child;
3537 	    mtcpRestartThreadArg.original_tid = child -> original_tid;
3538 	    clone_arg = (void *) &mtcpRestartThreadArg;
3539 	
3540 	   /*
3541 	    * syscall is wrapped by DMTCP when configured with PID-Virtualization.
3542 	    * It calls __clone which goes to DMTCP:__clone which then calls MTCP:__clone.
3543 	    * DMTCP:__clone checks for tid-conflict with any original tid. If
3544 	    * conflict, it replaces the thread with a new one with a new tid.
3545 	    * DMTCP:__clone wrapper calls the glibc:__clone if the computation is not
3546 	    * in RUNNING state (must be restarting), it calls the mtcp:__clone otherwise.
3547 	    * IF No PID-Virtualization, call glibc:__clone because threads created
3548 	    * during mtcp_restart should not go to MTCP:__clone; MTCP remembers those
3549 	    * threads from the checkpoint image.
3550 	    */
3551 	
3552 	    /* If running under DMTCP */
3553 	    pid_t tid;
3554 	    if (dmtcp_info_pid_virtualization_enabled == 1) {
3555 	      tid = syscall(SYS_clone, restarthread,
3556 	          (void *)(child -> savctx.SAVEDSP - 128),  // -128 for red zone
3557 	          (child -> clone_flags & ~CLONE_SETTLS) | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
3558 	          clone_arg, child -> parent_tidptr, NULL, child -> actual_tidptr);
3559 	    } else {
3560 	      tid = ((*clone_entry)( restarthread,
3561 		    (void *)(child -> savctx.SAVEDSP - 128),  // -128 for red zone
3562 	            (child -> clone_flags & ~CLONE_SETTLS) | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
3563 	            child, child -> parent_tidptr, NULL, child -> actual_tidptr));
3564 	    }
3565 	
3566 	    if (tid < 0) {
3567 	      mtcp_printf ("mtcp restarthread: error %d recreating thread\n", errno);
3568 	      mtcp_printf ("mtcp restarthread:   clone_flags %X, savedsp %p\n",
3569 	                   child -> clone_flags, child -> savctx.SAVEDSP);
3570 	      mtcp_abort ();
3571 	    }
3572 	    DPRINTF((" Parent:%d, tid of newly created thread:%d\n\n", thread->tid, tid));
3573 	  }
3574 	
3575 	  /* All my children have been created, jump to the stopthisthread routine just after getcontext call */
3576 	  /* Note that if this is the restored checkpointhread, it jumps to the checkpointhread routine       */
3577 	
3578 	  if (mtcp_have_thread_sysinfo_offset())
3579 	    mtcp_set_thread_sysinfo(saved_sysinfo);
3580 	  ///JA: v54b port
3581 	  DPRINTF (("mtcp restarthread*: calling setcontext: thread->tid: %d, original_tid:%d\n",
3582 	            thread->tid, thread->original_tid));
3583 	  setcontext (&(thread -> savctx)); /* Shouldn't return */
3584 	  mtcp_abort ();
3585 	  return (0); /* NOTREACHED : stop compiler warning */
3586 	}
3587 	
3588 	/********************************************************************************************************************************/
3589 	/*																*/
3590 	/*  Restore the GDT entries that are part of a thread's state									*/
3591 	/*																*/
3592 	/*  The kernel provides set_thread_area system call for a thread to alter a particular range of GDT entries, and it switches 	*/
3593 	/*  those entries on a per-thread basis.  So from our perspective, this is per-thread state that is saved outside user 		*/
3594 	/*  addressable memory that must be manually saved.										*/
3595 	/*																*/
3596 	/********************************************************************************************************************************/
3597 	
3598 	static void restore_tls_state (Thread *thisthread)
3599 	
3600 	{
3601 	  int rc;
3602 	#if MTCP__SAVE_MANY_GDT_ENTRIES
3603 	  int i;
3604 	#endif
3605 	
3606 	  /* The assumption that this points to the pid was checked by that tls_pid crap near the beginning */
3607 	
3608 	  *(pid_t *)(*(unsigned long *)&(thisthread -> gdtentrytls[0].base_addr) + TLS_PID_OFFSET()) = motherpid;
3609 	
3610 	  /* Likewise, we must jam the new pid into the mother thread's tid slot (checked by tls_tid carpola) */
3611 	
3612 	  if (thisthread == motherofall) {
3613 	    *(pid_t *)(*(unsigned long *)&(thisthread -> gdtentrytls[0].base_addr) + TLS_TID_OFFSET()) = motherpid;
3614 	  }
3615 	
3616 	  /* Restore all three areas */
3617 	
3618 	#if MTCP__SAVE_MANY_GDT_ENTRIES
3619 	  for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i ++) {
3620 	    rc = mtcp_sys_set_thread_area (&(thisthread -> gdtentrytls[i-GDT_ENTRY_TLS_MIN]));
3621 	    if (rc < 0) {
3622 	      mtcp_printf ("mtcp restore_tls_state: error %d restoring GDT TLS entry[%d]\n", mtcp_sys_errno, i);
3623 	      mtcp_abort ();
3624 	    }
3625 	  }
3626 	
3627 	  /* For newer Linuces, we just restore the one GDT entry that was indexed by GS */
3628 	
3629 	#else
3630 	  rc = mtcp_sys_set_thread_area (&(thisthread -> gdtentrytls[0]));
3631 	  if (rc < 0) {
3632 	    mtcp_printf ("mtcp restore_tls_state: error %d restoring GDT TLS entry[%d]\n", mtcp_sys_errno, thisthread -> gdtentrytls[0].entry_number);
3633 	    mtcp_abort ();
3634 	  }
3635 	#endif
3636 	
3637 	  /* Restore the rest of the stuff */
3638 	
3639 	#ifdef __i386__
3640 	  asm volatile ("movw %0,%%fs" : : "m" (thisthread -> fs));
3641 	  asm volatile ("movw %0,%%gs" : : "m" (thisthread -> gs));
3642 	#endif
3643 	#ifdef __x86_64__
3644 	/* Don't directly set fs.  It would only set 32 bits, and we just
3645 	 *  set the full 64-bit base of fs, using sys_set_thread_area,
3646 	 *  which called arch_prctl.
3647 	 *asm volatile ("movl %0,%%fs" : : "m" (thisthread -> fs));
3648 	 *asm volatile ("movl %0,%%gs" : : "m" (thisthread -> gs));
3649 	 */
3650 	#endif
3651 	
3652 	  thisthread -> tid = mtcp_sys_kernel_gettid ();
3653 	}
3654 	
3655 	/********************************************************************************************************************************/
3656 	/*																*/
3657 	/*  Set the thread's STOPSIGNAL handler.  Threads are sent STOPSIGNAL when they are to suspend execution the application, save 	*/
3658 	/*  their state and wait for the checkpointhread to write the checkpoint file.							*/
3659 	/*																*/
3660 	/*    Output:															*/
3661 	/*																*/
3662 	/*	Calling thread will call stopthisthread () when sent a STOPSIGNAL							*/
3663 	/*																*/
3664 	/********************************************************************************************************************************/
3665 	
3666 	static void setup_sig_handler (void)
3667 	{
3668 	  struct sigaction act, old_act;
3669 	
3670 	  act.sa_handler = &stopthisthread;
3671 	  sigfillset(&act.sa_mask);
3672 	  act.sa_flags = SA_RESTART;
3673 	
3674 	  if (_real_sigaction(STOPSIGNAL, &act, &old_act) == -1) {
3675 	    mtcp_printf ("mtcp setupthread: error setting up signal handler: %s\n",
3676 	                 strerror (errno));
3677 	    mtcp_abort ();
3678 	  }
3679 	
3680 	  if ((old_act.sa_handler != SIG_IGN) && (old_act.sa_handler != SIG_DFL) && 
3681 	      (old_act.sa_handler != stopthisthread)) {
3682 	    mtcp_printf ("mtcp setupthread: signal handler %d already in use (%p).\n"
3683 	                 " You may employ a different signal by setting the\n"
3684 	                 " environment variable MTCP_SIGCKPT (or DMTCP_SIGCKPT)"
3685 			 " to the number\n of the signal MTCP should "
3686 	                 "use for checkpointing.\n", STOPSIGNAL, old_act.sa_handler);
3687 	    mtcp_abort ();
3688 	  }
3689 	}
3690 	
3691 	/********************************************************************************************************************************/
3692 	/*                                                                                                                              */
3693 	/*  Sync shared memory pages with backup files on disk                                                                          */
3694 	/*                                                                                                                              */
3695 	/********************************************************************************************************************************/
3696 	static void sync_shared_mem(void)
3697 	{
3698 	  int mapsfd;
3699 	  Area area;
3700 	
3701 	  mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
3702 	  if (mapsfd < 0) {
3703 	    mtcp_printf ("mtcp sync_shared_memory: error opening /proc/self/maps: %s\n",
3704 	                 strerror (mtcp_sys_errno));
3705 	    mtcp_abort ();
3706 	  }
3707 	
3708 	  while (readmapsline (mapsfd, &area)) {
3709 	    /* Skip anything that has no read or execute permission.  This occurs on one page in a Linux 2.6.9 installation.  No idea why.  This code would also take care of kernel sections since we don't have read/execute permission there.  */
3710 	
3711 	    if (!((area.prot & PROT_READ) || (area.prot & PROT_WRITE))) continue;
3712 	
3713 	    if (!(area.flags & MAP_SHARED)) continue;
3714 	
3715 	    if (strstr(area.name, " (deleted)")) continue;
3716 	
3717 	    DPRINTF(("mtcp sync_shared_memory: syncing %X at %p from %s + %X\n", area.size, area.addr, area.name, area.offset));
3718 	
3719 	    if ( msync(area.addr, area.size, MS_SYNC) < 0 ){
3720 	      mtcp_printf ("mtcp sync_shared_memory: error syncing %X at %p from %s + %X\n", area.size, area.addr, area.name, area.offset);
3721 	      mtcp_abort();
3722 	    }
3723 	  }
3724 	
3725 	  close (mapsfd);
3726 	}