1    	/*****************************************************************************
2    	 *   Copyright (C) 2006-2010 by Michael Rieker, Jason Ansel, Kapil Arya, and *
3    	 *                                                            Gene Cooperman *
4    	 *   mrieker@nii.net, jansel@csail.mit.edu, kapil@ccs.neu.edu, and           *
5    	 *                                                          gene@ccs.neu.edu *
6    	 *                                                                           *
7    	 *   This file is part of the MTCP module of DMTCP (DMTCP:mtcp).             *
8    	 *                                                                           *
9    	 *  DMTCP:mtcp is free software: you can redistribute it and/or              *
10   	 *  modify it under the terms of the GNU Lesser General Public License as    *
11   	 *  published by the Free Software Foundation, either version 3 of the       *
12   	 *  License, or (at your option) any later version.                          *
13   	 *                                                                           *
14   	 *  DMTCP:dmtcp/src is distributed in the hope that it will be useful,       *
15   	 *  but WITHOUT ANY WARRANTY; without even the implied warranty of           *
16   	 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
17   	 *  GNU Lesser General Public License for more details.                      *
18   	 *                                                                           *
19   	 *  You should have received a copy of the GNU Lesser General Public         *
20   	 *  License along with DMTCP:dmtcp/src.  If not, see                         *
21   	 *  <http://www.gnu.org/licenses/>.                                          *
22   	 *****************************************************************************/
23   	
24   	/********************************************************************************************************************************/
25   	/*																*/
26   	/*  Multi-threaded checkpoint library												*/
27   	/*																*/
28   	/*  Link this in as part of your program that you want checkpoints taken							*/
29   	/*  Call the mtcp_init routine at the beginning of your program									*/
30   	/*  Call the mtcp_ok routine when it's OK to do checkpointing									*/
31   	/*  Call the mtcp_no routine when you want checkpointing inhibited								*/
32   	/*																*/
33   	/*  This module also contains a __clone wrapper routine										*/
34   	/*																*/
35   	/********************************************************************************************************************************/
36   	
37   	
38   	// Set _GNU_SOURCE in order to expose glibc-defined sigandset()
39   	#define _GNU_SOURCE
40   	#include <asm/ldt.h>      // for struct user_desc
41   	//#include <asm/segment.h>  // for GDT_ENTRY_TLS_... stuff
42   	#include <dirent.h>
43   	#include <dlfcn.h>
44   	#include <errno.h>
45   	#include <fcntl.h>
46   	#include <pthread.h>
47   	#include <semaphore.h>
48   	#include <sched.h>
49   	#include <signal.h>
50   	#include <stdarg.h>
51   	#include <stdio.h>
52   	#include <stdlib.h>
53   	#include <string.h>
54   	#include <sys/mman.h>
55   	#include <sys/resource.h>
56   	#include <sys/sem.h>
57   	#include <sys/stat.h>
58   	#include <sys/syscall.h>
59   	#include <sys/ioctl.h>
60   	#include <termios.h>       // for tcdrain, tcsetattr, etc.
61   	#include <unistd.h>
62   	#include <ucontext.h>
63   	#include <sys/types.h>     // for gettid, tkill, waitpid
64   	#include <sys/wait.h>	   // for waitpid
65   	#include <linux/unistd.h>  // for gettid, tkill
66   	#include <gnu/libc-version.h>
67   	
68   	#define MTCP_SYS_STRCPY
69   	#define MTCP_SYS_STRLEN
70   	#include "mtcp_internal.h"
71   	
72   	/* required for ptrace sake */
73   	#include <sys/user.h>
74   	#include "mtcp_ptrace.h" 
75   	
76   	static int WAIT=1;
77   	// static int WAIT=0;
78   	
79   	#if 0
80   	// Force thread to stop, without use of a system call.
81   	static int WAIT=1;
82   	# define DEBUG_WAIT \
83   	if (DEBUG_RESTARTING) \
84   	  {int i,j; \
85   	    for (i = 0; WAIT && i < 1000000000; i++) \
86   	      for (j = 0; j < 1000000000; j++) ; \
87   	  }
88   	#else
89   	# define DEBUG_WAIT
90   	#endif
91   	
92   	#if defined(GDT_ENTRY_TLS_ENTRIES) && !defined(__x86_64__)
93   	#define MTCP__SAVE_MANY_GDT_ENTRIES 1
94   	#else
95   	#define MTCP__SAVE_MANY_GDT_ENTRIES 0
96   	#endif
97   	
98   	/* Retrieve saved stack pointer saved by getcontext () */
99   	#ifdef __x86_64__
100  	#define MYREG_RSP 15
101  	#define SAVEDSP uc_mcontext.gregs[MYREG_RSP]
102  	#else
103  	#define MYREG_ESP 7
104  	#define SAVEDSP uc_mcontext.gregs[MYREG_ESP]
105  	#endif
106  	
107  	/* TLS segment registers used differently in i386 and x86_64. - Gene */
108  	#ifdef __i386__
109  	# define TLSSEGREG gs
110  	#endif
111  	#ifdef __x86_64__
112  	# define TLSSEGREG fs
113  	#endif
114  	
115  	/* Offset computed (&x.pid - &x) for
116  	 *   struct pthread x;
117  	 * as found in:  glibc-2.5/nptl/descr.h
118  	 * It was 0x4c and 0x48 for pid and tid for i386.
119  	 * Roughly, the definition is:
120  	 *glibc-2.5/nptl/descr.h:
121  	 * struct pthread
122  	 * {
123  	 *  union {
124  	 *   tcbheader_t tcbheader;
125  	 *   void *__padding[16];
126  	 *  };
127  	 *  list_t list;
128  	 *  pid_t tid;
129  	 *  pid_t pid;
130  	 *  ...
131  	 * } __attribute ((aligned (TCB_ALIGNMENT)));
132  	 *
133  	 *glibc-2.5/nptl/sysdeps/pthread/list.h:
134  	 * typedef struct list_head
135  	 * {
136  	 *  struct list_head *next;
137  	 *  struct list_head *prev;
138  	 * } list_t;
139  	 *
140  	 * NOTE: glibc-2.10 changes the size of __padding from 16 to 24.  --KAPIL
141  	 *
142  	 * NOTE: glibc-2.10 further changes the size tcphead_t without updating the
143  	 *       size of __padding in struct pthread. We need to add an extra 512 bytes
144  	 *       to accomodate this.                                     -- KAPIL
145  	 */
146  	#if __GLIBC_PREREQ (2,12)
147  	/* WHEN WE HAVE CONFIDENCE IN THIS VERSION, REMOVE ALL OTHER __GLIBC_PREREQ
148  	 * AND MAKE THIS THE ONLY VERSION.  IT SHOULD BE BACKWARDS COMPATIBLE.
149  	 */
150  	/* These function definitions should succeed independently of the glibc version.
151  	 * They use get_thread_area() to match (tid, pid) and find offset.
152  	 * In other code, on restart, that offset is used to set (tid,pid) to
153  	 *   the latest tid and pid of the new thread, instead of the (tid,pid)
154  	 *   of the original thread.
155  	 * SEE: "struct pthread" in glibc-2.XX/nptl/descr.h for 'struct pthread'.
156  	 */
157  	static int TLS_TID_OFFSET(void);
158  	
159  	/* Can remove the unused attribute when this __GLIBC_PREREQ is the only one. */
160  	static char *memsubarray (char *array, char *subarray, int len)
161  						 __attribute__ ((unused));
162  	static int mtcp_get_tls_segreg(void);
163  	static void *mtcp_get_tls_base_addr(void);
164  	
165  	static int TLS_TID_OFFSET(void) {
166  	  static int tid_offset = -1;
167  	  if (tid_offset == -1) {
168  	    struct {pid_t tid; pid_t pid;} tid_pid;
169  	    /* struct pthread has adjacent fields, tid and pid, in that order.
170  	     * Try to find at what offset that bit patttern occurs in struct pthread.
171  	     */
172  	    char * tmp;
173  	    tid_pid.tid = mtcp_sys_kernel_gettid();
174  	    tid_pid.pid = mtcp_sys_getpid();
175  	    /* Get entry number of current thread descriptor from its segment register:
176  	     * Segment register / 8 is the entry_number for the "thread area", which
177  	     * is of type 'struct user_desc'.   The base_addr field of that struct
178  	     * points to the struct pthread for the thread with that entry_number.
179  	     * The tid and pid are contained in the 'struct pthread'.
180  	     *   So, to access the tid/pid fields, first find the entry number.
181  	     * Then fill in the entry_number field of an empty 'struct user_desc', and
182  	     * get_thread_area(struct user_desc *uinfo) will fill in the rest.
183  	     * Then use the filled in base_address field to get the 'struct pthread'.
184  	     * The function mtcp_get_tls_base_addr() returns this 'struct pthread' addr.
185  	     */
186  	    void * pthread_desc = mtcp_get_tls_base_addr();
187  	    /* A false hit for tid_offset probably can't happen since a new
188  	     * 'struct pthread' is zeroed out before adding tid and pid.
189  	     */
190  	    tmp = memsubarray((char *)pthread_desc, (char *)&tid_pid, sizeof(tid_pid));
191  	    if (tmp == NULL) {
192  	      mtcp_printf("MTCP:  Couldn't find offsets of tid/pid in thread_area.\n");
193  	      mtcp_abort();
194  	    }
195  	    tid_offset = tmp - (char *)pthread_desc;
196  	#ifdef __x86_64__
197  	    if (tid_offset != 512+26*sizeof(void *))
198  	#else
199  	    if (tid_offset != 26*sizeof(void *))
200  	#endif
201  	      mtcp_printf("MTCP:  Warning:  tid_offset = %d; different from expected.\n"
202  	                  "  Continuing anyway.  If this fails, please try again.\n",
203  	                  tid_offset);
204  	    DPRINTF(("tid_offset: %d\n", tid_offset));
205  	    if (tid_offset % sizeof(int) != 0) {
206  	      mtcp_printf("MTCP:  tid_offset is not divisible by sizeof(int).\n");
207  	      mtcp_abort();
208  	    }
209  	    /* Should we do a double-check, and spawn a new thread and see
210  	     *  if its TID matches at this tid_offset?  This would give greater
211  	     *  confidence, but for the reasons above, it's probably not necessary.
212  	     */
213  	  }
214  	  return tid_offset;
215  	}
216  	static int TLS_PID_OFFSET(void) {
217  	  static int pid_offset = -1;
218  	  struct {pid_t tid; pid_t pid;} tid_pid;
219  	  if (pid_offset == -1) {
220  	    int tid_offset = TLS_TID_OFFSET();
221  	    pid_offset = tid_offset + (char *)&(tid_pid.pid) - (char *)&tid_pid;
222  	    DPRINTF(("pid_offset: %d\n", pid_offset));
223  	  }
224  	  return pid_offset;
225  	}
226  	#elif __GLIBC_PREREQ (2,11)
227  	# ifdef __x86_64__
228  	#  define TLS_PID_OFFSET() \
229  	           (512+26*sizeof(void *)+sizeof(pid_t))  // offset of pid in pthread struct
230  	#  define TLS_TID_OFFSET() (512+26*sizeof(void *))  // offset of tid in pthread struct
231  	# else
232  	#  define TLS_PID_OFFSET() \
233  	           (26*sizeof(void *)+sizeof(pid_t))  // offset of pid in pthread struct
234  	#  define TLS_TID_OFFSET() (26*sizeof(void *))  // offset of tid in pthread struct
235  	# endif
236  	#elif __GLIBC_PREREQ (2,10)
237  	# define TLS_PID_OFFSET() \
238  		  (26*sizeof(void *)+sizeof(pid_t))  // offset of pid in pthread struct
239  	# define TLS_TID_OFFSET() (26*sizeof(void *))  // offset of tid in pthread struct
240  	#else
241  	# define TLS_PID_OFFSET() \
242  		  (18*sizeof(void *)+sizeof(pid_t))  // offset of pid in pthread struct
243  	# define TLS_TID_OFFSET() (18*sizeof(void *))  // offset of tid in pthread struct
244  	#endif
245  	
246  	/* this call to gettid is hijacked by DMTCP for PID/TID-Virtualization */
247  	#define GETTID() (int)syscall(SYS_gettid)
248  	
249  	sem_t sem_start;
250  	
251  	typedef struct Thread Thread;
252  	
253  	struct Thread { Thread *next;                       // next thread in 'threads' list
254  	                Thread **prev;                      // prev thread in 'threads' list
255  	                int tid;                            // this thread's id as returned by mtcp_sys_kernel_gettid ()
256  	                int original_tid;                   // this is the the thread's "original" tid
257  	                MtcpState state;                    // see ST_... below
258  	                Thread *parent;                     // parent thread (or NULL if top-level thread)
259  	                Thread *children;                   // one of this thread's child threads
260  	                Thread *siblings;                   // one of this thread's sibling threads
261  	
262  	                int clone_flags;                    // parameters to __clone that created this thread
263  	                int *parent_tidptr;
264  	                int *given_tidptr;                  // (this is what __clone caller passed in)
265  	                int *actual_tidptr;                 // (this is what we passed to the system call, either given_tidptr or &child_tid)
266  	                int child_tid;                      // this is used for child_tidptr if the original call did not
267  	                                                    // ... have both CLONE_CHILD_SETTID and CLONE_CHILD_CLEARTID
268  	                int (*fn) (void *arg);              // thread's initial function entrypoint and argument
269  	                void *arg;
270  	
271  	                sigset_t sigblockmask;              // blocked signals
272  	                sigset_t sigpending;                // pending signals
273  	
274  	                ///JA: new code ported from v54b
275  	                ucontext_t savctx;                  // context saved on suspend
276  	
277  	                mtcp_segreg_t fs, gs;               // thread local storage pointers
278  	                pthread_t pth;                      // added for pthread_join
279  	#if MTCP__SAVE_MANY_GDT_ENTRIES
280  	                struct user_desc gdtentrytls[GDT_ENTRY_TLS_ENTRIES];
281  	#else
282  	                struct user_desc gdtentrytls[1];
283  	#endif
284  	              };
285  	
286  	/*
287  	 * struct MtcpRestartThreadArg
288  	 *
289  	 * DMTCP requires the original_tids  of the threads being created during
290  	 *  the RESTARTING phase. We use MtcpRestartThreadArg structure is to pass
291  	 *  the original_tid of the thread being created from MTCP to DMTCP.
292  	 *
293  	 * actual clone call: clone (fn, child_stack, flags, void *, ... )
294  	 * new clone call   : clone (fn, child_stack, flags, (struct MtcpRestartThreadArg *), ...)
295  	 *
296  	 * DMTCP automatically extracts arg from this structure and passes that
297  	 * to the _real_clone call.
298  	 *
299  	 * IMPORTANT NOTE: While updating, this structure must be kept in sync
300  	 * with the structure defined with the same name in mtcpinterface.cpp
301  	 */
302  	struct MtcpRestartThreadArg {
303  	  void *arg;
304  	  pid_t original_tid;
305  	};
306  	
307  	#define ST_RUNDISABLED 0     // thread is running normally but with checkpointing disabled
308  	#define ST_RUNENABLED 1      // thread is running normally and has checkpointing enabled
309  	#define ST_SIGDISABLED 2     // thread is running normally with cp disabled, but checkpoint thread is waiting for it to enable
310  	#define ST_SIGENABLED 3      // thread is running normally with cp enabled, and checkpoint thread has signalled it to stop
311  	#define ST_SUSPINPROG 4      // thread context being saved (very brief)
312  	#define ST_SUSPENDED 5       // thread is suspended waiting for checkpoint to complete
313  	#define ST_CKPNTHREAD 6      // thread is the checkpointing thread (special state just for that thread)
314  	
315  		/* Global data */
316  	
317  	void *mtcp_libc_dl_handle = NULL;  // dlopen handle for whatever libc.so is loaded with application program
318  	Area mtcp_libc_area;               // some area of that libc.so
319  	
320  	/* DMTCP Info Variables */
321  	
322  	/* These are reset by dmtcphijack.so at initialization. */
323  	int dmtcp_exists = 0; /* Are we running under DMTCP? */
324  	int dmtcp_info_pid_virtualization_enabled = 0;
325  	/* The following two DMTCP Info variables are defined in mtcp_printf.c */
326  	//int dmtcp_info_stderr_fd = 2;
327  	//int dmtcp_info_jassertlog_fd = -1;
328  	int dmtcp_info_restore_working_directory = -1;
329  	
330  		/* Static data */
331  	
332  	static sigset_t sigpending_global;                // pending signals for the process
333  	static char const *nscd_mmap_str = "/var/run/nscd/";    // OpenSUSE
334  	static char const *nscd_mmap_str2 = "/var/cache/nscd";  // Debian / Ubuntu
335  	static char const *nscd_mmap_str3 = "/var/db/nscd";     // RedHat (Linux 2.6.9)
336  	static char const *dev_zero_deleted_str = "/dev/zero (deleted)";
337  	static char const *dev_null_deleted_str = "/dev/null (deleted)";
338  	static char const *sys_v_shmem_file = "/SYSV";
339  	//static char const *perm_checkpointfilename = NULL;
340  	//static char const *temp_checkpointfilename = NULL;
341  	static char perm_checkpointfilename[MAXPATHLEN];
342  	static char temp_checkpointfilename[MAXPATHLEN];
343  	static size_t checkpointsize;
344  	static int intervalsecs;
345  	static pid_t motherpid;
346  	static size_t restore_size;
347  	static int showtiming;
348  	static int threadenabledefault;
349  	static int verify_count;  // number of checkpoints to go
350  	static int verify_total;  // value given by envar
351  	static pid_t mtcp_ckpt_gzip_child_pid = -1;
352  	static int volatile checkpointhreadstarting = 0;
353  	static MtcpState restoreinprog = MTCP_STATE_INITIALIZER;
354  	static MtcpState threadslocked = MTCP_STATE_INITIALIZER;
355  	static pthread_t checkpointhreadid;
356  	static struct timeval restorestarted;
357  	static int DEBUG_RESTARTING = 0;
358  	static Thread *motherofall = NULL;
359  	static Thread *ckpthread = NULL;
360  	static Thread *threads = NULL;
361  	struct sigaction sigactions[NSIG];  // signal handlers
362  	static VA restore_begin, restore_end;
363  	static void *restore_start; /* will be bound to fnc, mtcp_restore_start */
364  	static void *saved_sysinfo;
365  	static void *saved_heap_start = NULL;
366  	static char saved_working_directory[MTCP_MAX_PATH];
367  	static void (*callback_sleep_between_ckpt)(int sec) = NULL;
368  	static void (*callback_pre_ckpt)() = NULL;
369  	static void (*callback_post_ckpt)(int is_restarting) = NULL;
370  	static int  (*callback_ckpt_fd)(int fd) = NULL;
371  	static void (*callback_write_dmtcp_header)(int fd) = NULL;
372  	static void (*callback_restore_virtual_pid_table)() = NULL;
373  	
374  	static int (*clone_entry) (int (*fn) (void *arg),
375  	                           void *child_stack,
376  	                           int flags,
377  	                           void *arg,
378  	                           int *parent_tidptr,
379  	                           struct user_desc *newtls,
380  	                           int *child_tidptr);
381  	
382  	/* temp stack used internally by restore so we don't go outside the
383  	 *   libmtcp.so address range for anything;
384  	 * including "+ 1" since will set %esp/%rsp to tempstack+STACKSIZE
385  	 */
386  	static long long tempstack[STACKSIZE + 1];
387  	
388  		/* Internal routines */
389  	
390  	static long set_tid_address (int *tidptr);
391  	
392  	static char *memsubarray (char *array, char *subarray, int len)
393  						 __attribute__ ((unused));
394  	static int mtcp_get_tls_segreg(void);
395  	static void *mtcp_get_tls_base_addr(void);
396  	static int threadcloned (void *threadv);
397  	static void setupthread (Thread *thread);
398  	static void setup_clone_entry (void);
399  	static void threadisdead (Thread *thread);
400  	static void *checkpointhread (void *dummy);
401  	static int test_use_compression(void);
402  	static int open_ckpt_to_write(int fd, int pipe_fds[2], char *gzip_path);
403  	static void checkpointeverything (void);
404  	static void writefiledescrs (int fd);
405  	static void writememoryarea (int fd, Area *area,
406  				     int stack_was_seen, int vsyscall_exists);
407  	static void writecs (int fd, char cs);
408  	static void writefile (int fd, void const *buff, size_t size);
409  	static void preprocess_special_segments(int *vsyscall_exists);
410  	static void stopthisthread (int signum);
411  	static void wait_for_all_restored (void);
412  	static void save_sig_state (Thread *thisthread);
413  	static void restore_sig_state (Thread *thisthread);
414  	static void save_sig_handlers (void);
415  	static void restore_sig_handlers (Thread *thisthread);
416  	static void save_tls_state (Thread *thisthread);
417  	static void renametempoverperm (void);
418  	static Thread *getcurrenthread (void);
419  	static void lock_threads (void);
420  	static void unlk_threads (void);
421  	static int readmapsline (int mapsfd, Area *area);
422  	static void restore_heap(void);
423  	static void finishrestore (void);
424  	static int restarthread (void *threadv);
425  	static void restore_tls_state (Thread *thisthread);
426  	static void setup_sig_handler (void);
427  	static void sync_shared_mem(void);
428  	
429  	/* FIXME:
430  	 * dmtcp/src/syscallsreal.c has wrappers around signal, sigaction, sigprocmask
431  	 * The wrappers go to these mtcp_real_XXX versions so that MTCP can call
432  	 * the actual system calls and avoid the wrappers.  But if that is still
433  	 * an issue, then we can create mtcp_sys_signal(), etc., for direct calls.
434  	 *
435  	 * Update: 
436  	 * mtcp_real_XXX versions have been renamed to _real_XXX in DMTCP.
437  	 * sigprocmask should not be used in multi-threaded process, use
438  	 * pthread_sigmask instead.
439  	 */
440  	int _real_sigaction(int signum, const struct sigaction *act,
441  				struct sigaction *oldact){
442  	  if (dmtcp_exists) {
443  	    mtcp_printf("mtcp %s: This function mustn't be called when working under DMTCP\n",
444  	                __FUNCTION__);
445  	    mtcp_abort();
446  	  }
447  	  return sigaction(signum, act, oldact);
448  	}
449  	
450  	
451  	/********************************************************************************************************************************/
452  	/*																*/
453  	/*  This routine must be called at startup time to initiate checkpointing							*/
454  	/*																*/
455  	/*    Input:															*/
456  	/*																*/
457  	/*	checkpointfilename = name to give the checkpoint file									*/
458  	/*	interval = interval, in seconds, to write the checkpoint file								*/
459  	/*	clonenabledefault = 0 : clone checkpointing blocked by default (call mtcp_ok in the thread to enable)			*/
460  	/*	                    1 : clone checkpointing enabled by default (call mtcp_no in the thread to block if you want)	*/
461  	/*																*/
462  	/*	envar MTCP_WRAPPER_LIBC_SO = what library to use for inner wrappers (default libc.??.so)				*/
463  	/*	envar MTCP_VERIFY_CHECKPOINT = every n checkpoints, verify by doing a restore to resume					*/
464  	/*	                               default is 0, ie, don't ever verify							*/
465  	/*																*/
466  	/********************************************************************************************************************************/
467  	/* These hook functions provide an alternative to DMTCP callbacks, using
468  	 * weak symbols.  While MTCP is immature, let's allow both, in case
469  	 * the flexibility of a second hook mechanism is useful in the future.
470  	 * The mechanism is invisible unless end user compiles w/ -Wl,-export-dynamic
471  	 */
472  	__attribute__ ((weak)) void mtcpHookPreCheckpoint( void ) { }
473  	
474  	__attribute__ ((weak)) void mtcpHookPostCheckpoint( void ) { }
475  	
476  	__attribute__ ((weak)) void mtcpHookRestart( void ) { }
477  	
478  	/* Statically allocate this.  Malloc is dangerous here if application is
479  	 *   defining its own (possibly not thread-safe) malloc routine.
480  	 */
481  	static Thread ckptThreadStorage;
482  	
483  	void mtcp_init (char const *checkpointfilename, int interval, int clonenabledefault)
484  	{
485  	  char *p, *tmp, *endp;
486  	  int len;
487  	  Thread *ckptThreadDescriptor = & ckptThreadStorage;
488  	  mtcp_segreg_t TLSSEGREG;
489  	#ifdef PTRACE 
490  	  init_thread_local();
491  	#endif
492  	
493  	  if (sizeof(void *) != sizeof(long)) {
494  	    mtcp_printf("ERROR: sizeof(void *) != sizeof(long) on this architecture.\n"
495  		   "       This code assumes they are equal.\n");
496  	    mtcp_abort ();
497  	  }
498  	
499  	#ifndef __x86_64__
500  	  /* Nobody else has a right to preload on internal processes generated
501  	   * by mtcp_check_XXX() -- not even DMTCP, if it's currently operating.
502  	   *
503  	   * Saving LD_PRELOAD in a temp env var and restoring it later --Kapil.
504  	   *
505  	   * TODO: To insert some sort of error checking to make sure that we
506  	   *       are correctly setting LD_PRELOAD after we are done with
507  	   *       vdso check.
508  	   */
509  	
510  	  // Shouldn't this removal of LD_PRELOAD be around fork/exec of gzip ?
511  	  // setenv( "MTCP_TMP_LD_PRELOAD", getenv("LD_PRELOAD"), 1);
512  	  // unsetenv("LD_PRELOAD");
513  	  // Allow user program to run with randomize_va
514  	  // mtcp_check_vdso_enabled();
515  	  // setenv("LD_PRELOAD", getenv("MTCP_TMP_LD_PRELOAD"), 1);
516  	  // unsetenv("MTCP_TMP_LD_PRELOAD");
517  	#endif
518  	
519  	#if 0
520  	  { struct user_desc u_info;
521  	    u_info.entry_number = 12;
522  	    if (-1 == mtcp_sys_get_thread_area(&u_info) && mtcp_sys_errno == ENOSYS)
523  	      mtcp_printf(
524  	        "Apparently, get_thread_area is not implemented in your kernel.\n"
525  	        "  If this doesn't work, please try on a more recent kernel,\n"
526  	        "  or one configured to support get_thread_area.\n"
527  	      );
528  	  }
529  	#endif
530  	
531  	  intervalsecs = interval;
532  	
533  	  if (strlen(mtcp_ckpt_newname) >= MAXPATHLEN) {
534  	    mtcp_printf("mtcp mtcp_init: new ckpt file name (%s) too long (>=512 bytes)\n",
535  	                mtcp_ckpt_newname);
536  	    mtcp_abort();
537  	  }
538  	  strncpy(perm_checkpointfilename,checkpointfilename,MAXPATHLEN);  // this is what user wants the checkpoint file called
539  	  len = strlen (perm_checkpointfilename);        // make up another name, same as that, with ".temp" on the end
540  	  memcpy(temp_checkpointfilename, perm_checkpointfilename, len);
541  	  strncpy(temp_checkpointfilename + len, ".temp",MAXPATHLEN-len);
542  	                                                 // ... we use it to write to in case we crash while writing
543  	                                                 //     we will leave the previous good one intact
544  	
545  	#ifdef PTRACE
546  	  /* TODO:  USE flock WHEN WRITING TO THESE THREE FILES (NOT YET DONE FOR ptrace_setoptions_file? */
547  	  memset(ptrace_shared_file, '\0', MAXPATHLEN);
548  	  sprintf(ptrace_shared_file, "%s/ptrace_shared_file.txt", dir);
549  	  memset(ptrace_setoptions_file, '\0', MAXPATHLEN);
550  	  sprintf(ptrace_setoptions_file, "%s/ptrace_setoptions_file.txt", dir);
551  	  memset(checkpoint_threads_file, '\0', MAXPATHLEN);
552  	  sprintf(checkpoint_threads_file, "%s/checkpoint_threads_file.txt", dir);
553  	#endif
554  	
555  	  DPRINTF (("mtcp_init*: main tid %d\n", mtcp_sys_kernel_gettid ()));
556  	  /* If MTCP_INIT_PAUSE set, sleep 15 seconds and allow for gdb attach. */
557  	  if (getenv("MTCP_INIT_PAUSE")) {
558  	    mtcp_printf("Pausing 15 seconds.  Do:  gdb attach %d\n", mtcp_sys_getpid());
559  	    sleep(15);
560  	  }
561  	
562  	  threadenabledefault = clonenabledefault;       // save this away where it's easy to get
563  	
564  	  p = getenv ("MTCP_SHOWTIMING");
565  	  showtiming = ((p != NULL) && (*p & 1));
566  	
567  	  /* Maybe dump out some stuff about the TLS */
568  	
569  	  mtcp_dump_tls (__FILE__, __LINE__);
570  	
571  	  /* Save this process's pid.  Then verify that the TLS has it where it should be.           */
572  	  /* When we do a restore, we will have to modify each thread's TLS with the new motherpid. */
573  	  /* We also assume that GS uses the first GDT entry for its descriptor.                    */
574  	
575  	  motherpid = mtcp_sys_getpid (); /* libc/getpid can lie if we had
576  					   * used kernel fork() instead of libc fork().
577  					   */
578  	  {
579  	    pid_t tls_pid, tls_tid;
580  	    tls_pid = *(pid_t *) (mtcp_get_tls_base_addr() + TLS_PID_OFFSET());
581  	    tls_tid = *(pid_t *) (mtcp_get_tls_base_addr() + TLS_TID_OFFSET());
582  	
583  	    if ((tls_pid != motherpid) || (tls_tid != motherpid)) {
584  	      mtcp_printf ("mtcp_init: getpid %d, tls pid %d, tls tid %d, must all match\n",
585  	                    motherpid, tls_pid, tls_tid);
586  	      mtcp_abort ();
587  	    }
588  	  }
589  	
590  	  /* Get verify envar */
591  	
592  	  tmp = getenv ("MTCP_VERIFY_CHECKPOINT");
593  	  verify_total = 0;
594  	  if (tmp != NULL) {
595  	    verify_total = strtol (tmp, &p, 0);
596  	    if ((*p != '\0') || (verify_total < 0)) {
597  	      mtcp_printf ("mtcp_init: bad MTCP_VERIFY_CHECKPOINT %s\n", tmp);
598  	      mtcp_abort ();
599  	    }
600  	  }
601  	
602  	  /* If the user has defined a signal, use that to suspend.  Otherwise, use MTCP_DEFAULT_SIGNAL */
603  	
604  	  tmp = getenv("MTCP_SIGCKPT");
605  	  if (tmp == NULL)
606  	      STOPSIGNAL = MTCP_DEFAULT_SIGNAL;
607  	  else
608  	  {
609  	      errno = 0;
610  	      STOPSIGNAL = strtol(tmp, &endp, 0);
611  	
612  	      if ((errno != 0) || (tmp == endp))
613  	      {
614  	          mtcp_printf("mtcp_init: Your chosen SIGCKPT of \"%s\" does not "
615  	                        "translate to a number,\n"
616  				"  and cannot be used.  Signal %d "
617  	                        "will be used instead.\n", tmp, MTCP_DEFAULT_SIGNAL);
618  	          STOPSIGNAL = MTCP_DEFAULT_SIGNAL;
619  	      }
620  	      else if (STOPSIGNAL < 1 || STOPSIGNAL > 31)
621  	      {
622  	          mtcp_printf("mtcp_init: Your chosen SIGCKPT of \"%d\" is not a valid "
623  	                        "signal, and cannot be used.\n"
624  				"  Signal %d will be used instead.\n",
625  			       STOPSIGNAL, MTCP_DEFAULT_SIGNAL);
626  	          STOPSIGNAL = MTCP_DEFAULT_SIGNAL;
627  	      }
628  	  }
629  	
630  	  /* Set up signal handler so we can interrupt the thread for checkpointing */
631  	  setup_sig_handler ();
632  	
633  	  /* Get size and address of the shareable - used to separate it from the rest of the stuff */
634  	  /* All routines needed to perform restore must be within this address range               */
635  	
636  	  restore_begin = (((VA)mtcp_shareable_begin) & -MTCP_PAGE_SIZE);
637  	  restore_size  = ((VA)mtcp_shareable_end - restore_begin + MTCP_PAGE_SIZE - 1) & -MTCP_PAGE_SIZE;
638  	  restore_end   = restore_begin + restore_size;
639  	  restore_start = mtcp_restore_start;
640  	
641  	  /* Setup clone_entry to point to glibc's __clone routine */
642  	
643  	  setup_clone_entry ();
644  	
645  	  /* Set up caller as one of our threads so we can work on it */
646  	
647  	  memset (ckptThreadDescriptor, 0, sizeof *ckptThreadDescriptor);
648  	  setupthread (ckptThreadDescriptor);
649  	  ckptThreadDescriptor -> child_tid = mtcp_sys_kernel_gettid (); // need to set this up so the checkpointhread can see we haven't exited
650  	  set_tid_address (&(ckptThreadDescriptor -> child_tid));  // we are assuming mtcp_init has been called before application may have called set_tid_address
651  	                                             // ... or else we will end up overwriting that set_tid_address value
652  	  motherofall = ckptThreadDescriptor;
653  	
654  	  /* Spawn off a thread that will perform the checkpoints from time to time */
655  	
656  	  checkpointhreadstarting = 1;
657  	  /* If we return from a fork(), we don't know what is the semaphore value. */
658  	  errno = 0;
659  	  while (sem_trywait(&sem_start) == -1 && (errno == EAGAIN || errno == EINTR)) {
660  	    if ( errno == EAGAIN )
661  	      sem_post(&sem_start);
662  	    errno = 0;
663  	  }
664  	  if (errno != 0)
665  	    perror("ERROR: continue anyway from " __FILE__ ":mtcp_init:sem_trywait()");
666  	  /* Now we successfully locked it.  The sempaphore value is zero. */
667  	  if (pthread_create (&checkpointhreadid, NULL, checkpointhread, NULL) < 0) {
668  	    mtcp_printf ("mtcp_init: error creating checkpoint thread: %s\n", strerror (errno));
669  	    mtcp_abort ();
670  	  }
671  	  if (checkpointhreadstarting) mtcp_abort ();  // make sure the clone wrapper executed (ie, not just the standard clone)
672  	  /* Stop until checkpoint thread has finished initializing.
673  	   * Some programs (like gcl) implement their own glibc functions in
674  	   * a non-thread-safe manner.  In case we're using non-thread-safe glibc,
675  	   * don't run the checkpoint thread and user thread at the same time.
676  	   */
677  	  errno = 0;
678  	  while (-1 == sem_wait(&sem_start) && errno == EINTR)
679  	    errno = 0;
680  	  /* The child thread checkpointhread will now wake us. */
681  	}
682  	
683  	/********************************************************************************************************************************
684  	 *
685  	 *  The routine mtcp_set_callbacks below may be called BEFORE the first
686  	 *  MTCP checkpoint, to add special functionality to checkpointing
687  	 *
688  	 *    Its arguments (callback functions) are:
689  	 *
690  	 * sleep_between_ckpt:  Called in between checkpoints to replace the default "sleep(sec)" functionality,
691  	 *                      when this function returns checkpoint will start
692  	 * pre_ckpt:            Called after all user threads are suspended, but BEFORE checkpoint written
693  	 * post_ckpt:           Called after checkpoint, and after restore.  is_restarting will be 1 for restore 0 for after checkpoint
694  	 * ckpt_fd:             Called to test if mtcp should checkpoint a given FD returns 1 if it should
695  	 *
696  	 *******************************************************************************************************************************/
697  	
698  	void mtcp_set_callbacks(void (*sleep_between_ckpt)(int sec),
699  	                        void (*pre_ckpt)(),
700  	                        void (*post_ckpt)(int is_restarting),
701  	                        int  (*ckpt_fd)(int fd),
702  	                        void (*write_dmtcp_header)(int fd),
703  	                        void (*restore_virtual_pid_table)())
704  	{
705  	    callback_sleep_between_ckpt = sleep_between_ckpt;
706  	    callback_pre_ckpt = pre_ckpt;
707  	    callback_post_ckpt = post_ckpt;
708  	    callback_ckpt_fd = ckpt_fd;
709  	    callback_write_dmtcp_header = write_dmtcp_header;
710  	    callback_restore_virtual_pid_table = restore_virtual_pid_table;
711  	}
712  	
713  	/*************************************************************************/
714  	/*						                         */
715  	/*  Dump out the TLS stuff pointed to by %gs	                         */
716  	/*						                         */
717  	/*************************************************************************/
718  	
719  	void mtcp_dump_tls (char const *file, int line)
720  	{
721  	#if 000
722  	  int i, j, mypid;
723  	  sigset_t blockall, oldsigmask;
724  	  struct user_desc gdtentry;
725  	  unsigned char byt;
726  	  unsigned short gs;
727  	
728  	  static int mutex = 0;
729  	
730  	  /* Block all signals whilst we have the futex */
731  	
732  	  memset (&blockall, -1, sizeof blockall);
733  	  if (sigprocmask (SIG_SETMASK, &blockall, &oldsigmask) < 0) {
734  	    abort ();
735  	  }
736  	
737  	  /* Block other threads from doing this so the output doesn't mix */
738  	
739  	  while (!atomic_setif_int (&mutex, 1, 0)) {
740  	    mtcp_sys_futex (&mutex, FUTEX_WAIT, 1, NULL, NULL, 0);
741  	  }
742  	
743  	  /* Get the segment for the TLS stuff */
744  	
745  	  asm volatile ("movw %%gs,%0" : "=g" (gs));
746  	  mtcp_printf("mtcp_init: gs=%X at %s:%d\n", gs, file, line);
747  	  if (gs != 0) {
748  	
749  	    /* We only handle GDT based stuff */
750  	
751  	    if (gs & 4) mtcp_printf("   *** part of LDT\n");
752  	
753  	    /* It's in the GDT */
754  	
755  	    else {
756  	
757  	      /* Read the TLS descriptor */
758  	
759  	      gdtentry.entry_number = gs / 8;
760  	      i = mtcp_sys_get_thread_area (&gdtentry);
761  	      if (i < 0) mtcp_printf("  error getting GDT entry %d: %d\n", gdtentry.entry_number, mtcp_sys_errno);
762  	      else {
763  	
764  	        /* Print out descriptor and first 80 bytes of data */
765  	
766  	        mtcp_printf("  limit %X, baseaddr %X\n", gdtentry.limit, gdtentry.base_addr);
767  	        for (i = 0; i < 80; i += 16) {
768  	          for (j = 16; -- j >= 0;) {
769  	            if ((j & 3) == 3) fputc (' ', stderr);
770  	            asm volatile ("movb %%gs:(%1),%0" : "=r" (byt) : "r" (i + j));
771  	            mtcp_printf("%2.2X", byt);
772  	          }
773  	          mtcp_printf(" : gs+%2.2X\n", i);
774  	        }
775  	        for (i = 0; i < 80; i += 16) {
776  	          for (j = 16; -- j >= 0;) {
777  	            if ((j & 3) == 3) fputc (' ', stderr);
778  	            byt = ((unsigned char *)gdtentry.base_addr)[i+j];
779  	            mtcp_printf("%2.2X", byt);
780  	          }
781  	          mtcp_printf(" : %8.8X\n", gdtentry.base_addr + i);
782  	        }
783  	
784  	        /* Offset 4C should be the process id */
785  	
786  	        asm volatile ("mov %%gs:0x4C,%0" : "=r" (i));
787  	        mtcp_printf("mtcp_init: getpid=%d, gettid=%d, tls=%d\n", getpid (), mtcp_sys_kernel_gettid (), i);
788  	      }
789  	    }
790  	  }
791  	
792  	  /* Release mutex and restore signal delivery */
793  	
794  	  mutex = 0;
795  	  mtcp_sys_futex (&mutex, FUTEX_WAKE, 1, NULL, NULL, 0);
796  	  if (_real_sigprocmask (SIG_SETMASK, &oldsigmask, NULL) < 0) {
797  	    abort ();
798  	  }
799  	#endif
800  	}
801  	
802  	/*****************************************************************************/
803  	/*									     */
804  	/*  This is our clone system call wrapper				     */
805  	/*									     */
806  	/*    Note:								     */
807  	/*									     */
808  	/*      pthread_create eventually calls __clone to create threads	     */
809  	/*      It uses flags = 0x3D0F00:					     */
810  	/*	      CLONE_VM = VM shared between processes			     */
811  	/*	      CLONE_FS = fs info shared between processes (root, cwd, umask) */
812  	/*	   CLONE_FILES = open files shared between processes (fd table)	     */
813  	/*	 CLONE_SIGHAND = signal handlers and blocked signals shared	     */
814  	/*	 			 (sigaction common to parent and child)	     */
815  	/*	  CLONE_THREAD = add to same thread group			     */
816  	/*	 CLONE_SYSVSEM = share system V SEM_UNDO semantics		     */
817  	/*	  CLONE_SETTLS = create a new TLS for the child from newtls parameter*/
818  	/*	 CLONE_PARENT_SETTID = set the TID in the parent (before MM copy)    */
819  	/*	CLONE_CHILD_CLEARTID = clear the TID in the child and do	     */
820  	/*				 futex wake at that address		     */
821  	/*	      CLONE_DETACHED = create clone detached			     */
822  	/*									     */
823  	/*****************************************************************************/
824  	
825  	int __clone (int (*fn) (void *arg), void *child_stack, int flags, void *arg,
826  		     int *parent_tidptr, struct user_desc *newtls, int *child_tidptr)
827  	{
828  	  int rc;
829  	  Thread *thread;
830  	#ifdef PTRACE
831  	  int i;
832  	#endif
833  	
834  	  /* Maybe they decided not to call mtcp_init */
835  	  if (motherofall != NULL) {
836  	
837  	    /* They called mtcp_init meaning we are to do checkpointing.
838  	     * So we are going to track this thread.
839  	     */
840  	
841  	    thread = malloc (sizeof *thread);
842  	    memset (thread, 0, sizeof *thread);
843  	    thread -> fn     = fn;   // this is the user's function
844  	    thread -> arg    = arg;  // ... and the parameter
845  	    thread -> parent = getcurrenthread ();
846  	    if (checkpointhreadstarting) {
847  	      checkpointhreadstarting = 0;
848  	      mtcp_state_init(&thread->state, ST_CKPNTHREAD);
849  	    } else {
850  	      mtcp_state_init(&thread->state, ST_RUNDISABLED);
851  	    }
852  	
853  	    DPRINTF (("mtcp wrapper clone*: calling clone thread=%p,"
854  		      " fn=%p, flags=0x%X\n", thread, fn, flags));
855  	    DPRINTF (("mtcp wrapper clone*: parent_tidptr=%p, newtls=%p,"
856  		      " child_tidptr=%p\n", parent_tidptr, newtls, child_tidptr));
857  	    //asm volatile ("int3");
858  	
859  	    /* Save exactly what the caller is supplying */
860  	
861  	    thread -> clone_flags   = flags;
862  	    thread -> parent_tidptr = parent_tidptr;
863  	    thread -> given_tidptr  = child_tidptr;
864  	
865  	    /* We need the CLEARTID feature so we can detect			     */
866  	    /*   when the thread has exited					     */
867  	    /* So if the caller doesn't want it, we enable it                        */
868  	    /* Retain what the caller originally gave us so we can pass the tid back */
869  	
870  	    if (!(flags & CLONE_CHILD_CLEARTID)) {
871  	      child_tidptr = &(thread -> child_tid);
872  	    }
873  	    thread -> actual_tidptr = child_tidptr;
874  	    DPRINTF (("mtcp wrapper clone*: thread %p -> actual_tidptr %p\n",
875  		      thread, thread -> actual_tidptr));
876  	
877  	    /* Alter call parameters, forcing CLEARTID and make it call the wrapper routine */
878  	
879  	    flags |= CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID;
880  	    fn = threadcloned;
881  	    arg = thread;
882  	  }
883  	
884  	  /* mtcp_init not called, no checkpointing, but make sure clone_entry is */
885  	  /* set up so we can call the real clone                                 */
886  	
887  	  else if (clone_entry == NULL) setup_clone_entry ();
888  	
889  	  /* Now create the thread */
890  	
891  	  DPRINTF (("mtcp wrapper clone*: clone fn=%p, child_stack=%p, flags=%X, arg=%p\n", fn, child_stack, flags, arg));
892  	  DPRINTF (("mtcp wrapper clone*: parent_tidptr=%p, newtls=%p, child_tidptr=%p\n", parent_tidptr, newtls, child_tidptr));
893  	  rc = (*clone_entry) (fn, child_stack, flags, arg, parent_tidptr, newtls, child_tidptr);
894  	  if (rc < 0) {
895  	    DPRINTF (("mtcp wrapper clone*: clone rc=%d, errno=%d\n", rc, errno));
896  	  } else {
897  	    DPRINTF (("mtcp wrapper clone*: clone rc=%d\n", rc));
898  	  }
899  	
900  	#ifdef PTRACE
901  	 /*************************************************************************/
902  	  /*  Code added to keep record of new tasks and processes in a file       */
903  	  /*************************************************************************/
904  	
905  	  // initialize the ptrace_tid_pairs array  
906  	  if (!init_ptrace_pairs) {
907  	    for (i = 0; i < MAX_PTRACE_PAIRS_COUNT; i++) {
908  	      ptrace_pairs[i].last_command = PTRACE_UNSPECIFIED_COMMAND;
909  	      ptrace_pairs[i].singlestep_waited_on = FALSE;
910  	      ptrace_pairs[i].free = TRUE;
911  	      ptrace_pairs[i].inferior_st = 'u'; // undefined
912  	    }
913  	    init_ptrace_pairs = 1;
914  	  }
915  	
916  	  // initialize the semaphore used when motherofall reads the ptrace shared file  
917  	  if (!init_ptrace_read_pairs_sem) {
918  	    sem_init(&ptrace_read_pairs_sem, 0, 0);
919  	    init_ptrace_read_pairs_sem = 1;
920  	  }
921  	
922  	  if (!init__sem) {
923  	    sem_init(&__sem, 0, 1);
924  	    init__sem = 1;
925  	  }
926  	
927  	  if (is_ptrace_setoptions == TRUE) writeptraceinfo (setoptions_superior, rc);
928  	  else {
929  	    // read from file
930  	    int setoptions_fd = -1;
931  	    pid_t inferior;
932  	    pid_t superior;
933  	
934  	    setoptions_fd = open(ptrace_setoptions_file, O_RDONLY);
935  	
936  	    if (setoptions_fd != -1) {
937  	      while (readall(setoptions_fd, &superior, sizeof(pid_t)) > 0) {
938  	        readall(setoptions_fd, &inferior, sizeof(pid_t));
939  	  if (inferior == GETTID()) {
940  	    setoptions_superior = superior;
941  	    is_ptrace_setoptions = TRUE;
942  	    writeptraceinfo (setoptions_superior, rc);
943  	  }
944  	      }
945  	      if ( close(setoptions_fd) != 0 ) {
946  	        mtcp_printf("__clone: Error closing file: %s\n",
947  	                    strerror(errno));
948  	  mtcp_abort();
949  	      }
950  	    }
951  	  }
952  	  /* the structure of checkpoint_threads_file is pairs of pid and tid */
953  	  write_info_to_file (2, getpid(), rc);
954  	  /*************************************************************************/
955  	  /*  Done recording new tasks and processes.                              */
956  	  /*************************************************************************/
957  	#endif
958  	
959  	  return (rc);
960  	}
961  	
962  	void fill_in_pthread (pid_t tid, pthread_t pth) {
963  	  struct Thread *thread;
964  	  for (thread = threads; thread != NULL; thread = thread -> next) {
965  	    if (thread -> tid == tid) {
966  	      thread -> pth = pth;
967  	      break;
968  	    }
969  	  }
970  	}
971  	
972  	void delete_thread_on_pthread_join (pthread_t pth) {
973  	  struct Thread *thread;
974  	  for (thread = threads; thread != NULL; thread = thread -> next) {
975  	    if (thread -> pth == pth) {
976  	      threadisdead (thread);
977  	      break;
978  	    }
979  	  }
980  	}
981  	
982  	asm (".global clone ; .type clone,@function ; clone = __clone");
983  	
984  	/*****************************************************************************/
985  	/*									     */
986  	/*  This routine is called (via clone) as the top-level routine of a thread  */
987  	/*  that we are tracking.						     */
988  	/*									     */
989  	/*  It fills in remaining items of our thread struct, calls the user function,*/
990  	/*  then cleans up the thread struct before exiting.			     */
991  	/*									     */
992  	/*****************************************************************************/
993  	
994  	static int threadcloned (void *threadv)
995  	
996  	{
997  	  int rc;
998  	  Thread *const thread = threadv;
999  	
1000 	  DPRINTF (("mtcp threadcloned*: starting thread %p\n", thread));
1001 	
1002 	  setupthread (thread);
1003 	
1004 	  /* The new TLS should have the process ID in place at TLS_PID_OFFSET() */
1005 	  /* This is a verification step and is therefore optional as such     */
1006 	  {
1007 	    pid_t  tls_pid = *(pid_t *) (mtcp_get_tls_base_addr() + TLS_PID_OFFSET());
1008 	    if ((tls_pid != motherpid) && (tls_pid != (pid_t)-1)) {
1009 	      mtcp_printf ("mtcp threadcloned: getpid %d, tls pid %d at offset %d, must match\n",
1010 	                    motherpid, tls_pid, TLS_PID_OFFSET());
1011 	      mtcp_printf ("      %X\n", motherpid);
1012 	      for (rc = 0; rc < 256; rc += 4) {
1013 	        tls_pid = *(pid_t *) (mtcp_get_tls_base_addr() + rc);
1014 	        mtcp_printf ("   %d: %X", rc, tls_pid);
1015 	        if ((rc & 31) == 28) mtcp_printf ("\n");
1016 	      }
1017 	      mtcp_abort ();
1018 	    }
1019 	  }
1020 	
1021 	  /* If the caller wants the child tid but didn't have CLEARTID, pass the tid back to it */
1022 	
1023 	  if ((thread -> clone_flags & (CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) == CLONE_CHILD_SETTID) {
1024 	    *(thread -> given_tidptr) = thread -> child_tid;
1025 	  }
1026 	
1027 	  /* Maybe enable checkpointing by default */
1028 	
1029 	  if (threadenabledefault) mtcp_ok ();
1030 	
1031 	#ifdef PTRACE
1032 	  init_thread_local();
1033 	#endif
1034 	
1035 	  /* Call the user's function for whatever processing they want done */
1036 	
1037 	  DPRINTF (("mtcp threadcloned*: calling %p (%p)\n", thread -> fn, thread -> arg));
1038 	  rc = (*(thread -> fn)) (thread -> arg);
1039 	  DPRINTF (("mtcp threadcloned*: returned %d\n", rc));
1040 	
1041 	  /* Make sure checkpointing is inhibited while we clean up and exit */
1042 	  /* Otherwise, checkpointer might wait forever for us to re-enable  */
1043 	
1044 	  mtcp_no ();
1045 	
1046 	  /* Do whatever to unlink and free thread block */
1047 	
1048 	  threadisdead (thread);
1049 	
1050 	  /* Return the user's status as the exit code */
1051 	
1052 	  return (rc);
1053 	}
1054 	
1055 	/*****************************************************************************/
1056 	/*									     */
1057 	/*  set_tid_address wrapper routine					     */
1058 	/*									     */
1059 	/*  We save the new address of the tidptr that will get cleared when the     */
1060 	/*  thread exits							     */
1061 	/*									     */
1062 	/*****************************************************************************/
1063 	
1064 	static long set_tid_address (int *tidptr)
1065 	
1066 	{
1067 	  long rc;
1068 	  Thread *thread;
1069 	
1070 	  thread = getcurrenthread ();
1071 	  DPRINTF (("set_tid_address wrapper*: thread %p -> tid %d, tidptr %p\n",
1072 		    thread, thread -> tid, tidptr));
1073 	  thread -> actual_tidptr = tidptr;  // save new tidptr so subsequent restore will create with new pointer
1074 	  rc = mtcp_sys_set_tid_address(tidptr);
1075 	  return (rc);                       // now we tell kernel to change it for the current thread
1076 	}
1077 	
1078 	/*****************************************************************************/
1079 	/*									     */
1080 	/*  Link thread struct to the lists and finish filling it in		     */
1081 	/*									     */
1082 	/*    Input:								     */
1083 	/*									     */
1084 	/*	thread = thread to set up					     */
1085 	/*									     */
1086 	/*    Output:								     */
1087 	/*									     */
1088 	/*	thread linked to 'threads' list and 'motherofall' tree		     */
1089 	/*	thread -> tid = filled in with thread id			     */
1090 	/*	thread -> state = ST_RUNDISABLED (thread initially has checkpointing */
1091 	/*        disabled)							     */
1092 	/*	signal handler set up						     */
1093 	/*									     */
1094 	/*****************************************************************************/
1095 	
1096 	static void setupthread (Thread *thread)
1097 	
1098 	{
1099 	  Thread *parent;
1100 	
1101 	  /* Save the thread's ID number and put in threads list so we can look it up                                    */
1102 	  /* Set state to disable checkpointing so checkpointer won't race between adding to list and setting up handler */
1103 	
1104 	  thread -> tid = mtcp_sys_kernel_gettid ();
1105 	  thread -> original_tid = GETTID ();
1106 	
1107 	  DPRINTF (("mtcp setupthread*: thread %p -> tid %d\n", thread, thread->tid));
1108 	
1109 	  lock_threads ();
1110 	
1111 	  if ((thread -> next = threads) != NULL) {
1112 	    thread -> next -> prev = &(thread -> next);
1113 	  }
1114 	  thread -> prev = &threads;
1115 	  threads = thread;
1116 	
1117 	  parent = thread -> parent;
1118 	  if (parent != NULL) {
1119 	    thread -> siblings = parent -> children;
1120 	    parent -> children = thread;
1121 	  }
1122 	
1123 	  unlk_threads ();
1124 	}
1125 	
1126 	/*****************************************************************************/
1127 	/*									     */
1128 	/*  Set up 'clone_entry' variable					     */
1129 	/*									     */
1130 	/*    Output:								     */
1131 	/*									     */
1132 	/*	clone_entry = points to clone routine within libc.so		     */
1133 	/*									     */
1134 	/*****************************************************************************/
1135 	
1136 	static void setup_clone_entry (void)
1137 	
1138 	{
1139 	  char *p, *tmp;
1140 	  int mapsfd;
1141 	
1142 	  /* Get name of whatever concoction we have for a libc shareable image */
1143 	  /* This is used by the wrapper routines                               */
1144 	
1145 	  tmp = getenv ("MTCP_WRAPPER_LIBC_SO");
1146 	  if (tmp != NULL) {
1147 	    if (strlen(tmp) >= sizeof(mtcp_libc_area.name)) {
1148 	      mtcp_printf("mtcp setup_clone_entry: libc area name (%s) too long (>=1024 chars)\n",
1149 	                  tmp);
1150 	      mtcp_abort();
1151 	    }
1152 	    strncpy (mtcp_libc_area.name, tmp, sizeof mtcp_libc_area.name);
1153 	  } else {
1154 	    mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
1155 	    if (mapsfd < 0) {
1156 	      mtcp_printf ("mtcp_init: error opening /proc/self/maps: %s\n", strerror (mtcp_sys_errno));
1157 	      mtcp_abort ();
1158 	    }
1159 	    p = NULL;
1160 	    while (readmapsline (mapsfd, &mtcp_libc_area)) {
1161 	      p = strstr (mtcp_libc_area.name, "/libc");
1162 	      if ((p != NULL) && ((p[5] == '-') || (p[5] == '.'))) break;
1163 	    }
1164 	    close (mapsfd);
1165 	    if (p == NULL) {
1166 	      mtcp_printf ("mtcp_init: cannot find */libc[-.]* in /proc/self/maps\n");
1167 	      mtcp_abort ();
1168 	    }
1169 	  }
1170 	  mtcp_libc_dl_handle = dlopen (mtcp_libc_area.name, RTLD_LAZY | RTLD_GLOBAL);
1171 	  if (mtcp_libc_dl_handle == NULL) {
1172 	    mtcp_printf ("mtcp_init: error opening libc shareable %s: %s\n", mtcp_libc_area.name, dlerror ());
1173 	    mtcp_abort ();
1174 	  }
1175 	
1176 	  /* Find the clone routine therein */
1177 	
1178 	  clone_entry = mtcp_get_libc_symbol ("__clone");
1179 	}
1180 	
1181 	/********************************************************************************************************************************/
1182 	/*																*/
1183 	/*  Thread has exited - unlink it from lists and free struct									*/
1184 	/*																*/
1185 	/*    Input:															*/
1186 	/*																*/
1187 	/*	thread = thread that has exited												*/
1188 	/*																*/
1189 	/*    Output:															*/
1190 	/*																*/
1191 	/*	thread removed from 'threads' list and motherofall tree									*/
1192 	/*	thread pointer no longer valid												*/
1193 	/*	checkpointer woken if waiting for this thread										*/
1194 	/*																*/
1195 	/********************************************************************************************************************************/
1196 	
1197 	static void threadisdead (Thread *thread)
1198 	
1199 	{
1200 	  Thread **lthread, *parent, *xthread;
1201 	
1202 	  lock_threads ();
1203 	
1204 	  DPRINTF (("mtcp threadisdead*: thread %p -> tid %d\n", thread, thread -> tid));
1205 	
1206 	  /* Remove thread block from 'threads' list */
1207 	
1208 	  if ((*(thread -> prev) = thread -> next) != NULL) {
1209 	    thread -> next -> prev = thread -> prev;
1210 	  }
1211 	
1212 	  /* Remove thread block from parent's list of children */
1213 	
1214 	  parent = thread -> parent;
1215 	  if (parent != NULL) {
1216 	    for (lthread = &(parent -> children); (xthread = *lthread) != thread; lthread = &(xthread -> siblings)) {}
1217 	    *lthread = xthread -> siblings;
1218 	  }
1219 	
1220 	  /* If this thread has children, give them to its parent */
1221 	
1222 	  if (parent != NULL) {
1223 	    while ((xthread = thread -> children) != NULL) {
1224 	      thread -> children = xthread -> siblings;
1225 	      xthread -> siblings = parent -> children;
1226 	      parent -> children = xthread;
1227 	    }
1228 	  } else {
1229 	    while ((xthread = thread -> children) != NULL) {
1230 	      thread -> children = xthread -> siblings;
1231 	      xthread -> siblings = motherofall;
1232 	      motherofall = xthread;
1233 	    }
1234 	  }
1235 	
1236 	  unlk_threads ();
1237 	
1238 	  /* If checkpointer is waiting for us, wake it to see this thread no longer in list */
1239 	
1240 	  mtcp_state_futex (&(thread -> state), FUTEX_WAKE, 1, NULL);
1241 	
1242 	  mtcp_state_destroy( &(thread -> state) );
1243 	
1244 	  free (thread);
1245 	}
1246 	
1247 	void *mtcp_get_libc_symbol (char const *name)
1248 	
1249 	{
1250 	  void *temp;
1251 	
1252 	  temp = dlsym (mtcp_libc_dl_handle, name);
1253 	  if (temp == NULL) {
1254 	    mtcp_printf ("mtcp_get_libc_symbol: error getting %s from %s: %s\n",
1255 	                 name, mtcp_libc_area.name, dlerror ());
1256 	    mtcp_abort ();
1257 	  }
1258 	  return (temp);
1259 	}
1260 	
1261 	/********************************************************************************************************************************/
1262 	/*																*/
1263 	/*  Call this when it's OK to checkpoint											*/
1264 	/*																*/
1265 	/********************************************************************************************************************************/
1266 	
1267 	int mtcp_ok (void)
1268 	
1269 	{
1270 	  Thread *thread;
1271 	
1272 	  if (getenv("MTCP_NO_CHECKPOINT"))
1273 	    return 0;
1274 	  thread = getcurrenthread ();
1275 	
1276 	again:
1277 	  switch (mtcp_state_value(&thread -> state)) {
1278 	
1279 	    /* Thread was running normally with checkpointing disabled.  Enable checkpointing then just return. */
1280 	
1281 	    case ST_RUNDISABLED: {
1282 	      if (!mtcp_state_set (&(thread -> state), ST_RUNENABLED, ST_RUNDISABLED)) goto again;
1283 	      return (0);
1284 	    }
1285 	
1286 	    /* Thread was running normally with checkpointing already enabled.  So just return as is. */
1287 	
1288 	    case ST_RUNENABLED: {
1289 	      return (1);
1290 	    }
1291 	
1292 	    /* Thread was running with checkpointing disabled, but the checkpointhread wants to write a checkpoint.  So mark the thread  */
1293 	    /* as having checkpointing enabled, then just 'manually' call the signal handler as if the signal to suspend were just sent. */
1294 	
1295 	    case ST_SIGDISABLED: {
1296 	      if (!mtcp_state_set (&(thread -> state), ST_SIGENABLED, ST_SIGDISABLED)) goto again;
1297 	      stopthisthread (0);
1298 	      return (0);
1299 	    }
1300 	
1301 	    /* Thread is running with checkpointing enabled, but the checkpointhread wants to write a checkpoint and has sent a signal */
1302 	    /* telling the thread to call 'stopthisthread'.  So we'll just keep going as is until the signal is actually delivered.    */
1303 	
1304 	    case ST_SIGENABLED: {
1305 	      return (1);
1306 	    }
1307 	
1308 	    /* Thread is the checkpointhread so we just ignore the call (from threadcloned routine). */
1309 	
1310 	    case ST_CKPNTHREAD: {
1311 	      return (-1);
1312 	    }
1313 	
1314 	    /* How'd we get here? */
1315 	
1316 	    default: {
1317 	      mtcp_abort ();
1318 	      return (0); /* NOTREACHED : stop compiler warning */
1319 	    }
1320 	  }
1321 	}
1322 	
1323 	/* Likewise, disable checkpointing */
1324 	
1325 	int mtcp_no (void)
1326 	{
1327 	  Thread *thread;
1328 	
1329 	  if (getenv("MTCP_NO_CHECKPOINT"))
1330 	    return 0;
1331 	  thread = getcurrenthread ();
1332 	
1333 	again:
1334 	  switch (mtcp_state_value(&thread -> state)) {
1335 	    case ST_RUNDISABLED: {
1336 	      return (0);
1337 	    }
1338 	
1339 	    case ST_RUNENABLED: {
1340 	      if (!mtcp_state_set (&(thread -> state), ST_RUNDISABLED, ST_RUNENABLED)) goto again;
1341 	      return (1);
1342 	    }
1343 	
1344 	    case ST_SIGDISABLED: {
1345 	      return (0);
1346 	    }
1347 	
1348 	    case ST_SIGENABLED: {
1349 	      stopthisthread (0);
1350 	      goto again;
1351 	    }
1352 	
1353 	    default: {
1354 	      mtcp_abort ();
1355 	      return (0); /* NOTREACHED : stop compiler warning */
1356 	    }
1357 	  }
1358 	}
1359 	
1360 	/* This is used by ../dmtcp/src/mtcpinterface.cpp */
1361 	void mtcp_kill_ckpthread (void)
1362 	{
1363 	  Thread *thread;
1364 	
1365 	  lock_threads ();
1366 	  for (thread = threads; thread != NULL; thread = thread -> next) {
1367 	    if ( mtcp_state_value(&thread -> state) == ST_CKPNTHREAD ) {
1368 	      unlk_threads ();
1369 	      DPRINTF(("mtcp_kill_ckpthread: Kill checkpinthread, tid=%d\n",thread->tid));
1370 	      mtcp_sys_kernel_tkill(thread -> tid, STOPSIGNAL);
1371 	      return;
1372 	    }
1373 	  }
1374 	  unlk_threads ();
1375 	}
1376 	
1377 	
1378 	/*************************************************************************/
1379 	/*						                         */
1380 	/*  Save and restore terminal settings.		                         */
1381 	/*						                         */
1382 	/*************************************************************************/
1383 	
1384 	static int saved_termios_exists = 0;
1385 	static struct termios saved_termios;
1386 	static struct winsize win;
1387 	
1388 	static void save_term_settings() {
1389 	  saved_termios_exists = ( isatty(STDIN_FILENO)
1390 	  		           && tcgetattr(STDIN_FILENO, &saved_termios) >= 0 );
1391 	  if (saved_termios_exists)
1392 	    ioctl (STDIN_FILENO, TIOCGWINSZ, (char *) &win);
1393 	}
1394 	int safe_tcsetattr(int fd, int optional_actions,
1395 			   const struct termios *termios_p) {
1396 	  struct termios old_termios, new_termios;
1397 	  /* We will compare old and new, and we don't want unitialized data */
1398 	  memset(&new_termios, 0, sizeof(new_termios));
1399 	  /* tcgetattr returns success as long as at least one of requested
1400 	   * changes was executed.  So, repeat until no more changes.
1401 	   */ 
1402 	  do {
1403 	    memcpy(&old_termios, &new_termios, sizeof(new_termios));
1404 	    if (tcsetattr(fd, TCSANOW, termios_p) == -1) return -1;
1405 	    if (tcgetattr(fd, &new_termios) == -1) return -1;
1406 	  } while (memcmp(&new_termios, &old_termios, sizeof(new_termios)) != 0);
1407 	  return 0;
1408 	}
1409 	static void restore_term_settings() {
1410 	  if (saved_termios_exists){
1411 	    /* First check if we are in foreground. If not, skip this and print
1412 	     *   warning.  If we try to call tcsetattr in background, we will hang up.
1413 	     */
1414 	    int foreground = (tcgetpgrp(STDIN_FILENO) == getpgrp());
1415 	    DPRINTF(("restore terminal attributes, check foreground status first: %d\n",
1416 	             foreground));
1417 	    if (foreground) {
1418 	      if ( ( ! isatty(STDIN_FILENO)
1419 	             || safe_tcsetattr(STDIN_FILENO, TCSANOW, &saved_termios) == -1) )
1420 	        DPRINTF(("WARNING: mtcp finishrestore*: failed to restore terminal\n"));
1421 	      else {
1422 	        struct winsize cur_win;
1423 	        DPRINTF(("mtcp finishrestore*: restored terminal\n"));
1424 	        ioctl (STDIN_FILENO, TIOCGWINSZ, (char *) &cur_win);
1425 		/* ws_row/ws_col was probably not 0/0 prior to checkpoint.  We change
1426 		 * it back to last known row/col prior to checkpoint, and then send a
1427 		 * SIGWINCH (see below) to notify process that window might have changed
1428 		 */
1429 	        if (cur_win.ws_row == 0 && cur_win.ws_col == 0)
1430 	          ioctl (STDIN_FILENO, TIOCSWINSZ, (char *) &win);
1431 	      }
1432 	    } else {
1433 	      DPRINTF(("WARNING: mtcp finishrestore*: skip restore terminal step\n"
1434 		       " -- we are in BACKGROUND\n"));
1435 	    }
1436 	  }
1437 	  if (kill(getpid(), SIGWINCH) == -1) {}  /* No remedy if error */
1438 	}
1439 	
1440 	
1441 	/*************************************************************************/
1442 	/*						                         */
1443 	/*  This executes as a thread.  It sleeps for the checkpoint interval    */
1444 	/*    seconds, then wakes to write the checkpoint file.			 */
1445 	/*						                         */
1446 	/*************************************************************************/
1447 	
1448 	static void *checkpointhread (void *dummy)
1449 	{
1450 	  int needrescan;
1451 	  struct timespec sleeperiod;
1452 	  struct timeval started, stopped;
1453 	  Thread *thread;
1454 	  char * dmtcp_checkpoint_filename = NULL;
1455 	
1456 	  /* This is the start function of the checkpoint thread.
1457 	   * We also call getcontext to get a snapshot of this call frame,
1458 	   * since we will never exit this call frame.  We always return
1459 	   * to this call frame at time of startup, on restart.  Hence, restart
1460 	   * will forget any modifications to our local variables since restart.
1461 	   */
1462 	  static int originalstartup = 1;
1463 	
1464 	#ifdef PTRACE
1465 	  init_thread_local();
1466 	  check_size_for_ptrace_file (ptrace_shared_file);
1467 	  check_size_for_ptrace_file (ptrace_setoptions_file);
1468 	  check_size_for_ptrace_file (checkpoint_threads_file);
1469 	#endif
1470 	
1471 	  /* We put a timeout in case the thread being waited for exits whilst we are waiting */
1472 	
1473 	  static struct timespec const enabletimeout = { 10, 0 };
1474 	
1475 	  DPRINTF (("mtcp checkpointhread*: %d started\n", mtcp_sys_kernel_gettid ()));
1476 	
1477 	  /* Set up our restart point, ie, we get jumped to here after a restore */
1478 	
1479 	  ckpthread = getcurrenthread ();
1480 	
1481 	  save_sig_state( ckpthread );
1482 	  save_tls_state (ckpthread);
1483 	  /* Release user thread after we've initialized. */
1484 	  sem_post(&sem_start);
1485 	  if (getcontext (&(ckpthread -> savctx)) < 0) mtcp_abort ();
1486 	
1487 	  DPRINTF (("mtcp checkpointhread*: after getcontext. current_tid %d, original_tid:%d\n",
1488 	        mtcp_sys_kernel_gettid(), ckpthread->original_tid));
1489 	  if (originalstartup)
1490 	    originalstartup = 0;
1491 	  else {
1492 	
1493 	    /* We are being restored.  Wait for all other threads to finish being restored before resuming checkpointing. */
1494 	
1495 	    DPRINTF (("mtcp checkpointhread*: waiting for other threads after restore\n"));
1496 	    wait_for_all_restored ();
1497 	#ifdef PTRACE
1498 	    create_file (GETTID());
1499 	#endif
1500 	    DPRINTF (("mtcp checkpointhread*: resuming after restore\n"));
1501 	  }
1502 	
1503 	  /* Reset the verification counter - on init, this will set it to it's start value. */
1504 	  /* After a verification, it will reset it to its start value.  After a normal      */
1505 	  /* restore, it will set it to its start value.  So this covers all cases.          */
1506 	
1507 	  verify_count = verify_total;
1508 	  DPRINTF (("After verify count mtcp checkpointhread*: %d started\n",
1509 		    mtcp_sys_kernel_gettid ()));
1510 	
1511 	  while (1) {
1512 	#ifdef PTRACE
1513 	    int ptraced_by = 0;
1514 	#endif
1515 	
1516 	    /* Wait a while between writing checkpoint files */
1517 	
1518 	    if (callback_sleep_between_ckpt == NULL)
1519 	    {
1520 	        memset (&sleeperiod, 0, sizeof sleeperiod);
1521 	        sleeperiod.tv_sec = intervalsecs;
1522 	        while ((nanosleep (&sleeperiod, &sleeperiod) < 0) && (errno == EINTR)) {}
1523 	    }
1524 	    else
1525 	    {
1526 	        DPRINTF(("mtcp checkpointhread*: before callback_sleep_between_ckpt(%d)\n",intervalsecs));
1527 	        (*callback_sleep_between_ckpt)(intervalsecs);
1528 	        DPRINTF(("mtcp checkpointhread*: after callback_sleep_between_ckpt(%d)\n",intervalsecs));
1529 	    }
1530 	
1531 	    mtcp_sys_gettimeofday (&started, NULL);
1532 	    checkpointsize = 0;
1533 	
1534 	#ifdef PTRACE
1535 	    // Refresh ptrace information
1536 	    has_ptrace_file = 0;
1537 	    delete_ptrace_leader = -1;
1538 	    has_setoptions_file = 0;
1539 	    delete_setoptions_leader = -1;
1540 	    has_checkpoint_file = 0;
1541 	    delete_checkpoint_leader = -1;
1542 	    process_ptrace_info( &delete_ptrace_leader, &has_ptrace_file,
1543 	                         &delete_setoptions_leader, &has_setoptions_file,
1544 	                         &delete_checkpoint_leader, &has_checkpoint_file);
1545 	
1546 	    for (thread = threads; thread != NULL; thread = thread -> next) {
1547 	      int i;
1548 	      for (i = 0; i < ptrace_pairs_count; i++) {
1549 	        DPRINTF(("COMPARE: intf=%d, tid=%d\n",
1550 	                 ptrace_pairs[i].inferior, thread->original_tid));
1551 	        if( ptrace_pairs[i].inferior == thread->original_tid ){
1552 	          ptraced_by = ptrace_pairs[i].superior;
1553 	          break;
1554 	        }
1555 	      }
1556 	      if( ptraced_by )
1557 	        break;
1558 	    }
1559 	
1560 	    DPRINTF(("\n\n%d ptraced by %d\n\n",(thread) ? thread->tid : 0,ptraced_by));
1561 	    if( ptraced_by ){
1562 	      DPRINTF(("\n\n%d Wait for superior %d\n\n",thread->tid,ptraced_by));
1563 	      ptrace_wait4(ptraced_by);
1564 	      //sleep(1);
1565 	      DPRINTF(("\n\n%d Wait for superior %d - SUCCESS\n\n",thread->tid,ptraced_by));
1566 	    }
1567 	#endif 
1568 	
1569 	    /* Halt all other threads - force them to call stopthisthread                    */
1570 	    /* If any have blocked checkpointing, wait for them to unblock before signalling */
1571 	
1572 	rescan:
1573 	    needrescan = 0;
1574 	    lock_threads ();
1575 	    for (thread = threads; thread != NULL; thread = thread -> next) {
1576 	
1577 	      /* If thread no longer running, remove it from thread list */
1578 	
1579 	again:
1580 	      if (*(thread -> actual_tidptr) == 0) {
1581 	        DPRINTF (("mtcp checkpointhread*: thread %d disappeared\n", thread -> tid));
1582 	        unlk_threads ();
1583 	        threadisdead (thread);
1584 	        goto rescan;
1585 	      }
1586 	
1587 	      /* Do various things based on thread's state */
1588 	
1589 	      switch (mtcp_state_value (&thread -> state) ) {
1590 	
1591 	        /* Thread is running but has checkpointing disabled    */
1592 	        /* Tell the mtcp_ok routine that we are waiting for it */
1593 	        /* We will need to rescan so we will see it suspended  */
1594 	
1595 	        case ST_RUNDISABLED: {
1596 	          if (!mtcp_state_set (&(thread -> state), ST_SIGDISABLED, ST_RUNDISABLED)) goto again;
1597 	          needrescan = 1;
1598 	          break;
1599 	        }
1600 	
1601 	        /* Thread is running and has checkpointing enabled                 */
1602 	        /* Send it a signal so it will call stopthisthread                 */
1603 	        /* We will need to rescan (hopefully it will be suspended by then) */
1604 	
1605 	        case ST_RUNENABLED: {
1606 	          if (!mtcp_state_set (&(thread -> state), ST_SIGENABLED, ST_RUNENABLED)) goto again;
1607 	#ifdef PTRACE
1608 	          ptrace_save_threads_state ();
1609 	          int index;  
1610 	          char inferior_st = 'N';
1611 	          char inf_st;
1612 	          for (index = 0; index < ptrace_pairs_count; index++) {
1613 	            inf_st = procfs_state(ptrace_pairs[index].inferior);
1614 	            DPRINTF(("tid = %d now=%c stored=%c superior = %d inferior = %d\n",
1615 	                     GETTID(), inf_st, ptrace_pairs[index].inferior_st,
1616 	                     ptrace_pairs[index].superior, ptrace_pairs[index].inferior));
1617 	            if (ptrace_pairs[index].inferior == thread -> original_tid) {
1618 	              inferior_st = ptrace_pairs[index].inferior_st;
1619 	              break;
1620 	            }
1621 	          }
1622 	          DPRINTF(("%d %c\n", GETTID(), inferior_st));
1623 	          if (inferior_st == 'N') {
1624 	            // superior 
1625 	            if (mtcp_sys_kernel_tkill (thread -> tid, STOPSIGNAL) < 0) {
1626 	              if (mtcp_sys_errno != ESRCH) {
1627 	                mtcp_printf("mtcp checkpointhread: error signalling thread %d: %s\n",
1628 	                            thread -> tid, strerror (mtcp_sys_errno));
1629 	              }
1630 	              unlk_threads ();
1631 	              threadisdead (thread);
1632 	              goto rescan;
1633 	            }
1634 	          }
1635 	          else {
1636 	            // inferior 
1637 	            DPRINTF(("++++++++++++++++++++++++++++++++%c %d\n", inferior_st, thread -> original_tid));
1638 	            if (inferior_st != 'T') {
1639 	            if (mtcp_sys_kernel_tkill (thread -> tid, STOPSIGNAL) < 0) {
1640 	                if (mtcp_sys_errno != ESRCH) {
1641 	                  mtcp_printf ("mtcp checkpointhread: error signalling thread %d: %s\n",
1642 	                               thread -> tid, strerror (mtcp_sys_errno));
1643 	                }
1644 	                unlk_threads ();
1645 	                threadisdead (thread);
1646 	                goto rescan;
1647 	              }
1648 	            }
1649 	            create_file( thread -> original_tid );
1650 	          }
1651 	#else
1652 	          if (mtcp_sys_kernel_tkill (thread -> tid, STOPSIGNAL) < 0) {
1653 	            if (mtcp_sys_errno != ESRCH) {
1654 	              mtcp_printf ("mtcp checkpointhread: error signalling thread %d: %s\n",
1655 	                           thread -> tid, strerror (mtcp_sys_errno));
1656 	            }
1657 	            unlk_threads ();
1658 	            threadisdead (thread);
1659 	            goto rescan;
1660 	          }
1661 	#endif
1662 	          needrescan = 1;
1663 	          break;
1664 	        }
1665 	
1666 	        /* Thread is running, we have signalled it to stop, but it has
1667 		 * checkpointing disabled.  So we wait for it to change state.
1668 	         * We have to unlock because it may need lock to change state.
1669 		 */
1670 	
1671 	        case ST_SIGDISABLED: {
1672 	          unlk_threads ();
1673 	          mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SIGDISABLED,
1674 				    &enabletimeout);
1675 	          goto rescan;
1676 	        }
1677 	
1678 	        /* Thread is running and we have sent signal to stop it             */
1679 	        /* So we have to wait for it to change state (enter signal handler) */
1680 	        /* We have to unlock because it may try to use lock meanwhile       */
1681 	
1682 	        case ST_SIGENABLED: {
1683 	          unlk_threads ();
1684 	          mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SIGENABLED,
1685 				    &enabletimeout);
1686 	          goto rescan;
1687 	        }
1688 	
1689 	        /* Thread has entered signal handler and is saving its context.
1690 	         * So we have to wait for it to finish doing so.  We don't need
1691 		 * to unlock because it won't use lock before changing state.
1692 		 */
1693 	
1694 	        case ST_SUSPINPROG: {
1695 	          mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SUSPINPROG,
1696 				    &enabletimeout);
1697 	          goto again;
1698 	        }
1699 	
1700 	        /* Thread is suspended and all ready for us to write checkpoint file */
1701 	
1702 	        case ST_SUSPENDED: {
1703 	          break;
1704 	        }
1705 	
1706 	        /* Don't do anything to the checkpointhread (this) thread */
1707 	
1708 	        case ST_CKPNTHREAD: {
1709 	          break;
1710 	        }
1711 	
1712 	        /* Who knows? */
1713 	
1714 	        default: {
1715 	          mtcp_abort ();
1716 	        }
1717 	      }
1718 	    }
1719 	    unlk_threads ();
1720 	
1721 	    /* If need to rescan (ie, some thread possibly not in ST_SUSPENDED STATE),
1722 	     * check them all again
1723 	     */
1724 	
1725 	    if (needrescan) goto rescan;
1726 	    RMB; // matched by WMB in stopthisthread
1727 	    DPRINTF (("mtcp checkpointhread*: everything suspended\n"));
1728 	
1729 	    /* If no threads, we're all done */
1730 	
1731 	    if (threads == NULL) {
1732 	      DPRINTF (("mtcp checkpointhread*: exiting (no threads)\n"));
1733 	      return (NULL);
1734 	    }
1735 	
1736 	    /* Call weak symbol of this file, possibly overridden by user's strong symbol.
1737 	     * User must compile his/her code with -Wl,-export-dynamic to make it visible.
1738 	     */
1739 	    mtcpHookPreCheckpoint();
1740 	
1741 	    if (!dmtcp_exists) {
1742 	      save_sig_handlers();
1743 	    }
1744 	
1745 	    /* All other threads halted in 'stopthisthread' routine (they are all
1746 	     * in state ST_SUSPENDED).  It's safe to write checkpoint file now.
1747 	     */
1748 	    if (callback_pre_ckpt != NULL){
1749 	      // Here we want to synchronize the shared memory pages with the backup files
1750 	      DPRINTF(("mtcp checkpointhread*: syncing shared memory with backup files\n"));
1751 	      sync_shared_mem();
1752 	
1753 	      DPRINTF(("mtcp checkpointhread*: before callback_pre_ckpt() (&%x,%x) \n",
1754 		       &callback_pre_ckpt, callback_pre_ckpt));
1755 	      (*callback_pre_ckpt)(&dmtcp_checkpoint_filename);
1756 	      if (dmtcp_checkpoint_filename &&
1757 	          strcmp(dmtcp_checkpoint_filename, "/dev/null") != 0) {
1758 	        mtcp_sys_strcpy(perm_checkpointfilename, dmtcp_checkpoint_filename);
1759 	        DPRINTF(("mtcp checkpointhread*: Checkpoint filename changed to %s\n",
1760 			perm_checkpointfilename));
1761 	      }
1762 	    }
1763 	
1764 	#ifdef PTRACE
1765 	    /* If old stale files of these names exist, we append, with big problems
1766 	     * It's okay if files don't exist and unlink fails.
1767 	     * Pre_ckpt is a barrier from coordinator.  So, all processes finished
1768 	     *  reading ptrace pairs from files prior to this barrier.
1769 	     */
1770 	    unlink(ptrace_shared_file);
1771 	    unlink(ptrace_setoptions_file);
1772 	    unlink(checkpoint_threads_file);
1773 	#endif
1774 	
1775 	    mtcp_saved_break = (void*) mtcp_sys_brk(NULL);  // kernel returns mm->brk when passed zero
1776 	    /* Do this once, same for all threads.  But restore for each thread. */
1777 	    if (mtcp_have_thread_sysinfo_offset())
1778 	      saved_sysinfo = mtcp_get_thread_sysinfo();
1779 	    /* Do this once.  It's the same for all threads. */
1780 	    save_term_settings();
1781 	
1782 	    if (getcwd(saved_working_directory, MTCP_MAX_PATH) == NULL) {
1783 	      // buffer wasn't large enough
1784 	      perror("getcwd");
1785 	      mtcp_printf ("getcwd failed.");
1786 	      mtcp_abort ();
1787 	    }
1788 	
1789 	    DPRINTF (("mtcp checkpointhread*: mtcp_saved_break=%p\n", mtcp_saved_break));
1790 	
1791 	    if ( dmtcp_checkpoint_filename == NULL ||
1792 	         strcmp (dmtcp_checkpoint_filename, "/dev/null") != 0) {
1793 	      checkpointeverything ();
1794 	    } else {
1795 	      mtcp_printf("mtcp checkpointhread*:  received \'/dev/null\'" \
1796 			  " as ckpt filename.\n*** Skipping checkpoint. ***\n");
1797 	    }
1798 	
1799 	    if (callback_post_ckpt != NULL){
1800 	        DPRINTF(("mtcp checkpointhread*: before callback_post_ckpt() (&%x,%x) \n",
1801 			 &callback_post_ckpt, callback_post_ckpt));
1802 	        (*callback_post_ckpt)(0);
1803 	    }
1804 	    if (showtiming) {
1805 	      mtcp_sys_gettimeofday (&stopped, NULL);
1806 	      stopped.tv_usec += (stopped.tv_sec - started.tv_sec) * 1000000 - started.tv_usec;
1807 	      mtcp_printf ("mtcp checkpoint: time %u uS, size %u megabytes," \
1808 			   " avg rate %u MB/s\n",
1809 	                   stopped.tv_usec, (unsigned int)(checkpointsize / 1000000),
1810 	                   (unsigned int)(checkpointsize / stopped.tv_usec));
1811 	    }
1812 	
1813 	    /* Call weak symbol of this file, possibly overridden by user's strong symbol.
1814 	     * User must compile his/her code with -Wl,-export-dynamic to make it visible.
1815 	     */
1816 	    mtcpHookPostCheckpoint();
1817 	
1818 	    /* Resume all threads.  But if we're doing a checkpoint verify,
1819 	     * abort all threads except the main thread, as we don't want them
1820 	     * running when we exec the mtcp_restore program.
1821 	     */
1822 	
1823 	    DPRINTF (("mtcp checkpointhread*: resuming everything\n"));
1824 	    lock_threads();
1825 	    for (thread = threads; thread != NULL; thread = thread -> next) {
1826 	      if (mtcp_state_value(&(thread -> state)) != ST_CKPNTHREAD) {
1827 	        if (!mtcp_state_set (&(thread -> state), ST_RUNENABLED, ST_SUSPENDED))
1828 		  mtcp_abort();
1829 	        mtcp_state_futex(&(thread -> state), FUTEX_WAKE, 1, NULL);
1830 	      }
1831 	    }
1832 	    unlk_threads ();
1833 	    DPRINTF (("mtcp checkpointhread*: everything resumed\n"));
1834 	    /* But if we're doing a restore verify, just exit.  The main thread is doing the exec to start the restore. */
1835 	#ifdef PTRACE
1836 	    create_file (GETTID());
1837 	#endif
1838 	    if ((verify_total != 0) && (verify_count == 0)) return (NULL);
1839 	  }
1840 	}
1841 	
1842 	/**
1843 	 * This function returns the fd to which the checkpoint file should be written.
1844 	 * The purpose of using this function over mtcp_sys_open() is that this
1845 	 * function will handle compression and gzipping.
1846 	 */
1847 	static int test_use_compression(void)
1848 	{
1849 	  char *do_we_compress;
1850 	
1851 	  do_we_compress = getenv("MTCP_GZIP");
1852 	  // allow alternate name for env var
1853 	  if (do_we_compress == NULL)
1854 	    do_we_compress = getenv("DMTCP_GZIP");
1855 	  // env var is unset, let's default to enabled
1856 	  // to disable compression, run with MTCP_GZIP=0
1857 	  if (do_we_compress == NULL)
1858 	    do_we_compress = "1";
1859 	
1860 	  char *endptr;
1861 	  strtol(do_we_compress, &endptr, 0);
1862 	  if ( *do_we_compress == '\0' || *endptr != '\0' ) {
1863 	    mtcp_printf("WARNING: MTCP_GZIP/DMTCP_GZIP defined as %s (not a number)\n"
1864 		        "  Checkpoint image will not be compressed.\n",
1865 		        do_we_compress);
1866 	    do_we_compress = "0";
1867 	  }
1868 	  if ( 0 == strcmp(do_we_compress, "0") )
1869 	    return 0;
1870 	  /* If we arrive down here, it's safe to ccompress. */
1871 	  return 1;
1872 	}
1873 	
1874 	static int open_ckpt_to_write(int fd, int pipe_fds[2], char *gzip_path)
1875 	{
1876 	  pid_t cpid;
1877 	  char *gzip_args[] = { "gzip", "-1", "-", NULL };
1878 	
1879 	  gzip_args[0] = gzip_path;
1880 	
1881 	  cpid = mtcp_sys_fork();
1882 	  if (cpid == -1) {
1883 	    mtcp_printf("WARNING: error forking child process `%s`.  Compression will "
1884 	                "not be used [%s].\n", gzip_path, strerror(mtcp_sys_errno));
1885 	    close(pipe_fds[0]);
1886 	    close(pipe_fds[1]);
1887 	    //fall through to return fd
1888 	  } else if (cpid > 0) { /* parent process */
1889 	    //Before running gzip in child process, we must not use LD_PRELOAD.
1890 	    // See revision log 342 for details concerning bash.
1891 	    mtcp_ckpt_gzip_child_pid = cpid;
1892 	    if (close(pipe_fds[0]) == -1)
1893 	      mtcp_printf("WARNING: (in open_ckpt_to_write) close failed: %s\n",
1894 			  strerror(errno));
1895 	    if (close(fd) == -1)
1896 	      mtcp_printf("WARNING: (in open_ckpt_to_write) close failed: %s\n",
1897 			  strerror(errno));
1898 	    fd=pipe_fds[1];//change return value
1899 	  } else { /* child process */
1900 	    static int (*libc_unsetenv) (const char *name);
1901 	    static int (*libc_execvp) (const char *path, char *const argv[]);
1902 	
1903 	    close(pipe_fds[1]);
1904 	    dup2(pipe_fds[0], STDIN_FILENO);
1905 	    close(pipe_fds[0]);
1906 	    dup2(fd, STDOUT_FILENO);
1907 	    close(fd);
1908 	
1909 	    // Don't load dmtcphijack.so, etc. in exec.
1910 	    unsetenv("LD_PRELOAD"); // If in bash, this is bash env. var. version
1911 	    libc_unsetenv = mtcp_get_libc_symbol("unsetenv");
1912 	    (*libc_unsetenv)("LD_PRELOAD");
1913 	
1914 	    libc_execvp = mtcp_get_libc_symbol("execvp");
1915 	    (*libc_execvp)(gzip_path, gzip_args);
1916 	
1917 	    /* should not arrive here */
1918 	    mtcp_printf("ERROR: compression failed!  No checkpointing will be"
1919 	                "performed!  Cancel now!\n");
1920 	    mtcp_sys_exit(1);
1921 	  }
1922 	
1923 	  return fd;
1924 	}
1925 	
1926 	
1927 	/********************************************************************************************************************************/
1928 	/*																*/
1929 	/*  This routine is called from time-to-time to write a new checkpoint file.							*/
1930 	/*  It assumes all the threads are suspended.											*/
1931 	/*																*/
1932 	/********************************************************************************************************************************/
1933 	
1934 	static void checkpointeverything (void)
1935 	{
1936 	  Area area;
1937 	  int fd, mapsfd;
1938 	  VA area_begin, area_end;
1939 	  int stack_was_seen = 0;
1940 	  int vsyscall_exists = 0;
1941 	  int forked_checkpointing = 0;
1942 	  int forked_cpid;
1943 	  int use_compression = -1; /* decide later */
1944 	  int pipe_fds[2]; /* for potential piping */
1945 	  char *gzip_cmd = "gzip";
1946 	  char gzip_path[MTCP_MAX_PATH];
1947 	  char tmpDMTCPHeaderBuf[] = "/tmp/dmtcp.XXXXXX";
1948 	  char *tmpDMTCPHeaderFileName = tmpDMTCPHeaderBuf;
1949 	  int tmpDMTCPHeaderFd = -1;
1950 	
1951 	  static void *const frpointer = finishrestore;
1952 	
1953 	  DPRINTF (("mtcp checkpointeverything*: tid %d\n", mtcp_sys_kernel_gettid ()));
1954 	
1955 	  if (getenv("MTCP_FORKED_CHECKPOINT") != NULL)
1956 	    forked_checkpointing = 1;
1957 	#ifdef TEST_FORKED_CHECKPOINTING
1958 	  forked_checkpointing = 1;
1959 	#endif
1960 	
1961 	  if (callback_write_dmtcp_header != 0) {
1962 	    /* Temp file for DMTCP header; will be written into the checkpoint file. */
1963 	    tmpDMTCPHeaderFd = mkstemp(tmpDMTCPHeaderFileName);
1964 	    if (tmpDMTCPHeaderFd < 0) {
1965 	      mtcp_printf("error %d creating temp file: %s\n", errno, strerror(errno));
1966 	      mtcp_abort();
1967 	    }
1968 	
1969 	    if (unlink(tmpDMTCPHeaderFileName) == -1) {
1970 	      mtcp_printf("NOTE: error %d unlinking temp file: %s\n", errno,
1971 			  strerror(errno));
1972 	    }
1973 	
1974 	    /* Better to do this in parent, not child, for most accurate header info */
1975 	    (*callback_write_dmtcp_header)(tmpDMTCPHeaderFd);
1976 	  }
1977 	
1978 	  if (forked_checkpointing) {
1979 	    forked_cpid = mtcp_sys_fork();
1980 	    if (forked_cpid == -1) {
1981 	      mtcp_printf("WARNING: Failed to do forked checkpointing,"
1982 			  " trying normal checkpoint\n");
1983 	    } else if (forked_cpid > 0) {
1984 	      /* Parent process*/
1985 	      if (tmpDMTCPHeaderFd != -1)
1986 	        close(tmpDMTCPHeaderFd);
1987 	      // Calling waitpid here, but on 32-bit Linux, libc:waitpid() calls wait4()
1988 	      if ( waitpid(forked_cpid, NULL, 0) == -1 )
1989 	        DPRINTF (("mtcp restoreverything*: error waitpid: errno: %d",
1990 	              mtcp_sys_errno));
1991 	      DPRINTF (("mtcp checkpointeverything*: checkpoint complete\n"));
1992 	      return;
1993 	    } else {
1994 	      pid_t grandchild_pid = mtcp_sys_fork();
1995 	      if (grandchild_pid == -1) {
1996 	        mtcp_printf("WARNING: Forked checkpoint failed, no checkpoint available\n");
1997 	      } else if (grandchild_pid > 0) {
1998 	        mtcp_sys_exit(0); /* child exits */
1999 	      }
2000 	      /* grandchild continues; no need now to waitpid() on grandchild */
2001 	      DPRINTF (("mtcp checkpointeverything*: inside grandchild process\n"));
2002 	    }
2003 	  }
2004 	
2005 	  /* 1. Test if using compression */
2006 	  use_compression = test_use_compression();
2007 	  /* 2. Get gzip path */
2008 	  if (use_compression && mtcp_find_executable(gzip_cmd, gzip_path) == NULL) {
2009 	    mtcp_printf("WARNING: gzip cannot be executed.  Compression will "
2010 	                "not be used.\n");
2011 	    use_compression = 0;
2012 	  }
2013 	  /* 3. Create pipe */
2014 	  /* Note:  Must use mtcp_sys_pipe(), to go to kernel, since
2015 	   *   DMTCP has a wrapper around glibc promoting pipes to socketpairs,
2016 	   *   DMTCP doesn't directly checkpoint/restart pipes.
2017 	   */
2018 	  if ( use_compression && mtcp_sys_pipe(pipe_fds) == -1 ) {
2019 	    mtcp_printf("WARNING: error creating pipe. Compression will "
2020 	                "not be used.\n");
2021 	    use_compression = 0;
2022 	  }
2023 	  /* 4. Open fd to checkpoint image on disk */
2024 	  /* Create temp checkpoint file and write magic number to it */
2025 	  /* This is a callback to DMTCP.  DMTCP writes header and returns fd. */
2026 	  fd = mtcp_safe_open(temp_checkpointfilename,
2027 			      O_CREAT | O_TRUNC | O_WRONLY, 0600);
2028 	  if (fd < 0) {
2029 	    mtcp_printf("mtcp.c: checkpointeverything: error creating %s: %s\n",
2030 	                temp_checkpointfilename, strerror(mtcp_sys_errno));
2031 	    mtcp_abort();
2032 	  }
2033 	  /* 5. We now have the information to pipe to gzip, or directly to fd
2034 	  *     We do it this way, so that gzip will be direct child of forked process
2035 	  *       when using forked checkpointing.
2036 	  */
2037 	
2038 	#if 1
2039 	  /* Temporary fix, until DMTCP uses its own separate allocator.
2040 	   * The else code should really go lower down, just before we checkpoint
2041 	   * the heap.
2042 	   */
2043 	#else
2044 	  if (mtcp_sys_break(0) != mtcp_saved_break)
2045 	    mtcp_printf("\n\n*** ERROR:  End of heap grew."
2046 			"  Continue at your own risk. ***\n\n\n");
2047 	#endif
2048 	
2049 	  /* Drain stdin and stdout before checkpoint */
2050 	  tcdrain(STDOUT_FILENO);
2051 	  tcdrain(STDERR_FILENO);
2052 	
2053 	  if (use_compression) /* if use_compression, fork a gzip process */
2054 	    fd = open_ckpt_to_write(fd, pipe_fds, gzip_path);
2055 	
2056 	  if (tmpDMTCPHeaderFd != -1 ) {
2057 	    char tmpBuff[1024];
2058 	    int retval = -1;
2059 	    lseek(tmpDMTCPHeaderFd, 0, SEEK_SET);
2060 	
2061 	    while (retval != 0) {
2062 	      retval = read (tmpDMTCPHeaderFd, tmpBuff, 1024);
2063 	      if (retval == -1 && (errno == EAGAIN || errno == EINTR))
2064 	        continue;
2065 	      if (retval == -1) {
2066 	        mtcp_printf("Error writing checkpoint file: %s\n", strerror(errno));
2067 	        mtcp_abort();
2068 	      }
2069 	      writefile(fd, tmpBuff, retval);
2070 	    }
2071 	    close(tmpDMTCPHeaderFd);
2072 	  }
2073 	
2074 	  // Preprocess special segments like vsyscall, stack, heap etc.
2075 	  preprocess_special_segments(&vsyscall_exists);
2076 	
2077 	  writefile (fd, MAGIC, MAGIC_LEN);
2078 	
2079 	  DPRINTF (("mtcp checkpointeverything*: restore_begin %X at %p from [libmtcp.so]\n",
2080 	            restore_size, restore_begin));
2081 	
2082 	  struct rlimit stack_rlimit;
2083 	  getrlimit(RLIMIT_STACK, &stack_rlimit);
2084 	
2085 	  DPRINTF (("mtcp_restart: saved stack resource limit: soft_lim:%p, hard_lim:%p\n",
2086 		    stack_rlimit.rlim_cur, stack_rlimit.rlim_max));
2087 	
2088 	  writecs (fd, CS_STACKRLIMIT);
2089 	  writefile (fd, &stack_rlimit, sizeof stack_rlimit);
2090 	
2091 	  DPRINTF (("mtcp checkpointeverything*: [libmtcp.so] image of size %X at %p\n",
2092 		    restore_size, restore_begin));
2093 	
2094 	  writecs (fd, CS_RESTOREBEGIN);
2095 	  writefile (fd, &restore_begin, sizeof restore_begin);
2096 	  writecs (fd, CS_RESTORESIZE);
2097 	  writefile (fd, &restore_size, sizeof restore_size);
2098 	  writecs (fd, CS_RESTORESTART);
2099 	  writefile (fd, &restore_start, sizeof restore_start);
2100 	  writecs (fd, CS_RESTOREIMAGE);
2101 	  writefile (fd, (void *)restore_begin, restore_size);
2102 	  writecs (fd, CS_FINISHRESTORE);
2103 	  writefile (fd, &frpointer, sizeof frpointer);
2104 	
2105 	  /* Write out file descriptors */
2106 	
2107 	  writefiledescrs (fd);
2108 	
2109 	  /* Finally comes the memory contents */
2110 	
2111 	  /**************************************************************************/
2112 	  /* We can't do any more mallocing at this point because malloc stuff is   */
2113 	  /* outside the limits of the libmtcp.so image, so it won't get            */
2114 	  /* checkpointed, and it's possible that we would checkpoint an            */
2115 	  /* inconsistent state.  See note in restoreverything routine.             */
2116 	  /**************************************************************************/
2117 	
2118 	  mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
2119 	
2120 	  while (readmapsline (mapsfd, &area)) {
2121 	    area_begin = (VA)area.addr;
2122 	    area_end   = area_begin + area.size;
2123 	
2124 	    /* Original comment:  Skip anything in kernel address space ---
2125 	     *   beats me what's at FFFFE000..FFFFFFFF - we can't even read it;
2126 	     * Added: That's the vdso section for earlier Linux 2.6 kernels.  For later
2127 	     *  2.6 kernels, vdso occurs at an earlier address.  If it's unreadable,
2128 	     *  then we simply won't copy it.  But let's try to read all areas, anyway.
2129 	     * **COMMENTED OUT:** if (area_begin >= HIGHEST_VA) continue;
2130 	     */
2131 	    /* If it's readable, but it's VDSO, it will be dangerous to restore it.
2132 	     * In 32-bit mode later Red Hat RHEL Linux 2.6.9 releases use 0xffffe000,
2133 	     * the last page of virtual memory.  Note 0xffffe000 >= HIGHEST_VA
2134 	     * implies we're in 32-bit mode.
2135 	     */
2136 	    if (area_begin >= HIGHEST_VA && area_begin == 0xffffe000) continue;
2137 	#ifdef __x86_64__
2138 	    /* And in 64-bit mode later Red Hat RHEL Linux 2.6.9 releases
2139 	     * use 0xffffffffff600000 for VDSO.
2140 	     */
2141 	    if (area_begin >= HIGHEST_VA && area_begin == 0xffffffffff600000) continue;
2142 	#endif
2143 	
2144 	    /* Skip anything that has no read or execute permission.  This occurs
2145 	     * on one page in a Linux 2.6.9 installation.  No idea why.  This code
2146 	     * would also take care of kernel sections since we don't have read/execute
2147 	     * permission there.
2148 	     */
2149 	
2150 	    if (!((area.prot & PROT_READ) || (area.prot & PROT_WRITE))) continue;
2151 	
2152 	    // If the process has an area labelled as "/dev/zero (deleted)", we mark
2153 	    //   the area as Anonymous and save the contents to the ckpt image file.
2154 	    // IF this area has a MAP_SHARED attribute, it should be replaced with
2155 	    //   MAP_PRIVATE and we won't do any harm because, the /dev/zero file is an
2156 	    //   absolute source and sink. Anything written to it will be discarded and
2157 	    //   anything read from it will be all zeros.
2158 	    // The following call to mmap will create "/dev/zero (deleted)" area
2159 	    //         mmap(addr, size, protection, MAP_SHARED | MAP_ANONYMOUS, 0, 0)
2160 	    //
2161 	    // The above explanation also applies to "/dev/null (deleted)"
2162 	
2163 	    if ( mtcp_strstartswith(area.name, dev_zero_deleted_str) ||
2164 	         mtcp_strstartswith(area.name, dev_null_deleted_str) ) {
2165 	      DPRINTF(("mtcp checkpointeverything: saving area \"%s\" as Anonymous\n",
2166 		       area.name));
2167 	      area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
2168 	      area.name[0] = '\0';
2169 	    }
2170 	
2171 	    if (mtcp_strstartswith(area.name, sys_v_shmem_file)) {
2172 	      DPRINTF(("mtcp checkpointeverything: saving area \"%s\" as Anonymous\n",
2173 		       area.name));
2174 	      area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
2175 	      area.name[0] = '\0';
2176 	    }
2177 	
2178 	    /* Special Case Handling: nscd is enabled*/
2179 	    if ( mtcp_strstartswith(area.name, nscd_mmap_str) ||
2180 	         mtcp_strstartswith(area.name, nscd_mmap_str2) ||
2181 	         mtcp_strstartswith(area.name, nscd_mmap_str3) ) {
2182 	      DPRINTF(("mtcp checkpointeverything: NSCD daemon shared memory area present. MTCP will now try to remap\n" \
2183 	            "                           this area in read/write mode and then will fill it with zeros so that\n" \
2184 	            "                           glibc will automatically ask NSCD daemon for new shared area\n\n"));
2185 	      area.prot = PROT_READ | PROT_WRITE;
2186 	      area.flags = MAP_PRIVATE | MAP_ANONYMOUS;
2187 	
2188 	      if ( munmap(area.addr, area.size) == -1) {
2189 	        mtcp_printf ("mtcp checkpointeverything: error unmapping NSCD shared area: %s\n",
2190 	                     strerror (mtcp_sys_errno));
2191 	        mtcp_abort();
2192 	      }
2193 	
2194 	      if ( mmap(area.addr, area.size, area.prot, area.flags, 0, 0)
2195 	           == MAP_FAILED ){
2196 	        mtcp_printf ("mtcp checkpointeverything: error remapping NSCD shared area: %s\n",
2197 	                     strerror (mtcp_sys_errno));
2198 	        mtcp_abort();
2199 	      }
2200 	
2201 	      memset(area.addr, 0, area.size);
2202 	    }
2203 	
2204 	    /* Force the anonymous flag if it's a private writeable section, as the
2205 	     * data has probably changed from the contents of the original images.
2206 	     */
2207 	
2208 	    /* We also do this for read-only private sections as it's possible
2209 	     * to modify a page there, too (via mprotect).
2210 	     */
2211 	
2212 	    if ((area.flags & MAP_PRIVATE) /*&& (area.prot & PROT_WRITE)*/) {
2213 	      area.flags |= MAP_ANONYMOUS;
2214 	    }
2215 	
2216 	    if ( area.flags & MAP_SHARED ) {
2217 	      /* invalidate shared memory pages so that the next read to it (when we are
2218 	       * writing them to ckpt file) will cause them to be reloaded from the disk.
2219 	       */
2220 	      if ( msync(area.addr, area.size, MS_INVALIDATE) < 0 ){
2221 	        mtcp_printf ("mtcp sync_shared_memory: error %d Invalidating %X"
2222 	            " at %p from %s + %X\n", mtcp_sys_errno, area.size,
2223 	            area.addr, area.name, area.offset);
2224 	        mtcp_abort();
2225 	      }
2226 	    }
2227 	
2228 	
2229 	    /* Skip any mapping for this image - it got saved as CS_RESTOREIMAGE
2230 	     * at the beginning.
2231 	     */
2232 	
2233 	    if (area_begin < restore_begin) {
2234 	      if (area_end <= restore_begin) {
2235 	        writememoryarea (fd, &area, 0, vsyscall_exists); // the whole thing is before the restore image
2236 	      } else if (area_end <= restore_end) {
2237 	        area.size = restore_begin - area_begin;    // we just have to chop the end part off
2238 	        writememoryarea (fd, &area, 0, vsyscall_exists);
2239 	      } else {
2240 	        area.size = restore_begin - area_begin;    // we have to write stuff that comes before restore image
2241 	        writememoryarea (fd, &area, 0, vsyscall_exists);
2242 	        area.offset += restore_end - area_begin;   // ... and we have to write stuff that comes after restore image
2243 	        area.size = area_end - restore_end;
2244 	        area.addr = (void *)restore_end;
2245 	        writememoryarea (fd, &area, 0, vsyscall_exists);
2246 	      }
2247 	    } else if (area_begin < restore_end) {
2248 	      if (area_end > restore_end) {
2249 	        area.offset += restore_end - area_begin;   // we have to write stuff that comes after restore image
2250 	        area.size = area_end - restore_end;
2251 	        area.addr = (void *)restore_end;
2252 	        writememoryarea (fd, &area, 0, vsyscall_exists);
2253 	      }
2254 	    } else {
2255 	      if ( strstr (area.name, "[stack]") )
2256 	        stack_was_seen = 1;
2257 	      writememoryarea (fd, &area, stack_was_seen, vsyscall_exists); // the whole thing comes after the restore image
2258 	    }
2259 	  }
2260 	
2261 	  close (mapsfd);
2262 	
2263 	  /* That's all folks */
2264 	
2265 	  writecs (fd, CS_THEEND);
2266 	  if (close (fd) < 0) {
2267 	    mtcp_printf ("mtcp checkpointeverything(grandchild):"
2268 	                 " error closing checkpoint file: %s\n", strerror (errno));
2269 	    mtcp_abort ();
2270 	  }
2271 	  if (use_compression) {
2272 	    /* IF OUT OF DISK SPACE, REPORT IT HERE. */
2273 	    if ( waitpid(mtcp_ckpt_gzip_child_pid, NULL, 0 ) == -1 )
2274 	      mtcp_printf ("mtcp checkpointeverything(grandchild): waitpid: %s\n",
2275 	                   strerror (errno));
2276 	    mtcp_ckpt_gzip_child_pid = -1;
2277 	  }
2278 	
2279 	  /* Maybe it's time to verify the checkpoint.
2280 	   * If so, exec an mtcp_restore with the temp file (in case temp file is bad,
2281 	   *   we'll still have the last one).
2282 	   * If the new file is good, mtcp_restore will rename it over the last one.
2283 	   */
2284 	
2285 	  if (verify_total != 0) -- verify_count;
2286 	
2287 	  /* Now that temp checkpoint file is complete, rename it over old permanent
2288 	   * checkpoint file.  Uses rename() syscall, which doesn't change i-nodes.
2289 	   * So, gzip process can continue to write to file even after renaming.
2290 	   */
2291 	
2292 	  else renametempoverperm ();
2293 	
2294 	  if (forked_checkpointing)
2295 	    mtcp_sys_exit (0); /* grandchild exits */
2296 	
2297 	  DPRINTF (("mtcp checkpointeverything*: checkpoint complete\n"));
2298 	}
2299 	
2300 	/* True if the given FD should be checkpointed */
2301 	static int should_ckpt_fd (int fd)
2302 	{
2303 	   if ( callback_ckpt_fd!=NULL )
2304 	     return (*callback_ckpt_fd)(fd); //delegate to callback
2305 	   else if (fd > 2)
2306 	     return 1;
2307 	   else
2308 	   {
2309 	     /* stdin/stdout/stderr */
2310 	     /* we only want to checkpoint these if they are from a file */
2311 	     struct stat statbuf;
2312 	     fstat(fd, &statbuf);
2313 	     return S_ISREG(statbuf.st_mode);
2314 	   }
2315 	}
2316 	
2317 	/* Write list of open files to the checkpoint file */
2318 	
2319 	static void writefiledescrs (int fd)
2320 	
2321 	{
2322 	  char dbuf[BUFSIZ], linkbuf[FILENAMESIZE], *p, procfdname[64];
2323 	  int doff, dsiz, fddir, fdnum, linklen, rc;
2324 	  off_t offset;
2325 	  struct linux_dirent *dent;
2326 	  struct stat lstatbuf, statbuf;
2327 	
2328 	  writecs (fd, CS_FILEDESCRS);
2329 	
2330 	  /* Open /proc/self/fd directory - it contains a list of files I have open */
2331 	
2332 	  fddir = mtcp_sys_open ("/proc/self/fd", O_RDONLY, 0);
2333 	  if (fddir < 0) {
2334 	    mtcp_printf ("mtcp writefiledescrs: error opening directory /proc/self/fd: %s\n", strerror (errno));
2335 	    mtcp_abort ();
2336 	  }
2337 	
2338 	  /* Check each entry */
2339 	
2340 	  while (1) {
2341 	    dsiz = -1;
2342 	    if (sizeof dent -> d_ino == 4) dsiz = mtcp_sys_getdents (fddir, dbuf, sizeof dbuf);
2343 	    if (sizeof dent -> d_ino == 8) dsiz = mtcp_sys_getdents64 (fddir, dbuf, sizeof dbuf);
2344 	    if (dsiz <= 0) break;
2345 	
2346 	    for (doff = 0; doff < dsiz; doff += dent -> d_reclen) {
2347 	      dent = (struct linux_dirent *) (dbuf + doff);
2348 	
2349 	      /* The filename should just be a decimal number = the fd it represents.
2350 	       * Also, skip the entry for the checkpoint and directory files
2351 	       * as we don't want the restore to know about them.
2352 	       */
2353 	
2354 	      fdnum = strtol (dent -> d_name, &p, 10);
2355 	      if ((*p == '\0') && (fdnum >= 0) && (fdnum != fd) && (fdnum != fddir)
2356 		  && (should_ckpt_fd (fdnum) > 0)) {
2357 	
2358 	        /* Read the symbolic link so we get the filename that's open on the fd */
2359 	
Event secure_coding: [VERY RISKY]. Using "sprintf" can cause a buffer overflow when done incorrectly. Because sprintf() assumes an arbitrarily long string, callers must be careful not to overflow the actual space of the destination. Use snprintf() instead, or correct precision specifiers.
2360 	        sprintf (procfdname, "/proc/self/fd/%d", fdnum);
2361 	        linklen = readlink (procfdname, linkbuf, sizeof linkbuf - 1);
2362 	        if ((linklen >= 0) || (errno != ENOENT)) { // probably was the proc/self/fd directory itself
2363 	          if (linklen < 0) {
2364 	            mtcp_printf ("mtcp writefiledescrs: error reading %s: %s\n",
2365 		                 procfdname, strerror (errno));
2366 	            mtcp_abort ();
2367 	          }
2368 	          linkbuf[linklen] = '\0';
2369 	
2370 	          DPRINTF (("mtcp writefiledescrs*: checkpointing fd %d -> %s\n",
2371 			    fdnum, linkbuf));
2372 	
2373 	          /* Read about the link itself so we know read/write open flags */
2374 	
2375 	          rc = lstat (procfdname, &lstatbuf);
2376 	          if (rc < 0) {
2377 	            mtcp_printf ("mtcp writefiledescrs: error statting %s -> %s: %s\n",
2378 		                 procfdname, linkbuf, strerror (-rc));
2379 	            mtcp_abort ();
2380 	          }
2381 	
2382 	          /* Read about the actual file open on the fd */
2383 	
2384 	          rc = stat (linkbuf, &statbuf);
2385 	          if (rc < 0) {
2386 	            mtcp_printf ("mtcp writefiledescrs: error statting %s -> %s: %s\n",
2387 		                 procfdname, linkbuf, strerror (-rc));
2388 	          }
2389 	
2390 	          /* Write state information to checkpoint file.
2391 	           * Replace file's permissions with current access flags
2392 		   * so restore will know how to open it.
2393 		   */
2394 	
2395 	          else {
2396 	            offset = 0;
2397 	            if (S_ISREG (statbuf.st_mode))
2398 		      offset = mtcp_sys_lseek (fdnum, 0, SEEK_CUR);
2399 	            statbuf.st_mode = (statbuf.st_mode & ~0777)
2400 				       | (lstatbuf.st_mode & 0777);
2401 	            writefile (fd, &fdnum, sizeof fdnum);
2402 	            writefile (fd, &statbuf, sizeof statbuf);
2403 	            writefile (fd, &offset, sizeof offset);
2404 	            writefile (fd, &linklen, sizeof linklen);
2405 	            writefile (fd, linkbuf, linklen);
2406 	          }
2407 	        }
2408 	      }
2409 	    }
2410 	  }
2411 	  if (dsiz < 0) {
2412 	    mtcp_printf ("mtcp writefiledescrs: error reading /proc/self/fd: %s\n",
2413 	                 strerror (mtcp_sys_errno));
2414 	    mtcp_abort ();
2415 	  }
2416 	
2417 	  mtcp_sys_close (fddir);
2418 	
2419 	  /* Write end-of-fd-list marker to checkpoint file */
2420 	
2421 	  fdnum = -1;
2422 	  writefile (fd, &fdnum, sizeof fdnum);
2423 	}
2424 	
2425 	static void writememoryarea (int fd, Area *area, int stack_was_seen,
2426 				     int vsyscall_exists)
2427 	
2428 	{ static void * orig_stack = NULL;
2429 	
2430 	  /* Write corresponding descriptor to the file */
2431 	
2432 	  if (orig_stack == NULL && 0 == strcmp(area -> name, "[stack]"))
2433 	    orig_stack = area -> addr + area -> size;
2434 	
2435 	  if (0 == strcmp(area -> name, "[vdso]") && !stack_was_seen)
2436 	    DPRINTF (("mtcp checkpointeverything*: skipping over [vdso] section"
2437 	              " %p at %p\n", area -> size, area -> addr));
2438 	  else if (0 == strcmp(area -> name, "[vsyscall]") && !stack_was_seen)
2439 	    DPRINTF (("mtcp checkpointeverything*: skipping over [vsyscall] section"
2440 	    	      " %p at %p\n", area -> size, area -> addr));
2441 	  else if (0 == strcmp(area -> name, "[stack]") &&
2442 		   orig_stack != area -> addr + area -> size)
2443 	    /* Kernel won't let us munmap this.  But we don't need to restore it. */
2444 	    DPRINTF (("mtcp checkpointeverything*: skipping over [stack] segment"
2445 	    	      " %X at %pi (not the orig stack)\n", area -> size, area -> addr));
2446 	  else if (!(area -> flags & MAP_ANONYMOUS))
2447 	    DPRINTF (("mtcp checkpointeverything*: save %p at %p from %s + %X\n",
2448 	              area -> size, area -> addr, area -> name, area -> offset));
2449 	  else if (area -> name[0] == '\0')
2450 	    DPRINTF (("mtcp checkpointeverything*: save anonymous %p at %p\n",
2451 	              area -> size, area -> addr));
2452 	  else DPRINTF (("mtcp checkpointeverything*: save anonymous %p at %p"
2453 	                 " from %s + %X\n",
2454 			 area -> size, area -> addr, area -> name, area -> offset));
2455 	
2456 	  if ((area -> name[0]) == '\0') {
2457 	    void *brk = mtcp_sys_brk(NULL);
2458 	    if (brk > area -> addr && brk <= area -> addr + area -> size)
2459 	      mtcp_sys_strcpy(area -> name, "[heap]");
2460 	  }
2461 	
2462 	  if ( 0 != strcmp(area -> name, "[vsyscall]")
2463 	       && ( (0 != strcmp(area -> name, "[vdso]")
2464 	             || vsyscall_exists /* which implies vdso can be overwritten */
2465 	             || !stack_was_seen ))) /* If vdso appeared before stack, it can be replaced */
2466 	  {
2467 	    writecs (fd, CS_AREADESCRIP);
2468 	    writefile (fd, area, sizeof *area);
2469 	
2470 	    /* Anonymous sections need to have their data copied to the file,
2471 	     *   as there is no file that contains their data
2472 	     * We also save shared files to checkpoint file to handle shared memory
2473 	     *   implemented with backing files
2474 	     */
2475 	    if (area -> flags & MAP_ANONYMOUS || area -> flags & MAP_SHARED) {
2476 	      writecs (fd, CS_AREACONTENTS);
2477 	      writefile (fd, area -> addr, area -> size);
2478 	    }
2479 	  }
2480 	}
2481 	
2482 	/* Write checkpoint section number to checkpoint file */
2483 	
2484 	static void writecs (int fd, char cs)
2485 	
2486 	{
2487 	  writefile (fd, &cs, sizeof cs);
2488 	}
2489 	
2490 	/* Write something to checkpoint file */
2491 	
2492 	static char zeroes[MTCP_PAGE_SIZE] = { 0 };
2493 	static void writefile (int fd, void const *buff, size_t size)
2494 	
2495 	{
2496 	  char const *bf;
2497 	  ssize_t rc;
2498 	  size_t sz, wt;
2499 	
2500 	  checkpointsize += size;
2501 	
2502 	  bf = buff;
2503 	  sz = size;
2504 	  while (sz > 0) {
2505 	    for (wt = sz; wt > 0; wt /= 2) {
2506 	      rc = write (fd, bf, wt);
2507 	      if ((rc >= 0) || (errno != EFAULT)) break;
2508 	    }
2509 	
2510 	    /* Sometimes image page alignment will leave a hole in the middle of an image */
2511 	    /* ... but the idiot proc/self/maps will include it anyway                    */
2512 	
2513 	    if (wt == 0) {
2514 	      rc = (sz > sizeof zeroes ? sizeof zeroes : sz);
2515 	      checkpointsize -= rc; /* Correct now, since writefile will add rc back */
2516 	      writefile (fd, zeroes, rc);
2517 	    }
2518 	
2519 	    /* Otherwise, check for real error */
2520 	
2521 	    else {
2522 	      if (rc == 0) errno = EPIPE;
2523 	      if (rc <= 0) {
2524 	        mtcp_printf ("mtcp writefile: error writing from %p to %s: %s\n",
2525 		             bf, temp_checkpointfilename, strerror (errno));
2526 	        mtcp_abort ();
2527 	      }
2528 	    }
2529 	
2530 	    /* It's ok, we're on to next part */
2531 	
2532 	    sz -= rc;
2533 	    bf += rc;
2534 	  }
2535 	}
2536 	
2537 	static void preprocess_special_segments(int *vsyscall_exists)
2538 	{
2539 	  Area area;
2540 	  int mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
2541 	  if (mapsfd < 0) {
2542 	    mtcp_printf ("mtcp checkpointeverything: error opening"
2543 	        " /proc/self/maps: %s\n", strerror (mtcp_sys_errno));
2544 	    mtcp_abort ();
2545 	  }
2546 	
2547 	  while (readmapsline (mapsfd, &area)) {
2548 	    if (0 == strcmp(area.name, "[vsyscall]")) {
2549 	      /* Determine if [vsyscall] exists.  If [vdso] and [vsyscall] exist,
2550 	       * [vdso] will be saved and restored.
2551 	       * NOTE:  [vdso] is relocated if /proc/sys/kernel/randomize_va_space == 2.
2552 	       * We must restore old [vdso] and also keep [vdso] in that case.
2553 	       * On Linux 2.6.25, 32-bit Linux has:  [heap], /lib/ld-2.7.so, [vdso], libs, [stack].
2554 	       * On Linux 2.6.25, 64-bit Linux has:  [stack], [vdso], [vsyscall].
2555 	       *   and at least for gcl, [stack], libmtcp.so, [vsyscall] seen.
2556 	       * If 32-bit process in 64-bit Linux:  [stack] (0xffffd000), [vdso] (0xffffe0000)
2557 	       * On 32-bit Linux, mtcp_restart has [vdso], /lib/ld-2.7.so, [stack]
2558 	       * Need to restore old [vdso] into mtcp_restart, to restart.
2559 	       * With randomize_va_space turned off, libraries start at high address
2560 	       *     0xb8000000 and are loaded progressively at lower addresses.
2561 	       * mtcp_restart loads vdso (which looks like a shared library) first.
2562 	       * But libpthread/libdl/libc libraries are loaded above vdso in user image.
2563 	       * So, we must use the opposite of the user's setting (no randomization if
2564 	       *     user turned it on, and vice versa).  We must also keep the
2565 	       *     new vdso segment, provided by mtcp_restart.
2566 	       */
2567 	      *vsyscall_exists = 1;
2568 	    } else if (!saved_heap_start && strcmp(area.name, "[heap]") == 0) {
2569 	      // Record start of heap which will later be used in finishrestore()
2570 	      saved_heap_start = area.addr;
2571 	    } else if (strcmp(area.name, "[stack]") == 0) {
2572 	      /*
2573 	       * When using Matlab with dmtcp_checkpoint, sometimes the bottom most
2574 	       * page of stack (the page with highest address) which contains the
2575 	       * environment strings and the argv[] was not shown in /proc/self/maps.
2576 	       * This happens on some odd combination of environment passed on to
2577 	       * Matlab process. As a result, the page was not checkpointed and hence
2578 	       * the process segfaulted on restart. The fix is to try to mprotect this
2579 	       * page with RWX permission to make the page visible again. This call
2580 	       * will fail if no stack page was invisible to begin with.
2581 	       */
2582 	      int ret = mprotect(area.addr + area.size, 0x1000, 
2583 	                         PROT_READ | PROT_WRITE | PROT_EXEC);
2584 	      if (ret == 0) {
2585 	        mtcp_printf("mtcp checkpointeverything: bottom-most page of stack\n"
2586 	                 "(page with highest address) was invisible in /proc/self/maps.\n"
2587 	                 "It is made visible again now.\n");
2588 	      }
2589 	    }
2590 	  }
2591 	  close(mapsfd);
2592 	}
2593 	
2594 	/********************************************************************************************************************************/
2595 	/*																*/
2596 	/*  This signal handler is forced by the main thread doing a 'mtcp_sys_kernel_tkill' to stop these threads so it can do a 	*/
2597 	/*  checkpoint															*/
2598 	/*																*/
2599 	/********************************************************************************************************************************/
2600 	/* Grow the stack by kbStack*1024 so that large stack is allocated on restart
2601 	 * The kernel won't do it automatically for us any more, since it thinks
2602 	 * the stack is in a different place after restart.
2603 	 */
2604 	/* growstackValue is volatile so compiler doesn't optimize away growstack
2605 	 * Maybe it's not needed if we use ((optimize(0))) .
2606 	 */
2607 	static volatile unsigned int growstackValue = 0;
2608 	__attribute__ ((optimize(0))) static void growstack (int kbStack);
2609 	static void growstack (int kbStack) {
2610 	  const int kBincrement = 1024;
2611 	  char array[kBincrement * 1024] __attribute__ ((unused));
2612 	  volatile int dummy_value __attribute__ ((unused)) = 1; /*Again, try to prevent compiler optimization*/
2613 	  if (kbStack > 0)
2614 	    growstack(kbStack - kBincrement);
2615 	  else
2616 	    growstackValue++;
2617 	}
2618 	
2619 	static void stopthisthread (int signum)
2620 	
2621 	{
2622 	  int rc;
2623 	  Thread *thread;
2624 	#define BT_SIZE 1024
2625 	#define STDERR_FD 826
2626 	#define LOG_FD 826
2627 	
2628 	#ifdef PTRACE
2629 	  ptrace_unlock_inferiors();
2630 	  ptrace_remove_notexisted();
2631 	  ptrace_detach_checkpoint_threads ();
2632 	  ptrace_detach_user_threads ();
2633 	#endif
2634 	
2635 	  DPRINTF (("mtcp stopthisthread*: tid %d returns to %p\n",
2636 	            mtcp_sys_kernel_gettid (), __builtin_return_address (0)));
2637 	
2638 	  thread = getcurrenthread ();                                              // see which thread this is
2639 	
2640 	  // If this is checkpoint thread - exit immidiately
2641 	  if ( mtcp_state_value(&thread -> state) == ST_CKPNTHREAD ) {
2642 	    return ;
2643 	  }
2644 	
2645 	  if (0 && thread == motherofall) {
2646 	#include <execinfo.h>
2647 	    void *buffer[BT_SIZE];
2648 	    int nptrs;
2649 	
2650 	    DPRINTF (( "printing stacktrace of the motherofall Thread\n\n" ));
2651 	    nptrs = backtrace (buffer, BT_SIZE);
2652 	    backtrace_symbols_fd ( buffer, nptrs, STDERR_FD );
2653 	    backtrace_symbols_fd ( buffer, nptrs, LOG_FD );
2654 	  }
2655 	  if (mtcp_state_set (&(thread -> state), ST_SUSPINPROG, ST_SIGENABLED)) {  // make sure we don't get called twice for same thread
2656 	    static int is_first_checkpoint = 1;
2657 	
2658 	    save_sig_state (thread);      // save signal state (and block signal delivery)
2659 	    save_tls_state (thread);      // save thread local storage state
2660 	
2661 	    /* Grow stack only on first ckpt.  Kernel agrees this is main stack and
2662 	     * will mmap it.  On second ckpt and later, we would segfault if we tried
2663 	     * to grow the former stack beyond the portion that is already mmap'ed.
2664 	     */
2665 	    if (thread == motherofall) {
2666 	      static char *orig_stack_ptr;
2667 	      int kbStack = 2048;
2668 	      if (is_first_checkpoint) {
2669 		orig_stack_ptr = (char *)&kbStack;
2670 	        is_first_checkpoint = 0;
2671 	        DPRINTF(("mtcp_stopthisthread: temp. grow main stack by %d kilobytes\n",
2672 			 kbStack));
2673 	        growstack(kbStack);
2674 	      } else if (orig_stack_ptr - (char *)&kbStack > 3 * kbStack*1024 / 4) {
2675 	        mtcp_printf("WARNING:  Stack within %d bytes of end;\n"
2676 			    "  Consider increasing 'kbStack' at line %d of mtcp/%s\n",
2677 			    kbStack*1024/4, __LINE__-9, __FILE__);
2678 	      }
2679 	    }
2680 	
2681 	    ///JA: new code ported from v54b
2682 	    rc = getcontext (&(thread -> savctx));
2683 	    if (rc < 0) {
2684 	      mtcp_printf ("mtcp stopthisthread: getcontext rc %d errno %d\n",
2685 	                   rc, errno);
2686 	      mtcp_abort ();
2687 	    }
2688 	    DPRINTF (("mtcp stopthisthread*: after getcontext\n"));
2689 	    if (mtcp_state_value(&restoreinprog) == 0) {
2690 	
2691 	      /* We are the original process and all context is saved
2692 	       * restoreinprog is 0 ; wait for ckpt thread to write ckpt, and resume.
2693 	       */
2694 	
2695 	      WMB; // matched by RMB in checkpointhread
2696 	
2697 	      /* Next comes the first time we use the old stack. */
2698 	      /* Tell the checkpoint thread that we're all saved away */
2699 	      if (!mtcp_state_set (&(thread -> state), ST_SUSPENDED, ST_SUSPINPROG))
2700 		mtcp_abort ();  // tell checkpointhread all our context is saved
2701 	      mtcp_state_futex (&(thread -> state), FUTEX_WAKE, 1, NULL);                            // wake checkpoint thread if it's waiting for me
2702 	
2703 	      /* Then we wait for the checkpoint thread to write the checkpoint file then wake us up */
2704 	
2705 	      DPRINTF (("mtcp stopthisthread*: thread %d suspending\n", thread -> tid));
2706 	      while (mtcp_state_value(&thread -> state) == ST_SUSPENDED) {
2707 	        mtcp_state_futex (&(thread -> state), FUTEX_WAIT, ST_SUSPENDED, NULL);
2708 	      }
2709 	
2710 	#ifdef PTRACE
2711 	      DPRINTF (("mtcp stopthisthread*: thread %d after suspending before deleting files\n", thread -> tid));
2712 	      delete_file(0, delete_ptrace_leader, has_ptrace_file);
2713 	      delete_file(1, delete_setoptions_leader, has_setoptions_file);
2714 	      delete_file(2, delete_checkpoint_leader, has_checkpoint_file);
2715 	      ptrace_attach_threads(0);
2716 	#endif
2717 	
2718 	      /* Maybe there is to be a checkpoint verification.  If so, and we're the main    */
2719 	      /* thread, exec the restore program.  If so and we're not the main thread, exit. */
2720 	
2721 	      if ((verify_total != 0) && (verify_count == 0)) {
2722 	
2723 	        /* If not the main thread, exit.  Either normal exit() or _exit()
2724 	         * seems to cause other threads to exit.
2725 	         */
2726 	
2727 	        if (thread != motherofall) {
2728 	          mtcp_sys_exit(0);
2729 	        }
2730 	
2731 	        /* This is the main thread, verify checkpoint then restart by doing
2732 	         * a restart.
2733 	         * The restore will rename the file after it has done the restart.
2734 	         */
2735 	
2736 	        DPRINTF (("mtcp checkpointeverything*: verifying checkpoint...\n"));
2737 	        execlp ("mtcp_restart", "mtcp_restart", "--verify", temp_checkpointfilename, NULL);
2738 	        mtcp_printf ("mtcp checkpointeverything: error execing mtcp_restart %s: %s\n", temp_checkpointfilename, strerror (errno));
2739 	        mtcp_abort ();
2740 	      }
2741 	
2742 	      /* No verification, resume where we left off */
2743 	
2744 	      DPRINTF (("mtcp stopthisthread*: thread %d resuming\n", thread -> tid));
2745 	    }
2746 	
2747 	    /* Else restoreinprog >= 1;  This stuff executes to do a restart */
2748 	
2749 	    else {
2750 	      if (!mtcp_state_set (&(thread -> state), ST_RUNENABLED, ST_SUSPENDED))
2751 		mtcp_abort ();  // checkpoint was written when thread in SUSPENDED state
2752 	      wait_for_all_restored ();
2753 	      DPRINTF (("mtcp stopthisthread*: thread %d restored\n", thread -> tid));
2754 	
2755 	      if (thread == motherofall) {
2756 	
2757 	        /* If we're a restore verification, rename the temp file
2758 		 * over the permanent one
2759 		 */
2760 	
2761 	        if (mtcp_restore_verify) renametempoverperm ();
2762 	      }
2763 	
2764 	#ifdef PTRACE
2765 	      ptrace_attach_threads(1);
2766 	#endif 
2767 	    }
2768 	  }
2769 	  DPRINTF (("mtcp stopthisthread*: tid %d returning to %p\n",
2770 		    mtcp_sys_kernel_gettid (), __builtin_return_address (0)));
2771 	#ifdef PTRACE
2772 	  ptrace_lock_inferiors();
2773 	#endif
2774 	}
2775 	
2776 	/********************************************************************************************************************************/
2777 	/*																*/
2778 	/*  Wait for all threads to finish restoring their context, then release them all to continue on their way.			*/
2779 	/*																*/
2780 	/*    Input:															*/
2781 	/*																*/
2782 	/*	restoreinprog = number of threads, including this, that hasn't called 'wait_for_all_restored' yet			*/
2783 	/*	thread list locked													*/
2784 	/*																*/
2785 	/*    Output:															*/
2786 	/*																*/
2787 	/*	restoreinprog = decremented												*/
2788 	/*	                if now zero, all threads woken and thread list unlocked							*/
2789 	/*																*/
2790 	/********************************************************************************************************************************/
2791 	
2792 	static void wait_for_all_restored (void)
2793 	
2794 	{
2795 	  int rip;
2796 	
2797 	  do rip = mtcp_state_value(&restoreinprog);                         // dec number of threads cloned but not completed longjmp'ing
2798 	  while (!mtcp_state_set (&restoreinprog, rip - 1, rip));
2799 	  if (-- rip == 0) {
2800 	
2801 	    /* raise the signals which were pending for the entire process at the time
2802 	     * of checkpoint. It is assumed that if a signal is pending for all threads
2803 	     * including the ckpt-thread, then it was sent to the process as opposed to
2804 	     * sent to individual threads.
2805 	     */
2806 	    int i;
2807 	    for (i = NSIG; i > 0; --i) {
2808 	      if (sigismember(&sigpending_global, i) == 1) {
2809 	        kill(getpid(), i);
2810 	      }
2811 	    }
2812 	
2813 	    if (callback_restore_virtual_pid_table != NULL) {
2814 	      DPRINTF(("Before callback_restore_virtual_pid_table: Thread:%d \n", 
2815 	               mtcp_sys_kernel_gettid()));
2816 	      (*callback_restore_virtual_pid_table)();
2817 	      DPRINTF(("After callback_restore_virtual_pid_table: Thread:%d \n",
2818 	               mtcp_sys_kernel_gettid()));
2819 	    }
2820 	
2821 	    mtcp_state_futex (&restoreinprog, FUTEX_WAKE, 999999999, NULL);  // if this was last of all, wake everyone up
2822 	
2823 	    // NOTE:  This is last safe moment for hook.  All previous threads
2824 	    //   have executed the "else" and are waiting on the futex.
2825 	    //   This last thread has not yet unlocked the threads: unlk_threads()
2826 	    //   So, no race condition occurs.
2827 	    //   By comparison, *callback_post_ckpt() is called before creating
2828 	    //   additional user threads.  Only motherofall (checkpoint thread existed)
2829 	    /* call weak symbol of this file, possibly overridden by the user's strong symbol  */
2830 	    /* user must compile his/her code with -Wl,-export-dynamic to make it visible */
2831 	    mtcpHookRestart();
2832 	    unlk_threads ();                                                 // ... and release the thread list
2833 	  } else {
2834 	    while ((rip = mtcp_state_value(&restoreinprog)) > 0) {           // otherwise, wait for last of all to wake this one up
2835 	      mtcp_state_futex (&restoreinprog, FUTEX_WAIT, rip, NULL);
2836 	    }
2837 	  }
2838 	}
2839 	
2840 	/********************************************************************************************************************************/
2841 	/*																*/
2842 	/*  Save signal mask and list of pending signals delivery										*/
2843 	/*																*/
2844 	/********************************************************************************************************************************/
2845 	
2846 	static void save_sig_state (Thread *thisthread)
2847 	{
2848 	  /* For checkpoint thread, we want to block delivery of all but some special signals*/
2849 	  if (thisthread == ckpthread) {
2850 	    /* 
2851 	     * For the checkpoint thread, we should not block SIGSETXID which is used
2852 	     * by the setsid family of system calls to change the session leader. Glibc
2853 	     * uses this signal to notify the process threads of the change in session
2854 	     * leader information. This signal is not documented and is used internally
2855 	     * by glibc. It is defined in <glibc-src-root>/nptl/pthreadP.h
2856 	     * screen was getting affected by this since it used setsid to change the
2857 	     * session leaders.
2858 	     */
2859 	#define SIGSETXID (__SIGRTMIN + 1)
2860 	    sigset_t set;
2861 	
2862 	    sigfillset(&set);
2863 	    sigdelset(&set, SIGSETXID);
2864 	
2865 	    if (pthread_sigmask(SIG_SETMASK, &set, NULL) < 0) {
2866 	      mtcp_printf("mtcp %s: error getting sigal mask: %s\n",
2867 	          __FUNCTION__, strerror(errno));
2868 	      mtcp_abort ();
2869 	    }
2870 	  }
2871 	  // Save signal block mask
2872 	  if (pthread_sigmask (SIG_SETMASK, NULL, &(thisthread -> sigblockmask)) < 0) {
2873 	    mtcp_printf("mtcp %s: error getting sigal mask: %s\n",
2874 	                __FUNCTION__, strerror(errno));
2875 	    mtcp_abort ();
2876 	  }
2877 	
2878 	  // Save pending signals
2879 	  sigpending ( &(thisthread->sigpending) );
2880 	}
2881 	
2882 	/********************************************************************************************************************************/
2883 	/*																*/
2884 	/*  Restore signal mask and all pending signals										*/
2885 	/*																*/
2886 	/********************************************************************************************************************************/
2887 	
2888 	static void restore_sig_state (Thread *thisthread)
2889 	{
2890 	  int i;
2891 	  DPRINTF (("mtcp restore_sig_state*: restoring handlers for thread %d\n",
2892 		    thisthread->original_tid));
2893 	  if (pthread_sigmask (SIG_SETMASK, &(thisthread -> sigblockmask), NULL) < 0) {
2894 	    mtcp_printf("mtcp %s: error setting sigal mask: %s\n",
2895 	                __FUNCTION__, strerror(errno));
2896 	    mtcp_abort ();
2897 	  }
2898 	
2899 	  // Raise the signals which were pending for only this thread at the time of checkpoint.
2900 	  for (i = NSIG; i > 0; --i) {
2901 	    if (sigismember(&(thisthread -> sigpending), i)  == 1  &&
2902 	        sigismember(&(thisthread -> sigblockmask), i) == 1 &&
2903 	        sigismember(&(sigpending_global), i) == 0) {
2904 	      raise(i);
2905 	    }
2906 	  }
2907 	}
2908 	
2909 	/********************************************************************************************************************************/
2910 	/*																*/
2911 	/*  Save all signal handlers										*/
2912 	/*																*/
2913 	/********************************************************************************************************************************/
2914 	static void save_sig_handlers (void)
2915 	{
2916 	  int i;
2917 	
2918 	  if (dmtcp_exists) {
2919 	    mtcp_printf("mtcp:%s Illegal function call when running under DMTCP*****\n",
2920 	                __FUNCTION__);
2921 	    // Do a simple return instead of killing the process
2922 	    return;
2923 	    //mtcp_abort();
2924 	  }
2925 	
2926 	  /* Now save all the signal handlers */
2927 	  DPRINTF (("mtcp save_sig_handlers*: saving signal handlers\n"));
2928 	  for (i = NSIG; i > 0; --i) {
2929 	    if (_real_sigaction (i, NULL, &sigactions[i]) < 0) {
2930 	      if (errno == EINVAL)
2931 	         memset (&sigactions[i], 0, sizeof sigactions[i]);
2932 	      else {
2933 	        mtcp_printf ("mtcp save_sig_handlers: error saving signal %d action: %s\n",
2934 	                     i, strerror(errno));
2935 	        mtcp_abort ();
2936 	      }
2937 	    }
2938 	
2939 	    DPRINTF (("mtcp save_sig_handlers*: saving signal handler for %d -> %p\n",
2940 	              i,
2941 	              (sigactions[i].sa_flags & SA_SIGINFO ?
2942 	                 (void *)(sigactions[i].sa_sigaction) :
2943 	                 (void *)(sigactions[i].sa_handler)) ));
2944 	  }
2945 	}
2946 	
2947 	/********************************************************************************************************************************/
2948 	/*																*/
2949 	/*  Restore all saved signal handlers										*/
2950 	/*																*/
2951 	/********************************************************************************************************************************/
2952 	static void restore_sig_handlers (Thread *thisthread)
2953 	{
2954 	  int i;
2955 	
2956 	  if (dmtcp_exists) {
2957 	    mtcp_printf("mtcp:%s Illegal function when running under DMTCP*****\n",
2958 	                __FUNCTION__);
2959 	    // Do a simple return instead of killing the process
2960 	    return;
2961 	    //mtcp_abort();
2962 	  }
2963 	
2964 	  DPRINTF (("mtcp restore_sig_handlers*: restoring signal handlers\n"));
2965 	#if 0
2966 	# define VERBOSE_DEBUG
2967 	#endif
2968 	  for(i = NSIG; i > 0; --i) {
2969 	#ifdef VERBOSE_DEBUG
2970 	    DPRINTF (("mtcp restore_sig_handlers*: restore signal handler for %d -> %p\n",
2971 	              i,
2972 	              (sigactions[i].sa_flags & SA_SIGINFO ?
2973 	                 sigactions[i].sa_sigaction :
2974 	                 sigactions[i].sa_handler) ));
2975 	#endif
2976 	
2977 	    if (_real_sigaction(i, &sigactions[i], NULL) < 0) {
2978 	        if (errno != EINVAL) {
2979 	          mtcp_printf ("mtcp restore_sig_handlers:" \
2980 			       " error restoring signal %d handler: %s\n",
2981 			       i, strerror(errno));
2982 	          mtcp_abort ();
2983 	        }
2984 	    }
2985 	  }
2986 	}
2987 	
2988 	/********************************************************************************************************************************/
2989 	/*																*/
2990 	/*  Save state necessary for TLS restore											*/
2991 	/*  Linux saves stuff in the GDT, switching it on a per-thread basis								*/
2992 	/*																*/
2993 	/********************************************************************************************************************************/
2994 	
2995 	static void save_tls_state (Thread *thisthread)
2996 	
2997 	{
2998 	  int i, rc;
2999 	
3000 	#ifdef __i386__
3001 	  asm volatile ("movw %%fs,%0" : "=m" (thisthread -> fs));
3002 	  asm volatile ("movw %%gs,%0" : "=m" (thisthread -> gs));
3003 	#endif
3004 	#ifdef __x86_64__
3005 	  //asm volatile ("movl %%fs,%0" : "=m" (thisthread -> fs));
3006 	  //asm volatile ("movl %%gs,%0" : "=m" (thisthread -> gs));
3007 	#endif
3008 	
3009 	  memset (thisthread -> gdtentrytls, 0, sizeof thisthread -> gdtentrytls);
3010 	
3011 	  /* On older Linuxes, we must save several GDT entries available to threads. */
3012 	
3013 	#if MTCP__SAVE_MANY_GDT_ENTRIES
3014 	  for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i ++) {
3015 	    thisthread -> gdtentrytls[i-GDT_ENTRY_TLS_MIN].entry_number = i;
3016 	    rc = mtcp_sys_get_thread_area (&(thisthread -> gdtentrytls[i-GDT_ENTRY_TLS_MIN]));
3017 	    if (rc < 0) {
3018 	      mtcp_printf ("mtcp checkpointeverything: error saving GDT TLS entry[%d]: %s\n", i, strerror (mtcp_sys_errno));
3019 	      mtcp_abort ();
3020 	    }
3021 	  }
3022 	
3023 	  /* With newer Linuxes, we just save the one GDT entry indexed by GS so we don't need the GDT_ENTRY_TLS_... definitions. */
3024 	  /* We get the particular index of the GDT entry to save by reading GS.                                                  */
3025 	
3026 	#else
3027 	  i = thisthread -> TLSSEGREG / 8;
3028 	  thisthread -> gdtentrytls[0].entry_number = i;
3029 	  rc = mtcp_sys_get_thread_area (&(thisthread -> gdtentrytls[0]));
3030 	  if (rc < 0) {
3031 	    mtcp_printf ("mtcp checkpointeverything: error saving GDT TLS entry[%d]: %s\n", i, strerror (mtcp_sys_errno));
3032 	    mtcp_abort ();
3033 	  }
3034 	#endif
3035 	}
3036 	
3037 	static char *memsubarray (char *array, char *subarray, int len) {
3038 	   char *i_ptr;
3039 	   int j;
3040 	   int word1 = *(int *)subarray;
3041 	   // Assume subarray length is at least sizeof(int) and < 2048.
3042 	   if (len < sizeof(int))
3043 	     mtcp_abort();
3044 	   for (i_ptr = array; i_ptr < array+2048; i_ptr++) {
3045 	     if (*(int *)i_ptr == word1) {
3046 	       for (j=0; j < len; j++)
3047 		 if (i_ptr[j] != subarray[j])
3048 		   break;
3049 		if (j == len)
3050 		  return i_ptr;
3051 	     }
3052 	   }
3053 	   return NULL;
3054 	}
3055 	static int mtcp_get_tls_segreg(void)
3056 	{ mtcp_segreg_t tlssegreg;
3057 	#ifdef __i386__
3058 	  asm volatile ("movw %%gs,%0" : "=g" (tlssegreg)); /* any general register */
3059 	#endif
3060 	#ifdef __x86_64__
3061 	  asm volatile ("movl %%fs,%0" : "=q" (tlssegreg)); /* q = a,b,c,d for i386; 8 low bits of r class reg for x86_64 */
3062 	#endif
3063 	  return (int)tlssegreg;
3064 	}
3065 	static void *mtcp_get_tls_base_addr(void)
3066 	{
3067 	  struct user_desc gdtentrytls;
3068 	
3069 	#if MTCP__SAVE_MANY_GDT_ENTRIES
3070 	  if (mtcp_get_tls_segreg() / 8 != GDT_ENTRY_TLS_MIN) {
3071 	    mtcp_printf ("mtcp_init: gs %X not set to first TLS GDT ENTRY %X\n",
3072 	                 gs, GDT_ENTRY_TLS_MIN * 8 + 3);
3073 	    mtcp_abort ();
3074 	  }
3075 	#endif
3076 	
3077 	  gdtentrytls.entry_number = mtcp_get_tls_segreg() / 8;
3078 	  if ( mtcp_sys_get_thread_area ( &gdtentrytls ) < 0 ) {
3079 	    mtcp_printf ("mtcp_init: error getting GDT TLS entry: %s\n",
3080 	        strerror (mtcp_sys_errno));
3081 	    mtcp_abort ();
3082 	  }
3083 	  return (void *)(*(unsigned long *)&(gdtentrytls.base_addr));
3084 	}
3085 	
3086 	static void renametempoverperm (void)
3087 	
3088 	{
3089 	  if (rename (temp_checkpointfilename, perm_checkpointfilename) < 0) {
3090 	    mtcp_printf ("mtcp checkpointeverything: error renaming %s to %s: %s\n",  			temp_checkpointfilename, perm_checkpointfilename,
3091 			 strerror (errno));
3092 	    mtcp_abort ();
3093 	  }
3094 	}
3095 	
3096 	/********************************************************************************************************************************/
3097 	/*																*/
3098 	/*  Get current thread struct pointer												*/
3099 	/*  It is keyed by the calling thread's gettid value										*/
3100 	/*  Maybe improve someday by using TLS												*/
3101 	/*																*/
3102 	/********************************************************************************************************************************/
3103 	
3104 	static Thread *getcurrenthread (void)
3105 	
3106 	{
3107 	  int tid;
3108 	  Thread *thread;
3109 	
3110 	  tid = mtcp_sys_kernel_gettid ();
3111 	  lock_threads ();
3112 	  for (thread = threads; thread != NULL; thread = thread -> next) {
3113 	    if (thread -> tid == tid) {
3114 	      unlk_threads ();
3115 	      return (thread);
3116 	    }
3117 	  }
3118 	  mtcp_printf ("mtcp getcurrenthread: can't find thread id %d\n", tid);
3119 	  mtcp_abort ();
3120 	  return thread; /* NOTREACHED : stop compiler warning */
3121 	}
3122 	
3123 	/********************************************************************************************************************************/
3124 	/*																*/
3125 	/*  Lock and unlock the 'threads' list												*/
3126 	/*																*/
3127 	/********************************************************************************************************************************/
3128 	
3129 	static void lock_threads (void)
3130 	
3131 	{
3132 	  while (!mtcp_state_set (&threadslocked, 1, 0)) {
3133 	    mtcp_state_futex (&threadslocked, FUTEX_WAIT, 1, NULL);
3134 	  }
3135 	  RMB; // don't prefetch anything until we have the lock
3136 	}
3137 	
3138 	static void unlk_threads (void)
3139 	
3140 	{
3141 	  WMB; // flush data written before unlocking
3142 	  // FIXME: Should we be checking return value of mtcp_state_set? Can it ever fail?
3143 	  mtcp_state_set(&threadslocked , 0, 1);
3144 	  mtcp_state_futex (&threadslocked, FUTEX_WAKE, 1, NULL);
3145 	}
3146 	
3147 	/********************************************************************************************************************************/
3148 	/*																*/
3149 	/*  Read /proc/self/maps line, converting it to an Area descriptor struct							*/
3150 	/*																*/
3151 	/*    Input:															*/
3152 	/*																*/
3153 	/*	mapsfd = /proc/self/maps file, positioned to beginning of a line							*/
3154 	/*																*/
3155 	/*    Output:															*/
3156 	/*																*/
3157 	/*	readmapsline = 0 : was at end-of-file, nothing read									*/
3158 	/*	               1 : read and processed one line										*/
3159 	/*	*area = filled in													*/
3160 	/*																*/
3161 	/*    Note:															*/
3162 	/*																*/
3163 	/*	Line from /procs/self/maps is in form:											*/
3164 	/*																*/
3165 	/*	<startaddr>-<endaddrexclusive> rwxs <fileoffset> <devmaj>:<devmin> <inode>    <filename>\n				*/
3166 	/*	all numbers in hexadecimal except inode is in decimal									*/
3167 	/*	anonymous will be shown with offset=devmaj=devmin=inode=0 and no '     filename'					*/
3168 	/*																*/
3169 	/********************************************************************************************************************************/
3170 	
3171 	static int readmapsline (int mapsfd, Area *area)
3172 	
3173 	{
3174 	  char c, rflag, sflag, wflag, xflag;
3175 	  int i, rc;
3176 	  struct stat statbuf;
3177 	  VA devmajor, devminor, devnum, endaddr, inodenum, startaddr;
3178 	
3179 	  c = mtcp_readhex (mapsfd, &startaddr);
3180 	  if (c != '-') {
3181 	    if ((c == 0) && (startaddr == 0)) return (0);
3182 	    goto skipeol;
3183 	  }
3184 	  c = mtcp_readhex (mapsfd, &endaddr);
3185 	  if (c != ' ') goto skipeol;
3186 	  if (endaddr < startaddr) goto skipeol;
3187 	
3188 	  rflag = c = mtcp_readchar (mapsfd);
3189 	  if ((c != 'r') && (c != '-')) goto skipeol;
3190 	  wflag = c = mtcp_readchar (mapsfd);
3191 	  if ((c != 'w') && (c != '-')) goto skipeol;
3192 	  xflag = c = mtcp_readchar (mapsfd);
3193 	  if ((c != 'x') && (c != '-')) goto skipeol;
3194 	  sflag = c = mtcp_readchar (mapsfd);
3195 	  if ((c != 's') && (c != 'p')) goto skipeol;
3196 	
3197 	  c = mtcp_readchar (mapsfd);
3198 	  if (c != ' ') goto skipeol;
3199 	
3200 	  c = mtcp_readhex (mapsfd, &devmajor);
3201 	  if (c != ' ') goto skipeol;
3202 	  area -> offset = devmajor;
3203 	
3204 	  c = mtcp_readhex (mapsfd, &devmajor);
3205 	  if (c != ':') goto skipeol;
3206 	  c = mtcp_readhex (mapsfd, &devminor);
3207 	  if (c != ' ') goto skipeol;
3208 	  c = mtcp_readdec (mapsfd, &inodenum);
3209 	  area -> name[0] = '\0';
3210 	  while (c == ' ') c = mtcp_readchar (mapsfd);
3211 	  if (c == '/' || c == '[') { /* absolute pathname, or [stack], [vdso], etc. */
3212 	    i = 0;
3213 	    do {
3214 	      area -> name[i++] = c;
3215 	      if (i == sizeof area -> name) goto skipeol;
3216 	      c = mtcp_readchar (mapsfd);
3217 	    } while (c != '\n');
3218 	    area -> name[i] = '\0';
3219 	  }
3220 	  if (mtcp_strstartswith(area -> name, nscd_mmap_str)  ||
3221 	      mtcp_strstartswith(area -> name, nscd_mmap_str2) ||
3222 	      mtcp_strstartswith(area -> name, nscd_mmap_str3)) {
3223 	    /* if nscd is active */
3224 	  } else if ( mtcp_strstartswith(area -> name, sys_v_shmem_file) ) {
3225 	    /* System V Shared-Memory segments are handled by DMTCP. */
3226 	  } else if ( mtcp_strendswith(area -> name, " (deleted)") ) {
3227 	    /* Deleted File */
3228 	  } else if (area -> name[0] == '/') {                 /* if an absolute pathname */
3229 	    rc = stat (area -> name, &statbuf);
3230 	    if (rc < 0) {
3231 	      mtcp_printf ("ERROR:  mtcp readmapsline: error %d statting %s\n",
3232 	                   -rc, area -> name);
3233 	      return (1); /* 0 would mean last line of maps; could do mtcp_abort() */
3234 	    }
3235 	    devnum = makedev (devmajor, devminor);
3236 	    if ((devnum != statbuf.st_dev) || (inodenum != statbuf.st_ino)) {
3237 	      mtcp_printf ("ERROR:  mtcp readmapsline: image %s dev:inode %X:%u"
3238 			   " not eq maps %X:%u\n",
3239 	                   area -> name, statbuf.st_dev, statbuf.st_ino,
3240 			   devnum, inodenum);
3241 	      return (1); /* 0 would mean last line of maps; could do mtcp_abort() */
3242 	    }
3243 	  } else {
3244 	    /* Special area like [heap] or anonymous area. */
3245 	  }
3246 	
3247 	  if (c != '\n') goto skipeol;
3248 	
3249 	  area -> addr = (void *)startaddr;
3250 	  area -> size = endaddr - startaddr;
3251 	  area -> prot = 0;
3252 	  if (rflag == 'r') area -> prot |= PROT_READ;
3253 	  if (wflag == 'w') area -> prot |= PROT_WRITE;
3254 	  if (xflag == 'x') area -> prot |= PROT_EXEC;
3255 	  area -> flags = MAP_FIXED;
3256 	  if (sflag == 's') area -> flags |= MAP_SHARED;
3257 	  if (sflag == 'p') area -> flags |= MAP_PRIVATE;
3258 	  if (area -> name[0] == '\0') area -> flags |= MAP_ANONYMOUS;
3259 	
3260 	  return (1);
3261 	
3262 	skipeol:
3263 	  DPRINTF (("ERROR:  mtcp readmapsline*: bad maps line <%c", c));
3264 	  while ((c != '\n') && (c != '\0')) {
3265 	    c = mtcp_readchar (mapsfd);
3266 	    mtcp_printf ("%c", c);
3267 	  }
3268 	  mtcp_printf (">\n");
3269 	  mtcp_abort ();
3270 	  return (0);  /* NOTREACHED : stop compiler warning */
3271 	}
3272 	
3273 	/********************************************************************************************************************************/
3274 	/*																*/
3275 	/*  Do restore from checkpoint file												*/
3276 	/*  This routine is called from the mtcp_restore program to perform the restore							*/
3277 	/*  It resides in the libmtcp.so image in exactly the same spot that the checkpointed process had its libmtcp.so loaded at, so this 	*/
3278 	/*    can't possibly interfere with restoring the checkpointed process								*/
3279 	/*  The restore can't use malloc because that might create memory sections.							*/
3280 	/*  Strerror seems to mess up with its Locale stuff in here too.								*/
3281 	/*																*/
3282 	/*    Input:															*/
3283 	/*																*/
3284 	/*	fd = checkpoint file, positioned just after the CS_RESTOREIMAGE data							*/
3285 	/*																*/
3286 	/********************************************************************************************************************************/
3287 	
3288 	#ifdef __x86_64__
3289 	# define UNUSED_IN_64_BIT __attribute__ ((unused))
3290 	#else
3291 	# define UNUSED_IN_64_BIT
3292 	#endif
3293 	
3294 	#define STRINGS_LEN 10000
3295 	static char UNUSED_IN_64_BIT STRINGS[STRINGS_LEN];
3296 	void mtcp_restore_start (int fd, int verify, pid_t gzip_child_pid,char *ckpt_newname,
3297 				 char *cmd_file, char *argv[], char *envp[] )
3298 	
3299 	{
3300 	#ifndef __x86_64__
3301 	  int i;
3302 	  char *strings = STRINGS;
3303 	#endif
3304 	
3305 	  DEBUG_RESTARTING = 1;
3306 	  /* If we just replace extendedStack by (tempstack+STACKSIZE) in "asm"
3307 	   * below, the optimizer generates non-PIC code if it's not -O0 - Gene
3308 	   */
3309 	  long long * extendedStack = tempstack + STACKSIZE;
3310 	
3311 	  /* Not used until we do longjmps, but get it out of the way now */
3312 	
3313 	  // FIXME: Should we be checking return value of mtcp_state_set? Can it ever fail?
3314 	  mtcp_state_set(&restoreinprog ,1, 0);
3315 	
3316 	  mtcp_sys_gettimeofday (&restorestarted, NULL);
3317 	
3318 	  /* Save parameter away in a static memory location as we're about to wipe the stack */
3319 	
3320 	  mtcp_restore_cpfd   = fd;
3321 	  mtcp_restore_verify = verify;
3322 	  mtcp_restore_gzip_child_pid = gzip_child_pid;
3323 	  // Copy newname to save it too
3324 	  {
3325 	    int i;
3326 	    for(i=0;ckpt_newname[i];i++){
3327 	      mtcp_ckpt_newname[i] = ckpt_newname[i];
3328 	    }
3329 	    mtcp_ckpt_newname[i] = '\0';
3330 	  }
3331 	
3332 	
3333 	#ifndef __x86_64__
3334 	  // Copy command line to libmtcp.so, so that we can re-exec if randomized vdso
3335 	  //   steps on us.  This won't be needed when we use the linker to map areas.
3336 	  strings = STRINGS;
3337 	  // This version of STRCPY copies source string into STRINGS,
3338 	  // and sets destination string to point there.
3339 	# define STRCPY(x,y) \
3340 		if (strings + 256 < STRINGS + STRINGS_LEN) { \
3341 		  mtcp_sys_strcpy(strings,y); \
3342 		  x = strings; \
3343 		  strings += mtcp_sys_strlen(y) + 1; \
3344 		} else { \
3345 		  DPRINTF(("MTCP:  ran out of string space." \
3346 			   "  Trying to continue anyway\n")); \
3347 		}
3348 	  STRCPY(mtcp_restore_cmd_file, cmd_file);
3349 	  for (i = 0; argv[i] != NULL; i++) {
3350 	    STRCPY(mtcp_restore_argv[i], argv[i]);
3351 	  }
3352 	  mtcp_restore_argv[i] = NULL;
3353 	  for (i = 0; envp[i] != NULL; i++) {
3354 	    STRCPY(mtcp_restore_envp[i], envp[i]);
3355 	  }
3356 	  mtcp_restore_envp[i] = NULL;
3357 	#endif
3358 	
3359 	  /* Switch to a stack area that's part of the shareable's memory address range
3360 	   * and thus not used by the checkpointed program
3361 	   */
3362 	
3363 	  asm volatile (CLEAN_FOR_64_BIT(mov %0,%%esp\n\t)
3364 	                /* This next assembly language confuses gdb,
3365 			   but seems to work fine anyway */
3366 	                CLEAN_FOR_64_BIT(xor %%ebp,%%ebp\n\t)
3367 	                : : "g" (extendedStack) : "memory");
3368 	
3369 	  /* Once we're on the new stack, we can't access any local variables or parameters */
3370 	  /* Call the restoreverything to restore files and memory areas                    */
3371 	
3372 	  /* This should never return */
3373 	  mtcp_restoreverything();
3374 	  asm volatile ("hlt");
3375 	}
3376 	
3377 	
3378 	/********************************************************************************************************************************/
3379 	/*																*/
3380 	/*  Restore proper heap														*/
3381 	/*																*/
3382 	/********************************************************************************************************************************/
3383 	static void restore_heap()
3384 	{
3385 	  /*
3386 	   * If the original start of heap is lower than the current end of heap, we
3387 	   * want to mmap the area between mtcp_saved_break and current break. This
3388 	   * happens when the size of checkpointed program is smaller then the size of
3389 	   * mtcp_restart program.
3390 	   */
3391 	  void* current_break = mtcp_sys_brk (NULL);
3392 	  if (current_break > mtcp_saved_break) {
3393 	    DPRINTF(("mtcp finishrestore: Area between mtcp_saved_break:%p and "
3394 	             "Current_break:%p not mapped, mapping it now\n", 
3395 	             mtcp_saved_break, current_break));
3396 	    size_t oldsize = mtcp_saved_break - saved_heap_start;
3397 	    size_t newsize = current_break - saved_heap_start;
3398 	
3399 	    void* addr = mremap (saved_heap_start, oldsize, newsize, 0);
3400 	    if (addr == NULL) {
3401 	      mtcp_printf("mtcp finishrestore: mremap failed to map area between "
3402 	                  "mtcp_saved_break (%p) and current_break (%p)\n",
3403 	                  mtcp_saved_break, current_break);
3404 	      mtcp_abort();
3405 	    }
3406 	  }
3407 	}
3408 	
3409 	/********************************************************************************************************************************/
3410 	/*																*/
3411 	/*  The original program's memory and files have been restored									*/
3412 	/*																*/
3413 	/********************************************************************************************************************************/
3414 	
3415 	static void finishrestore (void)
3416 	{
3417 	  struct timeval stopped;
3418 	  int nnamelen;
3419 	
3420 	  DPRINTF (("mtcp finishrestore*: mtcp_printf works; libc should work\n"));
3421 	
3422 	  restore_heap();
3423 	
3424 	  if ( (nnamelen = strlen(mtcp_ckpt_newname))
3425 	       && strcmp(mtcp_ckpt_newname,perm_checkpointfilename) ) {
3426 	    // we start from different place - change it!
3427 	    DPRINTF(("mtcp finishrestore*: checkpoint file name was changed\n"));
3428 	    if (strlen(mtcp_ckpt_newname) >= MAXPATHLEN) {
3429 	      mtcp_printf("mtcp finishrestore: new ckpt file name (%s) too long (>=512 bytes)\n",
3430 	                  mtcp_ckpt_newname);
3431 	      mtcp_abort();
3432 	    }
3433 	    strncpy(perm_checkpointfilename,mtcp_ckpt_newname,MAXPATHLEN);
3434 	    memcpy(temp_checkpointfilename,perm_checkpointfilename,MAXPATHLEN);
3435 	    strncpy(temp_checkpointfilename + nnamelen, ".temp",MAXPATHLEN - nnamelen);
3436 	  }
3437 	
3438 	  mtcp_sys_gettimeofday (&stopped, NULL);
3439 	  stopped.tv_usec += (stopped.tv_sec - restorestarted.tv_sec) * 1000000 - restorestarted.tv_usec;
3440 	  TPRINTF (("mtcp finishrestore*: time %u uS\n", stopped.tv_usec));
3441 	
3442 	  /* Now we can access all our files and memory that existed at the time of the checkpoint  */
3443 	  /* We are still on the temporary stack, though                                            */
3444 	
3445 	  /* Fill in the new mother process id */
3446 	  motherpid = mtcp_sys_getpid();
3447 	
3448 	  /* Call another routine because our internal stack is whacked and we can't have local vars */
3449 	
3450 	  ///JA: v54b port
3451 	  // so restarthread will have a big stack
3452 	  asm volatile (CLEAN_FOR_64_BIT(mov %0,%%esp)
3453 			: : "g" (motherofall -> savctx.SAVEDSP - 128 ) : "memory");  // -128 for red zone
3454 	  restarthread (motherofall);
3455 	}
3456 	
3457 	static int restarthread (void *threadv)
3458 	{
3459 	  int rip;
3460 	  Thread *child;
3461 	  Thread *const thread = threadv;
3462 	  struct MtcpRestartThreadArg mtcpRestartThreadArg;
3463 	
3464 	  restore_tls_state (thread);
3465 	
3466 	
3467 	  if (thread == motherofall) {
3468 	    // Compute the set of signals which was pending for all the threads at the
3469 	    // time of checkpoint. This is a heuristic to compute the set of signals
3470 	    // which were pending for the entire process at the time of checkpoint.
3471 	    sigset_t tmp;
3472 	    sigfillset ( &tmp );
3473 	    Thread *th;
3474 	    for (th = threads; th != NULL; th = th -> next) {
3475 	      sigandset ( &sigpending_global, &tmp, &(th->sigpending) );
3476 	      tmp = sigpending_global;
3477 	    }
3478 	
3479 	    setup_sig_handler ();
3480 	
3481 	    set_tid_address (&(thread -> child_tid));
3482 	
3483 	    if (callback_post_ckpt != NULL) {
3484 	        DPRINTF(("mtcp finishrestore*: before callback_post_ckpt(1=restarting)"
3485 			 " (&%x,%x) \n",
3486 			 &callback_post_ckpt, callback_post_ckpt));
3487 	        (*callback_post_ckpt)(1);
3488 	        DPRINTF(("mtcp finishrestore*: after callback_post_ckpt(1=restarting)\n"));
3489 	    }
3490 	    /* Do it once only, in motherofall thread. */
3491 	
3492 	    restore_term_settings();
3493 	
3494 	    if (dmtcp_info_restore_working_directory
3495 	        && chdir(saved_working_directory) == -1) {
3496 	      perror("chdir");
3497 	      mtcp_abort ();
3498 	    }
3499 	
3500 	    /* DMTCP restores signal handlers.  But if we are running standalone,
3501 	     * MTCP must do it.
3502 	     * Because signal handlers are per-process, we only do this once.
3503 	     */
3504 	    if (!dmtcp_exists)
3505 	        restore_sig_handlers(thread);
3506 	  }
3507 	
3508 	  restore_sig_state (thread);
3509 	
3510 	  for (child = thread -> children; child != NULL; child = child -> siblings) {
3511 	
3512 	    /* Increment number of threads created but haven't completed their longjmp */
3513 	
3514 	    do rip = mtcp_state_value(&restoreinprog);
3515 	    while (!mtcp_state_set (&restoreinprog, rip + 1, rip));
3516 	
3517 	    /* Create the thread so it can finish restoring itself.                       */
3518 	    /* Don't do CLONE_SETTLS (it'll puke).  We do it later via restore_tls_state. */
3519 	
3520 	    ///JA: v54b port
3521 	    errno = -1;
3522 	
3523 	    void *clone_arg = (void *)child;
3524 	
3525 	    /*
3526 	     * DMTCP needs to know original_tid of the thread being created by the
3527 	     *  following clone() call.
3528 	     *
3529 	     * Threads are created by using syscall which is intercepted by DMTCP and
3530 	     *  the original_tid is sent to DMTCP as a field of MtcpRestartThreadArg
3531 	     *  structure. DMTCP will automatically extract the actual argument
3532 	     *  (clone_arg -> arg) from clone_arg and will pass it on to the real
3533 	     *  clone call.
3534 	     *                                                           (--Kapil)
3535 	     */
3536 	    mtcpRestartThreadArg.arg = (void *)child;
3537 	    mtcpRestartThreadArg.original_tid = child -> original_tid;
3538 	    clone_arg = (void *) &mtcpRestartThreadArg;
3539 	
3540 	   /*
3541 	    * syscall is wrapped by DMTCP when configured with PID-Virtualization.
3542 	    * It calls __clone which goes to DMTCP:__clone which then calls MTCP:__clone.
3543 	    * DMTCP:__clone checks for tid-conflict with any original tid. If
3544 	    * conflict, it replaces the thread with a new one with a new tid.
3545 	    * DMTCP:__clone wrapper calls the glibc:__clone if the computation is not
3546 	    * in RUNNING state (must be restarting), it calls the mtcp:__clone otherwise.
3547 	    * IF No PID-Virtualization, call glibc:__clone because threads created
3548 	    * during mtcp_restart should not go to MTCP:__clone; MTCP remembers those
3549 	    * threads from the checkpoint image.
3550 	    */
3551 	
3552 	    /* If running under DMTCP */
3553 	    pid_t tid;
3554 	    if (dmtcp_info_pid_virtualization_enabled == 1) {
3555 	      tid = syscall(SYS_clone, restarthread,
3556 	          (void *)(child -> savctx.SAVEDSP - 128),  // -128 for red zone
3557 	          (child -> clone_flags & ~CLONE_SETTLS) | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
3558 	          clone_arg, child -> parent_tidptr, NULL, child -> actual_tidptr);
3559 	    } else {
3560 	      tid = ((*clone_entry)( restarthread,
3561 		    (void *)(child -> savctx.SAVEDSP - 128),  // -128 for red zone
3562 	            (child -> clone_flags & ~CLONE_SETTLS) | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
3563 	            child, child -> parent_tidptr, NULL, child -> actual_tidptr));
3564 	    }
3565 	
3566 	    if (tid < 0) {
3567 	      mtcp_printf ("mtcp restarthread: error %d recreating thread\n", errno);
3568 	      mtcp_printf ("mtcp restarthread:   clone_flags %X, savedsp %p\n",
3569 	                   child -> clone_flags, child -> savctx.SAVEDSP);
3570 	      mtcp_abort ();
3571 	    }
3572 	    DPRINTF((" Parent:%d, tid of newly created thread:%d\n\n", thread->tid, tid));
3573 	  }
3574 	
3575 	  /* All my children have been created, jump to the stopthisthread routine just after getcontext call */
3576 	  /* Note that if this is the restored checkpointhread, it jumps to the checkpointhread routine       */
3577 	
3578 	  if (mtcp_have_thread_sysinfo_offset())
3579 	    mtcp_set_thread_sysinfo(saved_sysinfo);
3580 	  ///JA: v54b port
3581 	  DPRINTF (("mtcp restarthread*: calling setcontext: thread->tid: %d, original_tid:%d\n",
3582 	            thread->tid, thread->original_tid));
3583 	  setcontext (&(thread -> savctx)); /* Shouldn't return */
3584 	  mtcp_abort ();
3585 	  return (0); /* NOTREACHED : stop compiler warning */
3586 	}
3587 	
3588 	/********************************************************************************************************************************/
3589 	/*																*/
3590 	/*  Restore the GDT entries that are part of a thread's state									*/
3591 	/*																*/
3592 	/*  The kernel provides set_thread_area system call for a thread to alter a particular range of GDT entries, and it switches 	*/
3593 	/*  those entries on a per-thread basis.  So from our perspective, this is per-thread state that is saved outside user 		*/
3594 	/*  addressable memory that must be manually saved.										*/
3595 	/*																*/
3596 	/********************************************************************************************************************************/
3597 	
3598 	static void restore_tls_state (Thread *thisthread)
3599 	
3600 	{
3601 	  int rc;
3602 	#if MTCP__SAVE_MANY_GDT_ENTRIES
3603 	  int i;
3604 	#endif
3605 	
3606 	  /* The assumption that this points to the pid was checked by that tls_pid crap near the beginning */
3607 	
3608 	  *(pid_t *)(*(unsigned long *)&(thisthread -> gdtentrytls[0].base_addr) + TLS_PID_OFFSET()) = motherpid;
3609 	
3610 	  /* Likewise, we must jam the new pid into the mother thread's tid slot (checked by tls_tid carpola) */
3611 	
3612 	  if (thisthread == motherofall) {
3613 	    *(pid_t *)(*(unsigned long *)&(thisthread -> gdtentrytls[0].base_addr) + TLS_TID_OFFSET()) = motherpid;
3614 	  }
3615 	
3616 	  /* Restore all three areas */
3617 	
3618 	#if MTCP__SAVE_MANY_GDT_ENTRIES
3619 	  for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i ++) {
3620 	    rc = mtcp_sys_set_thread_area (&(thisthread -> gdtentrytls[i-GDT_ENTRY_TLS_MIN]));
3621 	    if (rc < 0) {
3622 	      mtcp_printf ("mtcp restore_tls_state: error %d restoring GDT TLS entry[%d]\n", mtcp_sys_errno, i);
3623 	      mtcp_abort ();
3624 	    }
3625 	  }
3626 	
3627 	  /* For newer Linuces, we just restore the one GDT entry that was indexed by GS */
3628 	
3629 	#else
3630 	  rc = mtcp_sys_set_thread_area (&(thisthread -> gdtentrytls[0]));
3631 	  if (rc < 0) {
3632 	    mtcp_printf ("mtcp restore_tls_state: error %d restoring GDT TLS entry[%d]\n", mtcp_sys_errno, thisthread -> gdtentrytls[0].entry_number);
3633 	    mtcp_abort ();
3634 	  }
3635 	#endif
3636 	
3637 	  /* Restore the rest of the stuff */
3638 	
3639 	#ifdef __i386__
3640 	  asm volatile ("movw %0,%%fs" : : "m" (thisthread -> fs));
3641 	  asm volatile ("movw %0,%%gs" : : "m" (thisthread -> gs));
3642 	#endif
3643 	#ifdef __x86_64__
3644 	/* Don't directly set fs.  It would only set 32 bits, and we just
3645 	 *  set the full 64-bit base of fs, using sys_set_thread_area,
3646 	 *  which called arch_prctl.
3647 	 *asm volatile ("movl %0,%%fs" : : "m" (thisthread -> fs));
3648 	 *asm volatile ("movl %0,%%gs" : : "m" (thisthread -> gs));
3649 	 */
3650 	#endif
3651 	
3652 	  thisthread -> tid = mtcp_sys_kernel_gettid ();
3653 	}
3654 	
3655 	/********************************************************************************************************************************/
3656 	/*																*/
3657 	/*  Set the thread's STOPSIGNAL handler.  Threads are sent STOPSIGNAL when they are to suspend execution the application, save 	*/
3658 	/*  their state and wait for the checkpointhread to write the checkpoint file.							*/
3659 	/*																*/
3660 	/*    Output:															*/
3661 	/*																*/
3662 	/*	Calling thread will call stopthisthread () when sent a STOPSIGNAL							*/
3663 	/*																*/
3664 	/********************************************************************************************************************************/
3665 	
3666 	static void setup_sig_handler (void)
3667 	{
3668 	  struct sigaction act, old_act;
3669 	
3670 	  act.sa_handler = &stopthisthread;
3671 	  sigfillset(&act.sa_mask);
3672 	  act.sa_flags = SA_RESTART;
3673 	
3674 	  if (_real_sigaction(STOPSIGNAL, &act, &old_act) == -1) {
3675 	    mtcp_printf ("mtcp setupthread: error setting up signal handler: %s\n",
3676 	                 strerror (errno));
3677 	    mtcp_abort ();
3678 	  }
3679 	
3680 	  if ((old_act.sa_handler != SIG_IGN) && (old_act.sa_handler != SIG_DFL) && 
3681 	      (old_act.sa_handler != stopthisthread)) {
3682 	    mtcp_printf ("mtcp setupthread: signal handler %d already in use (%p).\n"
3683 	                 " You may employ a different signal by setting the\n"
3684 	                 " environment variable MTCP_SIGCKPT (or DMTCP_SIGCKPT)"
3685 			 " to the number\n of the signal MTCP should "
3686 	                 "use for checkpointing.\n", STOPSIGNAL, old_act.sa_handler);
3687 	    mtcp_abort ();
3688 	  }
3689 	}
3690 	
3691 	/********************************************************************************************************************************/
3692 	/*                                                                                                                              */
3693 	/*  Sync shared memory pages with backup files on disk                                                                          */
3694 	/*                                                                                                                              */
3695 	/********************************************************************************************************************************/
3696 	static void sync_shared_mem(void)
3697 	{
3698 	  int mapsfd;
3699 	  Area area;
3700 	
3701 	  mapsfd = mtcp_sys_open2 ("/proc/self/maps", O_RDONLY);
3702 	  if (mapsfd < 0) {
3703 	    mtcp_printf ("mtcp sync_shared_memory: error opening /proc/self/maps: %s\n",
3704 	                 strerror (mtcp_sys_errno));
3705 	    mtcp_abort ();
3706 	  }
3707 	
3708 	  while (readmapsline (mapsfd, &area)) {
3709 	    /* Skip anything that has no read or execute permission.  This occurs on one page in a Linux 2.6.9 installation.  No idea why.  This code would also take care of kernel sections since we don't have read/execute permission there.  */
3710 	
3711 	    if (!((area.prot & PROT_READ) || (area.prot & PROT_WRITE))) continue;
3712 	
3713 	    if (!(area.flags & MAP_SHARED)) continue;
3714 	
3715 	    if (strstr(area.name, " (deleted)")) continue;
3716 	
3717 	    DPRINTF(("mtcp sync_shared_memory: syncing %X at %p from %s + %X\n", area.size, area.addr, area.name, area.offset));
3718 	
3719 	    if ( msync(area.addr, area.size, MS_SYNC) < 0 ){
3720 	      mtcp_printf ("mtcp sync_shared_memory: error syncing %X at %p from %s + %X\n", area.size, area.addr, area.name, area.offset);
3721 	      mtcp_abort();
3722 	    }
3723 	  }
3724 	
3725 	  close (mapsfd);
3726 	}