1    	/*****************************************************************************
2    	 *   Copyright (C) 2006-2010 by Michael Rieker, Jason Ansel, Kapil Arya, and *
3    	 *                                                            Gene Cooperman *
4    	 *   mrieker@nii.net, jansel@csail.mit.edu, kapil@ccs.neu.edu, and           *
5    	 *                                                          gene@ccs.neu.edu *
6    	 *                                                                           *
7    	 *   This file is part of the MTCP module of DMTCP (DMTCP:mtcp).             *
8    	 *                                                                           *
9    	 *  DMTCP:mtcp is free software: you can redistribute it and/or              *
10   	 *  modify it under the terms of the GNU Lesser General Public License as    *
11   	 *  published by the Free Software Foundation, either version 3 of the       *
12   	 *  License, or (at your option) any later version.                          *
13   	 *                                                                           *
14   	 *  DMTCP:dmtcp/src is distributed in the hope that it will be useful,       *
15   	 *  but WITHOUT ANY WARRANTY; without even the implied warranty of           *
16   	 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
17   	 *  GNU Lesser General Public License for more details.                      *
18   	 *                                                                           *
19   	 *  You should have received a copy of the GNU Lesser General Public         *
20   	 *  License along with DMTCP:dmtcp/src.  If not, see                         *
21   	 *  <http://www.gnu.org/licenses/>.                                          *
22   	 *****************************************************************************/
23   	
24   	/* To test:  gcc -DSTANDALONE THIS_FILE; ./a.out */
25   	
26   	#include <fcntl.h>
27   	#include <stdio.h>
28   	#include <stdlib.h>
29   	#include <sys/types.h>
30   	#include <sys/stat.h>
31   	#include <fcntl.h>
32   	#include <string.h>
33   	#include <sys/utsname.h> /* uname */
34   	#include <sys/time.h>
35   	#include <sys/resource.h> /* getrlimit, setrlimit */
36   	#include <sys/personality.h>
37   	#define ADDR_NO_RANDOMIZE  0x0040000  /* In case of old Linux, not defined */
38   	#define ADDR_COMPAT_LAYOUT 0x0200000  /* Not yet defined as of Ubuntu 8.04 */
39   	#include <unistd.h>
40   	#include <errno.h>
41   	#include <elf.h> // For value of AT_SYSINFO, Elf??_auxv_t
42   	#include "mtcp_sys.h" // For CLEAN_FOR_64BIT
43   	#include "mtcp_internal.h" // For CLEAN_FOR_64BIT and MAXPATHLEN
44   	
45   	// We turn off va_addr_rand(/proc/sys/kernel/randomize_va_space).  
46   	// For a _given_ binary,
47   	// this fixes the address of the vdso.  Luckily, on restart, we
48   	// get our vdso from mtcp_restart.  So, we need to maintain two
49   	// vdso segments:  one from the user binary and one from each
50   	// invocation of mtcp_restart during iterated restarts.
51   	# define NO_RAND_VA_PERSONALITY 1
52   	
53   	//======================================================================
54   	// Get and set AT_SYSINFO for purposes of patching address in vdso
55   	
56   	#ifdef __x86_64__
57   	# define ELF_AUXV_T Elf64_auxv_t
58   	# define UINT_T uint64_t
59   	#else
60   	# define ELF_AUXV_T Elf32_auxv_t
61   	# define UINT_T uint32_t
62   	#endif
63   	
64   	// Returns value for AT_SYSINFO in kernel's auxv
65   	// Ideally:  mtcp_at_sysinfo() == *mtcp_addr_sysinfo()
66   	// Best if we call this early, before the user makes problems
67   	// by moving environment variables, putting in a weird stack, etc.
68   	extern char **environ;
69   	static void * get_at_sysinfo() {
70   	  void **stack;
71   	  int i;
72   	  ELF_AUXV_T *auxv;
73   	  static char **my_environ = NULL;
74   	
75   	  if (my_environ == NULL)
76   	    my_environ = environ;
77   	#if 0
78   	  // Walk the stack.
79   	  asm volatile (CLEAN_FOR_64_BIT(mov %%ebp, %0\n\t)
80   	                : "=g" (stack) );
81   	  mtcp_printf("stack 2: %p\n", stack);
82   	
83   	  // When popping stack/%ebp yields zero, that's the ELF loader telling us that
84   	  // this is "_start", the first call frame, which was created by ELF.
85   	  for ( ; *stack != NULL; stack = *stack )
86   	    ;
87   	
88   	  // Go beyond first call frame:
89   	  // Next look for &(argv[argc]) on stack;  (argv[argc] == NULL)
90   	  for (i = 1; stack[i] != NULL; i++)
91   	    ;
92   	  // Do some error checks
93   	  if ( &(stack[i]) - stack > 100000 ) {
94   	    mtcp_printf("Error:  overshot stack\n");
95   	    exit(1);
96   	  }
97   	  stack = &stack[i];
98   	#else
99   	  stack = (void **)&my_environ[-1];
100  	  if (*stack != NULL) {
101  	    mtcp_printf("This should be argv[argc] == NULL and it's not.\n"
102  		"NO &argv[argc], stack: %p\n", stack);
103  	    exit(1);
104  	  }
105  	#endif
106  	  // stack[-1] should be argv[argc-1]
107  	  if ( (void **)stack[-1] < stack || (void **)stack[-1] > stack + 100000 ) {
108  	    mtcp_printf("candidate argv[argc-1] failed consistency check\n");
109  	    exit(1);
110  	  }
111  	  for (i = 1; stack[i] != NULL; i++)
112  	    if ( (void **)stack[i] < stack || (void **)stack[i] > stack + 10000 ) {
113  	      mtcp_printf("candidate argv[%d] failed consistency check\n", i);
114  	      exit(1);
115  	    }
116  	  stack = &stack[i+1];
117  	  // Now stack is beginning of auxiliary vector (auxv)
118  	  // auxv->a_type = AT_NULL marks the end of auxv
119  	  for (auxv = (ELF_AUXV_T *)stack; auxv->a_type != AT_NULL; auxv++) {
120  	    // mtcp_printf("0x%x 0x%x\n", auxv->a_type, auxv->a_un.a_val);
121  	    if ( auxv->a_type == (UINT_T)AT_SYSINFO ) {
122  	      mtcp_printf("AT_SYSINFO      (at 0x%p) is:  0x%lx\n",
123  	        &auxv->a_un.a_val, auxv->a_un.a_val);
124  	      return (void *)auxv->a_un.a_val;
125  	    }
126  	  }
127  	  return NULL;  /* Couldn't find AT_SYSINFO */
128  	}
129  	
130  	// From glibc-2.7: glibc-2.7/nptl/sysdeps/i386/tls.h
131  	// SYSINFO_OFFSET given by:
132  	//  #include "glibc-2.7/nptl/sysdeps/i386/tls.h"
133  	//  tcbhead_t dummy;
134  	//  #define SYSINFO_OFFSET &(dummy.sysinfo) - &dummy
135  	
136  	// Some reports say it was 0x18 in past.  Should we also check that?
137  	#define DEFAULT_SYSINFO_OFFSET "0x10"
138  	
139  	int mtcp_have_thread_sysinfo_offset() {
140  	#ifdef RESET_THREAD_SYSINFO
141  	  static int result = -1; // Reset to 0 or 1 on first call.
142  	#else
143  	  static int result = 0;
144  	#endif
145  	  if (result == -1) {
146  	    void * sysinfo;
147  	    asm (CLEAN_FOR_64_BIT(mov %%gs:) DEFAULT_SYSINFO_OFFSET ", %0\n\t"
148  		 : "=r" (sysinfo));
149  	    result = (sysinfo == get_at_sysinfo());
150  	  }
151  	  return result;
152  	}
153  	
154  	// AT_SYSINFO is what kernel calls sysenter address in vdso segment.
155  	// Kernel saves it for each thread in %gs:SYSINFO_OFFSEt ??
156  	//  as part of kernel TCB (thread control block) at beginning of TLS ??
157  	void *mtcp_get_thread_sysinfo() {
158  	  void *sysinfo;
159  	  asm volatile (CLEAN_FOR_64_BIT(mov %%gs:) DEFAULT_SYSINFO_OFFSET ", %0\n\t"
160  	                : "=r" (sysinfo) );
161  	  return sysinfo;
162  	}
163  	
164  	void mtcp_set_thread_sysinfo(void *sysinfo) {
165  	  asm volatile (CLEAN_FOR_64_BIT(mov %0, %%gs:) DEFAULT_SYSINFO_OFFSET "\n\t"
166  	                : : "r" (sysinfo) );
167  	}
168  	
169  	//======================================================================
170  	// Used to check if vdso is an issue
171  	
172  	#define MAX_ARGS 500
173  	static int write_args(char **vector, char *filename) {
174  	  ssize_t i;
175  	  int retval, fd;
Event stack_use_local_overflow: Local variable "strings" uses 10004 bytes of stack space, which exceeds the maximum single use of 10000 bytes.
176  	  char strings[10001];
177  	  char *str = strings;
178  	
179  	  if (-1 == (fd = open(filename, O_RDONLY))) {
180  	    perror("open");
181  	    exit(1);
182  	  }
183  	  strings[10001] = '\0';
184  	  ssize_t num_read = mtcp_read_all(fd, strings, 10000);
185  	  close(fd);
186  	
187  	  if (num_read == -1)
188  	    return -1;
189  	  
190  	  for (i = 0; str - strings < num_read && i < MAX_ARGS; i++) {
191  	    vector[i] = str;
192  	    while (*str++ != '\0')
193  	      ;
194  	  }
195  	  vector[i] = NULL;
196  	  return 0;
197  	}
198  	
199  	static unsigned long getenv_oldpers() {
200  	    unsigned long oldpers = 0;
201  	    char *oldpers_str = getenv("MTCP_OLDPERS");
202  	    if (oldpers_str == NULL) {
203  	      mtcp_printf("MTCP: internal error: %s:%d\n", __FILE__, __LINE__);
204  	      exit(1);
205  	    }
206  	    while (*oldpers_str != '\0')
207  	      oldpers = (oldpers << 1) + (*oldpers_str++ == '1' ? 1 : 0);
208  	    return oldpers;
209  	}
210  	
211  	static int setenv_oldpers(int oldpers) {
212  	    static char oldpers_str[sizeof(oldpers)*8+1];
213  	    int i = sizeof(oldpers_str); 
214  	    oldpers_str[i--] = '\0';
215  	    while (i >= 0) {
216  	      oldpers_str[i--] = ((oldpers & 1) ? '1' : '0');
217  	      oldpers = oldpers >> 1;
218  	    }
219  	    return setenv("MTCP_OLDPERS", oldpers_str, 1);
220  	}
221  	
222  	/* Turn off randomize_va (by re-exec'ing) or warn user if vdso_enabled is on. */
223  	void mtcp_check_vdso_enabled() {
224  	  char buf[1];
225  	  struct utsname utsname;
226  	#ifdef RESET_THREAD_SYSINFO
227  	  get_at_sysinfo(); /* Initialize pointer to environ for later calls */
228  	#endif
229  	
230  	#ifdef NO_RAND_VA_PERSONALITY
231  	  /* Set ADDR_NO_RANDOMIZE bit;
232  	   * In Ubuntu Linux 2.6.24 kernel, This places vdso in  a different
233  	   * fixed position in mtcp_init (since /lib/ld-2.7.so is inserted
234  	   * above [vdso] and below [stack].  mtcp_restart has no /lib/ld-2.7.so.
235  	   */
236  	  int pers = personality(0xffffffffUL); /* get current personality */
237  	  if (pers & ADDR_NO_RANDOMIZE) { /* if no addr space randomization ... */
238  	    if (getenv("MTCP_OLDPERS") != NULL) {
239  	      personality(getenv_oldpers()); /* restore orig pre-exec personality */
240  	      if (-1 == unsetenv("MTCP_OLDPERS"))
241  	        perror("unsetenv");
242  	    }
243  	    return; /* skip the rest */
244  	  }
245  	
246  	  if (! (pers & ADDR_NO_RANDOMIZE)) /* if addr space randomization ... */
247  	  { 
248  	    unsigned long oldpers = pers;
249  	    /* then turn off randomization and (just in case) remove ADDR_COMPAT_LAYOUT*/
250  	    personality((pers | ADDR_NO_RANDOMIZE) & ~ADDR_COMPAT_LAYOUT);
251  	    if ( ADDR_NO_RANDOMIZE & personality(0xffffffffUL) ) /* if it's off now */
252  	    { char runtime[MAXPATHLEN+1];
253  	      int i = readlink("/proc/self/exe", runtime, MAXPATHLEN);
254  	      if ( i != -1)
255  	      { char *argv[MAX_ARGS+1];
256  	        extern char **environ;
257  		struct rlimit rlim;
258  	
259  		/* "make" has the capability to raise RLIMIT_STACK to infinity.
260  		 * This is a problem.  When the kernel (2.6.24 or later) detects this,
261  		 * it falls back to an older "standard" memory layout for libs.
262  		 * 
263  		 * "standard" memory layout puts [vdso] segment in low memory, which 
264  		 *  MTCP currently doesn't handle properly.
265  		 *
266  		 * glibc:nptl/sysdeps/<ARCH>/pthreaddef.h defines the default stack for 
267  		 *  pthread_create to be ARCH_STACK_DEFAULT_SIZE if rlimit is set to be
268  		 *  unlimited. We follow the same default.
269  		 */
270  	//#ifdef __x86_64__
271  	//# define ARCH_STACK_DEFAULT_SIZE (32 * 1024 * 1024)
272  	//#else
273  	//# define ARCH_STACK_DEFAULT_SIZE (2 * 1024 * 1024)
274  	//#endif 
275  	        /*
276  	         * XXX: TODO: Due to some reason, manual restart of checkpointed
277  	         *  processes fails if  ARCH_STACK_DEFAULT_SIZE is less than 256MB. It
278  	         *  has to do with VDSO. The location of VDSO section conflicts with the
279  	         *  location of process libraries and hence it is unmapped which causes
280  	         *  failure during thre restarting phase. If we set the stack limit to
281  	         *  256 MB or higher, we donot see this bug. 
282  	         * It Should also be noted that the process will call setrlimit to set
283  	         *  the resource limites to their pre-checkpoint values.
284  	         */
285  	#define ARCH_STACK_DEFAULT_SIZE (256 * 1024 * 1024)
286  		 
287  		if ( -1 == getrlimit(RLIMIT_STACK, &rlim) ||
288  	             ( rlim.rlim_cur = rlim.rlim_max = ARCH_STACK_DEFAULT_SIZE,
289  		       setrlimit(RLIMIT_STACK, &rlim),
290  		       getrlimit(RLIMIT_STACK, &rlim),
291  		       rlim.rlim_max == RLIM_INFINITY )
292  		   ) {
293  	          mtcp_printf("Failed to reduce RLIMIT_STACK"
294  				  " below RLIM_INFINITY\n");
295  		  exit(1);
296  		}
297  		write_args(argv, "/proc/self/cmdline");
298  	        runtime[i] = '\0';
299  		setenv_oldpers(oldpers);
300  	        execve(runtime, argv, environ);
301  	      }
302  	      if (-1 == personality(oldpers)) /* reset if we couldn't exec */
303  	        perror("personality");
304  	    }
305  	  }
306  	#endif
307  	
308  	  /* We failed to turn off address space rand., but maybe vdso is not enabled 
309  	   * On newer kernels, there is no /proc/sys/vm/vdso_enabled, we will cross our
310  	   *  fingers and continue anyways.
311  	   */
312  	  FILE * stream = fopen("/proc/sys/vm/vdso_enabled", "r");
313  	  if (stream == NULL)
314  	    return;  /* In older kernels, if it doesn't exist, it can't be enabled. */
315  	  clearerr(stream);
316  	  if (fread(buf, sizeof(buf[0]), 1, stream) < 1) {
317  	    if (ferror(stream)) {
318  	      perror("fread");
319  	      exit(1);
320  	    }
321  	  }
322  	  if (-1 == fclose(stream)) {
323  	    perror("fclose");
324  	    exit(1);
325  	  }
326  	  /* This call also caches AT_SYSINFO for use by mtcp_set_thread_sysinfo() */
327  	  if (mtcp_have_thread_sysinfo_offset())
328  	    return;
329  	  if (buf[0] == '1') {
330  	    mtcp_printf("\n\n\nPROBLEM:  cat /proc/sys/vm/vdso_enabled returns 1\n"
331  	    "  Further, I failed to find SYSINFO_OFFSET in TLS.\n"
332  	    "  Can't work around this problem.\n"
333  	    "  Please run this program again after doing as root:\n"
334  	    "                                    echo 0 > /proc/sys/vm/vdso_enabled\n"
335  	    "  Alternatively, upgrade kernel to one that allows for a personality\n"
336  	    "  with ADDR_NO_RANDOMIZE in /usr/include/linux/personality.h.\n");
337  	    exit(1);
338  	  }
339  	}
340  	
341  	#ifdef STANDALONE
342  	int main() {
343  	  mtcp_check_vdso_enabled();
344  	  system("echo ulimit -s | sh");
345  	  return 0;
346  	}
347  	#endif