3

I have a script run and monitored by svscan/supervise. When the last

luces-hf/run:

#!/bin/bash
set -v
export http_proxy=[redacted]
curl -N -s https://[redacted] |\
grep --line-buffered "event" | http_proxy='' xargs -n 1 luces.sh

It seems that when the script launches luces.sh and the script hangs and remains zombie, then the whole pipe remains running. I would like it to fail, so that the script finishes and gets restarted by supervise.

This is the process tree:

24788 ?        Ss     0:00 /bin/sh /usr/bin/svscanboot
24790 ?        S      0:01  \_ svscan /etc/service
24794 ?        S      0:00  |   \_ supervise luces-hf
29876 ?        S      0:00  |   |   \_ /bin/bash ./run
29877 ?        S      0:00  |   |       \_ curl -N -s https://[redacted][long request never endind. server sent events one line at a time]
29878 ?        S      0:00  |   |       \_ grep --line-buffered event
29879 ?        S      0:00  |   |       \_ xargs -n 1 luces.sh
 5885 ?        Z      0:00  |   |           \_ [luces.sh] <defunct>

EDIT : luces.shdoes three http requests using curl

EDIT : strace output for the xargs command in the script:

execve("/usr/bin/xargs", ["xargs", "-n", "1", "luces.sh"], [/* 6 vars */]) = 0
brk(0)                                  = 0x1d07000
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fad73c49000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=43357, ...}) = 0
mmap(NULL, 43357, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7fad73c3e000
close(3)                                = 0
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
open("/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\37\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1840928, ...}) = 0
mmap(NULL, 3949248, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fad73664000
mprotect(0x7fad7381f000, 2093056, PROT_NONE) = 0
mmap(0x7fad73a1e000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1ba000) = 0x7fad73a1e000
mmap(0x7fad73a24000, 17088, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fad73a24000
close(3)                                = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fad73c3d000
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fad73c3b000
arch_prctl(ARCH_SET_FS, 0x7fad73c3b740) = 0
mprotect(0x7fad73a1e000, 16384, PROT_READ) = 0
mprotect(0x609000, 4096, PROT_READ)     = 0
mprotect(0x7fad73c4b000, 4096, PROT_READ) = 0
munmap(0x7fad73c3e000, 43357)           = 0
brk(0)                                  = 0x1d07000
brk(0x1d28000)                          = 0x1d28000
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
mmap(NULL, 135168, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fad73c1a000
rt_sigaction(SIGCHLD, {SIG_DFL, [CHLD], SA_RESTORER|SA_RESTART, 0x7fad7369ad40}, {SIG_DFL, [], 0}, 8) = 0
fstat(0, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fad73c48000
read(0, "event: HF\n", 4096)            = 10
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fad73c3ba10) = 16952
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 16952
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=16952, si_status=0, si_utime=0, si_stime=0} ---
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fad73c3ba10) = 16968
read(0, 0x7fad73c48000, 4096)           = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=16968, si_status=0, si_utime=0, si_stime=0} ---
read(0, "event: HF\n", 4096)            = 10
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 16968
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fad73c3ba10) = 17023
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 17023
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=17023, si_status=0, si_utime=0, si_stime=0} ---
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fad73c3ba10) = 17040
read(0, 0x7fad73c48000, 4096)           = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=17040, si_status=0, si_utime=0, si_stime=0} ---
read(0, "event: HF\n", 4096)            = 10
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 17040
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fad73c3ba10) = 17065
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 17065
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=17065, si_status=0, si_utime=0, si_stime=0} ---
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fad73c3ba10) = 17087
read(0, 0x7fad73c48000, 4096)           = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=17087, si_status=0, si_utime=0, si_stime=0} ---
read(0, "event: HF\n", 4096)            = 10
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 17087
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fad73c3ba10) = 17127
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 17127
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=17127, si_status=0, si_utime=0, si_stime=0} ---
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fad73c3ba10) = 17147
read(0, 0x7fad73c48000, 4096)           = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=17147, si_status=0, si_utime=0, si_stime=0} ---
read(0, "event: HF\n", 4096)            = 10
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 17147
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fad73c3ba10) = 17274
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 17274
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=17274, si_status=0, si_utime=0, si_stime=0} ---
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fad73c3ba10) = 17293
read(0, 0x7fad73c48000, 4096)           = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=17293, si_status=0, si_utime=0, si_stime=0} ---
marianov
  • 173
  • 5
  • 1
    The pipe isn't ending because curl is still alive; xargs will keep running as long as it has input. I don't understand why xargs isn't reaping its child though. Can you run `strace -o xargs.strace xargs …` and see what's going on? – Gilles 'SO- stop being evil' Sep 01 '15 at 22:27
  • @Gilles: added the strace (I included it in the script, replacing the call to xargs, whas that what you suggested?) – marianov Sep 04 '15 at 17:13
  • The trace is what I asked for. So xargs is hanging when its `read` call is interrupted by a child dying. This looks normal, except that I'd expect the trace to show another, unfinished call to `read`. Has `curl … | grep …` produced more lines already? If it hasn't, the system state would be perfectly normal apart from the missing unfinished call to `read`: `xargs` is sitting around waiting for the next input line. – Gilles 'SO- stop being evil' Sep 04 '15 at 18:10

0 Answers0