Browse Source

First version of more enhanced sync.sh

experiments/enhance-sync
Sven Karsten 4 months ago
parent
commit
4e5e3047a5
  1. 18
      run.sh
  2. 14
      scripts/run/run.py
  3. 106
      sync.sh

18
run.sh

@ -106,20 +106,13 @@ scp "${last_build_file}" ${user_at_dest}:${dest_folder}/
echo ""
echo ""
if [ "${sync_to}" != "" ]; then
echo "## Start synchronization of output ##"
echo "###############################################"
./sync.sh "${target_keyword}" "${sync_to}"
echo ""
echo ""
fi
if [ $inputs_arg -lt $# ]; then
for ((i = ${inputs_arg}; i < $#; i++)); do
echo "## Start the run on target ##"
echo "###############################################"
echo "Start run with input folder ${args[i]}"
./local_scripts/run_${target}.sh ${user_at_dest} ${dest_folder} ${prepare_before_run} "${args[i]}"
ssh -t ${user_at_dest} "touch ${dest_folder}/${args[i]}_started.txt"
echo ""
echo ""
done
@ -128,6 +121,15 @@ else
echo "###############################################"
echo "Start run with input folder"
./local_scripts/run_${target}.sh ${user_at_dest} ${dest_folder} ${prepare_before_run}
ssh -t ${user_at_dest} "touch ${dest_folder}/started.txt"
echo ""
echo ""
fi
if [ "${sync_to}" != "" ]; then
echo "## Start synchronization of output ##"
echo "###############################################"
./sync.sh "${target_keyword}" "${sync_to}"
echo ""
echo ""
fi

14
scripts/run/run.py

@ -49,6 +49,8 @@ from get_parallelization_layout import get_parallelization_layout
# read in global settings
global_settings = GlobalSettings(IOW_ESM_ROOT, input_name)
#os.system("touch " + IOW_ESM_ROOT + "/" + global_settings.run_name + "_started.txt")
# remove finished marker
if glob.glob(IOW_ESM_ROOT + "/" + global_settings.run_name + "_finished.txt"):
os.system("rm " + IOW_ESM_ROOT + "/" + global_settings.run_name + "_finished.txt")
@ -101,8 +103,10 @@ for run in range(global_settings.runs_per_job):
if int(start_date) >= int(global_settings.final_date):
print('IOW_ESM job finished integration to final date '+global_settings.final_date)
os.system("touch " + IOW_ESM_ROOT + "/" + global_settings.run_name + "_finished.txt")
os.system("rm "+IOW_ESM_ROOT + "/" + global_settings.run_name + "_running.txt")
sys.exit()
os.system("touch " + IOW_ESM_ROOT + "/" + global_settings.run_name + "_running.txt")
########################################################################
# STEP 2b: ATTEMPT HANDLING: PREPARATION #
@ -121,6 +125,8 @@ for run in range(global_settings.runs_per_job):
if attempt is None:
print("All attempts are exhausted. Abort.")
print("To start from scratch, please remove " + global_settings.attempt_handler_obj_file + ".")
os.system("rm "+IOW_ESM_ROOT + "/" + global_settings.run_name + "_running.txt")
os.system("touch " + IOW_ESM_ROOT + "/" + global_settings.run_name + "_failed.txt")
sys.exit()
# do the customer's preparation here
@ -190,6 +196,8 @@ for run in range(global_settings.runs_per_job):
# if we have no attempt handling and the model crashed we can only stop the entire job
if crashed and (global_settings.attempt_handler is None):
print('IOW_ESM job finally failed integration from '+str(start_date)+' to '+str(end_date))
os.system("rm "+IOW_ESM_ROOT + "/" + global_settings.run_name + "_running.txt")
os.system("touch " + IOW_ESM_ROOT + "/" + global_settings.run_name + "_failed.txt")
sys.exit()
try:
@ -221,6 +229,8 @@ for run in range(global_settings.runs_per_job):
if global_settings.attempt_handler.next_attempt is None:
print('All attempts exhausted. IOW_ESM job finally failed integration from '+str(start_date)+' to '+str(end_date))
os.system("rm "+global_settings.attempt_handler_obj_file) # remove attempt handler state to enable restart
os.system("rm "+IOW_ESM_ROOT + "/" + global_settings.run_name + "_running.txt")
os.system("touch " + IOW_ESM_ROOT + "/" + global_settings.run_name + "_failed.txt")
sys.exit()
print('Go on with next attempt.', flush=True)
@ -230,6 +240,7 @@ for run in range(global_settings.runs_per_job):
# if the attempt failed we throw away the work and start a new job
if resubmit_command is None:
print('No command for resubmitting specified in global_settings.py. Abort.')
os.system("rm "+IOW_ESM_ROOT + "/" + global_settings.run_name + "_running.txt")
sys.exit()
print('Run failed. Try again with command '+resubmit_command+'.', flush=True)
@ -239,6 +250,8 @@ for run in range(global_settings.runs_per_job):
# if the crashed attempt was not evaluated to false, we stop here
if crashed:
print('Error: Attempt '+str(attempt)+' has crashed but has been successfully evaluated. Abort.', flush=True)
os.system("touch " + IOW_ESM_ROOT + "/" + global_settings.run_name + "_failed.txt")
os.system("rm "+IOW_ESM_ROOT + "/" + global_settings.run_name + "_running.txt")
sys.exit()
print(' attempt '+str(attempt)+' succeeded.', flush=True)
@ -281,6 +294,7 @@ postprocess_handling.postprocess_handling(global_settings, models, initial_start
# if this run has successfully finished, mark it
if int(start_date) >= int(global_settings.final_date):
os.system("touch " + IOW_ESM_ROOT + "/" + global_settings.run_name + "_finished.txt")
os.system("rm "+IOW_ESM_ROOT + "/" + global_settings.run_name + "_running.txt")

106
sync.sh

@ -16,10 +16,110 @@ user_at_dst=${user_at_dest}
dst_folder=${dest_folder}
dst=${dest}
script="cd ${dest_folder}; counter=0; terminate=0; while [ 1 ]; do let counter=counter+1; nohup rsync -r -i -u -l ${src}/* ${dest_folder}/ > nohup_\\\${counter}.out 2>&1; if [ \\\`cat nohup_\\\${counter}.out | wc -l\\\` -eq 1 ]; then let terminate=terminate+1; fi; if [ \\\$terminate -gt ${timeout} ]; then break; fi; sleep 3600; done"
cat <<eof > local_sync_script.sh
my_PID=\$\$
function log(){
echo \$1 >> sync_log.txt
}
# this function will run in the background and checks if synchronization is still needed
function check_for_running() {
cd ${dest_folder}
log "Check if any model is still running"
while [ 1 ]; do
# perform dry run (-n)
started=\`ssh -t ${user_at_src} "ls ${src_folder}/*started.txt 2> /dev/null | wc -l"\`
finished=\`ssh -t ${user_at_src} "ls ${src_folder}/*finished.txt 2> /dev/null | wc -l"\`
failed=\`ssh -t ${user_at_src} "ls ${src_folder}/*failed.txt 2> /dev/null | wc -l"\`
let running=started-finished-failed
if [ \$running -eq 0 ]; then
# if no model is running we stop
log "No model running anymore"
break
fi
sleep 10
done
log "Kill running snchronization to stop the hourly loop and remove PID file"
while [ ! -f PID-$1-$2 ]; do
sleep 5
done
pkill -P \`cat PID-$1-$2\`
rm PID-$1-$2
sleep 5
log "Perform a last snychronization"
./rsync-$1-$2.sh last
#wait
log "Kill the synchronization as such. Goodbye."
#pkill -P ${my_PID}
screen_PID=\`ps ux | grep "SCREEN -dSm rsync-$1-$2" | grep -v "grep" | awk '{print \$2}'\`
pkill -P \${screen_PID}
}
cd ${dest_folder}
rm sync_log.txt
log "Create script that performs the actual synchronization"
cat <<EOF > rsync-$1-$2.sh
echo \\\$BASHPID
counter=\\\$1
rsync -r -i -u -l --exclude 'work' ${src}/* ${dest_folder}/ > sync_\\\${counter}.out 2>&1
EOF
log "Make file executable"
chmod u+x rsync-$1-$2.sh
log "Start checking if synchronization is still needed in background"
check_for_running &
# wait a little bit
#sleep 30
log "Start synchronization loop that is called every hour"
counter=0
while [ 1 ]; do
let counter=counter+1
./rsync-$1-$2.sh \$counter > PID-$1-$2
sleep 5
# if the output file does not exist anymore it means that check_for_running has killed the synchronization
# and we can stop here
if [ ! -f PID-$1-$2 ]; then
log "check_for_running has killed the hourly synchronization"
break
fi
sleep 3595
# if file is removed during sleeping and last synchronization is still running we stop here anyway
if [ ! -f PID-$1-$2 ]; then
log "check_for_running has killed the hourly synchronization"
break
fi
done
log "Wait for last synchronization..."
wait
log "...done."
eof
#cript="cd ${dest_folder}; counter=0; terminate=0; while [ 1 ]; do let counter=counter+1; nohup rsync -r -i -u -l ${src}/* ${dest_folder}/ > nohup_\${counter}.out 2>&1; if [ \`cat nohup_\${counter}.out | wc -l\` -eq 1 ]; then let terminate=terminate+1; fi; if [ \$terminate -gt ${timeout} ]; then break; fi; sleep 3600; done"
echo ssh -t ${user_at_dst} \"mkdir -p ${dst_folder}\"
echo ssh -t ${user_at_dst} \"screen -dSm \\\"rsync-$1-$2\\\" bash -c \\\"$script\\\"";" sleep 1\"
echo ssh -t ${user_at_dst} \"screen -dSm \"rsync-$1-$2\" bash -c \"$script\"";" sleep 1\"
ssh -t ${user_at_dst} "mkdir -p ${dst_folder}"
ssh ${user_at_dst} -t "screen -dSm \"rsync-$1-$2\" bash -c \"$script\"; sleep 1"
scp local_sync_script.sh ${dest}/sync_script.sh
ssh -t ${user_at_dst} "chmod u+x ${dst_folder}/sync_script.sh"
ssh ${user_at_dst} -t "cd ${dst_folder}; screen -dSm \"rsync-$1-$2\" bash -c ./sync_script.sh; sleep 1"
rm local_sync_script.sh
Loading…
Cancel
Save