cd /nobackupp2/dmenemen/llc_4320/regions/CalSWOT/run_template4
cvs co -r checkpoint65v MITgcm_code
cd ~/llc_4320/regions/CalSWOT/run_template4/MITgcm
mkdir build run
lfs setstripe -c 84 run

qsub -I -q alphatst -l select=1140:ncpus=28:model=bro_ele,walltime=24:00:00 -m abe -M menemenlis@jpl.nasa.gov
# qsub -I -q alphatst -l select=1148:ncpus=28:model=bro_ele,walltime=72:00:00 -m abe -M menemenlis@jpl.nasa.gov
# qsub -I -q alphatst -l select=1138:ncpus=28:model=bro_ele,walltime=24:00:00 -m abe -M menemenlis@jpl.nasa.gov

tcsh
module purge
module load comp-intel/2015.0.090 mpi-sgi/mpt.2.15r13 netcdf/4.0
setenv MPI_UD_RECV_MSGS 131072
setenv MPI_UD_TIMEOUT 80
setenv MPI_QUERYABLE ON
setenv  MPI_BUFS_PER_PROC 10000

cd ~/llc_4320/regions/CalSWOT/run_template4/MITgcm/build
../tools/genmake2 -of ../../code-async/linux_amd64_ifort+mpi_ice_nas -mpi  -mods '../../code ../../code-async'
make depend
make -j 64
cp mitgcmuv ../run/mitgcmuv_64x64x30000

cd ~/llc_4320/regions/CalSWOT/run_template4/MITgcm/run
ln -sf /nobackup/dmenemen/forcing/ECMWF_operational/* .
ln -sf /nobackup/dmenemen/forcing/ncep_rgau/runoff1p2472-360x180x12.bin .
ln -sf ../../data/* .
cp ../../input/* .

tcsh
module purge
module load comp-intel/2015.0.090 mpi-sgi/mpt.2.15r13 netcdf/4.0
setenv MPI_UD_RECV_MSGS 131072
setenv MPI_UD_TIMEOUT 80
setenv MPI_QUERYABLE ON
setenv MPI_BUFS_PER_PROC 10000
setenv MPI_IB_RAILS 2
cd ~/llc_4320/regions/CalSWOT/run_template4/MITgcm/run
mpiexec -n 31920 ./mitgcmuv_64x64x30000 |& tee mpiexec.log

tcsh
module purge
module load comp-intel/2015.0.090 mpi-sgi/mpt.2.12r23 netcdf/4.0
setenv  MPI_BUFS_PER_PROC 10000
cd ~/llc_4320/regions/CalSWOT/run_template4/MITgcm/run
mpiexec -n 31920 ./mitgcmuv_64x64x30000 |& tee mpiexec.log

==========

qstat alphatst
qstat -n 928984 | less
ssh r1i0n1
ps -Af | grep -i mitg
gdb -p 19296


on head node, do: pdsh -F $PBS_NODEFILE -a pgrep -l -u dmenemen (edited)

menemenlis [8:12 PM] ?
thanks

chenbro [8:12 PM] ?
note edit

[8:13] ?
pdsh -F $PBS_NODEFILE -a pkill -u dmenemen

[8:13] ?
will clean up


you need to login to the node you want, typically the first *compute* node, which

[11:56]  
is the *second* node in the hosts file, get the pid of the process you want, and

[11:57]  
attach to it with  "gdb -p <pid>"

[11:57]  
when <pid> is replace by the numeric pid of the process

[11:58]  
Once that starts up and you get a gdb prompt, give the command "trace".  I think "tr: or maybe even "t" is

[11:58]  
sufficient, but I'm not positive

menemenlis [11:58 PM]  
where does hosts file live? i used to know in a previous life

bron [11:59 PM]  
use qstat to find out the jobid of your job, and give the command  "qstat -n <jobid>"

[11:59]  
and it will print out the list of nodes assigned to your job


==============

to check nodes that are down:
pbsnodes -l
bro_ele rack numbers are in the 0-15 range:
e.g., these five nodes belong to bro_ele and are down:
pbsnodes -l
...
r7i6n12              offline 16-12-08 DRT errors on HCA port 2
r11i2n14             down node down: communication closed - CRA - 2016-12-17
r15i0n0              offline dedicated node for electra monitoring - ciotti
r15i1n1              down,offline Jose, HET testing.
r15i7n17             offline ciotti using for monitoring
...


=============================

to check that nodes are clean
on head node

pdsh -F $PBS_NODEFILE -a pgrep -l -u dmenemen

to clean up:
pdsh -F $PBS_NODEFILE -a pkill -u dmenemen