1 Star 0 Fork 0

lurenchina/oklin

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
pbsstat 6.85 KB
一键复制 编辑 原始数据 按行查看 历史
lurenchina 提交于 2024-07-23 11:01 . 第一次提交
#!/bin/sh
# Name: pbsstat
# Usage: pbsstat [-f]
# Torque resource manager utility script:
# Print a 1-line summary of jobs on each node.
# The printout may be customized as needed.
# Author: irosebird.com & ougan.net & linbingping
# Version: 2.0
# Date: 27 September 2007
# Command arguments
if test $# -ge 1
then
# Argument -f: Print only those nodes that are flagged
if test $# -eq 1 -a $1 = "-f"
then
echo Listing only nodes that are flagged by \*
listflagged=1
else
echo Usage: $0 '[-f]'
exit 1
fi
else
listflagged=0
fi
# Locations of commands used
PBSNODES=pbsnodes
QSTAT=qstat
AWK=/bin/awk
# Heading for printout showing:
# node: Node hostname
# state: Torque state
# load: CPU load average
# pmem: Physical memory
# ncpu: Number of CPUs
# mem: Physical+virtual memory
# resi: Resident (used) memory
# usrs: Number of sessions / Number of users
# jobs: Number of jobs
# jobids/users: Jobids and corresponding usernames of Torque jobs on this node
echo "+----------------------------------------------------------------------------------------------+"
echo "| node | state | load | pmem | ncpu | mem | resi | usrs | tasks | jobids/users |"
#
# Show the Torque node status and parse the results
#
$PBSNODES -a | $AWK -v listflagged=$listflagged -v QSTAT=$QSTAT '
BEGIN {
#
# First get the list of jobids versus usernames from qstat
#
QSTAT = QSTAT " -r" # Append -r flag (running jobs) to qstat.
while ((QSTAT | getline) > 0) { # Parse lines from qstat -r
if (++line>5) { # Skip first 5 header lines
split($1,b,".") # Jobid is b[1]
username[b[1]] = $2 # Username of this jobid
}
}
close(QSTAT)
}
# Parse the output of pbsnodes
#
NF==1 { node=$1 # 1st line is nodename
nodename[node] = node # Node name
getline # Get the next input line
numjobs[node] = 0 # Torque jobs on the node
numtasks[node] = 0 # Number of tasks started by Torque on the node
listnode=0 # Set to > 0 if this node gets flagged
while (NF >= 3) { # Read a number of non-blank lines
if ($1 == "state") {
if ($3 == "job-exclusive") state[node] = "\033[1;31;40mexcl\033[0m"
else if ($3 == "job-exclusive,busy") state[node] = "busy"
else if ($3 == "busy") state[node] = "busy"
else if ($3 == "free") state[node] = "\033[1;32;40mfree\033[0m"
else if ($3 == "offline") state[node] = "offl"
else if ($3 == "offline,job-exclusive") state[node] = "offl"
else if ($3 == "offline,job-exclusive,busy") state[node] = "offl"
else if ($3 == "down") state[node] = "\033[1;5;37;41mdown\033[0m"
else if ($3 == "down,offline") state[node] = "\033[1;5;37;41mdown\033[0m"
else if ($3 == "down,job-exclusive") state[node] = "\033[1;5;37;41mdown\033[0m"
else if ($3 == "down,offline,job-exclusive") state[node] = "\033[1;5;37;41mdown\033[0m"
else if ($3 == "down,offline,busy") state[node] = "\033[1;5;37;41mdown\033[0m"
else if ($3 == "down,offline,job-exclusive,busy") state[node] = "\033[1;5;37;41mdown\033[0m"
else if ($3 == "UNKN") state[node] = "UNKN"
}
else if ($1 == "np") np[node] = $3
else if ($1 == "properties") properties[node] = $3
else if ($1 == "ntype") ntype[node] = $3
else if ($1 == "jobs") numtasks[node] = NF - 2
else if ($1 == "status") {
# Get the node status subfields
split (substr($0,15), a, ",") # Remove leading "status =", split subfields separated by ","
for (field in a) { # Process individual status subfields
split(a[field],b,"=") # Split var=value fields
if (b[1]=="arch") arch[node]=b[2]
else if (b[1]=="opsys") opsys[node]=b[2]
else if (b[1]=="sessions") sessions[node]=b[2]
else if (b[1]=="nsessions") nsessions[node]=int(b[2])
else if (b[1]=="nusers") nusers[node]=b[2]
else if (b[1]=="idletime") idletime[node]=b[2]
else if (b[1]=="totmem") totmem[node]=b[2]
else if (b[1]=="availmem") availmem[node]=b[2]
else if (b[1]=="physmem") physmem[node]=b[2]
else if (b[1]=="ncpus") ncpus[node]=b[2]
else if (b[1]=="loadave") loadave[node]=b[2]
else if (b[1]=="netload") netload[node]=b[2]
else if (b[1]=="size") size[node]=b[2]
else if (b[1]=="jobs") {
# Get the list of jobids/users for this node
if (b[2] == "? 0") b[2] = "" # Fix for a bug in pbsnodes ?
numjobs[node]=split(b[2],c)
for (i=1; i <= numjobs[node]; i++) {
split(c[i], d, ".")
# Get jobid and username
jobid = d[1]
user = username[jobid]
# Case where the node pbs_mom has a (dead job) jobid unknown to pbs_server:
if (length(user) == 0) { # Flag non-existent username
user="NONE*"
listnode++
}
# Append jobid and username to the job list
jobidlist[node] = jobidlist[node] " " jobid "/" user
}
} else if (b[1]=="rectime") rectime[node]=b[2]
}
}
getline # Get the next input line
}
# Print out values that we are interested in. Flag unexpected values with a "*".
# Flag nodes with status down, offline or unknown
if (state[node] == "busy" || state[node] == "down" || state[node] == "offl" || state[node] == "UNKN") {
stateflag="*"
listnode++
} else
stateflag=" "
# Flag unexpected CPU load average
loaddiff = loadave[node] - numtasks[node]
if (loaddiff > 0.5 || loaddiff < -0.5) {
loadflag="*"
listnode++
} else
loadflag=" "
# Resident memory
resi = (totmem[node]-availmem[node])/1024
if (resi > 50 && resi > physmem[node]/1024 - 50) { # High memory usage
resiflag="*"
listnode++
} else
resiflag=" "
# Flag unexpected number of processes or users
if (nsessions[node] > 2*ncpus[node] + 1) { # More than 2 sessions per job
sessflag="*"
listnode++
} else if (nusers[node] > ncpus[node]) { # More users than nCPUs is bad
sessflag="*"
listnode++
} else
sessflag=" "
# Flag unexpected number of jobs
if (numjobs[node] > numtasks[node]) { # Should be at least 1 task per job
jobflag="*"
listnode++
} else
jobflag=" "
# CONFIGURE: Comment out the line below
# Omit down nodes from the flagged list because we do not bother to see them
# (Use "pbsnodes -l" to list down nodes).
if (state[node] == "down") listnode=0
if (!listflagged || listnode > 0)
printf ("+-------------+-------+-------+-------+------+--------+--------+------+-------+----------------+\n")
printf ("|%-13s| %-4s%1s | %5.2f%-1s|%6d | %3d | %6d | %4d%1s | %1d/%1d%1s | %3d%2s | %-15s|\n",
node, state[node], stateflag, loadave[node], loadflag,
physmem[node]/1024, ncpus[node], totmem[node]/1024, resi, resiflag,
nsessions[node], nusers[node], sessflag, numtasks[node], jobflag, jobidlist[node])
}'
echo "+----------------------------------------------------------------------------------------------+"
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/lurenchina/oklin.git
git@gitee.com:lurenchina/oklin.git
lurenchina
oklin
oklin
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385