package GMJobsInfo; use POSIX qw(ceil); use English; use LogUtils; use strict; # The returned hash looks something like this: our $j = { 'jobID' => { gmuser => '*', # from .local lrms => '*', queue => '', localid => '*', subject => '', starttime => '*', # MDS time format jobname => '*', gmlog => '*', cleanuptime => '*', # MDS time format delegexpiretime => '*', # MDS time format clientname => '', clientsoftware => '*', activityid => [ '*' ], sessiondir => '', diskspace => '', failedstate => '*', fullaccess => '*', lifetime => '*', # seconds jobreport => '*', interface => '*', # added for GLUE2, the interface the job was submitted. If missing, gridftp voms => [ '*' ], #ref array, contains all the voms attributes in the user certificate that A-REX authorization accepted vomsvo => '*', # string, the first VO in the voms array without the slashes # from .description description => '', # rsl or xml # from .grami -- not kept when the job is deleted stdin => '*', stdout => '*', stderr => '*', count => '*', reqwalltime => '*', # units: s reqcputime => '*', # units: s runtimeenvironments=> [ '*' ], # from .status status => '', statusmodified => '', # seconds since epoch statusread => '', # seconds since epoch localowner => '', # from .failed errors => [ '*' ], lrmsexitcode => '*', # from .diag exitcode => '*', nodenames => [ '*' ], UsedMem => '*', # units: kB; summed over all execution threads CpuTime => '*', # units: s; summed over all execution threads WallTime => '*', # units: s; real-world time elapsed LRMSStartTime => '*', # units: MDS time format LRMSEndTime => '*', # units: MDS time format completiontime => '*' # MDS time format } }; our $log = LogUtils->getLogger(__PACKAGE__); # # switch effective user if possible. This is reversible. # sub switchEffectiveUser { my ($user) = @_; return unless $UID == 0; my ($name, $pass, $uid, $gid); if ($user eq '.') { ($uid, $gid) = (0, 0); } else { ($name, $pass, $uid, $gid) = getpwnam($user); return unless defined $gid; } eval { $EGID = $gid; $EUID = $uid; }; } sub collect { my ($controls, $remotegmdirs, $nojobs) = @_; my $gmjobs = {}; while (my ($user, $control) = each %$controls) { switchEffectiveUser($user); my $controldir = $control->{controldir}; my $newjobs = get_gmjobs($controldir, $nojobs); $newjobs->{$_}{gmuser} = $user for keys %$newjobs; $gmjobs->{$_} = $newjobs->{$_} for keys %$newjobs; } # switch to root or to gridftpd user? switchEffectiveUser('.'); if ($remotegmdirs) { for my $pair (@$remotegmdirs) { my ($controldir, $sessiondir) = split ' ', $pair; my $newjobs = get_gmjobs($controldir, $nojobs); $gmjobs->{$_} = $newjobs->{$_} for keys %$newjobs; } } # switch back to root switchEffectiveUser('.'); return $gmjobs; } sub get_gmjobs { my ($controldir, $nojobs) = @_; my %gmjobs; my $jobstoscan = 0; my $jobsskipped = 0; # read the list of jobs from the jobdir and create the @gridmanager_jobs # the @gridmanager_jobs contains the IDs from the job.ID.status foreach my $controlsubdir ("$controldir/accepting", "$controldir/processing", "$controldir/finished") { unless (opendir JOBDIR, $controlsubdir ) { $log->warning("Can't access the job control directory: $controlsubdir") and return {}; } my @allfiles = grep /\.status/, readdir JOBDIR; closedir JOBDIR; my @gridmanager_jobs = map {$_=~m/job\.(.+)\.status/; $_=$1;} @allfiles; # count job IDs to scan $jobstoscan = $jobstoscan + @gridmanager_jobs; $log->verbose("Found ". scalar @gridmanager_jobs. " jobs in $controlsubdir"); foreach my $ID (@gridmanager_jobs) { my $job = $gmjobs{$ID} = {}; my $gmjob_local = $controldir."/job.".$ID.".local"; my $gmjob_status = $controlsubdir."/job.".$ID.".status"; my $gmjob_failed = $controldir."/job.".$ID.".failed"; my $gmjob_description = $controldir."/job.".$ID.".description"; my $gmjob_grami = $controldir."/job.".$ID.".grami"; my $gmjob_diag = $controldir."/job.".$ID.".diag"; unless ( open (GMJOB_LOCAL, "<$gmjob_local") ) { $log->debug( "Job $ID: Can't read jobfile $gmjob_local, skipping job" ); delete $gmjobs{$ID}; $jobsskipped++; next; } my @local_allines = ; $job->{activityid} = []; # parse the content of the job.ID.local into the %gmjobs hash foreach my $line (@local_allines) { if ($line=~m/^(\w+)=(.+)$/) { # TODO: multiple activityid support. # is this still used? if not, remove the code. # looking at trunk it doesn't seem to exist anymore. if ($1 eq "activityid") { push @{$job->{activityid}}, $2; } else { # a job can belong to a user that has multiple voms roles # for completeness all added to the datastructure # in an array if ($1 eq "voms") { push @{$job->{voms}}, $2; # vomsvo to hold the selected vo, I assume is the first in the list. # will be used to calculate vo statistics # must match advertised (i.e. slashes are removed) unless (defined $job->{vomsvo}) { my $vostring = $2; if ($vostring =~ /^\/+(\w+)/) { $vostring = $1; }; $job->{vomsvo} = $vostring; } } else { $job->{$1}=$2; } } } } close GMJOB_LOCAL; # Extrasct jobID uri if ($job->{globalid}) { $job->{globalid} =~ s/.*JobSessionDir>([^<]+)<.*/$1/; } else { $log->debug("Job $ID: 'globalid' missing from .local file"); } # Rename queue -> share if (exists $job->{queue}) { $job->{share} = $job->{queue}; delete $job->{queue}; } else { $log->debug("Job $ID: 'queue' missing from .local file"); } # check for interface field if (! $job->{interface}) { $log->debug("Job $ID: 'interface' missing from .local file, reverting to org.nordugrid.gridftpjob"); $job->{interface} = 'org.nordugrid.gridftpjob'; } # read the job.ID.status into "status" unless (open (GMJOB_STATUS, "<$gmjob_status")) { $log->debug("Job $ID: Can't open status file $gmjob_status, skipping job"); delete $gmjobs{$ID}; $jobsskipped++; next; } else { my @file_stat = stat GMJOB_STATUS; my ($first_line) = ; close GMJOB_STATUS; unless ($first_line) { $log->debug("Job $ID: Failed to read status from file $gmjob_status, skipping job"); delete $gmjobs{$ID}; $jobsskipped++; next; } chomp ($first_line); $job->{status} = $first_line; if (@file_stat) { # localowner my $uid = $file_stat[4]; my $user = (getpwuid($uid))[0]; if ($user) { $job->{localowner} = $user; } else { $log->debug("Job $ID: Cannot determine user name for owner (uid $uid)"); } $job->{"statusmodified"} = $file_stat[9]; $job->{"statusread"} = time(); } else { $log->debug("Job $ID: Cannot stat status file: $!"); } } # check for localid if (! $job->{localid}) { if ($job->{status} eq 'INLRMS') { $log->debug("Job $ID: has no local ID but is in INLRMS state, this should not happen"); } $job->{localid} = 'UNDEFINEDVALUE'; } # Comes the splitting of the terminal job state # check for job failure, (job.ID.failed ) "errors" if (-e $gmjob_failed) { unless (open (GMJOB_FAILED, "<$gmjob_failed")) { $log->debug("Job $ID: Can't open $gmjob_failed"); } else { my $chars; read GMJOB_FAILED, $chars, 1024; my @allines = split "\n", $chars; close GMJOB_FAILED; $job->{errors} = \@allines; } } if ($job->{"status"} eq "FINISHED") { #terminal job state mapping if ( $job->{errors} ) { if (grep /Job is canceled by external request/, @{$job->{errors}}) { $job->{status} = "KILLED"; } elsif ( defined $job->{errors} ) { $job->{status} = "FAILED"; } } } # if jobs are not printed, it's sufficient to have jobid, status, # subject, queue and share. Can skip the rest. next if $nojobs; # read the job.ID.grami file unless ($job->{status} eq 'DELETED') { unless ( open (GMJOB_GRAMI, "<$gmjob_grami") ) { # this file is is kept by A-REX during the hole existence of the # job. grid-manager from arc0, however, deletes it after the job # has finished. $log->debug("Job $ID: Can't open $gmjob_grami"); } else { my $sessiondir = $job->{sessiondir} || ''; while (my $line = ) { if ($line =~ m/^joboption_(\w+)='(.*)'$/) { my ($param, $value) = ($1, $2); $param =~ s/'\\''/'/g; # unescape quotes # These parameters are quoted by A-REX if ($param eq "stdin") { $job->{stdin} = $value; $job->{stdin} =~ s/^\Q$sessiondir\E\/*//; } elsif ($param eq "stdout") { $job->{stdout} = $value; $job->{stdout} =~ s/^\Q$sessiondir\E\/*//; } elsif ($param eq "stderr") { $job->{stderr} = $value; $job->{stderr} =~ s/^\Q$sessiondir\E\/*//; } elsif ($param =~ m/^runtime_/) { push @{$job->{runtimeenvironments}}, $value; } } elsif ($line =~ m/^joboption_(\w+)=(\w+)$/) { my ($param, $value) = ($1, $2); # These parameters are not quoted by A-REX if ($param eq "count") { $job->{count} = int($value); } elsif ($param eq "walltime") { $job->{reqwalltime} = int($value); } elsif ($param eq "cputime") { $job->{reqcputime} = int($value); } elsif ($param eq "starttime") { $job->{starttime} = $value; } } } close GMJOB_GRAMI; } } #read the job.ID.description file unless ($job->{status} eq 'DELETED') { unless ( open (GMJOB_DESCRIPTION, "<$gmjob_description") ) { $log->debug("Job $ID: Can't open $gmjob_description"); } else { while (my $line = ) { chomp $line; next unless $line; if ($line =~ m/^\s*[&+|(]/) { $job->{description} = 'rsl'; last } if ($line =~ m/http\:\/\/www.eu-emi.eu\/es\/2010\/12\/adl/) { $job->{description} = 'adl'; last } my $nextline = ; if ($nextline =~ m/http\:\/\/www.eu-emi.eu\/es\/2010\/12\/adl/) { $job->{description} = 'adl'; last } $log->debug("Job $ID: Can't identify job description language"); last; } close GMJOB_DESCRIPTION; } } #read the job.ID.diag file if (-s $gmjob_diag) { unless ( open (GMJOB_DIAG, "<$gmjob_diag") ) { $log->debug("Job $ID: Can't open $gmjob_diag"); } else { my %nodenames; my ($kerneltime, $usertime); while (my $line = ) { $line=~m/^nodename=(\S+)/ and $nodenames{$1} = 1; $line=~m/^WallTime=(\d+)(\.\d*)?/ and $job->{WallTime} = ceil($1); $line=~m/^exitcode=(\d+)/ and $job->{exitcode} = $1; $line=~m/^AverageTotalMemory=(\d+)kB/ and $job->{UsedMem} = ceil($1); $line=~m/^KernelTime=(\d+)(\.\d*)?/ and $kerneltime=$1; $line=~m/^UserTime=(\d+)(\.\d*)?/ and $usertime=$1; $line=~m/^LRMSStartTime=(\d*Z)/ and $job->{LRMSStartTime}=$1; $line=~m/^LRMSEndTime=(\d*Z)/ and $job->{LRMSEndTime}=$1; } # Use completion time from diag instead of status, more reliable if ( ( $job->{status} eq 'FINISHED' ) ) { my @file_stat = stat GMJOB_DIAG; if (@file_stat) { my ($s,$m,$h,$D,$M,$Y) = gmtime($file_stat[9]); my $ts = sprintf("%4d%02d%02d%02d%02d%02d%1s",$Y+1900,$M+1,$D,$h,$m,$s,"Z"); $job->{"completiontime"} = $ts; } else { $log->debug("Job $ID: Cannot stat diag file: $!"); } } close GMJOB_DIAG; $job->{nodenames} = [ sort keys %nodenames ] if %nodenames; $job->{CpuTime}= ceil($kerneltime + $usertime) if defined $kerneltime and defined $usertime; } } } # job ID loop } # controlsubdir loop $log->verbose("Number of jobs to scan: $jobstoscan ; Number of jobs skipped: $jobsskipped"); return \%gmjobs; } #### TEST ##### TEST ##### TEST ##### TEST ##### TEST ##### TEST ##### TEST #### sub test() { require Data::Dumper; LogUtils::level('VERBOSE'); my $controldirs = {'*' => { 'controldir' => '/tmp/jobstatus' } }; my $results = GMJobsInfo::collect(\%$controldirs); print Data::Dumper::Dumper($results); } #test; 1;