@@ -17,21 +17,34 @@ import (
1717var qacctClient qacct.QAcct
1818var qstatClient qstat.QStat
1919
20+ var newlyFinishedJobs <- chan qacct.JobDetail
21+
2022var log * zap.Logger
2123
2224func init () {
2325 var err error
2426 log , _ = zap .NewProduction ()
27+
28+ qstatClient , err = qstat .NewCommandLineQstat (qstat.CommandLineQStatConfig {})
29+ if err != nil {
30+ log .Fatal ("Failed to initialize qstat client" , zap .String ("error" ,
31+ err .Error ()))
32+ }
33+
2534 qacctClient , err = qacct .NewCommandLineQAcct (qacct.CommandLineQAcctConfig {})
2635 if err != nil {
2736 log .Fatal ("Failed to initialize qacct client" , zap .String ("error" ,
2837 err .Error ()))
2938 }
30- qstatClient , err = qstat .NewCommandLineQstat (qstat.CommandLineQStatConfig {})
39+
40+ // watch for newly finished jobs
41+ newlyFinishedJobs , err = qacct .WatchFile (context .Background (),
42+ qacct .GetDefaultQacctFile (), 1024 )
3143 if err != nil {
32- log .Fatal ("Failed to initialize qstat client" , zap . String ( "error " ,
33- err .Error ()))
44+ log .Fatal ("Failed to initialize job watcher " ,
45+ zap . String ( "error" , err .Error ()))
3446 }
47+
3548}
3649
3750func main () {
@@ -48,7 +61,7 @@ func run(ctx context.Context) {
4861 log .Info ("Context cancelled, stopping ClusterScheduler" )
4962 return
5063 default :
51- finishedJobs , err := GetFinishedJobs ()
64+ finishedJobs , err := GetFinishedJobsWithWatcher ()
5265 if err != nil {
5366 log .Error ("Error getting finished jobs" , zap .String ("error" ,
5467 err .Error ()))
@@ -107,10 +120,47 @@ type SimpleJob struct {
107120 MasterNode string `json:"master_node"`
108121}
109122
123+ func GetFinishedJobsWithWatcher () ([]* SimpleJob , error ) {
124+ jobs := []* SimpleJob {}
125+
126+ for {
127+ // get next job or timeout after 0.1s of there is no new job
128+ select {
129+ case fjob := <- newlyFinishedJobs :
130+ state := fmt .Sprintf ("%d" , fjob .ExitStatus )
131+ if state == "0" {
132+ state = "done"
133+ } else {
134+ state = "failed"
135+ }
136+ simpleJob := SimpleJob {
137+ // ignore job arrays for now
138+ JobId : fmt .Sprintf ("%d" , fjob .JobNumber ),
139+ Cluster : fjob .QName ,
140+ JobName : fjob .JobName ,
141+ Partition : fjob .GrantedPE ,
142+ Account : fjob .Account ,
143+ User : fjob .Owner ,
144+ State : state ,
145+ ExitCode : fmt .Sprintf ("%d" , fjob .ExitStatus ),
146+ Submit : parseTimestampInt64 (fjob .SubmitTime ),
147+ Start : parseTimestampInt64 (fjob .StartTime ),
148+ End : parseTimestampInt64 (fjob .EndTime ),
149+ MasterNode : fjob .HostName ,
150+ }
151+ jobs = append (jobs , & simpleJob )
152+ case <- time .After (100 * time .Millisecond ):
153+ return jobs , nil
154+ }
155+ }
156+ return jobs , nil
157+ }
158+
110159func GetFinishedJobs () ([]* SimpleJob , error ) {
111160 // Use qacct NativeSpecification to get finished jobs
112161 qacctOutput , err := qacctClient .NativeSpecification ([]string {"-j" , "*" })
113162 if err != nil {
163+ // no job are command failed
114164 return nil , fmt .Errorf ("error running qacct command: %v" , err )
115165 }
116166
@@ -137,9 +187,9 @@ func GetFinishedJobs() ([]*SimpleJob, error) {
137187 User : job .Owner ,
138188 State : state ,
139189 ExitCode : fmt .Sprintf ("%d" , job .ExitStatus ),
140- Submit : parseTimestamp (job .QSubTime ),
141- Start : parseTimestamp (job .StartTime ),
142- End : parseTimestamp (job .EndTime ),
190+ Submit : parseTimestampInt64 (job .SubmitTime ),
191+ Start : parseTimestampInt64 (job .StartTime ),
192+ End : parseTimestampInt64 (job .EndTime ),
143193 MasterNode : job .HostName ,
144194 }
145195 }
@@ -150,7 +200,8 @@ func GetRunningJobs() ([]*SimpleJob, error) {
150200
151201 qstatOverview , err := qstatClient .NativeSpecification ([]string {"-g" , "t" })
152202 if err != nil {
153- return nil , fmt .Errorf ("error running qstat command: %v" , err )
203+ // no jobs running
204+ return nil , nil
154205 }
155206 jobsByTask , err := qstat .ParseGroupByTask (qstatOverview )
156207 if err != nil {
@@ -193,7 +244,8 @@ func GetRunningJobs() ([]*SimpleJob, error) {
193244 // get running jobs
194245 qstatOutput , err := qstatClient .NativeSpecification ([]string {"-j" , "*" })
195246 if err != nil {
196- return nil , fmt .Errorf ("error running qstat command: %v" , err )
247+ // no jobs running; qstat -j * found 0 jobs (TODO)
248+ return nil , nil
197249 }
198250
199251 jobs , err := qstat .ParseSchedulerJobInfo (qstatOutput )
@@ -242,6 +294,14 @@ func SendJobs(ctx context.Context, jobs []*SimpleJob) (int, error) {
242294 return len (jobs ), nil
243295}
244296
297+ func parseTimestampInt64 (ts int64 ) * timestamppb.Timestamp {
298+ // ts is 6 digits behind the second (microseconds)
299+ sec := ts / 1e6
300+ nsec := (ts - sec * 1e6 ) * 1e3
301+ t := time .Unix (sec , nsec )
302+ return timestamppb .New (t )
303+ }
304+
245305// 2024-10-24 09:49:59.911136
246306func parseTimestamp (s string ) * timestamppb.Timestamp {
247307 loc , err := time .LoadLocation ("Local" )
0 commit comments