1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49 import time
50 import string
51 import threading
52 from random import choice
53 import popen2
54 import os
55 import sys
56 import types
57 import random as rd
58 import operator
59 import signal
60 import re
61
62
63
64
65
66
67
68 G_verbose = 2
69 G_exit = False
70
71
72
73
74
75
76
78 """Mail a file """
79 if MyConf().info.mail == '':
80 print 'Define address mail with setMail() function'
81 return
82 ret = os.path.isfile(File)
83 if ret:
84 os.system('mail -s "APCScheduler : %s" %s < %s'%(File, MyConf().info.mail, File))
85 else:
86 print 'File "'+File+'" doesn\'t exist'
87
88
89 -def newJob(MyAppli, scheduler='LOCAL'):
90 """Return Job object with follow default scheduler [LOCAL]"""
91 if scheduler == 'GLITE':
92 MySched = SchedulerGLITE()
93 elif scheduler == 'LOCAL':
94 MySched = SchedulerLOCAL()
95 elif scheduler == 'BQS':
96 MySched = SchedulerBQS()
97 elif scheduler == 'GE':
98 MySched = SchedulerSGE()
99 else:
100 print 'Error unknown scheduler !!'
101 return None
102 return JobClass(MyAppli, MySched)
103
104
106 """ Define mail address"""
107
108 MyConf().info.mail = mail
109
110
112 """ Define mail address"""
113 file_tmp = os.path.expanduser(fileconf)
114 ret = os.path.isfile(file_tmp)
115 if ret:
116 ConfigGrid().setFileConf(file_tmp)
117 else:
118 sys.stderr.write("\nERROR: file '%s' doesn't exist !\n"%fileconf)
119
120
122 """ Define repository, create it if necessary"""
123
124 global S_GridRepository
125 if NameRepository[0] != '/':
126 NameRepository = os.path.join(os.getcwd(),NameRepository)
127 if NameRepository[-1] != '/':
128 NameRepository += '/'
129
130 if not os.path.isdir(NameRepository):
131
132 os.makedirs(NameRepository)
133 MyConf().info.workDir = NameRepository
134
135
137 """Set verbose level [0-20]: 0 no message, 20 all messages"""
138 global G_verbose
139 if level >= 0 and level <= 20:
140 G_verbose = level
141 else:
142 print "Level must be between 0 and 20."
143
144
145 -def submitCmd(cmd, TpsMax=3600, verbose=True):
153
154
156 if os.environ.has_key('LD_LIBRARY_PATH'):
157 os.environ['LD_LIBRARY_PATH'] += ':'+path
158 else:
159 os.environ['LD_LIBRARY_PATH'] = path
160
161
163 return "APCscheduler june 2012, version: 0.9.3"
164
165
166
167
168
169
171
172 os.system('myproxy-destroy -d')
173 os.system('glite-voms-proxy-destroy')
174
175 if nb_hours <= 0:
176 sys.stderr.write("\nERROR: enter a positive number\n")
177 sys.exit(1)
178 return os.system('glite-voms-proxy-init -voms %s -hours %d'%(MyConf().gLite.vo, nb_hours))
179
180
182 print 'Signal handler called with signal', signum
183 print frame
184 global G_exit
185 G_exit = True
186 for job in JobClass.dictJobSubmit.values():
187 try:
188 print "try cancel job ", job.name()
189 job._Scheduler.cancel(job._Appli)
190 except:
191 pass
192 sys.exit(signum)
193
194
195 signal.signal(signal.SIGINT, _cancelJob)
196
197
199 Alea = ''
200 Alphabet = string.letters + string.digits
201 for i in range(NbChar):
202 Alea += choice(Alphabet)
203 MyStr += '_'+Alea
204 return(MyStr)
205
206
208 InternList = []
209 for elt in MyList:
210 if elt not in InternList:
211 InternList.append(elt)
212 return(InternList)
213
214
216 idx = buffer.find(keyword)
217 if idx >= 0:
218
219 next = buffer[idx+len(keyword):]
220
221 next = next.split('\n')
222
223
224 next.remove('')
225
226
227 next = next[0].split()
228
229 return next[0]
230 return None
231
232
234 val = MyExe.find('/')
235 if val >= 0:
236 if val == 0:
237 if os.access(MyExe, os.X_OK):
238 return MyExe
239 else:
240 return None
241 if MyExe[0:2] == './':
242
243 file = os.getcwd()+MyExe[1:]
244 if os.access(file, os.X_OK):
245 return file
246 return None
247 else:
248
249
250 file = os.getcwd()+'/'+MyExe
251 if os.access(file, os.X_OK):
252 return file
253 return None
254 else:
255 if os.access(MyExe, os.X_OK):
256 return MyExe
257 else:
258 pathlist = os.getenv('PATH').split(os.pathsep)
259 for path in pathlist:
260 file = os.path.join(path, MyExe)
261
262 if os.access(file, os.X_OK):
263 if path == '.':
264 file = os.getcwd()+'/'+ MyExe
265 return file
266 return None
267
268
270 dec = time.split(':')
271 ToSec = 1
272 TotalInSec = 0
273 for i in range(len(dec)-1,-1,-1):
274 TotalInSec += long(dec[i])*ToSec
275 ToSec *= 60
276 return TotalInSec
277
278
280 myp = Process('ldd '+ exelib)
281 myp.wait()
282 output = myp.stdOut()
283 setlines =output.split('\n')
284 setlines.remove('')
285 for line in setlines:
286 if line.find('=>') != -1:
287 lib = line.split()[2]
288 if (lib=='not'):
289 print "path lib not find !! Update LD_LIBRARY_PATH var env "
290 return False
291 else:
292 lib = line.split()[0]
293 if (lib.find('/lib64') != 0) and (lib.find('/usr/lib64') != 0):
294 PasLa = True
295
296 for elt in listlib:
297 if elt == lib :
298 PasLa = False
299 break
300 if PasLa:
301 listlib.append(lib)
302 ret = _FindLib(lib, listlib)
303 if not ret:
304 return ret
305 return True
306
307
309 try:
310 pf=open(file,'r')
311
312 except IOError:
313
314 return None
315 buf = pf.read()
316 pf.close()
317 return buf
318
320 try:
321 pf=open(file,'a')
322
323 except IOError:
324
325 return None
326 pf.write(buf)
327 pf.close()
328
329
330
332 if file.find('/') >= 0:
333 return(file[0:file.rfind('/')])
334 else:
335 se=SEtools( MyConf().gLite.vo, MyConf().gLite.se)
336 return(se._simpleDelTag(file))
337
338
340 if file.find('/') >= 0:
341 return(file[0:file.rfind('/')])
342 else:
343 return(file)
344
346 if name[-1] == '/':
347 return name[:-1]
348 else:
349 return name
350
352 if file.find('/') >= 0:
353 return(file[file.rfind('/')+1:])
354 else:
355 return('')
356
357
358
359
360
361
362
363
365 """Generic class parser """
367 self._file = file
368 if not os.path.isfile(file):
369
370
371 return
372 pFile = open(file, 'r')
373 conf = pFile.read()
374 pFile.close()
375 tabline = conf.split('\n')
376 for line in tabline:
377 words = line.split()
378 if len(words) == 0:
379 continue
380 if words[0][0] == "#":
381 continue
382 if hasattr(self, words[0]):
383 if len(words) == 1:
384 sys.stderr.write("\nERROR: in file %s, no value for argument '%s' !!\n"%(file, words[0]))
385 exit(1)
386 myType = type(getattr(self, words[0]))
387 if myType == types.StringType:
388 setattr(self, words[0], words[1])
389 else:
390 try:
391 setattr(self, words[0], (myType)(words[1]))
392 except ValueError:
393 sys.stderr.write("\nERROR: parameter '%s' in file %s must be %s !!\n"%(words[0],file, myType))
394 exit(1)
395 else:
396 sys.stdout.write("\nWARNING: unknown parameter '%s' in file conf %s\n"%(words[0],file))
397
398
399
401 """Personnal Glite parameters"""
402 _defaultFile = os.path.expanduser('~/.apcgrid/apcgrid.conf')
404
405 self.file = ConfigGrid._defaultFile
406 self.vo= ''
407 self.lfchost = ''
408 self.se = ''
409 self.ce = ''
410 ConfigFile.__init__(self,self.file)
411 os.environ['LCG_CATALOG_TYPE']='lfc'
412 os.environ['LFC_HOST']=self.lfchost
413 if G_verbose>5:print "Fin INIT ConfigGrid"
414
419
420
422 """Personnal information"""
424 self.file = os.path.expanduser('~/.apcgrid/apcperso.conf')
425 self.mail = ''
426 self.workDir = os.getcwd()+'/'
427 ConfigFile.__init__(self,self.file )
428
429
431 """Personnal configuration for APCScheduler with design pattern Singleton"""
436
437 instance = None
438
443
446
448 return setattr(self.instance, attr, val)
449
450
451
452
453
455 - def __init__(self, nameObj, fileSystem):
456 self._name = nameObj
457 self._fs = fileSystem
458 self._perm = None
459 fileSystem.defType(self)
461 - def size(self): raise
466
467
469 - def __init__(self, nameObj, fileSystem):
473 - def isFile(self): return True
474 - def isLink(self): return False
475 - def isDir(self): return False
476 - def isNone(self): return False
477
478
480 - def __init__(self, nameObj, fileSystem):
483 - def size(self): return 0
484 - def isFile(self): return False
485 - def isLink(self): return False
486 - def isDir(self): return False
487 - def isNone(self): return True
488
489
491 - def __init__(self, nameObj, fileSystem):
494 - def size(self): return 0
495 - def isFile(self): return False
496 - def isLink(self): return True
497 - def isDir(self): return False
498 - def isNone(self): return False
499
500
502 - def __init__(self, nameObj, fileSystem):
506 - def isFile(self): return False
507 - def isLink(self): return False
508 - def isDir(self): return True
509 - def isNone(self): return False
510
511
513 - def _init(self, obj): pass
514 - def getName(self): return 'NOT DEFINE'
515 - def isDir(self,obj): raise
525
526
532 - def isDir(self,obj): return (obj._perm[0]=='d')
533 - def isFile(self,obj): return (obj._perm[0]=='-')
534 - def isLink(self,obj): return (obj._perm[0]=='l')
535 - def sizeFile(self,obj): return obj._size
537 myp = Process("lfc-ls -l /grid/%s/%s | grep ^[^d] | awk '{s=s+$5} END {print s}'"%(self._vo, obj._name))
538 myp.wait(30)
539 if myp.getExitValue() != 0:
540 sys.stderr.write(myp.stdErr())
541 return -1
542 elif len(myp.stdOut()) == 0:
543 return 0
544 else:
545 if G_verbose>=15:
546 print "stdout='"+myp.stdOut()+"'"
547 try:
548 mys = int(myp.stdOut())
549 except:
550 mys = 0
551 return mys
552
559
560
562
563 path = obj._name
564 if path=='':
565 obj._perm = 'd'
566 obj._size = 0
567 return True
568 if path.find('/') >= 0:
569 path_tmp = path[0:path.rfind('/')]
570 name_tmp = path[path.rfind('/')+1:]
571 else:
572 path_tmp = ''
573 name_tmp = path
574 if G_verbose>=10:
575 print 'path_tmp='+path_tmp
576 print 'name_tmp='+name_tmp
577 mycmd = "lfc-ls -l /grid/%s/%s | grep %s"%(self._vo, path_tmp, name_tmp )
578 myp = Process(mycmd)
579 myp.wait(30)
580 if myp.getExitValue() != 0:
581 sys.stderr.write(myp.stdErr())
582 list = myp.stdOut().split('\n')
583 for dir in list:
584 if dir == "" : continue
585 mot = dir.split()
586 if G_verbose>=10:
587 print dir
588 print mot
589 if mot[8] == name_tmp:
590 if G_verbose>=10: print "find objet, permission is ", mot[0]
591 obj._perm = mot[0]
592 obj._size = int(mot[4])
593
594
596 - def _init(self, obj): pass
598 - def isDir(self,obj): return os.path.isdir(obj._name)
599 - def isFile(self,obj): return os.path.isfile(obj._name)
600 - def isLink(self,obj): return os.path.islink(obj._name)
601 - def sizeFile(self,obj):return os.path.getsize(obj._name)
603 pp = Process('ls -l %s'%obj._name)
604 pp.wait(60)
605 size = 0
606 for line in pp.stdOut().split('\n'):
607 if line == '': continue
608 if line[0:5] == "total": continue
609 if (line[0] == '-') : size += int(line.split()[4])
610 return size
611
612
1100
1101
1137
1138
1140 """Thread to supervise a process"""
1142 threading.Thread.__init__(self)
1143 self.Finish = False
1144 self.Process = ObjProcess
1145
1151
1152
1154
1155 """Safe process method wait() no """
1156 - def __init__(self, cmd, stdout='' , stderr=''):
1157 self._cmd = cmd
1158
1159
1160
1161
1162 nameAlea = os.getcwd()+"/"+_AddAlea("APCprocess",7)
1163 if stdout == '':
1164 self._rmStdOut = True
1165 self._nameStdOut = nameAlea+".out"
1166 else:
1167 self._rmStdOut = False
1168 self._nameStdOut = stdout
1169
1170 if stderr == '':
1171 self._rmStdErr = True
1172 self._nameStdErr = nameAlea+".err"
1173 else:
1174 self._rmStdErr = False
1175 self._nameStdErr = stderr
1176
1177 cmd_add = cmd + " 1>%s 2>%s"%(self._nameStdOut, self._nameStdErr)
1178
1179 popen2.Popen3.__init__(self, cmd_add, True)
1180 self._Status = 'SubmitRunning'
1181 self._ret = None
1182 self._exitValue = None
1183 self._readErrFlag = True
1184 self._readOutFlag = True
1185
1186
1187
1188
1189
1190
1192 """Call when process finish"""
1193 if self._exitValue != None: return
1194 self._exitValue = self.poll()>>8
1195 if self._exitValue > 127:
1196 self._exitValue -= 256
1197 if G_verbose>=15:print "exit value "+str( self._exitValue)
1198 if self._exitValue == 0:
1199 self._Status = "FinishOK"
1200 else:
1201 self._Status = "FinishNOK"
1202 self._retrieveOut()
1203 self._retrieveErr()
1204 self.fromchild.close()
1205 self.tochild.close()
1206 self.childerr.close()
1207
1208
1210 """update status, no return"""
1211 if self._Status.find('Finish') >= 0:
1212
1213 return
1214 cmd = 'ps -p %d -o state'%self.pid
1215 (o,i,e) = popen2.popen3(cmd)
1216 output = o.read()
1217 o.close()
1218 i.close()
1219 e.close()
1220 if G_verbose>=15 : print cmd+'\n'+output
1221 rep = output.split('\n')
1222 rep.remove('')
1223 if len(rep) >= 2:
1224 if (rep[1] in ['Z','X']):
1225
1226 self._ExaminePoll()
1227 else:
1228
1229 pass
1230 else:
1231
1232 self._ExaminePoll()
1233
1235 self._updateStatus()
1236 return (self._Status == 'SubmitRunning')
1237
1239 self._updateStatus()
1240 return (self._Status.find('Finish') >= 0)
1241
1243 self._updateStatus()
1244 return (self._Status == 'FinishOK')
1245
1247 return popen2.Popen3.wait(self)
1248
1249 - def wait(self,TimeOut=-1):
1250 if TimeOut == -1:
1251
1252 try:
1253 self._ret = self._wait()
1254 except:
1255
1256 pass
1257 self._ExaminePoll()
1258 else:
1259
1260 t = SuperviseProcess(self)
1261 t.start()
1262 t.join(TimeOut)
1263 if not t.Finish:
1264 os.system("kill -9 %d"%self.pid)
1265 self._Status = "FinishKill"
1266 self._ret = None
1267 if G_verbose>=15:print "Time out, kill process"
1268 else:
1269 self._ExaminePoll()
1270 self._retrieveOut()
1271 self._retrieveErr()
1272
1273
1275
1276 try:
1277 fo = open(nameFile,'r')
1278 except IOError:
1279 return ''
1280 out = fo.read()
1281 fo.close()
1282 return out
1283
1285 if self._rmStdOut:
1286
1287 if self._readOutFlag:
1288 self._readOutFlag = False
1289 self._out = self._readFile(self._nameStdOut)
1290 os.system("rm -rf "+self._nameStdOut)
1291
1293 if self._rmStdErr:
1294
1295 if self._readErrFlag:
1296 self._readErrFlag = False
1297 self._err = self._readFile(self._nameStdErr)
1298 os.system("rm -rf "+self._nameStdErr)
1299
1301 if self.isFinish():
1302 if self._rmStdOut:
1303 return self._out
1304 else:
1305 return self._readFile(self._nameStdOut)
1306 else:
1307 return None
1308
1310 if self.isFinish():
1311 if self._rmStdErr:
1312 return self._err
1313 else:
1314 return self._readFile(self._nameStdErr)
1315 else:
1316 return None
1317
1319 return self._exitValue
1320
1324
1325
1327 """ si le nombre de process running est superieur a la limite :
1328 - si parametre Wait est False alors le process echoue
1329 - sinon attente qu'un process du groupe si termine pour le lancer
1330 """
1331 MaxProcess = 10
1332 listProcess = []
1333 - def __init__(self, cmd, stdout='' , stderr='', Wait=True):
1351
1361
1362
1364 """
1365 Reparti et regule les process submit Glite sur les WMS disponibles de la VO
1366 """
1367
1368 SlistProcess = []
1369 Squeue = {}
1370
1371
1373 listWMS=[]
1374 WMSpossible = False
1375 for key, value in _ProcessSubWMS.Squeue.items():
1376 if not set([key]).issubset(setIdx) and value[0] > 0:
1377
1378 WMSpossible = True
1379 if value[0] > value[1]:
1380
1381 listWMS.append((value[1], key))
1382 if listWMS==[]:
1383 return [WMSpossible, None]
1384 else:
1385 listWMS.sort(key=operator.itemgetter(0))
1386
1387 return [True, listWMS[0][1]]
1388 _checkSlotFree = staticmethod(_checkSlotFree)
1389
1390
1391 - def __init__(self, cmd, queueWMS, stdout='', stderr=''):
1401
1402
1411
1412
1413
1414
1415
1416
1417
1418
1421 self._TypeSched =''
1422 self._delegateProxy = "No"
1423
1425 """cancel Application"""
1426 pass
1427
1429 """submit Appli"""
1430 pass
1431
1433 self._delegateProxy = "ToDo"
1434
1436 """Update status attribut of Appli object"""
1437
1438 if (Appli._Status.find('SubmitScheduled')==0):
1439
1440 TimeToSubmit = time.time() - Appli._TimeStart
1441 Grace = Appli._TimeOutToStart -TimeToSubmit
1442 if G_verbose>5:print '\nTimeToSubmit=',TimeToSubmit,'Grace= ',Grace
1443 if Grace < 0:
1444 print 'Time Out !!!!!!'
1445 self.cancel(Appli)
1446 if not self._ReSubmit(Appli):
1447 Appli._Status = 'FinishTimeOut'
1448
1449 if Appli._Status.find('Finish') >= 0:
1450 try:
1451 del JobClass.dictJobSubmit[Appli._APCSchedID]
1452 except:
1453 pass
1454 if (Appli._Status.find('FinishOK') >= 0 or Appli._Status.find('FinishNOK') >= 0):
1455 self.retrieveOutput(Appli)
1456 if Appli._Status.find('FinishOK') >= 0 :
1457 self._cleaner(Appli)
1458 Appli._Cleaner()
1459
1461 """Retrieve Appli outputs in work directory"""
1462 pass
1463
1464 - def wait(self, Appli):
1465 """wait the end of Appli"""
1466 if Appli._Status.find('Submit') != 0:
1467 return
1468 self.status(Appli)
1469 while Appli._Status.find('Submit') == 0:
1470 time.sleep(Appli._Timer)
1471 self.status(Appli)
1472
1473
1474
1476 """reSubmit Appli"""
1477 Appli._FlagResubmit = True
1478
1480 """scheduler cleaner after end of job """
1481 pass
1482
1485
1487 if Appli._keepStdFile:
1488 return _readFile(Appli.getPathFile('STDOUT'))
1489 else:
1490 return Appli._stdOut
1491
1493 if Appli._keepStdFile:
1494 return _readFile(Appli.getPathFile('STDERR'))
1495 else:
1496 return Appli._stdErr
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1508
1509 DoProxyTest = True
1510 S_DelegateProxy = False
1512 SchedulerAbstract.__init__(self)
1513 self._FileConf = ''
1514 self._FileJDL = ''
1515 self._pFile = ''
1516 self._Req = 'Requirements = other.GlueCEStateStatus == "Production" && ( ! ( RegExp(".*node16.*",other.CEId) ) );'
1517 self._LoadBalCE = None
1518 self._defCE = None
1519
1522
1525
1528
1531
1533 cmd = self._CancelCmd(Appli)
1534 if G_verbose>=10: print "cancel job: "+cmd
1535 pp = Process(cmd)
1536 Appli._Cleaner()
1537 pp.wait(30)
1538 if not pp.isOk():
1539 sys.stderr.write("\nERROR:\n commend '%s' is %s\n"%pp._cmd, pp.getStatus())
1540 sys.stderr.write(pp.stdErr())
1541 else:
1542 sys.stdout.write(pp.stdOut())
1543
1546
1549
1551 self._pFile = open(self._FileConf, 'w+')
1552 self._FillConf(wms)
1553 self._pFile.close()
1554
1556
1557 buffer = Appli._FillFileJDL()
1558
1559
1560 buffer += 'StdOutput = "%s";\n'%Appli.getNameStdOut()
1561 buffer += 'StdError = "%s";\n'%Appli.getNameStdErr()
1562
1563 outtemp = 'OutputSandbox = {"%s","%s"'%(Appli.getNameStdOut(),Appli.getNameStdErr())
1564 for elt in Appli._ListOutput:
1565
1566
1567 if elt.find("lfn:/grid") == -1:
1568
1569 outtemp += ',"%s"'%elt
1570 outtemp += '};\n'
1571 buffer += outtemp
1572
1573
1574 self._pFile = open(self._FileJDL, 'w+')
1575
1576
1577 print >> self._pFile,'%s'%buffer
1578 Appli._ce = self._FillJDL(Appli)
1579 self._pFile.close()
1580
1582 """
1583 """
1584 cmd = 'glite-voms-proxy-info'
1585 myp = Process(cmd)
1586 myp.wait(5)
1587 output = myp.stdOut()
1588 err = myp.stdErr()
1589 if G_verbose>5:print cmd
1590 if G_verbose>9: print output
1591 if G_verbose>9: print err
1592 minHourProxyTimeOut = 48
1593 initProxy =False
1594 if myp.isOk():
1595 timeleft_hhmmss = _GetWordsAfter(output,'timeleft :')
1596 timeleft_ss = _ConvHHMMSSInSec(timeleft_hhmmss)
1597 if G_verbose>=5:
1598 print 'proxy timeleft: ',timeleft_hhmmss,timeleft_ss
1599 if int(timeleft_ss) < 60*60*minHourProxyTimeOut:
1600 sys.stdout.write("\nProxy valid but time life proxy too short < %d hours."%minHourProxyTimeOut)
1601 initProxy = True
1602 else:
1603 if G_verbose>=2: print 'Proxy OK'
1604 SchedulerGrid.DoProxyTest = False
1605 else:
1606 initProxy = True
1607
1608 if initProxy:
1609 nb_hours= 3*24
1610 try:
1611 hours=input("\nProxy initialisation, enter duration in hours and I add %d hours or [Enter] to pass: "%minHourProxyTimeOut)
1612 try:
1613 nb_hours = max(minHourProxyTimeOut+hours, minHourProxyTimeOut)
1614 except:
1615 sys.stderr.write("\n%s is not a number, fix duration for 3 days\n")
1616 nb_hours= 3*24
1617 except:
1618
1619
1620
1621 sys.stderr.write("\nCan't initialisation proxy no interactive mode !\n")
1622 SchedulerGrid.DoProxyTest = False
1623 return
1624
1625
1626
1627 _initProxy(nb_hours)
1628
1629
1632
1634 ret = os.system('myproxy-init -d -n')
1635 time.sleep(2)
1636 if G_verbose >=10:
1637 print "status myproxy-init is ", ret
1638 return ret
1639
1641 if self._delegateProxy == "ToDo":
1642 if not SchedulerGrid.S_DelegateProxy:
1643 self._initDelegateProxy()
1644 self._delegateProxy = "Yes"
1645
1646 prefixe = Appli._PathLocWD + Appli._APCSchedID
1647 self._FileJDL = prefixe+'.jdl'
1648 self._FileConf= prefixe+'.conf'
1649 self.doFileJDL(Appli)
1650 Appli._wmsIdx = self._GetWMS(Appli._wmsUsed)
1651 if Appli._wmsIdx == None:
1652
1653 Appli._Status = 'FinishSubmitNOK'
1654 if len(Appli._wmsUsed)==0:
1655 sys.stderr.write("\nERROR:\n _GetWMS any wms available\n")
1656 del JobClass.dictJobSubmit[Appli._APCSchedID]
1657 return
1658 Appli._wmsUsed=Appli._wmsUsed.union(set([Appli._wmsIdx]))
1659 self.doFileConf(Appli._wmsIdx)
1660 cmd = self.submitCmd(Appli)
1661 Appli._ProcessSubmit = _ProcessSubWMS(cmd, Appli._wmsIdx)
1662 Appli._Status = "SubmitQuery"
1663 if G_verbose >=2:
1664 mes='\nTry submit ' +Appli._APCSchedID+ ' on wms '+self._shortWMSname(Appli)
1665 sys.stdout.write(mes)
1666
1667
1669 output = Appli._ProcessSubmit.stdOut()
1670 error_sub = Appli._ProcessSubmit.stdErr()
1671 if G_verbose>=15:
1672 print 'submit output:'+output+'\nsubmit erreur:'+error_sub
1673
1674
1675 if output.find('successfully submitted') >= 0:
1676 Appli._Status = 'SubmitScheduled'
1677 Appli._TimeStart = time.time()
1678 if G_verbose>=1:
1679 mes='\nSubmit ' +Appli._APCSchedID+ ' on wms '+self._shortWMSname(Appli)+' is ok.'
1680 mes += "Used CE "+Appli._ce
1681 sys.stdout.write(mes)
1682
1683 pID = open(Appli._FileID, 'r')
1684 a = pID.read()
1685 Appli._IDgrid = a.split('\n')[1]
1686 pID.close()
1687 if G_verbose>=15:print 'ID grid: '+Appli._IDgrid
1688 else:
1689 if G_verbose>=10:
1690 print 'submit NOK\nsubmit output:'+output+'\nsubmit erreur:'+error_sub
1691 if G_verbose>=1:
1692 print '\nSubmit ' +Appli._APCSchedID+ ' is NOK on wms', self._shortWMSname(Appli)
1693 _writeFile(Appli.getPathFile('SUBERR'), error_sub)
1694 self._ReSubmit(Appli)
1695
1696
1698 if Appli._Status.find('Submit') != 0:
1699 if G_verbose>10: print "status can't change !!!"
1700 return
1701
1702 if Appli._Status == "SubmitQuery":
1703 if Appli._ProcessSubmit.isFinish():
1704 self._AfterSubmit(Appli)
1705 if Appli._Status == "SubmitQuery":
1706 return
1707 Appli._ProcessSubmit = None
1708
1709
1710 if Appli.isFinish(): return
1711 else:
1712 return
1713
1714 if Appli._Status == "SubmitRetrieveSE":
1715 if not Appli._threadCopy.isAlive():
1716 if Appli._StatusAppli==0:
1717 Appli._Status = "FinishOK"
1718 else:
1719 Appli._Status = "FinishNOK"
1720 if G_verbose>=1:
1721 mes='\n'+Appli._APCSchedID +' retrieve finish.'
1722 sys.stdout.write(mes)
1723 return
1724
1725
1726 if os.path.isfile(Appli._FileID):
1727 cmd = self.statusCmd(Appli._FileID)
1728 myp = Process(cmd)
1729 myp.wait()
1730 ouput = myp.stdOut()
1731
1732 if G_verbose>9: print ouput
1733 status = _GetWordsAfter(ouput,'Current Status:')
1734 if status == None:
1735 Appli._Status = 'FinishNOKLostStatus'
1736 else:
1737 if G_verbose>=1:
1738 mes='\n'+Appli._APCSchedID +' status is ' + str(status)
1739 if G_verbose>=2:
1740 mes+='.\t[WMS: %s\tCE: %s]'%(self._shortWMSname(Appli).split('.')[0], Appli._ce)
1741 sys.stdout.write(mes)
1742 if (status.find('Running')>= 0):
1743 Appli._Status = 'SubmitRunning'
1744 elif (status.find('Done')>= 0):
1745 exitcode = _GetWordsAfter(ouput,'Exit code:')
1746 if exitcode != None:
1747 try:
1748 Appli._StatusAppli = int(exitcode)
1749 if Appli._StatusAppli == 0:
1750 Appli._Status = 'FinishOK'
1751 else:
1752 Appli._Status = 'FinishNOK'
1753 except:
1754 Appli._Status = 'FinishOK'
1755 else:
1756 Appli._Status = 'FinishOK'
1757 elif status.find('Cleared')>= 0:
1758 Appli._Status = 'FinishUnknow'
1759 elif status.find('Cancelled')>= 0:
1760 Appli._Status = 'FinishCancelled'
1761 elif status.find('Aborted')>= 0:
1762 if not self._ReSubmit(Appli):
1763 Appli._Status = 'FinishAborted'
1764 _writeFile(Appli.getPathFile('STDERR'), ouput)
1765 else:
1766 print "FileID doesn't exist", Appli._FileID
1767
1768 Appli._Status = 'FinishLostIDgrid'
1769
1770
1771 SchedulerAbstract.status(self,Appli)
1772 if Appli._threadCopy != None:
1773 Appli._Status = "SubmitRetrieveSE"
1774
1775
1777 cmd, outpath = self.retrieveCmd(Appli._FileID, Appli)
1778 if G_verbose>9:
1779 print cmd, outpath
1780 myp = Process(cmd)
1781 myp.wait(360)
1782 if G_verbose>9:
1783 print myp.stdOut()
1784 print myp.stdErr()
1785
1786 retrieveStatus = False
1787 if os.path.isdir(outpath) and myp.isOk():
1788 outpath_glite = _GetWordsAfter(myp.stdOut(),'stored in the directory:')
1789 if outpath_glite != None:
1790 retrieveStatus = True
1791
1792 os.system('mv %s/%s %s'%(outpath_glite, Appli.getNameStdOut(), Appli._PathLocWD))
1793
1794
1795 os.system('mv %s/%s %s'%(outpath_glite, Appli.getNameStdErr(), Appli._PathLocWD))
1796
1797
1798 for i in range(len(Appli._ListOutput)):
1799 RetFile = "%s/%s"%(outpath_glite, Appli._ListOutput[i])
1800 if os.path.isfile(RetFile):
1801 cmd = 'mv %s %s'%(RetFile, Appli.getPathFile("OUT",i))
1802 os.system(cmd)
1803 else:
1804 print "Can't retrieve %s , doesn't exist!!"%RetFile
1805
1806
1807 os.system('rm -rf '+ outpath)
1808
1809
1810 for i in range(len(Appli._ListOutputSE)):
1811 cmd = 'lcg-cp --vo %s %s file:%s'%(MyConf().gLite.vo, Appli._ListOutputSE[i], Appli.getPathFile("OUT_SE",i))
1812 if G_verbose > 5: print cmd
1813 os.system(cmd)
1814
1815 if not retrieveStatus:
1816 print "============retrieveOutput : pb"
1817 print myp.stdOut()
1818 print myp.stdErr()
1819 print "\n%s\n%s\n%s "%(outpath, cmd, myp._Status)
1820 print "==============================="
1821 Appli._Status = 'FinishNOKretrievePB'
1822 return
1823
1824 Appli._RetrieveFromSE()
1825
1827 SchedulerAbstract._ReSubmit(self, Appli)
1828 if G_verbose>8:print "Try _ReSubmit"
1829 cmd = 'rm -rf %s'%Appli._FileID
1830 os.system(cmd)
1831 Appli._Status='NotSubmit'
1832 self.submit(Appli)
1833 return Appli._Status != 'FinishSubmitNOK'
1834
1835 - def setCE(self, CEName):
1836 if self._defCE != None:
1837 sys.stderr.write("\nWARNING: CE already definied, conflict between load balancing and setCE ?")
1838 return
1839 cmd = "lcg-infosites --vo %s ce | grep %s"%(MyConf().gLite.vo,CEName )
1840 myp = Process(cmd)
1841 myp.wait(100)
1842 if not myp.isOk():
1843 print "\nout:\n"+myp.stdOut()
1844 print "\nerr:\n"+myp.stdErr()
1845 sys.stderr.write("ERROR with command "+cmd)
1846 sys.stderr.write(myp.stdErr())
1847 sys.exit(1)
1848 out = myp.stdOut()
1849 if out == "":
1850 sys.stderr.write("\nERROR: unknown CE for your VO\nMy check procedure:\n %s "%cmd)
1851 exit(1)
1852 lineout = out.split('\n')
1853 splitout = lineout[0].split()
1854 if len(splitout) >=6 :
1855 if splitout[5].find(CEName)== 0:
1856 self._Req = 'Requirements = other.GlueCEUniqueID == "%s";'%splitout[5]
1857 self._defCE = splitout[5]
1858 else:
1859 sys.stderr.write("\nERROR: I don't exactly find your CE '%s' in \n%s\nresult:\n%s "%(CEName, cmd, out))
1860 sys.exit(1)
1861 else:
1862 sys.stderr.write("\nERROR: invalid format lcg-infosites --vo xx ce, wait for 6 columns:\n%s"%out)
1863 sys.exit(1)
1864
1865
1868
1869
1871 """scheduler cleaner after end of job """
1872 if Appli.isOk():
1873 file= os.path.join(Appli._PathLocWD,Appli._APCSchedID)
1874 os.system("rm -rf "+file+".jdl")
1875 os.system("rm -rf "+file+".conf")
1876
1878
1879 if listJob._name =="":
1880 filetemp = listJob._ListJob[0].addFullPath(_AddAlea("",6))
1881 else:
1882 filetemp = listJob._ListJob[0].addFullPath(_AddAlea(listJob._name.replace(' ','_') ,6))
1883 filetemp = filetemp+'.ids'
1884 listJob.concatId(filetemp)
1885
1886
1887 cmd = self._resultCmd(filetemp)
1888 lce=Process(cmd)
1889 lce.wait(60)
1890 if not lce.isOk():
1891 sys.stdout.write(lce.stdOut())
1892 sys.stderr.write("ERROR: "+lce.stdErr())
1893 buffer = "\nCompute Element used:\n---------------------\n"
1894 return buffer+lce.stdOut()
1895
1896
1897
1898
1899
1901 ListWMS = []
1902
1904 SchedulerGrid.__init__(self)
1905 if not os.path.isfile(MyConf().gLite.file):
1906 sys.stderr.write("ERROR: file glite '%s' configuration doesn't exist.\nDo apcgrid-init to define it !!\n"%MyConf().gLite.file)
1907 sys.exit(1)
1908 self._VO= 'VirtualOrganisation = "%s";'%MyConf().gLite.vo
1909 self._MyProxy= 'MyProxyServer = "%s";'%os.getenv('MYPROXY_SERVER')
1910 self._Rank = 'Rank = ( other.GlueCEStateWaitingJobs == 0 ) ? ((other.GlueCEStateFreeCPUs==0)?-2222:other.GlueCEStateFreeCPUs) : - other.GlueCEStateWaitingJobs * 10 / (other.GlueCEStateRunningJobs + 1) * ( (other.GlueCEStateFreeCPUs == 0)?500:1 ) ;'
1911 self._ForceWMS = -1
1912 self._TypeSched = 'GLITE'
1913 self._checkProxy()
1914 self._InitListWMS()
1915
1916
1925
1926 - def setWMS(self, listWMSName):
1942
1944 for ask_wms in listWMS:
1945 findwms = False
1946 for idx in range(len(SchedulerGLITE.ListWMS)):
1947 if SchedulerGLITE.ListWMS[idx].find(ask_wms) >= 0:
1948 _ProcessSubWMS.Squeue[idx] = [0, 0]
1949 findwms = True
1950 break
1951 if not findwms:
1952 sys.stderr.write("\nERROR: your WMS %s isn't available for this VO %s"%(ask_wms, MyConf().gLite.vo))
1953 sys.exit(1)
1954
1991
1992
1993
1997
1998
2016
2017
2020
2022 return wms.split('/')[2].split(':')[0]
2023
2027
2029 wms = SchedulerGLITE.ListWMS[wmsIdx]
2030 self._NSAd= 'NSAddresses = "%s:7772";'%self._splitWMS(wms)
2031 self._LBAd= 'LBAddresses = "%s:9000";'%self._splitWMS(wms)
2032 self._WMProxy= 'WMProxyEndpoints = {"%s"};'%wms
2033
2035 self.initWithRB(wms)
2036 print >> self._pFile,'%s'%self._VO
2037 print >> self._pFile,'%s'%self._NSAd
2038 print >> self._pFile,'%s'%self._LBAd
2039 print >> self._pFile,'%s'%self._WMProxy
2040
2042 if self._defCE != None:
2043 sys.stderr.write("\nWARNING: CE already definied, conflict between load balancing and setCE ?")
2044 return
2045 self._defCE = True
2046 try:
2047 lb_class = globals()[vers]
2048 except:
2049 sys.stderr.write("ERROR: unknow class "+vers)
2050 sys.exit(1)
2051
2052 if query != "":
2053 self._LoadBalCE = lb_class(query)
2054 else:
2055 self._LoadBalCE = lb_class()
2056
2057
2059 if self._delegateProxy == "Yes":
2060 print >> self._pFile,'%s'%self._MyProxy
2061 print >> self._pFile,'%s'%self._Rank
2062 if Appli._MPIcpu != []:
2063 if self._defCE != None:
2064 requirmt = 'Requirements = other.GlueCEUniqueID == "%s";' %self._defCE
2065 ce_out = self._defCE
2066 else:
2067 requirmt = 'Requirements = Member("MPI-START", other.GlueHostApplicationSoftwareRunTimeEnvironment)'
2068 requirmt += '&& Member("OPENMPI", other.GlueHostApplicationSoftwareRunTimeEnvironment)'
2069 requirmt += '&& ( other.GlueCEStateStatus == "Production" ) '
2070 requirmt += '&& ( other.GlueCEInfoTotalCPUs >= %d);'%Appli._MPIcpuTotal
2071 ce_out = "unknow"
2072 else:
2073 requirmt = self._Req
2074 if self._LoadBalCE != None:
2075 ce = self._LoadBalCE.choiceCE()
2076 ce_out = ce.split(".")[0]
2077 if ce != None:
2078 if G_verbose>=5: sys.stdout.write("\nSelect CE: %s"%ce)
2079 requirmt='Requirements = other.GlueCEUniqueID == "%s";'%ce
2080 elif self._defCE:
2081 ce_out = self._defCE.split(".")[0]
2082 else:
2083 ce_out = "unknow"
2084 print >> self._pFile,'%s'% requirmt
2085 return ce_out
2086
2088 if self._delegateProxy == "Yes":
2089 if True:
2090 dpjp = Process('glite-wms-job-delegate-proxy -a -e %s --noint'%self._NameWMS(Appli))
2091 dpjp.wait(60)
2092 if not dpjp.isOk():
2093 sys.stderr.write("\nERROR:\n commend '%s' NOK\n"%dpjp._cmd)
2094 sys.stderr.write(dpjp.stdErr())
2095 if G_verbose >= 10: sys.stdout.write(dpjp.stdOut())
2096 cmd = 'glite-wms-job-submit --noint --config %s -o %s -a %s'%(self._FileConf, Appli._FileID, self._FileJDL)
2097 return cmd
2098
2100 cmd = 'unset PYTHONHOME;glite-wms-job-status -i '+FileID + ' --noint --verbosity 1'
2101 if G_verbose>=10: print cmd
2102 return cmd
2103
2105 outDir= MyConf().info.workDir+Appli._APCSchedID
2106 cmd = 'glite-wms-job-output -i '+FileID
2107 cmd += ' --dir '+outDir+ ' --noint'
2108 return cmd, outDir
2109
2111 cmd = "glite-wms-job-cancel -i %s --noint"%Appli.getPathFile('ID')
2112 return cmd
2113
2115 cmd='echo a | glite-wms-job-status -i %s --verbosity 2 | awk -F: \'{if(/Current/){s=$NF;n++;} if(/Destination/) {d[$2" "s]++;} } END {for ( x in d ){print x": "d[x]" / "n;} }\' | sort'%fileID
2116 return cmd
2117
2118
2119
2120
2121
2124 self.listCE = []
2125 self._update()
2126 self._lastUpdate = time.time()
2127
2131
2132 if ( time.time() - self._lastUpdate)/60 > 10:
2133 self._lastUpdate = time.time()
2134 self._update()
2135
2136
2138 """Algos CE loadbalancing de Tristan Beau """
2140 if query == "":
2141 self.query = "CEStatus=Production,PlatformArch=x86_64,EstRespTime=0"
2142 else:
2143 self.query = query
2144 LoadBalancingCE.__init__(self)
2145 self.ce_idx=0
2146 self.dec = 1
2147
2149 buffer="\nCEname: FreeSlot"
2150 for ce in self.listCE:
2151
2152 buffer +='\n%s : \t%d'%(ce[0],ce[1])
2153 return buffer
2154
2156 self.listCE = []
2157 lce=Process("lcg-info --list-ce --vo %s --attrs WaitingJobs,FreeJobSlots,TotalCPUs --query %s --sed"%(MyConf().gLite.vo, self.query))
2158 lce.wait(60)
2159 if not lce.isOk():
2160 sys.stdout.write(lce.stdOut())
2161 sys.stderr.write("ERROR ERROR ERROR "+lce.stdErr())
2162
2163 lineout=lce.stdOut().split('\n')
2164 for line in lineout:
2165 fields=line.split('%')
2166 if len(fields) != 4:
2167 break
2168
2169 t=[fields[0], int(round(0.9*int(fields[2])))]
2170 self.listCE.append(t)
2171
2172 self.listCE.sort(key=operator.itemgetter(1),reverse=True)
2173 if G_verbose>=5:
2174 sys.stdout.write("\nCE selection:\n")
2175 print self
2176
2178 LoadBalancingCE.choiceCE(self)
2179 if self.listCE[self.ce_idx][1]==0:
2180 ce=MyConf().gLite.ce
2181 else:
2182 ce=self.listCE[self.ce_idx][0]
2183 self.listCE[self.ce_idx][1] -= self.dec
2184 if self.listCE[self.ce_idx][1] <= 0:
2185 if self.ce_idx<len(self.listCE):
2186 self.ce_idx +=1
2187 return ce
2188
2189
2191 """version 1, mais retrie le tableau s'il n'est plus ordonne"""
2194
2196 LoadBalancingCE.choiceCE(self)
2197 if len(self.listCE) == 0:
2198 return None
2199 if self.listCE[0][1]==0:
2200 return None
2201 else:
2202 if len(self.listCE) == 1:
2203 return self.listCE[0][0]
2204 else:
2205 ce=self.listCE[0][0]
2206 self.listCE[0][1] -= self.dec
2207 if self.listCE[0][1] < self.listCE[1][1]:
2208 self.listCE.sort(key=operator.itemgetter(1),reverse=True)
2209 return ce
2210
2211
2213 """Repartition au hasard sur les slots libres des CE selectionnes"""
2216
2218 LoadBalCE_v1._update(self)
2219 self.ces=[]
2220 for ce in self.listCE:
2221 for i in range(ce[1]):
2222 self.ces.append(ce[0])
2223 rd.shuffle(self.ces)
2224 self.nb_ces=len(self.ces)
2225 self.i_ces=0
2226
2228 LoadBalancingCE.choiceCE(self)
2229 if self.i_ces >= self.nb_ces:
2230 ce=MyConf().gLite.ce
2231 else:
2232 ce=self.ces[self.i_ces]
2233 self.i_ces += 1
2234 return ce
2235
2236
2237
2238
2239
2240
2241
2242
2248
2250 """submit Appli"""
2251 cmd= 'cd %s; '%(Appli._PathLocWD)
2252 if Appli._MPIcpu != []:
2253 NbNode = Appli._MPIcpu[0]
2254 NbProc = NbNode*Appli._MPIcpu[1]
2255 cmd = 'mpirun -np %d %s %s'%(NbProc, Appli._AppliName, Appli._Arg)
2256 else:
2257 cmd += '%s %s'%(Appli._AppliName, Appli._Arg)
2258 if Appli._keepStdFile:
2259 self.Processus = Process(cmd, Appli.getPathFile('STDOUT'), Appli.getPathFile('STDERR'))
2260 else:
2261 self.Processus = Process(cmd)
2262 Appli._Status = "SubmitScheduled"
2263 self.status(Appli)
2264
2266 """Update status attribut of Appli object"""
2267 if Appli._Status.find('Submit') != 0:
2268 if G_verbose>10: print "status can't change !!!"
2269 return
2270 self.Processus._updateStatus()
2271 Appli._Status = self.Processus._Status
2272 Appli._StatusAppli = self.Processus.getExitValue()
2273 SchedulerAbstract.status(self,Appli)
2274
2276 """Retrieve Appli outputs in work directory"""
2277 if not Appli._keepStdFile:
2278 Appli._stdOut = self.Processus.stdOut()
2279 Appli._stdErr = self.Processus.stdErr()
2280
2281 if Appli._stdErr == "":
2282 os.system('rm '+ Appli.getPathFile("STDERR"))
2283
2284 if not self.Processus.isOk() and G_verbose>10:
2285 sys.stderr.write("\nERROR with command :\n%s\n"%self.Processus._cmd)
2286 sys.stderr.write(self.Processus.stdErr())
2287
2289 """cancel Application"""
2290 cmd="kill -2 %d"%self.Processus.pid
2291 print cmd
2292 os.system(cmd)
2293
2294
2295
2296
2297
2298
2299
2302 SchedulerAbstract.__init__(self)
2303
2304 self._TypeCluster = ''
2305
2306 self._ScriptBatchFile = ''
2307
2308 self._ScriptBatchDesc = None
2309
2311 """Add in batch script scheluder command"""
2312 pass
2313
2317
2319 """Add in batch script user commands before run"""
2320 pass
2321
2323 """Add in batch script exe"""
2324 self._ScriptBatchDesc.write("\n\n# Add Application")
2325 if Appli._MPIcpu == []:
2326 self._ScriptBatchDesc.write("\n%s %s"%(Appli._AppliName, Appli._Arg))
2327 else:
2328 self._AddExeMPI(Appli)
2329
2331 """Add in batch script user commands after run"""
2332 pass
2333
2335 """Create a batch script for cluster scheduler"""
2336
2337 MyFile = Appli._PathLocWD + Appli._APCSchedID
2338 self._ScriptBatchFile = MyFile+'.sh'
2339 self._ScriptBatchDesc = open(self._ScriptBatchFile, "w")
2340 if G_verbose > 5: print self._ScriptBatchFile
2341
2342
2343 self._AddSchedulerCommand(Appli)
2344 self._AddPlateformEnv(Appli)
2345 self._ScriptBatchDesc.write("\n\ntouch "+self._FileRun)
2346 self._AddBeforeRun(Appli)
2347 self._AddExe(Appli)
2348 self._AddAfterRun(Appli)
2349 self._ScriptBatchDesc.write("\n")
2350
2351 self._ScriptBatchDesc.close()
2352 os.system('chmod 755 '+self._ScriptBatchFile)
2353
2354
2355
2356
2357
2364
2365
2367
2368 self._ScriptBatchDesc.write("\necho $? >> "+self._FileRun)
2369
2371 """submit Appli"""
2372 Appli._CPUTime = _ConvHHMMSSInSec(Appli._CPUTime)
2373 if Appli._CPUTimePerWeek != '':
2374 Appli._CPUTimePerWeek = _ConvHHMMSSInSec(Appli._CPUTimePerWeek)
2375 else:
2376 Appli._CPUTimePerWeek = 0
2377 Appli._farm = 'anastasie'
2378 self._CreateBatchScript(Appli)
2379 os.putenv('BQSCLUSTER', Appli._farm)
2380 myp = Process("qsub "+self._ScriptBatchFile)
2381 myp.wait(5)
2382 if myp.isOk():
2383 Appli._Status = 'SubmitScheduled'
2384 Appli._TimeStart = time.time()
2385 else:
2386 Appli._Status = 'FinishSubmitNOK'
2387
2389 os.putenv('BQSCLUSTER', Appli._farm)
2390 mp=Process("qdel "+ Appli._APCSchedID)
2391 mp.wait()
2392
2394 if Appli._Status.find('Submit') != 0:
2395 if G_verbose>10: print "status can't change !!!"
2396 return
2397 if os.path.isfile(self._FileRun):
2398 Appli._Status = 'SubmitRunning'
2399 finish = False
2400 if os.path.isfile(Appli.getPathFile("STDOUT")):
2401 finish = True
2402 elif os.path.isfile(Appli.getPathFile("STDERR")):
2403 finish = True
2404 if finish:
2405 rvs = _readFile(self._FileRun)
2406 try:
2407 rv = int(rvs)
2408 Appli._StatusAppli = rv
2409 if rv == 0 : Appli._Status = 'FinishOK'
2410 else: Appli._Status = 'FinishNOK'
2411 except:
2412 Appli._Status = 'FinishNOK'
2413 os.system('rm -rf '+self._FileRun)
2414 os.system('rm -rf '+self._ScriptBatchFile)
2415 if G_verbose>5: print "status :"+Appli._Status
2416
2417
2418 SchedulerAbstract.status(self,Appli)
2419
2421 Appli._AddEnv(self._ScriptBatchDesc)
2422 if Appli._MPIcpu != []:
2423
2424 self._ScriptBatchDesc.write("\n. /usr/local/shared/bin/openmpi_env.sh")
2425
2427 NbNode = Appli._MPIcpu[0]
2428 NbProc = NbNode*Appli._MPIcpu[1]
2429 if NbProc > 1:
2430
2431 self._ScriptBatchDesc.write("\n/usr/local/openmpi/bin/mpirun -x LD_LIBRARY_PATH -x PATH --mca pls_rsh_agent /usr/local/products/bqs/bqsrsh -machinefile $BQS_PROCLISTPATH -np $BQS_PROCNUMBER %s %s"%(Appli._AppliName, Appli._Arg))
2432 else:
2433
2434 self._ScriptBatchDesc.write("\n%s %s"%(Appli._AppliName, Appli._Arg))
2435
2437 """Add in batch script scheluder command"""
2438
2439 self._FileRun = Appli._PathLocWD + Appli._APCSchedID+'.run'
2440
2441 pf = self._ScriptBatchDesc
2442 pf.write("#!/bin/bash")
2443 pf.write("\n#PBS -N %s"%Appli._APCSchedID)
2444 pf.write("\n#PBS -l platform=LINUX")
2445 pf.write("\n#PBS -l M=%dMB"%Appli._MemorySize)
2446 if Appli._CPUTimePerWeek == 0:
2447 MyTime = Appli._CPUTime*self.SecToUISec
2448 pf.write("\n#PBS -l T=%d"%MyTime)
2449
2450 if (Appli._MemorySize > 2200) or (MyTime > 2400000):
2451 pf.write("\n#PBS -q J")
2452 else:
2453 MyTime = Appli._CPUTimePerWeek*self.SecToUISec
2454 pf.write("\n#PBS -l T=%d"%MyTime)
2455 pf.write("\n#PBS -q V")
2456 pf.write("\n#PBS -V")
2457 pf.write("\n#PBS -o "+Appli.getPathFile("STDOUT"))
2458 pf.write("\n#PBS -e "+Appli.getPathFile("STDERR"))
2459 if Appli._MPIcpu == []:
2460 if Appli._AccessDir != '':
2461 pf.write("\n#PBS -l "+Appli._AccessDir)
2462 else:
2463 NbNode = Appli._MPIcpu[0]
2464 NbProc = NbNode*Appli._MPIcpu[1]
2465 if NbProc > 1:
2466 Appli._farm = 'pistoo'
2467 if Appli._AccessDir != '':
2468 pf.write("\n#PBS -l %s=%d"%(Appli._AccessDir, NbProc))
2469 pf.write("\n#PBS -l ptype=OpenMPI")
2470 StringCPU = "\n#PBS -l proc=%d"%(NbProc)
2471 if NbNode != NbProc:
2472 StringCPU += ",machine=%d"%(NbNode)
2473 pf.write(StringCPU)
2474 else:
2475
2476 if Appli._AccessDir != '':
2477 pf.write("\n#PBS -l "+Appli._AccessDir)
2478
2479
2480
2481
2482
2484 """
2485 check des ressources
2486 """
2487 pa_mame = ["pa_short","pa_medium","pa_long"]
2488 pa_memLim = [500, 3*1024, 4*1024]
2489 pa_timeLim = [6*60, 5*3600, 30*3600]
2490 pa_cpuMax = 112
2491 huge_time = 46*3600
2492 huge_mem = 16*1024
2493
2494
2495 - def __init__(self, memMB, cpuTime, nbCPU=1):
2496 self.memMB = memMB
2497 self.cpuTime = cpuTime
2498 self.nbCPU = nbCPU
2499
2500
2506
2507
2509 memMax = GridEngineCC.huge_mem
2510 timeMax = GridEngineCC.huge_time
2511 if self.memMB > memMax:
2512 return "ERROR: too many memory, max is %dMB"%memMax
2513 if self.cpuTime > timeMax:
2514 return "ERROR: too many time, max is %f hours"%(timeMax/3600)
2515 return "Ok"
2516
2517
2519 memMax = GridEngineCC.pa_memLim[-1]
2520 timeMax = GridEngineCC.pa_timeLim[-1]
2521 cpuMax = GridEngineCC.pa_cpuMax
2522 if self.nbCPU > cpuMax:
2523 return "ERROR: too many CPUs for openmpi GE env., max is %d"%cpuMax
2524 if self.memMB > memMax:
2525 return "ERROR: too many memory for MPI job, max is %dMB"%memMax
2526 if self.cpuTime > timeMax:
2527 return "ERROR: too many time for MPI job, max is %f hours"%(timeMax/3600)
2528 return "Ok"
2529
2530
2547
2548
2554
2556
2557 self._ScriptBatchDesc.write("\nres=$? ")
2558 self._ScriptBatchDesc.write("\nexec 3<> "+self._FileEnd)
2559 self._ScriptBatchDesc.write("\necho $res >> "+self._FileEnd)
2560 self._ScriptBatchDesc.write("\nexec 3>&-")
2561
2563 """submit Appli"""
2564 Appli._CPUTime = _ConvHHMMSSInSec(Appli._CPUTime)
2565 if Appli._CPUTimePerWeek != '':
2566 Appli._CPUTimePerWeek = _ConvHHMMSSInSec(Appli._CPUTimePerWeek)
2567 else:
2568 Appli._CPUTimePerWeek = 0
2569 self._CreateBatchScript(Appli)
2570 myp = Process("qsub "+self._ScriptBatchFile)
2571 myp.wait(10)
2572 if myp.isOk():
2573 Appli._Status = 'SubmitScheduled'
2574 Appli._TimeStart = time.time()
2575 else:
2576 Appli._Status = 'FinishSubmitNOK'
2577
2579 mp=Process("qdel "+ Appli._APCSchedID)
2580 mp.wait()
2581
2582
2584 Appli._AddEnv(self._ScriptBatchDesc)
2585 if Appli._MPIcpu != []:
2586
2587 self._ScriptBatchDesc.write("\n. /usr/local/shared/bin/openmpi_env.sh")
2588
2590 NbNode = Appli._MPIcpu[0]
2591 NbProc = NbNode*Appli._MPIcpu[1]
2592 if NbProc > 1:
2593
2594 self._ScriptBatchDesc.write("\nmpiexec --mca btl ^udapl,openib --mca btl_tcp_if_include eth0 -n $NSLOTS %s %s"%(Appli._AppliName, Appli._Arg))
2595 else:
2596
2597 self._ScriptBatchDesc.write("\n%s %s"%(Appli._AppliName, Appli._Arg))
2598
2600 """Add in batch script scheluder command"""
2601
2602 self._FileRun = Appli._PathLocWD + Appli._APCSchedID+'.run'
2603 self._FileEnd = Appli._PathLocWD + Appli._APCSchedID+'.end'
2604
2605 pf = self._ScriptBatchDesc
2606 pf.write("#!/bin/bash -l")
2607 pf.write("\n#$ -N %s"%Appli._APCSchedID)
2608 pf.write("\n#$ -l vmem=%dM"%Appli._MemorySize)
2609 if Appli._CPUTimePerWeek == 0:
2610 MyTime = Appli._CPUTime
2611 pf.write("\n#$ -l ct=%d"%MyTime)
2612 else:
2613 MyTime = Appli._CPUTimePerWeek*self.SecToUISec
2614 pf.write("\n#$ -l ct=%d"%MyTime)
2615 pf.write("\n#$ -q demon")
2616 pf.write("\n#$ -V")
2617
2618
2619 NbProc = 1
2620 if Appli._MPIcpu != []:
2621 NbNode = Appli._MPIcpu[0]
2622 NbProc = NbNode*Appli._MPIcpu[1]
2623 sge = GridEngineCC(Appli._MemorySize, MyTime, NbProc)
2624 ret = sge.checkResource()
2625 if ret != "Ok":
2626 sys.stderr.write(ret)
2627 raise
2628
2629 pf.write("\n#$ -o "+Appli.getPathFile("STDOUT"))
2630 pf.write("\n#$ -e "+Appli.getPathFile("STDERR"))
2631 if NbProc > 1:
2632 queue = sge.retQueueParal()
2633 if queue == None:
2634 sys.stderr.write("ERROR: can't define parallele queue for %dMB, %dseconds"%(Appli._MemorySize, MyTime))
2635 pf.write("#$ -pe openmpi %d -q %s"%(NbProc, queue) )
2636 if Appli._AccessDir != '':
2637 pf.write("\n#$ -l %s=1"%Appli._AccessDir)
2638
2639
2641 if Appli._Status.find('Submit') != 0:
2642 if G_verbose>10: print "Status can't change !!!"
2643 return
2644 if os.path.isfile(self._FileRun):
2645 Appli._Status = 'SubmitRunning'
2646
2647
2648 try:
2649 finish = os.path.isfile(self._FileEnd)
2650 except:
2651 finish = False
2652 if finish:
2653 rvs = _readFile(self._FileEnd)
2654 print "run:"+rvs
2655 try:
2656 rv = int(rvs)
2657 Appli._StatusAppli = rv
2658 if rv == 0 : Appli._Status = 'FinishOK'
2659 else: Appli._Status = 'FinishNOK'
2660 except:
2661 print "\nCan't convert:'%s'"%rvs
2662 Appli._Status = 'FinishNOK'
2663 os.system('rm -rf '+self._FileRun)
2664 os.system('rm -rf '+self._FileEnd)
2665 os.system('rm -rf '+self._ScriptBatchFile)
2666 if G_verbose>5: print "Status :"+Appli._Status
2667
2668
2669 SchedulerAbstract.status(self,Appli)
2670
2671
2672
2673
2674
2675
2676
2677
2678
2681 self._AppliName = Appli
2682
2683 self._AppliOnly = Appli[Appli.rfind('/')+1:]
2684 if label=="":
2685 self._APCSchedID = self._AppliOnly
2686 else:
2687 self._APCSchedID = label.replace(' ','_')
2688 self._Arg = ''
2689
2690 self._MpiNodeCPUbyNode = [1,1]
2691
2692 self._StatusAppli = -1
2693 self._Status = 'NotSubmit'
2694 self._ListSrc = []
2695 self._ListInput = []
2696 self._ListOutput = []
2697 self._ListInputSE = []
2698 self._ListOutputSE = []
2699 self._PathLocWD = ''
2700 self._FileIDgrid = ''
2701 self._Timer = 10
2702 self._TimeOutToStart = 0
2703 self._TimeStart = 0
2704 self._RetrieveWithID = True
2705 self._RetrieveSEWithID = True
2706 self._CPUTime = '1:0:0'
2707 self._CPUTimePerWeek = ''
2708 self._AccessDir = ''
2709 self._MemorySize = 1024
2710 self._LocalDiskSize = 1024
2711 self._MPIcpu = []
2712 self._MPIcpuTotal = 0
2713 self._MainSE = ''
2714 self._WorkDirSE = '%s/APCScheduler'%os.getenv('USER')
2715 self._OutDirSE = None
2716 self._MainScript = None
2717 self._FlagResubmit = False
2718 self._keepStdFile= True
2719 self._stdOut= ''
2720 self._stdErr= ''
2721 self._wmsUsed = set([])
2722 self._wmsIdx = None
2723 self._threadCopy = None
2724
2726 """Return absolute name of namefile and None if doesn't exit"""
2727 return None
2728
2731
2734
2737
2739 sys.stderr.write("\nNot Available with kind of application")
2740 return False
2741
2743 """Define excutable/script argument.
2744 Fomrat: string"""
2745 self._Arg = Arg
2746
2747
2752
2757
2759 """[Specific grid]
2760 List : Define output file List to retrieve at the end of job via OutputSandbox and copy in job repository
2761 AddID : add prefixe identificator
2762 AddID==True : prefix is SchedulerID random number and letter
2763 AddID is type String : prefix is value of AddID
2764 else : no ID
2765 """
2766 self._ListOutput = List
2767 self._RetrieveWithID = AddID
2768
2769
2771 """[Specific grid]
2772 List : Define output file List to retrieve at the end of job on storage element
2773 AddID : add prefixe identificator
2774 AddID==True : prefix is SchedulerID random number and letter
2775 AddID is type String : prefix is value of AddID
2776 else : no ID
2777 """
2778 self._ListOutputSE = List
2779
2780
2781 self._RetrieveSEWithID = AddID
2782
2783
2785 """[Specific grid tarball] Define directory on SE where all files in tarball directory toSE/ will copied, if AddID is True APCScheduler add prefixe ID job to name file"""
2786 self._OutDirSE = outdir
2787
2788 - def setMPI(self, NodeCPUbyNode):
2789 """Define node number [Node, CPUbyNode]. Example: [4,2] total CPU is 4.2=8"""
2790 self._MPIcpu = NodeCPUbyNode
2791
2793 """Define CPU Time request for job.
2794 Format : string [[hh:]mm:]ss"""
2795 self._CPUTime = TimeMem
2796
2798 """Define CPU Time per week for long and slow job (like class V with BQS).
2799 Format : string [[hh:]mm:]ss"""
2800 self._CPUTimePerWeek = TimeMem
2801
2803 """[specific cluster] Requirement name global aria space disk. For example 'sps_planck' in CCIN2P3
2804 Fomrat : string """
2805 self._AccessDir = Dir
2806
2808 """Define memory request for job.
2809 Format : Integer in MByte"""
2810 self._MemorySize = Mem
2811
2813 """[specific cluster] Requirement local worker node space disk in MByte """
2814 self._LocalDiskSize = Mem
2815
2817 """[specific grid] Define current storage element"""
2818 self._MainSE = se
2819
2820 - def getFile(self,Key, index = 0):
2821 """ Give name file for follow keywords: 'STDOUT', 'STDERR', 'OUT', 'OUT_SE', 'SUBOUT', 'SUBERR', 'TAR','ID'"""
2822 if Key=='STDOUT':
2823 return self.getNameStdOut()
2824 elif Key=='STDERR':
2825 return self.getNameStdErr()
2826 elif Key=='OUT':
2827 return self.getNameOutput(index)
2828 elif Key=='OUT_SE':
2829 return self.getNameOutputSE(index)
2830 elif Key=='SUBOUT':
2831 return self._APCSchedID + '.sub_out'
2832 elif Key=='SUBERR':
2833 return self._APCSchedID + '.sub_err'
2834 elif Key=='TAR':
2835 return self._APCSchedID + '.tar.gz'
2836 elif Key=='ID':
2837 return self.getNameJobID()
2838 else:
2839 print "Keyword '%s' is unknown"%Key
2840 return None
2841
2843 """Same thing like getFile but with absolute path"""
2844 name = self.getFile(Key,index)
2845 if name == None:
2846 return None
2847 else:
2848 if G_verbose>=10 : print name
2849 return os.path.join(self._PathLocWD, name)
2850
2852 return (self._APCSchedID +'.stdout')
2853
2855 return (self._APCSchedID +'.stderr')
2856
2858 return (self._Status.find('Finish') >= 0)
2859
2861 return (self._Status.find('Submit') == 0)
2862
2863
2865 if AddID == True:
2866
2867 return self._APCSchedID+"_"
2868 elif isinstance(AddID, str):
2869 return AddID
2870 else:
2871 return ""
2872
2873
2875 if idx < len(self._ListOutput):
2876 filename = self._ListOutput[idx]
2877 withID = self._processPrefix(self._RetrieveWithID)
2878 else:
2879 myidx = idx-len(self._ListOutput)
2880 if (myidx < len(self._ListOutputSE)):
2881 aa = self._ListOutputSE[myidx].split('/')
2882 filename = aa[len(aa)-1]
2883 withID = self._processPrefix(self._RetrieveSEWithID)
2884 else:
2885 sys.stderr.write("[getNameOutput]Error out of range")
2886 sys.exit(1)
2887 return (withID + filename)
2888
2890 return (self._APCSchedID +'.id')
2891
2894
2896 return (self._Status == 'FinishOK')
2897
2899 """ no keep standard output and error file"""
2900 self._keepStdFile= False
2901
2902
2903
2904
2905
2906
2907
2908
2910 - def __init__(self, NameExe, label=""):
2911 AbsNameExe = self._isAppliExist(NameExe)
2912 if AbsNameExe != None:
2913 Application.__init__(self, AbsNameExe,label)
2914 else:
2915 sys.stderr.write("\nDon't find %s anywhere, check var. env PATH, wms-proxy problem.\n"%NameExe)
2916 sys.exit(1)
2917
2918
2920 AbsNameExe = _AbsolutePathExe(NameExe)
2921 if AbsNameExe == None:
2922
2923 return None
2924
2925 return AbsNameExe
2926
2928 buffer = 'Executable = "%s";\n'%self._AppliOnly
2929 buffer += 'Arguments = "%s";\n'%self._Arg
2930 intemp = 'InputSandbox = {"%s"'%self._AppliName
2931 for elt in self._ListInput:
2932 intemp += ',"%s"'%elt
2933
2934 for elt in self._ListSrc:
2935 intemp += ',"%s"'%elt
2936
2937 buffer += intemp +'};\n'
2938 return buffer
2939
2941 buffer = '"LCG_CATALOG_TYPE=lfc","VO_NAME=%s"'%MyConf().gLite.vo
2942 if os.environ.has_key('LFC_HOST'):
2943 buffer += ',"LFC_HOST='+os.environ['LFC_HOST']+'"'
2944 return buffer
2945
2947 """ env for cluster """
2948 Val=os.getenv('PATH')
2949 if Val != None:
2950 File.write("\nexport PATH="+Val)
2951 Val=os.getenv('LD_LIBRARY_PATH')
2952 if Val != None:
2953 File.write("\nexport LD_LIBRARY_PATH="+Val)
2954
2958
2959
2960
2961
2962
2966
2967
2969 """Python env for cluster """
2970 AppliExe._AddEnv(self, File)
2971 Val=os.getenv('PYTHONHOME')
2972 if Val != None:
2973 File.write("\nexport PYTHONHOME="+Val)
2974 Val=os.getenv('PYTHONPATH')
2975 if Val != None:
2976 File.write("\nexport PYTHONPATH="+Val)
2977
2978
2979
2980
2981
2982
2983
2984
2985
2988 """ call by submit """
2989
2990 return self
2991
2993 """ call by job.copyTarBallOnSE() """
2994
2995 sys.stdout.write("\nWARNING: tarball '%s' already exist !"%master._nameTarball)
2996 return self
2997
2998
3011
3012
3013
3017
3018
3020 S_listTarballExist= []
3021 - def __init__(self, NameExe, MainScript = None, label="", KeepTarBall=False):
3022 """MainScript Optional, define the main script of tarball,
3023 ie APCScheduler will called this script in tarball case. If not
3024 present the executable will called directly"""
3025
3026 AppliExe.__init__(self, NameExe, label)
3027
3028 self._KeepTarBall = KeepTarBall
3029
3030 if MainScript != None:
3031 AbsNameExe = _AbsolutePathExe(MainScript)
3032 if AbsNameExe == None:
3033 sys.stderr.write("ERROR: Can't find '"+MainScript+"' executable.\n" )
3034 sys.exit(1)
3035 else:
3036 self._MainScript = AbsNameExe
3037
3039 AbsNameExe = AppliExe._isAppliExist(self, NameExe)
3040 if AbsNameExe == None:
3041
3042 TarOk = False
3043 se=SEtools( MyConf().gLite.vo,MyConf().gLite.ce )
3044 if NameExe in AppliParachute.S_listTarballExist:
3045 TarOk = True
3046 else:
3047 if se.sizeFile(NameExe) > 0:
3048 TarOk = True
3049 AppliParachute.S_listTarballExist.append(NameExe)
3050 if G_verbose>=5:
3051 print 'AppliParachute.S_listTarballExist:', AppliParachute.S_listTarballExist
3052 if TarOk:
3053
3054 AbsNameExe = NameExe
3055 self._nameTarball = se._simpleDelTag(NameExe)
3056 self._stateObj = _AppliParachute_State._tarExist
3057 else:
3058 return None
3059 else:
3060
3061 self._stateObj = _AppliParachute_State._doTar
3062 return AbsNameExe
3063
3065
3066 BootFile = os.path.join(self._PathLocWD,"BootScript.sh")
3067 self._DoScriptBoot(BootFile)
3068
3069
3070 se=SEtools( MyConf().gLite.vo, self._MainSE)
3071 pathdir = '%s/%s'%(self._WorkDirSE,self._APCSchedID)
3072 se._mkdirSE(pathdir)
3073
3074
3075 self._stateObj.doTarBallIfNecessary(self)
3076
3077 bufferJDL = 'Executable = "BootScript.sh";\n'
3078 if self._MPIcpu !=[]:
3079 NbNode = self._MPIcpu[0]
3080 self._MPIcpuTotal = NbNode*self._MPIcpu[1]
3081
3082 bufferJDL += 'JobType = "MPICH";\n'
3083 bufferJDL += 'CPUNumber = %d;\n'%self._MPIcpuTotal
3084
3085 if self._WorkDirSE == None:
3086 sys.stderr.write("ERROR: You must define SetWorkDirSE()")
3087 raise
3088
3089
3090
3091 strArg = "%s "%self._nameTarball
3092 strArg+= "%s/%s "%(self._WorkDirSE,self._APCSchedID)
3093 strArg+= "%s"%self._Arg
3094
3095 bufferJDL += 'Arguments = "%s";\n'%strArg
3096 bufferJDL += 'Environment = {'+self._FillFileJDLEnv()+'};\n'
3097 intemp = 'InputSandbox = {"%s"'%BootFile
3098 for elt in self._ListInput:
3099 intemp += ',"%s"'%elt
3100
3101
3102 bufferJDL += intemp+'};\n'
3103 return bufferJDL
3104
3106 se=SEtools( MyConf().gLite.vo, self._MainSE)
3107
3108 pathdir = '%s/%s'%(self._WorkDirSE,self._APCSchedID)
3109 self._threadCopy = _CopySEThread(se, pathdir, self._PathLocWD, self._APCSchedID+'_')
3110 self._threadCopy.start()
3111
3112
3114 if self._threadCopy == None:
3115
3116
3117 pathdir = '%s/%s'%(self._WorkDirSE,self._APCSchedID)
3118 cmd='apcgrid-rm -r '+pathdir
3119 mp = Process(cmd)
3120 mp.wait()
3121 else:
3122
3123 if G_exit : self._threadCopy.join()
3124
3125
3127 if self._FlagResubmit: return
3128 if G_verbose>=10: sys.stdout.write("\ndef _DoTarBall(self, pathTar=""):\n")
3129 root = os.path.join(self._PathLocWD, self._APCSchedID)
3130 TarRoot = os.path.join(root, "tarball")
3131 os.makedirs(TarRoot)
3132 os.makedirs(TarRoot+"/lib")
3133 os.makedirs(TarRoot+"/toSE")
3134 os.makedirs(TarRoot+"/toUI")
3135 os.makedirs(TarRoot+"/python")
3136
3137
3138 os.system("cp %s %s "%(self._AppliName, TarRoot))
3139 namescript = TarRoot+"/tarballScript.py"
3140 self._DoScriptTarball(namescript)
3141 if self._MainScript != None: os.system("cp %s %s"%(self._MainScript, TarRoot))
3142
3143
3144 for elt in self._ListInput: os.system("cp -r %s %s"%(elt, TarRoot))
3145
3146
3147 filesrc = __file__
3148 if filesrc[-1]=='c':
3149 filesrc = filesrc[0:-1]
3150 if G_verbose>10: sys.stdout.write("add %s, not %s\n" %(filesrc, __file__))
3151 os.system("cp -r %s %s/python"%(filesrc, TarRoot))
3152 listlib = []
3153 if not _FindLib(self._AppliName, listlib):
3154 sys.exit(1)
3155
3156 for elt in listlib:
3157 cmd ="cp %s %s/lib"%(elt, TarRoot)
3158 os.system(cmd)
3159 if G_verbose>=10: print cmd
3160
3161
3162 cmd = "cd %s; tar cfz %s *"%(TarRoot, self.getPathFile('TAR'))
3163
3164 os.system(cmd)
3165 os.system("rm -rf "+root)
3166
3167
3168 se=SEtools( MyConf().gLite.vo, self._MainSE)
3169 if pathTar== "":
3170
3171 self._nameTarball='%s/%s/%s'%(self._WorkDirSE,self._APCSchedID,self.getFile("TAR"))
3172 else:
3173
3174 if se.sizeFile(pathTar) >=0:
3175 sys.stderr.write("\nFile '%s' exist !! Delete it or change name\n"%pathTar)
3176 sys.exit(1)
3177 self._nameTarball=se._simpleDelTag(pathTar)
3178 ret=se.cp(self.getPathFile('TAR'), 'se:'+self._nameTarball)
3179 if ret != 0:
3180 if G_verbose>1: sys.stderr.write("ERROR: Can't copy tarball %s to se:%s\n"%(self.getPathFile('TAR'),self._nameTarball))
3181 self._Status = 'FinishSubmitNOK'
3182 return
3183 if not self._KeepTarBall: os.system("rm -rf "+self.getPathFile('TAR'))
3184
3185
3187
3188 buffer = "import sys\nimport os"
3189 buffer += "\nsys.path.append('python')"
3190 buffer += "\nfrom APCScheduler import *"
3191 buffer += "\nsetVerboseLevel(1)"
3192 buffer += "\n\n#update LD_LIBRARY_PATH\naddLIBRARYPATH(os.environ['PWD']+'/lib')"
3193 buffer += "\nse=SEtools( '%s', '%s') "%(MyConf().gLite.vo, self._MainSE)
3194 buffer += "\n\n# copy input SE"
3195 for elt in self._ListInputSE:
3196 buffer += "\nse.cp('se:%s','%s')"%(elt,elt.split('/')[-1])
3197 buffer += "\n\n# call executable"
3198 buffer += "\nArg = ''\nfor i in range(2,len(sys.argv)): Arg += sys.argv[i]+ ' '"
3199 if self._MainScript == None:
3200 buffer += "\nret=submitCmd('./%s %%s'%%Arg,-1)"%(self._AppliOnly)
3201 else:
3202 buffer += "\nret=submitCmd('./%s %%s'%%Arg,-1)"%(os.path.basename(self._MainScript))
3203 if self._OutDirSE != None:
3204 buffer += "\n\n# copy out SE"
3205 buffer += "\nse.cpdir( 'toSE', 'se:%s') " %self._OutDirSE
3206 buffer += "\n\n# copy out UI"
3207 buffer += "\nse.cpdir( 'toUI', 'se:%s/data'%sys.argv[1])\n"
3208 buffer += "\nsys.exit(ret)"
3209 pf = open(namefile,'w+')
3210 pf.write(buffer)
3211 pf.close()
3212 os.system('chmod 755 '+namefile)
3213
3214
3216
3217 if not os.path.isfile(scriptname):
3218 buffer = "#!/bin/bash"
3219 buffer += "\nlcg-cp -v --vo $VO_NAME lfn:/grid/$VO_NAME/$1 file:`pwd`/tarball.tar.gz"
3220 buffer += "\ntar xzf tarball.tar.gz ; rm -rf tarball.tar.gz"
3221 buffer += "\nexport PATH=$PATH:./"
3222 buffer += "\nshift"
3223 buffer += '\n\necho "**************** date debut job code : " `date`'
3224 buffer += "\npython tarballScript.py $* 2>&1"
3225 buffer += "\nret=$?"
3226 buffer += '\necho "**************** date fin job code : " `date`'
3227
3228 buffer += "\nexit $ret"
3229 pf = open(scriptname,'w+')
3230 pf.write(buffer)
3231 pf.close()
3232 os.system('chmod 755 '+scriptname)
3233
3235 """Copy tarball on SE"""
3236 if self._Status != 'NotSubmit':
3237 sys.stdout.write("\nWARNING: call copyTarBallOnSE() before submit().\n")
3238 return
3239 self._stateObj = self._stateObj.doTarBallAndCopy(self, pathSE)
3240
3241
3242
3243
3244
3245
3246
3247
3248
3250
3251 dictJobSubmit = {}
3252
3254 self._Appli = MyAppli
3255 self._Scheduler = MySched
3256 self._Appli._PathLocWD = MyConf().info.workDir
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272 self._Appli._APCSchedID = _AddAlea(self._Appli._APCSchedID, 6)
3273 self._Appli._FileID = self._Appli.getPathFile('ID')
3274 self._TimeOutToStart = "24:0:0"
3275 if self._Appli._MainSE == '':
3276 self._Appli._MainSE = MyConf().gLite.se
3277
3279 """Time between two update status ,
3280 timeUpdate[[hh:]mm:]ss string"""
3281 sec = _ConvHHMMSSInSec(timeUpdate)
3282 if sec > 0 :
3283 self._Appli._Timer = sec
3284
3286 return (self._Appli._PathLocWD+File)
3287
3289 """Create a tarball and copy it on SE. Call it before submit()"""
3290 return (self._Appli.copyTarBallOnSE(FileSE))
3291
3293 return (self._Appli._APCSchedID)
3294
3296 self._Scheduler.status(self._Appli)
3297 return self._Appli._Status
3298
3300 return self._Appli.isFinish()
3301
3303 return self._Appli.isSubmit()
3304
3306 return self._Appli.isOk()
3307
3308 - def submit(self,TimeOutToStart="24:0:0"):
3309 if G_verbose>15: print 'submit'
3310 self._Appli._TimeOutToStart = _ConvHHMMSSInSec(TimeOutToStart)
3311 if G_verbose > 10:print "TimeOutToStart=", self._Appli._TimeOutToStart
3312 if (self._Appli._Status == 'NotSubmit'):
3313 JobClass.dictJobSubmit[self._Appli._APCSchedID] = self
3314 self._Scheduler.submit(self._Appli)
3315
3316 else:
3317 print 'Already submit !'
3318
3320 self._Scheduler.wait(self._Appli)
3321
3325
3328
3331
3333 if Idx >= len(self._Appli._ListOutput):
3334 print "Idx too great"
3335 return None
3336 else:
3337 return self.addFullPath(self._Appli.getNameOutput(Idx))
3338
3340 return self._Scheduler.stdOut(self._Appli)
3341
3343 return self._Scheduler.stdErr(self._Appli)
3344
3345
3346
3348 return _readFile(self.fullNameOutput(Idx))
3349
3351 if MyConf().info.mail == '':
3352 print 'Define address mail with setMail() function'
3353 return
3354 cmd = ('mail -s "APCScheduler : %s status is %s" %s')%(self._Appli._APCSchedID,self._Appli._Status,MyConf().info.mail)
3355
3356 if self._Appli._Status == 'FinishOK':
3357 file = self.fullNameStdOut()
3358 if os.path.isfile(file):
3359 os.system(cmd + '< %s'%(file))
3360 else:
3361 os.system('echo "No file stdout available"|'+cmd )
3362 elif self._Appli._Status == 'FinishSubmitNOK':
3363 file = self._Appli.getPathFile('SUBERR')
3364 if os.path.isfile(file):
3365 os.system(cmd + '< %s'%(file))
3366 else:
3367 os.system('echo "No file submit stderr available"|'+cmd )
3368 elif self._Appli._Status == 'FinishTimeOut':
3369 os.system('echo " "|'+cmd )
3370 else:
3371 file = self.fullNameStdErr()
3372 if os.path.isfile(file):
3373 os.system(cmd + '< %s'%(file))
3374 else:
3375 os.system('echo "No file stderr available"|'+cmd )
3376
3377
3378
3379
3380
3381
3382
3383
3384
3386 S_EventFinish = threading.Event()
3387
3389 self._ListJob = []
3390 self._Timer = 60*5
3391 self._name = name
3392 self._timeStart = time.asctime()
3393
3395 """Add a job to the Run"""
3396 self._ListJob.append(job)
3397
3398
3400 """Time between two update status ,
3401 timeUpdate[[hh:]mm:]ss string"""
3402 sec = _ConvHHMMSSInSec(timeUpdate)
3403 if sec > 0 :
3404 self._Timer = sec
3405
3406
3408 """submit all jobs and wait all:
3409 TimeOutToStart: [[hh:]mm:]ss string, if job doesn't run after TimeOutToStart abort it.
3410 default value is 15 mn ie 15:0
3411 """
3412 if MaxRunning <=0:
3413 self.submitAll(TimeOutToStart)
3414 self.waitAll()
3415 return
3416
3417 NbRun = 1
3418 while NbRun != 0:
3419 NbRun = 0
3420 for job in self._ListJob:
3421 job.status()
3422
3423 if job.isSubmit() :
3424 NbRun += 1
3425
3426
3427 if NbRun < MaxRunning:
3428 for job in self._ListJob:
3429 if job._Appli._Status == 'NotSubmit':
3430 job.submit(TimeOutToStart)
3431
3432 NbRun += 1
3433 if NbRun >= MaxRunning:
3434 break
3435
3436
3437 if NbRun > 0:
3438 MultiJobsClass.S_EventFinish.wait(self._Timer)
3439 MultiJobsClass.S_EventFinish.clear()
3440
3441
3442 - def submitAll(self,TimeOutToStart="24:0:0"):
3443 """submit all:
3444 TimeOutToStart: [[hh:]mm:]ss string, if job doesn't run after TimeOutToStart abort it.
3445 default value is 15 mn ie 15:0
3446 """
3447 for job in self._ListJob:
3448 job.submit(TimeOutToStart)
3449
3450
3465
3466
3468 """Concate output[IdxOutput] file of the list defined by setOutput() application method
3469 in one file FileConcat
3470 """
3471 if os.path.isfile(FileConcat):
3472 os.system('rm -rf %s'%FileConcat)
3473 os.system('touch %s '%FileConcat)
3474 for job in self._ListJob:
3475 if job.isOk():
3476 os.system('cat %s >> %s'%(job.fullNameOutput(IdxOutput),FileConcat))
3477 if G_verbose>15: print 'add file '+ job.fullNameOutput(IdxOutput)
3478
3479
3481 """Concatenation of all output file """
3482 for i in range(len(self._ListJob[0]._Appli._ListOutput)):
3483 self.concatOutput(i, PrefixeFileConcat+'_'+self._ListJob[0]._Appli._ListOutput[i])
3484
3485
3487 """Concatenation of all sdtout"""
3488 if os.path.isfile(FileConcat):
3489 os.system('rm -rf %s'%FileConcat)
3490 os.system('touch %s '%FileConcat)
3491 for job in self._ListJob:
3492 if job.isOk():
3493 os.system('cat %s >> %s'%(job.fullNameStdOut(),FileConcat))
3494 if G_verbose>15: print 'add file '+ job.fullNameStdOut()
3495
3496
3498 """Concatenation of all sdterr"""
3499 if os.path.isfile(FileConcat):
3500 os.system('rm -rf %s'%FileConcat)
3501 os.system('touch %s '%FileConcat)
3502 for job in self._ListJob:
3503 os.system('cat %s >> %s'%(job.fullNameStdErr(),FileConcat))
3504 if G_verbose>15: print 'add file '+ job.fullNameStdErr()
3505
3506
3508 """Concatenation of all sdterr"""
3509 if os.path.isfile(FileConcat):
3510 os.system('rm -rf %s'%FileConcat)
3511 os.system('touch %s '%FileConcat)
3512 for job in self._ListJob:
3513 os.system('cat %s >> %s'%(job._Appli.getPathFile('ID'),FileConcat))
3514
3515
3517 NbOk = 0
3518 NbNOk = 0
3519 NbTimeOut = 0
3520 NbTot= len( self._ListJob)
3521 for job in self._ListJob:
3522 if job.isOk():
3523 NbOk += 1
3524 elif job.status().find("FinishNOK")>=0:
3525 NbNOk += 1
3526 elif job.status() == "FinishTimeOut":
3527 NbTimeOut += 1
3528
3529 buffer = "output directory : %s\nResult:\n"%job._Appli._PathLocWD
3530 buffer += " %d job(s) OK on %d\n"%(NbOk, NbTot)
3531 if NbNOk > 0:
3532 buffer += " %d job(s) NOK\n"%(NbNOk)
3533 if NbTimeOut > 0:
3534 buffer += " %d job(s) time out to pass RUNNING\n"%(NbTimeOut)
3535
3536 NbOtherNOK = NbTot-NbOk-NbNOk-NbTimeOut
3537 if NbOtherNOK > 0:
3538 buffer += " %d job(s) failed to other reason\n"%NbOtherNOK
3539 buffer += self._ListJob[0]._Scheduler._resultListJob(self)
3540 if NbTot != NbOk :
3541 buffer += "\njob(s) failed information:\n--------------------------\n"
3542 for job in self._ListJob:
3543 if not job.isOk():
3544 buffer += "status job %s is %s"%(job.name(), job.status())
3545 if job.status() == 'FinishNOK':
3546 buffer +=" with status %d."%job._Appli._StatusAppli
3547 buffer +="\n"
3548 err = job.stdErr()
3549 if err != None and err != '':
3550 buffer += "stderr:\n"+err+"\n"
3551 buffer += "===============================================================================\n\n"
3552 return buffer
3553
3554
3557
3558
3560 if MyConf().info.mail == '':
3561 sys.stderr.write('\nERROR sendMailResult() method:\nno address mail, set it with setMail() function')
3562 return
3563 FileResult = 'tmpresult.txt'
3564 pf = open(FileResult,'w+')
3565 pf.write(self.result())
3566 pf.close()
3567 cmd = 'mail -s "APCScheduler: result run %s started %s" %s < %s'%(self._name, self._timeStart, MyConf().info.mail, FileResult)
3568 os.system(cmd)
3569 time.sleep(1)
3570 os.system('rm -rf '+FileResult)
3571