Module APCScheduler
[hide private]
[frames] | no frames]

Source Code for Module APCScheduler

   1  #-*- coding: utf-8 -*- 
   2   
   3  # 
   4  #Copyright APC, CNRS, Université Paris Diderot, 2008 
   5  # 
   6  #http://www.apc.univ-paris7.fr 
   7  #https://forge.in2p3.fr/projects/apc-scheduler 
   8  # 
   9  #Authors: 
  10  #   Colley Jean-Marc <colley@in2p3.fr> 
  11  # 
  12  #This software is a computer program whose purpose to simply 
  13  #job submission on grid with gLite/WMS middleware : 
  14  # * creation binary tarball, included specified library 
  15  # * retrieve data in mail box directory on CE to UI user directory 
  16  # * resubmit "zombi" job on different WMS after time-out defined by used 
  17  # * management of mpi job with MPI-START method 
  18  # *  
  19  # 
  20  #This software is governed by the CeCILL-B license under French law and 
  21  #abiding by the rules of distribution of free software.  You can  use,  
  22  #modify and/ or redistribute the software under the terms of the CeCILL-B 
  23  #license as circulated by CEA, CNRS and INRIA at the following URL 
  24  #"http://www.cecill.info".  
  25  # 
  26  #As a counterpart to the access to the source code and  rights to copy, 
  27  #modify and redistribute granted by the license, users are provided only 
  28  #with a limited warranty  and the software's author,  the holder of the 
  29  #economic rights,  and the successive licensors  have only  limited 
  30  #liability.  
  31  # 
  32  #In this respect, the user's attention is drawn to the risks associated 
  33  #with loading,  using,  modifying and/or developing or reproducing the 
  34  #software by the user in light of its specific status of free software, 
  35  #that may mean  that it is complicated to manipulate,  and  that  also 
  36  #therefore means  that it is reserved for developers  and  experienced 
  37  #professionals having in-depth computer knowledge. Users are therefore 
  38  #encouraged to load and test the software's suitability as regards their 
  39  #requirements in conditions enabling the security of their systems and/or  
  40  #data to be ensured and,  more generally, to use and operate it in the  
  41  #same conditions as regards security.  
  42  # 
  43  #The fact that you are presently reading this means that you have had 
  44  #knowledge of the CeCILL-B license and that you accept its terms. 
  45  # 
  46   
  47   
  48   
  49  import time 
  50  import string 
  51  import threading 
  52  from random import choice 
  53  import popen2 
  54  import os 
  55  import sys 
  56  import types  
  57  import random as rd 
  58  import operator 
  59  import signal 
  60  import re 
  61   
  62  ############################################################### 
  63  # 
  64  #  Global variable 
  65  # 
  66  ############################################################### 
  67   
  68  G_verbose = 2 
  69  G_exit = False 
  70   
  71  ############################################################### 
  72  # 
  73  #  Tool function 
  74  # 
  75  ############################################################### 
  76   
77 -def sendMailFile(File):
78 """Mail a file """ 79 if MyConf().info.mail == '': 80 print 'Define address mail with setMail() function' 81 return 82 ret = os.path.isfile(File) 83 if ret: 84 os.system('mail -s "APCScheduler : %s" %s < %s'%(File, MyConf().info.mail, File)) 85 else: 86 print 'File "'+File+'" doesn\'t exist'
87 88
89 -def newJob(MyAppli, scheduler='LOCAL'):
90 """Return Job object with follow default scheduler [LOCAL]""" 91 if scheduler == 'GLITE': 92 MySched = SchedulerGLITE() 93 elif scheduler == 'LOCAL': 94 MySched = SchedulerLOCAL() 95 elif scheduler == 'BQS': 96 MySched = SchedulerBQS() 97 elif scheduler == 'GE': 98 MySched = SchedulerSGE() 99 else: 100 print 'Error unknown scheduler !!' 101 return None 102 return JobClass(MyAppli, MySched)
103 104
105 -def setMail(mail):
106 """ Define mail address""" 107 # To do: check if repository is OK 108 MyConf().info.mail = mail
109 110
111 -def setFileConfVO(fileconf):
112 """ Define mail address""" 113 file_tmp = os.path.expanduser(fileconf) 114 ret = os.path.isfile(file_tmp) 115 if ret: 116 ConfigGrid().setFileConf(file_tmp) 117 else: 118 sys.stderr.write("\nERROR: file '%s' doesn't exist !\n"%fileconf)
119 120
121 -def setRepository(NameRepository):
122 """ Define repository, create it if necessary""" 123 # To do: check if repository is OK 124 global S_GridRepository 125 if NameRepository[0] != '/': 126 NameRepository = os.path.join(os.getcwd(),NameRepository) 127 if NameRepository[-1] != '/': 128 NameRepository += '/' 129 130 if not os.path.isdir(NameRepository): 131 # Create directory 132 os.makedirs(NameRepository) 133 MyConf().info.workDir = NameRepository
134 135
136 -def setVerboseLevel(level):
137 """Set verbose level [0-20]: 0 no message, 20 all messages""" 138 global G_verbose 139 if level >= 0 and level <= 20: 140 G_verbose = level 141 else: 142 print "Level must be between 0 and 20."
143 144
145 -def submitCmd(cmd, TpsMax=3600, verbose=True):
146 myp = Process(cmd) 147 if G_verbose > 4: print cmd 148 myp.wait(TpsMax) 149 if verbose or not myp.isOk(): 150 sys.stderr.write(myp.stdOut()) 151 sys.stderr.write(myp.stdErr()) 152 return myp.getExitValue()
153 154
155 -def addLIBRARYPATH(path):
156 if os.environ.has_key('LD_LIBRARY_PATH'): 157 os.environ['LD_LIBRARY_PATH'] += ':'+path 158 else: 159 os.environ['LD_LIBRARY_PATH'] = path
160 161
162 -def getVersion():
163 return "APCscheduler june 2012, version: 0.9.3"
164 165 166 # 167 # Private 168 # 169
170 -def _initProxy(nb_hours=72):
171 # peut détruire des proxys ?! à voir 172 os.system('myproxy-destroy -d') 173 os.system('glite-voms-proxy-destroy') 174 175 if nb_hours <= 0: 176 sys.stderr.write("\nERROR: enter a positive number\n") 177 sys.exit(1) 178 return os.system('glite-voms-proxy-init -voms %s -hours %d'%(MyConf().gLite.vo, nb_hours))
179 180
181 -def _cancelJob(signum, frame):
182 print 'Signal handler called with signal', signum 183 print frame 184 global G_exit 185 G_exit = True 186 for job in JobClass.dictJobSubmit.values(): 187 try: 188 print "try cancel job ", job.name() 189 job._Scheduler.cancel(job._Appli) 190 except: 191 pass 192 sys.exit(signum)
193 194 195 signal.signal(signal.SIGINT, _cancelJob) 196 #signal.signal(signal.SIGKILL, _cancelJob) 197
198 -def _AddAlea(MyStr,NbChar):
199 Alea = '' 200 Alphabet = string.letters + string.digits 201 for i in range(NbChar): 202 Alea += choice(Alphabet) 203 MyStr += '_'+Alea 204 return(MyStr)
205 206
207 -def _ListUnique(MyList):
208 InternList = [] 209 for elt in MyList: 210 if elt not in InternList: 211 InternList.append(elt) 212 return(InternList)
213 214
215 -def _GetWordsAfter(buffer, keyword):
216 idx = buffer.find(keyword) 217 if idx >= 0: 218 # conserve tout apres keyword 219 next = buffer[idx+len(keyword):] 220 # decoupe par ligne 221 next = next.split('\n') 222 #print next 223 # suppression chaine vide 224 next.remove('') 225 #print next 226 # extrait premier mot de la premiere ligne 227 next = next[0].split() 228 #print next 229 return next[0] 230 return None
231 232
233 -def _AbsolutePathExe(MyExe):
234 val = MyExe.find('/') 235 if val >= 0: 236 if val == 0: 237 if os.access(MyExe, os.X_OK): 238 return MyExe 239 else: 240 return None 241 if MyExe[0:2] == './': 242 # exe local 243 file = os.getcwd()+MyExe[1:] 244 if os.access(file, os.X_OK): 245 return file 246 return None 247 else: 248 #elif MyExe[0:2] == '..': 249 # exe relatif 250 file = os.getcwd()+'/'+MyExe 251 if os.access(file, os.X_OK): 252 return file 253 return None 254 else: 255 if os.access(MyExe, os.X_OK): 256 return MyExe 257 else: 258 pathlist = os.getenv('PATH').split(os.pathsep) 259 for path in pathlist: 260 file = os.path.join(path, MyExe) 261 #print file 262 if os.access(file, os.X_OK): 263 if path == '.': 264 file = os.getcwd()+'/'+ MyExe 265 return file 266 return None
267 268
269 -def _ConvHHMMSSInSec(time):
270 dec = time.split(':') 271 ToSec = 1 272 TotalInSec = 0 273 for i in range(len(dec)-1,-1,-1): 274 TotalInSec += long(dec[i])*ToSec 275 ToSec *= 60 276 return TotalInSec
277 278
279 -def _FindLib(exelib, listlib):
280 myp = Process('ldd '+ exelib) 281 myp.wait() 282 output = myp.stdOut() 283 setlines =output.split('\n') 284 setlines.remove('') 285 for line in setlines: 286 if line.find('=>') != -1: 287 lib = line.split()[2] 288 if (lib=='not'): 289 print "path lib not find !! Update LD_LIBRARY_PATH var env " 290 return False 291 else: 292 lib = line.split()[0] 293 if (lib.find('/lib64') != 0) and (lib.find('/usr/lib64') != 0): 294 PasLa = True 295 # pas de doublon 296 for elt in listlib: 297 if elt == lib : 298 PasLa = False 299 break 300 if PasLa: 301 listlib.append(lib) 302 ret = _FindLib(lib, listlib) 303 if not ret: 304 return ret 305 return True
306 307
308 -def _readFile(file):
309 try: 310 pf=open(file,'r') 311 #print "OpenFile "+file 312 except IOError: 313 #print "file %s doesn'MPIt exist"%file 314 return None 315 buf = pf.read() 316 pf.close() 317 return buf
318
319 -def _writeFile(file,buf):
320 try: 321 pf=open(file,'a') 322 #print "OpenFile "+file 323 except IOError: 324 #print "file %s doesn't exist"%file 325 return None 326 pf.write(buf) 327 pf.close()
328 329 330
331 -def _getFileSE(file):
332 if file.find('/') >= 0: 333 return(file[0:file.rfind('/')]) 334 else: 335 se=SEtools( MyConf().gLite.vo, MyConf().gLite.se) 336 return(se._simpleDelTag(file))
337 338
339 -def _getFile(file):
340 if file.find('/') >= 0: 341 return(file[0:file.rfind('/')]) 342 else: 343 return(file)
344
345 -def _removeFinalSlash(name):
346 if name[-1] == '/': 347 return name[:-1] 348 else: 349 return name
350
351 -def _getPath(file):
352 if file.find('/') >= 0: 353 return(file[file.rfind('/')+1:]) 354 else: 355 return('')
356 357 358 ############################################################### 359 # 360 # Tool class 361 # 362 ############################################################### 363
364 -class ConfigFile:
365 """Generic class parser """
366 - def __init__(self, file):
367 self._file = file 368 if not os.path.isfile(file): 369 #sys.stderr.write("\nERROR: %s doesnt't exist !!\n"%file) 370 #sys.exit(1) 371 return 372 pFile = open(file, 'r') 373 conf = pFile.read() 374 pFile.close() 375 tabline = conf.split('\n') 376 for line in tabline: 377 words = line.split() 378 if len(words) == 0: 379 continue 380 if words[0][0] == "#": 381 continue 382 if hasattr(self, words[0]): 383 if len(words) == 1: 384 sys.stderr.write("\nERROR: in file %s, no value for argument '%s' !!\n"%(file, words[0])) 385 exit(1) 386 myType = type(getattr(self, words[0])) 387 if myType == types.StringType: 388 setattr(self, words[0], words[1]) 389 else: 390 try: 391 setattr(self, words[0], (myType)(words[1])) 392 except ValueError: 393 sys.stderr.write("\nERROR: parameter '%s' in file %s must be %s !!\n"%(words[0],file, myType)) 394 exit(1) 395 else: 396 sys.stdout.write("\nWARNING: unknown parameter '%s' in file conf %s\n"%(words[0],file))
397 #print inspect.getmembers(self) 398 399
400 -class ConfigGrid(ConfigFile):
401 """Personnal Glite parameters""" 402 _defaultFile = os.path.expanduser('~/.apcgrid/apcgrid.conf')
403 - def __init__(self):
404 # define attribut and type 405 self.file = ConfigGrid._defaultFile 406 self.vo= '' 407 self.lfchost = '' 408 self.se = '' 409 self.ce = '' 410 ConfigFile.__init__(self,self.file) 411 os.environ['LCG_CATALOG_TYPE']='lfc' 412 os.environ['LFC_HOST']=self.lfchost 413 if G_verbose>5:print "Fin INIT ConfigGrid"
414
415 - def setFileConf(self, file):
416 ConfigGrid._defaultFile = file 417 MyConf.instance = None 418 if G_verbose >= 4: print "ConfigGrid: change file %s"%file
419 420
421 -class ConfigPerso(ConfigFile):
422 """Personnal information"""
423 - def __init__(self):
424 self.file = os.path.expanduser('~/.apcgrid/apcperso.conf') 425 self.mail = '' 426 self.workDir = os.getcwd()+'/' 427 ConfigFile.__init__(self,self.file )
428 429
430 -class MyConf(object):
431 """Personnal configuration for APCScheduler with design pattern Singleton"""
432 - class __Singleton:
433 - def __init__(self):
434 self.gLite = ConfigGrid() 435 self.info = ConfigPerso()
436 437 instance = None 438
439 - def __new__(cls):
440 if not MyConf.instance: 441 MyConf.instance = MyConf.__Singleton() 442 return MyConf.instance
443
444 - def __getattr__(self, attr):
445 return getattr(self.instance, attr)
446
447 - def __setattr__(self, attr, val):
448 return setattr(self.instance, attr, val)
449 450 # 451 # Storage Element management 452 # 453
454 -class ObjFileSystem:
455 - def __init__(self, nameObj, fileSystem):
456 self._name = nameObj 457 self._fs = fileSystem 458 self._perm = None 459 fileSystem.defType(self)
460 - def getName(self): raise
461 - def size(self): raise
462 - def isFile(self): raise
464 - def isDir(self): raise
465 - def isNone(self): raise
466 467
468 -class FileFileSystem(ObjFileSystem):
469 - def __init__(self, nameObj, fileSystem):
470 ObjFileSystem.__init__(self, nameObj, fileSystem)
471 - def getName(self): return 'FILE'
472 - def size(self): return self._fs.sizeFile(self)
473 - def isFile(self): return True
475 - def isDir(self): return False
476 - def isNone(self): return False
477 478
479 -class NoneFileSystem(ObjFileSystem):
480 - def __init__(self, nameObj, fileSystem):
481 ObjFileSystem.__init__(self, nameObj, fileSystem)
482 - def getName(self): return 'NONE'
483 - def size(self): return 0
484 - def isFile(self): return False
486 - def isDir(self): return False
487 - def isNone(self): return True
488 489
490 -class LinkFileSystem(ObjFileSystem):
491 - def __init__(self, nameObj, fileSystem):
492 ObjFileSystem.__init__(self, nameObj, fileSystem)
493 - def getName(self): return 'LINK'
494 - def size(self): return 0
495 - def isFile(self): return False
497 - def isDir(self): return False
498 - def isNone(self): return False
499 500
501 -class DirFileSystem(ObjFileSystem):
502 - def __init__(self, nameObj, fileSystem):
503 ObjFileSystem.__init__(self, nameObj, fileSystem)
504 - def getName(self): return 'DIR'
505 - def size(self): return self._fs.sizeDir(self)
506 - def isFile(self): return False
508 - def isDir(self): return True
509 - def isNone(self): return False
510 511
512 -class FSystem:
513 - def _init(self, obj): pass
514 - def getName(self): return 'NOT DEFINE'
515 - def isDir(self,obj): raise
516 - def isFile(self,obj): raise
518 - def sizeFile(self): raise
519 - def sizeDir(self): raise
520 - def defType(self, obj):
521 if self.isDir(obj) : obj.__class__ = DirFileSystem 522 elif self.isLink(obj): obj.__class__ = LinkFileSystem 523 elif self.isFile(obj): obj.__class__ = FileFileSystem 524 else : obj.__class__ = NoneFileSystem
525 526
527 -class FSystemSE(FSystem):
528 - def __init__(self, vo):
529 self._vo = vo
530 - def _init(self, obj): self._getPerm(obj)
531 - def getName(self): return 'SE'
532 - def isDir(self,obj): return (obj._perm[0]=='d')
533 - def isFile(self,obj): return (obj._perm[0]=='-')
535 - def sizeFile(self,obj): return obj._size
536 - def sizeDir(self,obj):
537 myp = Process("lfc-ls -l /grid/%s/%s | grep ^[^d] | awk '{s=s+$5} END {print s}'"%(self._vo, obj._name)) 538 myp.wait(30) 539 if myp.getExitValue() != 0: 540 sys.stderr.write(myp.stdErr()) 541 return -1 542 elif len(myp.stdOut()) == 0: 543 return 0 544 else: 545 if G_verbose>=15: 546 print "stdout='"+myp.stdOut()+"'" 547 try: 548 mys = int(myp.stdOut()) 549 except: 550 mys = 0 551 return mys
552
553 - def defType(self, obj):
554 self._init(obj) 555 if obj._perm == None: 556 obj.__class__ = NoneFileSystem 557 return 558 FSystem.defType(self,obj)
559 560
561 - def _getPerm(self, obj):
562 # remove last 563 path = obj._name 564 if path=='': 565 obj._perm = 'd' 566 obj._size = 0 567 return True 568 if path.find('/') >= 0: 569 path_tmp = path[0:path.rfind('/')] 570 name_tmp = path[path.rfind('/')+1:] 571 else: 572 path_tmp = '' 573 name_tmp = path 574 if G_verbose>=10: 575 print 'path_tmp='+path_tmp 576 print 'name_tmp='+name_tmp 577 mycmd = "lfc-ls -l /grid/%s/%s | grep %s"%(self._vo, path_tmp, name_tmp ) 578 myp = Process(mycmd) 579 myp.wait(30) 580 if myp.getExitValue() != 0: 581 sys.stderr.write(myp.stdErr()) 582 list = myp.stdOut().split('\n') 583 for dir in list: 584 if dir == "" : continue 585 mot = dir.split() 586 if G_verbose>=10: 587 print dir 588 print mot 589 if mot[8] == name_tmp: 590 if G_verbose>=10: print "find objet, permission is ", mot[0] 591 obj._perm = mot[0] 592 obj._size = int(mot[4])
593 594
595 -class FSystemUNIX(FSystem):
596 - def _init(self, obj): pass
597 - def getName(self): return 'UNIX'
598 - def isDir(self,obj): return os.path.isdir(obj._name)
599 - def isFile(self,obj): return os.path.isfile(obj._name)
601 - def sizeFile(self,obj):return os.path.getsize(obj._name)
602 - def sizeDir(self,obj):
603 pp = Process('ls -l %s'%obj._name) 604 pp.wait(60) 605 size = 0 606 for line in pp.stdOut().split('\n'): 607 if line == '': continue 608 if line[0:5] == "total": continue 609 if (line[0] == '-') : size += int(line.split()[4]) 610 return size
611 612
613 -class SEtools:
614 - def __init__(self, NameVO, NameSE):
615 self._LFC_HOST = os.getenv('LFC_HOST') 616 if self._LFC_HOST == None: 617 print "ERROR: Define LFC_HOST in your environnement" 618 raise 619 self._NameSE = NameSE 620 self._NameVO = NameVO 621 self._tag = "se:" 622 self._SEis = None 623 self._lentag = len(self._tag) 624 self._cp = {'SRC': self._cp2Loc, 'DEST': self._cp2SE } 625 self._cprec = {'SRC': self._cp2Locrec, 'DEST': self._cp2SErec } 626 self._lscmd = {'SRC': self._lsSEfile,'DEST': self._lsLocfile } 627 self._mkdir = {'SRC': self._mkdirLoc,'DEST': self._mkdirSE }
628
629 - def _whereSE(self,src,dest):
630 "File on storage element must begining by se:" 631 if src[0:self._lentag] == self._tag: 632 self._SEis = "SRC" 633 elif dest[0:self._lentag] == self._tag: 634 self._SEis = "DEST" 635 else: 636 print "ERROR: src or dest must begin by "+self._tag 637 raise 638 return self._SEis
639
640 - def _deltag(self,src,dest):
641 if self._SEis == "SRC": 642 return [src[self._lentag:],dest] 643 else: 644 return [src,dest[self._lentag:]]
645
646 - def _SubmitCmd(self, cmd, TpsMax=3600):
647 myp = Process(cmd) 648 if G_verbose > 4: print cmd 649 myp.wait(TpsMax) 650 if not myp.isOk(): 651 myp.stdOut() 652 myp.stdErr() 653 sys.stderr.write("\nERROR: "+cmd+"\n") 654 sys.stderr.write(myp.stdErr()) 655 sys.exit(1) 656 return [myp.stdOut(), myp.stdErr()]
657
658 - def _cp2SE(self , src, destSE):
659 fSE = "lfn:/grid/"+self._NameVO+'/'+destSE 660 fsrc= "file:"+src 661 cmd = "lcg-cr -n 3 --vo %s -d %s %s -l %s"%(self._NameVO, self._NameSE, fsrc, fSE) 662 return submitCmd(cmd,60)
663
664 - def _cp2SErec(self , src, destSE, er, rec=False, nbstream=1):
665 if rec : 666 self._mkdirSE(destSE) 667 else: 668 self._cp2SE(src, destSE) 669 return 670 pp = Process('ls -l %s'%src) 671 pp.wait(60) 672 if pp.isOk(): 673 task_File = MultiJobsClass() 674 for line in pp.stdOut().split('\n'): 675 if line == '': continue 676 if line[0:5] == "total": continue 677 if (line[0] == 'd') and rec : 678 dir = line.split()[8] 679 if G_verbose >=2: 680 sys.stdout.write("\ncpdir "+src+'/'+dir) 681 sys.stdout.flush() 682 MyExe = AppliExe('apcgrid-cp') 683 newdirSE= "%s/%s"%(destSE,dir) 684 arg = '-r -n %d %s/%s se:%s -v %d --ncd '%(nbstream,src, dir, newdirSE, G_verbose) 685 if er: arg = arg + ' -e "%s"'%er 686 MyExe.setArg(arg) 687 MyExe.noKeepStdFile() 688 job= newJob(MyExe,'LOCAL') 689 job.timerUpdateStatus("1") 690 job.submitAndWait() 691 if job.isOk() and G_verbose >=2: 692 sys.stdout.write(job.stdOut()) 693 sys.stdout.flush() 694 if not job.isOk(): sys.stderr.write(job.stdErr()) 695 else: 696 nfile = line.split()[8] 697 if er: 698 if re.match(er, nfile) == None: continue 699 fSE = "lfn:/grid/"+self._NameVO+'/'+destSE+'/'+nfile 700 fsrc= "file:"+src+'/'+nfile 701 arg = " -n %d --vo %s -d %s %s -l %s"%(nbstream, self._NameVO, self._NameSE, fsrc, fSE) 702 if G_verbose >=4: 703 sys.stdout.write("\ncp "+src+'/'+nfile) 704 sys.stdout.flush() 705 MyExe = AppliExe('lcg-cr') 706 MyExe.setArg(arg) 707 MyExe.noKeepStdFile() 708 task_File.append(newJob(MyExe,'LOCAL')) 709 710 task_File.timerUpdateStatus("1") 711 task_File.submitAndWaitAll( MaxRunning=10) 712 for job in task_File._ListJob: 713 if job.isOk() and G_verbose >=2: 714 sys.stdout.write(job.stdOut()) 715 sys.stdout.flush() 716 if not job.isOk(): sys.stderr.write(job.stdErr())
717
718 - def _cp2Loc(self, srcSE, dest, ns=1):
719 fSE = "lfn:/grid/"+self._NameVO+'/'+srcSE 720 fdest= "file:"+dest 721 #cmd = "lcg-cp -n %d --vo %s %s %s"%(ns, self._NameVO,fSE,fdest) 722 print dest 723 cmd = "lcg-cp --vo %s %s %s"%( self._NameVO,fSE,fdest) 724 return submitCmd(cmd)
725 726
727 - def _lsSEfile(self, dir):
728 """ ls uniquement les non fichiers """ 729 return "lfc-ls -l /grid/%s/%s| grep ^[^d]|awk '{print $9}'"%(self._NameVO, dir)
730
731 - def _lsLocfile(self, dir):
732 return "ls -F --file-type %s |grep [^/]$ "%dir
733
734 - def _mkdirSE(self, dir):
735 cmd = "lfc-mkdir /grid/%s/%s"%(self._NameVO, dir) 736 myp = Process(cmd) 737 myp.wait(120) 738 if not myp.isOk(): 739 # maybe already exist ? 740 myerr = myp.stdErr() 741 if myerr.find('File exists')< 0: 742 sys.stderr.write(myerr) 743 myp.stdOut() 744 raise
745
746 - def _mkdirLoc(self, dir):
747 if not os.path.isdir(dir): 748 os.makedirs(dir)
749
750 - def _tagSEpresent(self, file):
751 if len(file) >=self._lentag: 752 return file[0:self._lentag]==self._tag 753 else: 754 return False
755
756 - def _simpleDelTag(self, file):
757 if self._tagSEpresent(file): 758 return file[self._lentag:] 759 else: 760 return file
761
762 - def _rmrec(self,dir, er, checkdir=True, rec=False):
763 dir = self._simpleDelTag(dir) 764 if checkdir: 765 if not self.isDir(dir): 766 self.rm(dir) 767 return 768 if er == "": 769 # comme unix rm -r dir, si pas d'expression, le repertoire courant est supprime 770 rmCurDir = True 771 er = ".*" 772 else: 773 rmCurDir = False 774 pp = Process('lfc-ls -l /grid/%s/%s'%(self._NameVO,dir)) 775 pp.wait(60) 776 if pp.isOk(): 777 task_rmFile = MultiJobsClass() 778 task_rmDir = MultiJobsClass() 779 for line in pp.stdOut().split('\n'): 780 if line == '': break 781 if (line[0] == 'd') and rec: 782 newdir = dir+'/'+line.split()[8] 783 if G_verbose >=4: 784 sys.stdout.write("\nrm dir "+newdir) 785 MyExe = AppliExe('apcgrid-rm') 786 arg = '-r %s -v %d --ncd '%(newdir, G_verbose) 787 if er != ".*" : 788 # pour effacer les sous repertoires vides 789 arg = arg +' -e "%s"'%er 790 MyExe.setArg(arg) 791 MyExe.noKeepStdFile() 792 task_rmDir.append( newJob(MyExe,'LOCAL')) 793 else: 794 nfile = line.split()[8] 795 if re.match(er, nfile) == None: continue 796 nfile = dir+'/'+nfile 797 arg = "--vo %s -a lfn:/grid/%s/%s --force"%(self._NameVO, self._NameVO, nfile) 798 if G_verbose >=4: 799 sys.stdout.write("\nrm %s"%nfile) 800 MyExe = AppliExe('lcg-del') 801 MyExe.setArg(arg) 802 MyExe.noKeepStdFile() 803 task_rmFile.append(newJob(MyExe,'LOCAL')) 804 805 task_rmFile.timerUpdateStatus("1") 806 task_rmDir.timerUpdateStatus("1") 807 808 task_rmFile.submitAndWaitAll( MaxRunning=10) 809 for job in task_rmFile._ListJob: 810 if job.isOk() and G_verbose >=2: sys.stdout.write(job.stdOut()) 811 if not job.isOk(): sys.stderr.write(job.stdErr()) 812 813 task_rmDir.submitAndWaitAll( MaxRunning=10) 814 for job in task_rmDir._ListJob: 815 if job.isOk() and G_verbose >=2: sys.stdout.write(job.stdOut()) 816 if not job.isOk(): sys.stderr.write(job.stdErr()) 817 if rmCurDir: 818 # test si repertoire vide 819 pp = Process('lfc-ls /grid/%s/%s"'%(self._NameVO,dir)) 820 pp.wait(60) 821 if not pp.isOk(): 822 sys.stderr.write(pp.stdErr()) 823 if pp.stdOut() == '': 824 if G_verbose >=4: sys.stdout.write("\nrmdir se:"+dir) 825 cmd = "lfc-rm -r /grid/%s/%s"%(self._NameVO,dir) 826 self._SubmitCmd(cmd, 60)
827
828 - def _cp2Locrec(self, srcSE, dest, er, rec=False, nbstream=1, pref=""):
829 if rec: 830 self._mkdirLoc(dest) 831 else: 832 return self._cp2Loc(srcSE, dest,nbstream) 833 834 if er == "": er = ".*" 835 pp = Process('lfc-ls -l /grid/%s/%s'%(self._NameVO, srcSE)) 836 pp.wait(60) 837 if pp.isOk(): 838 task_File = MultiJobsClass() 839 for line in pp.stdOut().split('\n'): 840 if line == '': break 841 if (line[0] == 'd') and rec: 842 dir = line.split()[8] 843 newdirSE = srcSE+'/'+dir 844 newdir = dest+'/'+pref+dir 845 if G_verbose >=4: 846 sys.stdout.write("\ncp dir "+newdirSE) 847 MyExe = AppliExe('apcgrid-cp') 848 arg = 'se:%s %s -r -v %d -n %d --ncd'%(newdirSE, newdir, G_verbose, nbstream) 849 if er: arg = arg + ' -e "%s"'%er 850 MyExe.setArg(arg) 851 if G_verbose >=4: 852 sys.stdout.write("\napcgrid-cp "+arg) 853 MyExe.noKeepStdFile() 854 job=newJob(MyExe,'LOCAL') 855 job.timerUpdateStatus("1") 856 job.submitAndWait() 857 if job.isOk() and G_verbose >=2: 858 sys.stdout.write(job.stdOut()) 859 sys.stdout.flush() 860 if not job.isOk(): sys.stderr.write(job.stdErr()) 861 else: 862 nfile = line.split()[8] 863 if er : 864 if re.match(er, nfile) == None: continue 865 pfile = srcSE+'/'+nfile 866 if G_verbose >=4: 867 sys.stdout.write("\ncp %s"%pfile) 868 MyExe = AppliExe('lcg-cp') 869 fSE = "lfn:/grid/"+self._NameVO+'/'+pfile 870 fdest= "file:"+dest+"/"+pref+nfile 871 # -n bloque tout !!!! 872 #arg = "-n %d --vo %s %s %s"%(nbstream, self._NameVO, fSE,fdest) 873 arg = " --vo %s %s %s"%( self._NameVO, fSE, fdest) 874 if G_verbose >=4: 875 sys.stdout.write("\nlcg-cp "+arg) 876 MyExe.setArg(arg) 877 MyExe.noKeepStdFile() 878 task_File.append(newJob(MyExe,'LOCAL')) 879 880 task_File.timerUpdateStatus("1") 881 task_File.submitAndWaitAll( MaxRunning=10) 882 for job in task_File._ListJob: 883 if job.isOk() and G_verbose >=2: 884 sys.stdout.write(job.stdOut()) 885 sys.stdout.flush() 886 if not job.isOk(): sys.stderr.write(job.stdErr())
887 888
889 - def _testProxy(self, stderr, dotest):
890 if (stderr.find('credential')>0) and dotest: 891 return( _initProxy()==0) 892 return False
893 894 895 # 896 # PUBLIC 897 #
898 - def cp(self,src,dest):
899 "File on storage element must begining by se:" 900 self._whereSE(src,dest) 901 a = self._deltag(src,dest) 902 return self._cp[self._SEis](a[0],a[1])
903
904 - def cprec(self, src, dest, re, rec, nbstream, check):
905 """ 906 Action : copy recursif UI <-> SE 907 File on storage element must begining by se: 908 src 909 dest 910 re 911 rec : flag copy recursif 912 nbstream 913 check 914 """ 915 self._whereSE(src,dest) 916 a = self._deltag(src,dest) 917 a[0] = _removeFinalSlash(a[0]) 918 a[1] = _removeFinalSlash(a[1]) 919 if check: 920 if G_verbose >=10: print "check cprec" 921 if self._SEis == "SRC": 922 objSrc = ObjFileSystem(a[0], FSystemSE(self._NameVO)) 923 objDest = ObjFileSystem(a[1], FSystemUNIX()) 924 else: 925 objSrc = ObjFileSystem(a[0], FSystemUNIX()) 926 objDest = ObjFileSystem(a[1], FSystemSE(self._NameVO)) 927 928 if G_verbose >=10: 929 print "objSrc.getName()", objSrc.getName() 930 print "objDest.getName()",objDest.getName() 931 print "rec", rec 932 print "re", re 933 934 if rec or (re != None): 935 # RECURSIF 936 if not objSrc.isDir(): 937 sys.stderr.write("\nERROR 1: '%s' must be a directory."%a[0]) 938 return 1 939 if not objDest.isDir(): 940 if not objDest.isNone(): 941 sys.stderr.write("\nERROR 2: '%s' must be a directory."%a[1]) 942 return 1 943 else: 944 # dest n'existe pas, est-t-il porter un repertoire ? 945 base = os.path.dirname(a[1]) 946 if self._SEis == "SRC": 947 objBase = ObjFileSystem(base, FSystemUNIX()) 948 else: 949 objBase = ObjFileSystem(base, FSystemSE(self._NameVO)) 950 if not objBase.isDir(): 951 sys.stderr.write("\nERROR 3: '%s' must be a directory."%base) 952 return 1 953 else: 954 # NON RECURSIF 955 # src ne pas etre un repertoire 956 if objSrc.isDir(): 957 sys.stderr.write("\nERROR: '%s' can't be a directory or used -r option"%a[0]) 958 return 1 959 if objSrc.isNone(): 960 sys.stderr.write("\nERROR: '%s' unknown."%a[0]) 961 return 1 962 # dest name file present ? 963 if objDest.isDir(): 964 # dest est un repertoire ajout du nom du fichier source 965 a[1] = a[1] + '/'+os.path.basename(a[0]) 966 else: 967 # dest n'est pas un repertoire, verifions qu'il est porté par un repertoire valide 968 base = os.path.dirname(a[1]) 969 if base == "": base = './' 970 if self._SEis == "SRC": 971 objBase = ObjFileSystem(base, FSystemUNIX()) 972 else: 973 objBase = ObjFileSystem(base, FSystemSE(self._NameVO)) 974 print "objBase is ", objBase.getName() 975 if not objBase.isDir(): 976 sys.stderr.write("\nERROR 4: '%s' must be a directory."%base) 977 return 1 978 return self._cprec[self._SEis](a[0],a[1], re, rec, nbstream)
979 980
981 - def cpdir(self,src,dest,prefixe=''):
982 "Directory on storage element must begining by se:" 983 # define where is SE direction src or dest 984 SEis = self._whereSE(src,dest) 985 (srcf, destf) = self._deltag(src,dest) 986 987 # create directory 988 self._mkdir[SEis](destf) 989 990 # list file in src directory 991 cmd = self._lscmd[SEis](srcf) 992 myp = Process(cmd) 993 myp.wait(120) 994 if myp.isOk(): 995 list = self._SubmitCmd(cmd,60)[0].split() 996 997 # copy loop file 998 for myfile in list: 999 self._cp[SEis](srcf+'/'+myfile, destf+'/'+prefixe+myfile) 1000 else: 1001 if G_verbose>=10: 1002 print "[SEtools.cpdir] Nothing to copy"
1003
1004 - def isDir(self,path):
1005 # remove last 1006 if path=='': return True 1007 if path.find('/') >= 0: 1008 path_tmp = path[0:path.rfind('/')] 1009 name_tmp = path[path.rfind('/')+1:] 1010 else: 1011 path_tmp = '' 1012 name_tmp = self._simpleDelTag(path) 1013 if G_verbose>=15: 1014 print 'path_tmp='+path_tmp 1015 print 'name_tmp='+name_tmp 1016 myp = Process("lfc-ls -l /grid/%s/%s | grep ^d| awk '{print $9}'"%(self._NameVO,self._simpleDelTag(path_tmp))) 1017 myp.wait(30) 1018 if myp.getExitValue() != 0: 1019 sys.stderr.write(myp.stdErr()) 1020 return None 1021 listdir = myp.stdOut().split('\n') 1022 for dir in listdir: 1023 if dir == name_tmp: 1024 return True 1025 return False
1026
1027 - def ls(self,dir="", retry=False):
1028 dir = self._simpleDelTag(dir) 1029 cmd = "lfc-ls -l /grid/%s/%s"%(self._NameVO,dir) 1030 if G_verbose>=15: print cmd 1031 myp = Process(cmd) 1032 myp.wait(30) 1033 if myp.getExitValue() != 0: 1034 sys.stderr.write("ERROR:\n") 1035 sys.stderr.write(myp.stdErr()) 1036 if self._testProxy(myp.stdErr(), retry): 1037 return self.ls(dir, False) 1038 else: 1039 sys.stdout.write(myp.stdOut()) 1040 return myp.getExitValue()
1041 1042
1043 - def sizeDir(self,dir):
1044 if not self.isDir(dir): return -1 1045 myp = Process("lfc-ls -l /grid/%s/%s | grep ^[^d] | awk '{s=s+$5} END {print s}'"%(self._NameVO, self._simpleDelTag(dir))) 1046 myp.wait(30) 1047 if myp.getExitValue() != 0: 1048 sys.stderr.write(myp.stdErr()) 1049 return -1 1050 elif len(myp.stdOut()) == 0: 1051 return 0 1052 else: 1053 if G_verbose>=15: 1054 print "stdout='"+myp.stdOut()+"'" 1055 try: 1056 mys = int(myp.stdOut()) 1057 except: 1058 mys = -1 1059 return mys
1060 1061
1062 - def sizeFile(self,file):
1063 if self.isDir(file) : return -1 1064 cmd = "lfc-ls -l /grid/%s/%s | grep ^[^d] | awk '{if (NR == 1) print $0}' | awk '{print $5}'"%(self._NameVO, self._simpleDelTag(file)) 1065 myp = Process(cmd) 1066 myp.wait(30) 1067 if myp.getExitValue() != 0: 1068 sys.stderr.write(myp.stdErr()) 1069 return -1 1070 elif len(myp.stdOut()) == 0: 1071 return -1 1072 else: 1073 if G_verbose>=15: 1074 print len(myp.stdOut()) 1075 print "stdout='"+myp.stdOut()+"'" 1076 try: 1077 mys = int(myp.stdOut()) 1078 except: 1079 mys = -1 1080 return mys
1081 1082
1083 - def rm(self,file):
1084 cmd = "lcg-del --vo %s -a lfn:/grid/%s/%s --force"%(self._NameVO, self._NameVO, file) 1085 self._SubmitCmd(cmd, 60)
1086 1087
1088 - def rmdir(self,dir):
1089 # list file in src directory 1090 cmd = self._lsSEfile(dir) 1091 list = self._SubmitCmd(cmd,60)[0].split() 1092 1093 # rm loop file 1094 for myfile in list: 1095 self.rm(dir+'/'+myfile) 1096 1097 # del repertoire 1098 cmd = "lfc-rm -r /grid/%s/%s"%(self._NameVO,dir) 1099 self._SubmitCmd(cmd, 60)
1100 1101
1102 -class _CopySEThread(threading.Thread):
1103 MaxThread = 10 1104 listThread = [] 1105 lockStaticVar = threading.Lock() 1106
1107 - def __init__(self, objCopy, pathdir, PathLocWD, prefixe ):
1108 threading.Thread.__init__(self) 1109 self.obj = objCopy 1110 self.pathdir = pathdir 1111 self.PathLocWD = PathLocWD 1112 self.pref = prefixe 1113 while self.TooManyThread(): time.sleep(1) 1114 _CopySEThread.lockStaticVar.acquire() 1115 _CopySEThread.listThread.append(self) 1116 _CopySEThread.lockStaticVar.release()
1117
1118 - def TooManyThread(self):
1119 _CopySEThread.lockStaticVar.acquire() 1120 #print "CopySEThread: ", len(_CopySEThread.listThread) 1121 test = len(_CopySEThread.listThread) > _CopySEThread.MaxThread 1122 _CopySEThread.lockStaticVar.release() 1123 return test
1124
1125 - def run(self):
1126 # copy from SE 1127 self.obj._cp2Locrec(self.pathdir+'/data', self.PathLocWD, "", True, 1, self.pref) 1128 # rm SE tempory directory 1129 mp = Process('apcgrid-rm -r '+self.pathdir) 1130 mp.wait() 1131 # free resource 1132 _CopySEThread.lockStaticVar.acquire() 1133 _CopySEThread.listThread.remove(self) 1134 _CopySEThread.lockStaticVar.release() 1135 # set flag finish 1136 MultiJobsClass.S_EventFinish.set()
1137 1138
1139 -class SuperviseProcess(threading.Thread):
1140 """Thread to supervise a process"""
1141 - def __init__(self, ObjProcess):
1142 threading.Thread.__init__(self) 1143 self.Finish = False 1144 self.Process = ObjProcess
1145
1146 - def run(self):
1147 # method call by thread 1148 # wait process 1149 self.Process._wait() 1150 self.Finish = True
1151 1152
1153 -class Process(popen2.Popen3):
1154 #S_nbInstance = 0 1155 """Safe process method wait() no """
1156 - def __init__(self, cmd, stdout='' , stderr=''):
1157 self._cmd = cmd 1158 #Process.S_nbInstance +=1 1159 #if G_verbose>=15: 1160 #print "\nAdd process ", cmd, Process.S_nbInstance 1161 1162 nameAlea = os.getcwd()+"/"+_AddAlea("APCprocess",7) 1163 if stdout == '': 1164 self._rmStdOut = True 1165 self._nameStdOut = nameAlea+".out" 1166 else: 1167 self._rmStdOut = False 1168 self._nameStdOut = stdout 1169 1170 if stderr == '': 1171 self._rmStdErr = True 1172 self._nameStdErr = nameAlea+".err" 1173 else: 1174 self._rmStdErr = False 1175 self._nameStdErr = stderr 1176 1177 cmd_add = cmd + " 1>%s 2>%s"%(self._nameStdOut, self._nameStdErr) 1178 # appel au consytructeur de base 1179 popen2.Popen3.__init__(self, cmd_add, True) 1180 self._Status = 'SubmitRunning' 1181 self._ret = None 1182 self._exitValue = None 1183 self._readErrFlag = True 1184 self._readOutFlag = True
1185 1186 # def __del__(self): 1187 # Process.S_nbInstance -=1 1188 # if G_verbose>=15: 1189 # print "\ndel process ", self._cmd, Process.S_nbInstance 1190
1191 - def _ExaminePoll(self):
1192 """Call when process finish""" 1193 if self._exitValue != None: return 1194 self._exitValue = self.poll()>>8 1195 if self._exitValue > 127: 1196 self._exitValue -= 256 1197 if G_verbose>=15:print "exit value "+str( self._exitValue) 1198 if self._exitValue == 0: 1199 self._Status = "FinishOK" 1200 else: 1201 self._Status = "FinishNOK" 1202 self._retrieveOut() 1203 self._retrieveErr() 1204 self.fromchild.close() 1205 self.tochild.close() 1206 self.childerr.close()
1207 1208
1209 - def _updateStatus(self):
1210 """update status, no return""" 1211 if self._Status.find('Finish') >= 0: 1212 # le process est fini pas de changement 1213 return 1214 cmd = 'ps -p %d -o state'%self.pid 1215 (o,i,e) = popen2.popen3(cmd) 1216 output = o.read() 1217 o.close() 1218 i.close() 1219 e.close() 1220 if G_verbose>=15 : print cmd+'\n'+output 1221 rep = output.split('\n') 1222 rep.remove('') 1223 if len(rep) >= 2: 1224 if (rep[1] in ['Z','X']): 1225 # le process est fini 1226 self._ExaminePoll() 1227 else: 1228 # toujours run 1229 pass 1230 else: 1231 # le process n'existe plus, il est termine 1232 self._ExaminePoll()
1233
1234 - def isAlive(self):
1235 self._updateStatus() 1236 return (self._Status == 'SubmitRunning')
1237
1238 - def isFinish(self):
1239 self._updateStatus() 1240 return (self._Status.find('Finish') >= 0)
1241
1242 - def isOk(self):
1243 self._updateStatus() 1244 return (self._Status == 'FinishOK')
1245
1246 - def _wait(self):
1247 return popen2.Popen3.wait(self)
1248
1249 - def wait(self,TimeOut=-1):
1250 if TimeOut == -1: 1251 # wait no limit 1252 try: 1253 self._ret = self._wait() 1254 except: 1255 # process (certainement) termine avant d'arriver a ce wait 1256 pass 1257 self._ExaminePoll() 1258 else: 1259 # wait with time out and kill if no reponse 1260 t = SuperviseProcess(self) 1261 t.start() 1262 t.join(TimeOut) 1263 if not t.Finish: 1264 os.system("kill -9 %d"%self.pid) 1265 self._Status = "FinishKill" 1266 self._ret = None # return 9 1267 if G_verbose>=15:print "Time out, kill process" 1268 else: 1269 self._ExaminePoll() 1270 self._retrieveOut() 1271 self._retrieveErr()
1272 1273
1274 - def _readFile(self, nameFile):
1275 #print "_readFile", nameFile 1276 try: 1277 fo = open(nameFile,'r') 1278 except IOError: 1279 return '' 1280 out = fo.read() 1281 fo.close() 1282 return out
1283
1284 - def _retrieveOut(self):
1285 if self._rmStdOut: 1286 # read one time and keep result 1287 if self._readOutFlag: 1288 self._readOutFlag = False 1289 self._out = self._readFile(self._nameStdOut) 1290 os.system("rm -rf "+self._nameStdOut)
1291
1292 - def _retrieveErr(self):
1293 if self._rmStdErr: 1294 # read one time and keep result 1295 if self._readErrFlag: 1296 self._readErrFlag = False 1297 self._err = self._readFile(self._nameStdErr) 1298 os.system("rm -rf "+self._nameStdErr)
1299
1300 - def stdOut(self):
1301 if self.isFinish(): 1302 if self._rmStdOut: 1303 return self._out 1304 else: 1305 return self._readFile(self._nameStdOut) 1306 else: 1307 return None
1308
1309 - def stdErr(self):
1310 if self.isFinish(): 1311 if self._rmStdErr: 1312 return self._err 1313 else: 1314 return self._readFile(self._nameStdErr) 1315 else: 1316 return None
1317
1318 - def getExitValue(self):
1319 return self._exitValue
1320
1321 - def getStatus(self):
1322 self._updateStatus() 1323 return self._Status
1324 1325
1326 -class GroupProcess(Process):
1327 """ si le nombre de process running est superieur a la limite : 1328 - si parametre Wait est False alors le process echoue 1329 - sinon attente qu'un process du groupe si termine pour le lancer 1330 """ 1331 MaxProcess = 10 1332 listProcess = []
1333 - def __init__(self, cmd, stdout='' , stderr='', Wait=True):
1334 if len(GroupProcess.listProcess) < GroupProcess.MaxProcess: 1335 Process.__init__(self,cmd,stdout, stderr) 1336 GroupProcess.listProcess.append(self) 1337 return 1338 else: 1339 if not Wait: 1340 self._Status = "FinishSubmitNOK" 1341 self._exitValue = -1 1342 return 1343 while True: 1344 for pp in GroupProcess.listProcess: 1345 if pp.isFinish(): 1346 Process.__init__(self,cmd, stdout, stderr) 1347 GroupProcess.listProcess.append(self) 1348 return 1349 #print "GroupProcess: wait end process to submit ", cmd 1350 time.sleep(1)
1351
1352 - def _updateStatus(self):
1353 if self._Status != "FinishSubmitNOK": 1354 Process._updateStatus(self) 1355 if self._Status.find('Finish') >= 0: 1356 #print "_updateStatus finish: ", self._cmd 1357 try: 1358 GroupProcess.listProcess.remove(self) 1359 except: 1360 pass
1361 1362
1363 -class _ProcessSubWMS(Process):
1364 """ 1365 Reparti et regule les process submit Glite sur les WMS disponibles de la VO 1366 """ 1367 # Variable static 1368 SlistProcess = [] # liste des ref. des Process submit 1369 Squeue = {} # [wmsidx,[max,nbProcess]] 1370 1371 # static method
1372 - def _checkSlotFree(setIdx):
1373 listWMS=[] 1374 WMSpossible = False 1375 for key, value in _ProcessSubWMS.Squeue.items(): 1376 if not set([key]).issubset(setIdx) and value[0] > 0: 1377 # we can used this WMS 1378 WMSpossible = True 1379 if value[0] > value[1]: 1380 # a slot is free for this WMS 1381 listWMS.append((value[1], key)) 1382 if listWMS==[]: 1383 return [WMSpossible, None] 1384 else: 1385 listWMS.sort(key=operator.itemgetter(0)) 1386 #print "listWMS ", listWMS 1387 return [True, listWMS[0][1]]
1388 _checkSlotFree = staticmethod(_checkSlotFree) 1389 1390
1391 - def __init__(self, cmd, queueWMS, stdout='', stderr=''):
1392 if not _ProcessSubWMS.Squeue.has_key(queueWMS): 1393 sys.stderr.write("\nERROR:\n ProcessSubWMS unknow queueWMS %d\n"%queueWMS) 1394 return 1395 self._queueWMS = queueWMS 1396 # ajoute un process a la queue 'queueWMS' 1397 _ProcessSubWMS.Squeue[queueWMS][1] += 1 1398 # ajoute la reference du process à la liste generale 1399 _ProcessSubWMS.SlistProcess.append(self) 1400 Process.__init__(self, cmd, stdout, stderr)
1401 1402
1403 - def _updateStatus(self):
1404 # appel de la methode de la classe de base 1405 Process._updateStatus(self) 1406 if self._Status.find('Finish') >= 0: 1407 if self in _ProcessSubWMS.SlistProcess: 1408 #print "ProcessSubWMS: remove Process ", self._cmd 1409 _ProcessSubWMS.Squeue[self._queueWMS][1] -= 1 1410 _ProcessSubWMS.SlistProcess.remove(self)
1411 1412 1413 ############################################################### 1414 # 1415 # Scheduler Interface 1416 # 1417 ############################################################### 1418
1419 -class SchedulerAbstract:
1420 - def __init__(self):
1421 self._TypeSched ='' 1422 self._delegateProxy = "No"
1423
1424 - def cancel(self, Appli):
1425 """cancel Application""" 1426 pass
1427
1428 - def submit(self, Appli):
1429 """submit Appli""" 1430 pass
1431
1432 - def delegateProxy(self):
1433 self._delegateProxy = "ToDo"
1434
1435 - def status(self, Appli):
1436 """Update status attribut of Appli object""" 1437 # Only test time out 1438 if (Appli._Status.find('SubmitScheduled')==0): 1439 # Test Time out 1440 TimeToSubmit = time.time() - Appli._TimeStart 1441 Grace = Appli._TimeOutToStart -TimeToSubmit 1442 if G_verbose>5:print '\nTimeToSubmit=',TimeToSubmit,'Grace= ',Grace 1443 if Grace < 0: 1444 print 'Time Out !!!!!!' 1445 self.cancel(Appli) 1446 if not self._ReSubmit(Appli): 1447 Appli._Status = 'FinishTimeOut' 1448 1449 if Appli._Status.find('Finish') >= 0: 1450 try: 1451 del JobClass.dictJobSubmit[Appli._APCSchedID] 1452 except: 1453 pass 1454 if (Appli._Status.find('FinishOK') >= 0 or Appli._Status.find('FinishNOK') >= 0): 1455 self.retrieveOutput(Appli) 1456 if Appli._Status.find('FinishOK') >= 0 : 1457 self._cleaner(Appli) 1458 Appli._Cleaner()
1459
1460 - def retrieveOutput(self, Appli):
1461 """Retrieve Appli outputs in work directory""" 1462 pass
1463
1464 - def wait(self, Appli):
1465 """wait the end of Appli""" 1466 if Appli._Status.find('Submit') != 0: 1467 return 1468 self.status(Appli) 1469 while Appli._Status.find('Submit') == 0: 1470 time.sleep(Appli._Timer) 1471 self.status(Appli)
1472 #print "After wait %s new status is %s"%(Appli._APCSchedID, Appli._Status) 1473 1474
1475 - def _ReSubmit(self, Appli):
1476 """reSubmit Appli""" 1477 Appli._FlagResubmit = True
1478
1479 - def _cleaner(self, Appli):
1480 """scheduler cleaner after end of job """ 1481 pass
1482
1483 - def _resultListJob(self, listJob):
1484 return ""
1485
1486 - def stdOut(self, Appli):
1487 if Appli._keepStdFile: 1488 return _readFile(Appli.getPathFile('STDOUT')) 1489 else: 1490 return Appli._stdOut
1491
1492 - def stdErr(self, Appli):
1493 if Appli._keepStdFile: 1494 return _readFile(Appli.getPathFile('STDERR')) 1495 else: 1496 return Appli._stdErr
1497 1498 1499 1500 1501 ################################################## 1502 # 1503 # GENERIC GRID 1504 # 1505 1506
1507 -class SchedulerGrid(SchedulerAbstract):
1508 # Variable static a la classe 1509 DoProxyTest = True 1510 S_DelegateProxy = False
1511 - def __init__(self):
1512 SchedulerAbstract.__init__(self) 1513 self._FileConf = '' 1514 self._FileJDL = '' 1515 self._pFile = '' 1516 self._Req = 'Requirements = other.GlueCEStateStatus == "Production" && ( ! ( RegExp(".*node16.*",other.CEId) ) );' 1517 self._LoadBalCE = None 1518 self._defCE = None
1519
1520 - def _FillJDL(self):
1521 pass
1522
1523 - def _FillConf(self):
1524 pass
1525
1526 - def submitCmd(self, FileID):
1527 pass
1528
1529 - def statusCmd(self, FileID):
1530 pass
1531
1532 - def cancel(self, Appli):
1533 cmd = self._CancelCmd(Appli) 1534 if G_verbose>=10: print "cancel job: "+cmd 1535 pp = Process(cmd) 1536 Appli._Cleaner() 1537 pp.wait(30) 1538 if not pp.isOk(): 1539 sys.stderr.write("\nERROR:\n commend '%s' is %s\n"%pp._cmd, pp.getStatus()) 1540 sys.stderr.write(pp.stdErr()) 1541 else: 1542 sys.stdout.write(pp.stdOut())
1543
1544 - def _CancelCmd(self, Appli):
1545 pass
1546
1547 - def retrieveCmd(self, FileID, Appli, outDir):
1548 pass
1549
1550 - def doFileConf(self, wms):
1551 self._pFile = open(self._FileConf, 'w+') 1552 self._FillConf(wms) 1553 self._pFile.close() 1554
1555 - def doFileJDL(self, Appli):
1556 # 1- Application part 1557 buffer = Appli._FillFileJDL() 1558 1559 # 2- job part 1560 buffer += 'StdOutput = "%s";\n'%Appli.getNameStdOut() 1561 buffer += 'StdError = "%s";\n'%Appli.getNameStdErr() 1562 1563 outtemp = 'OutputSandbox = {"%s","%s"'%(Appli.getNameStdOut(),Appli.getNameStdErr()) 1564 for elt in Appli._ListOutput: 1565 # add unique name executable 1566 #outtemp += ',"%s_%s"'%(Appli._APCSchedID, elt) 1567 if elt.find("lfn:/grid") == -1: 1568 # Don't add file se in OutputSandbox 1569 outtemp += ',"%s"'%elt 1570 outtemp += '};\n' 1571 buffer += outtemp 1572 1573 # 3- scheduler part 1574 self._pFile = open(self._FileJDL, 'w+') 1575 # renouvellement du proxy 1576 #buffer += "MyProxyServer = `myproxy.grif.fr`;" 1577 print >> self._pFile,'%s'%buffer 1578 Appli._ce = self._FillJDL(Appli) 1579 self._pFile.close()
1580
1581 - def checkProxy(self):
1582 """ 1583 """ 1584 cmd = 'glite-voms-proxy-info' 1585 myp = Process(cmd) 1586 myp.wait(5) 1587 output = myp.stdOut() 1588 err = myp.stdErr() 1589 if G_verbose>5:print cmd 1590 if G_verbose>9: print output 1591 if G_verbose>9: print err 1592 minHourProxyTimeOut = 48 1593 initProxy =False 1594 if myp.isOk(): 1595 timeleft_hhmmss = _GetWordsAfter(output,'timeleft :') 1596 timeleft_ss = _ConvHHMMSSInSec(timeleft_hhmmss) 1597 if G_verbose>=5: 1598 print 'proxy timeleft: ',timeleft_hhmmss,timeleft_ss 1599 if int(timeleft_ss) < 60*60*minHourProxyTimeOut: 1600 sys.stdout.write("\nProxy valid but time life proxy too short < %d hours."%minHourProxyTimeOut) 1601 initProxy = True 1602 else: 1603 if G_verbose>=2: print 'Proxy OK' 1604 SchedulerGrid.DoProxyTest = False 1605 else: 1606 initProxy = True 1607 1608 if initProxy: 1609 nb_hours= 3*24 1610 try: 1611 hours=input("\nProxy initialisation, enter duration in hours and I add %d hours or [Enter] to pass: "%minHourProxyTimeOut) 1612 try: 1613 nb_hours = max(minHourProxyTimeOut+hours, minHourProxyTimeOut) 1614 except: 1615 sys.stderr.write("\n%s is not a number, fix duration for 3 days\n") 1616 nb_hours= 3*24 1617 except: 1618 # si je ne peux saisir le nombre de jours 1619 # alors cela sera pareil pour la pass phrase du proxy 1620 # on continue quand meme ... 1621 sys.stderr.write("\nCan't initialisation proxy no interactive mode !\n") 1622 SchedulerGrid.DoProxyTest = False 1623 return 1624 #Rem 1625 #La vérification de la prise ne compte de _initProxy est fait 1626 #en rappelant la methode checkProxy() 1627 _initProxy(nb_hours) 1628 1629
1630 - def loadBalancingCE(self, vers, query):
1631 pass
1632
1633 - def _initDelegateProxy(self):
1634 ret = os.system('myproxy-init -d -n') 1635 time.sleep(2) 1636 if G_verbose >=10: 1637 print "status myproxy-init is ", ret 1638 return ret
1639
1640 - def submit(self, Appli):
1641 if self._delegateProxy == "ToDo": 1642 if not SchedulerGrid.S_DelegateProxy: 1643 self._initDelegateProxy() 1644 self._delegateProxy = "Yes" 1645 1646 prefixe = Appli._PathLocWD + Appli._APCSchedID 1647 self._FileJDL = prefixe+'.jdl' 1648 self._FileConf= prefixe+'.conf' 1649 self.doFileJDL(Appli) 1650 Appli._wmsIdx = self._GetWMS(Appli._wmsUsed) 1651 if Appli._wmsIdx == None: 1652 # any WMS available 1653 Appli._Status = 'FinishSubmitNOK' 1654 if len(Appli._wmsUsed)==0: 1655 sys.stderr.write("\nERROR:\n _GetWMS any wms available\n") 1656 del JobClass.dictJobSubmit[Appli._APCSchedID] 1657 return 1658 Appli._wmsUsed=Appli._wmsUsed.union(set([Appli._wmsIdx])) 1659 self.doFileConf(Appli._wmsIdx) 1660 cmd = self.submitCmd(Appli) 1661 Appli._ProcessSubmit = _ProcessSubWMS(cmd, Appli._wmsIdx) 1662 Appli._Status = "SubmitQuery" 1663 if G_verbose >=2: 1664 mes='\nTry submit ' +Appli._APCSchedID+ ' on wms '+self._shortWMSname(Appli) 1665 sys.stdout.write(mes) 1666 1667
1668 - def _AfterSubmit(self, Appli):
1669 output = Appli._ProcessSubmit.stdOut() 1670 error_sub = Appli._ProcessSubmit.stdErr() 1671 if G_verbose>=15: 1672 print 'submit output:'+output+'\nsubmit erreur:'+error_sub 1673 1674 # to do check submit success 1675 if output.find('successfully submitted') >= 0: 1676 Appli._Status = 'SubmitScheduled' 1677 Appli._TimeStart = time.time() 1678 if G_verbose>=1: 1679 mes='\nSubmit ' +Appli._APCSchedID+ ' on wms '+self._shortWMSname(Appli)+' is ok.' 1680 mes += "Used CE "+Appli._ce 1681 sys.stdout.write(mes) 1682 # extract ID grid 1683 pID = open(Appli._FileID, 'r') 1684 a = pID.read() 1685 Appli._IDgrid = a.split('\n')[1] 1686 pID.close() 1687 if G_verbose>=15:print 'ID grid: '+Appli._IDgrid 1688 else: 1689 if G_verbose>=10: 1690 print 'submit NOK\nsubmit output:'+output+'\nsubmit erreur:'+error_sub 1691 if G_verbose>=1: 1692 print '\nSubmit ' +Appli._APCSchedID+ ' is NOK on wms', self._shortWMSname(Appli) 1693 _writeFile(Appli.getPathFile('SUBERR'), error_sub) 1694 self._ReSubmit(Appli)
1695 1696
1697 - def status(self, Appli):
1698 if Appli._Status.find('Submit') != 0: 1699 if G_verbose>10: print "status can't change !!!" 1700 return 1701 1702 if Appli._Status == "SubmitQuery": 1703 if Appli._ProcessSubmit.isFinish(): 1704 self._AfterSubmit(Appli) 1705 if Appli._Status == "SubmitQuery": 1706 return 1707 Appli._ProcessSubmit = None 1708 # Si on n'a plus la possibilité de resoumettre 1709 # le job est declare fini 1710 if Appli.isFinish(): return 1711 else: 1712 return 1713 1714 if Appli._Status == "SubmitRetrieveSE": 1715 if not Appli._threadCopy.isAlive(): 1716 if Appli._StatusAppli==0: 1717 Appli._Status = "FinishOK" 1718 else: 1719 Appli._Status = "FinishNOK" 1720 if G_verbose>=1: 1721 mes='\n'+Appli._APCSchedID +' retrieve finish.' 1722 sys.stdout.write(mes) 1723 return 1724 1725 1726 if os.path.isfile(Appli._FileID): 1727 cmd = self.statusCmd(Appli._FileID) 1728 myp = Process(cmd) 1729 myp.wait() 1730 ouput = myp.stdOut() 1731 1732 if G_verbose>9: print ouput 1733 status = _GetWordsAfter(ouput,'Current Status:') 1734 if status == None: 1735 Appli._Status = 'FinishNOKLostStatus' 1736 else: 1737 if G_verbose>=1: 1738 mes='\n'+Appli._APCSchedID +' status is ' + str(status) 1739 if G_verbose>=2: 1740 mes+='.\t[WMS: %s\tCE: %s]'%(self._shortWMSname(Appli).split('.')[0], Appli._ce) 1741 sys.stdout.write(mes) 1742 if (status.find('Running')>= 0): 1743 Appli._Status = 'SubmitRunning' 1744 elif (status.find('Done')>= 0): 1745 exitcode = _GetWordsAfter(ouput,'Exit code:') 1746 if exitcode != None: 1747 try: 1748 Appli._StatusAppli = int(exitcode) 1749 if Appli._StatusAppli == 0: 1750 Appli._Status = 'FinishOK' 1751 else: 1752 Appli._Status = 'FinishNOK' 1753 except: 1754 Appli._Status = 'FinishOK' 1755 else: 1756 Appli._Status = 'FinishOK' 1757 elif status.find('Cleared')>= 0: 1758 Appli._Status = 'FinishUnknow' 1759 elif status.find('Cancelled')>= 0: 1760 Appli._Status = 'FinishCancelled' 1761 elif status.find('Aborted')>= 0: 1762 if not self._ReSubmit(Appli): 1763 Appli._Status = 'FinishAborted' 1764 _writeFile(Appli.getPathFile('STDERR'), ouput) 1765 else: 1766 print "FileID doesn't exist", Appli._FileID 1767 # temporary 1768 Appli._Status = 'FinishLostIDgrid' 1769 1770 # manage time out and call retrieve output 1771 SchedulerAbstract.status(self,Appli) 1772 if Appli._threadCopy != None: 1773 Appli._Status = "SubmitRetrieveSE" 1774 1775
1776 - def retrieveOutput(self, Appli):
1777 cmd, outpath = self.retrieveCmd(Appli._FileID, Appli) 1778 if G_verbose>9: 1779 print cmd, outpath 1780 myp = Process(cmd) 1781 myp.wait(360) 1782 if G_verbose>9: 1783 print myp.stdOut() 1784 print myp.stdErr() 1785 # used output directory indicated in output message 1786 retrieveStatus = False 1787 if os.path.isdir(outpath) and myp.isOk(): 1788 outpath_glite = _GetWordsAfter(myp.stdOut(),'stored in the directory:') 1789 if outpath_glite != None: 1790 retrieveStatus = True 1791 # stdout 1792 os.system('mv %s/%s %s'%(outpath_glite, Appli.getNameStdOut(), Appli._PathLocWD)) 1793 1794 # stderr 1795 os.system('mv %s/%s %s'%(outpath_glite, Appli.getNameStdErr(), Appli._PathLocWD)) 1796 1797 # output sandbox 1798 for i in range(len(Appli._ListOutput)): 1799 RetFile = "%s/%s"%(outpath_glite, Appli._ListOutput[i]) 1800 if os.path.isfile(RetFile): 1801 cmd = 'mv %s %s'%(RetFile, Appli.getPathFile("OUT",i)) 1802 os.system(cmd) 1803 else: 1804 print "Can't retrieve %s , doesn't exist!!"%RetFile 1805 1806 # remove directory 1807 os.system('rm -rf '+ outpath) 1808 1809 # output SE 1810 for i in range(len(Appli._ListOutputSE)): 1811 cmd = 'lcg-cp --vo %s %s file:%s'%(MyConf().gLite.vo, Appli._ListOutputSE[i], Appli.getPathFile("OUT_SE",i)) 1812 if G_verbose > 5: print cmd 1813 os.system(cmd) 1814 1815 if not retrieveStatus: 1816 print "============retrieveOutput : pb" 1817 print myp.stdOut() 1818 print myp.stdErr() 1819 print "\n%s\n%s\n%s "%(outpath, cmd, myp._Status) 1820 print "===============================" 1821 Appli._Status = 'FinishNOKretrievePB' 1822 return 1823 1824 Appli._RetrieveFromSE() 1825
1826 - def _ReSubmit(self, Appli):
1827 SchedulerAbstract._ReSubmit(self, Appli) 1828 if G_verbose>8:print "Try _ReSubmit" 1829 cmd = 'rm -rf %s'%Appli._FileID 1830 os.system(cmd) 1831 Appli._Status='NotSubmit' 1832 self.submit(Appli) 1833 return Appli._Status != 'FinishSubmitNOK'
1834
1835 - def setCE(self, CEName):
1836 if self._defCE != None: 1837 sys.stderr.write("\nWARNING: CE already definied, conflict between load balancing and setCE ?") 1838 return 1839 cmd = "lcg-infosites --vo %s ce | grep %s"%(MyConf().gLite.vo,CEName ) 1840 myp = Process(cmd) 1841 myp.wait(100) 1842 if not myp.isOk(): 1843 print "\nout:\n"+myp.stdOut() 1844 print "\nerr:\n"+myp.stdErr() 1845 sys.stderr.write("ERROR with command "+cmd) 1846 sys.stderr.write(myp.stdErr()) 1847 sys.exit(1) 1848 out = myp.stdOut() 1849 if out == "": 1850 sys.stderr.write("\nERROR: unknown CE for your VO\nMy check procedure:\n %s "%cmd) 1851 exit(1) 1852 lineout = out.split('\n') 1853 splitout = lineout[0].split() 1854 if len(splitout) >=6 : 1855 if splitout[5].find(CEName)== 0: 1856 self._Req = 'Requirements = other.GlueCEUniqueID == "%s";'%splitout[5] 1857 self._defCE = splitout[5] 1858 else: 1859 sys.stderr.write("\nERROR: I don't exactly find your CE '%s' in \n%s\nresult:\n%s "%(CEName, cmd, out)) 1860 sys.exit(1) 1861 else: 1862 sys.stderr.write("\nERROR: invalid format lcg-infosites --vo xx ce, wait for 6 columns:\n%s"%out) 1863 sys.exit(1)
1864 1865
1866 - def setLocalCE(self):
1867 return self.setCE(MyConf().gLite.ce)
1868 1869
1870 - def _cleaner(self, Appli):
1871 """scheduler cleaner after end of job """ 1872 if Appli.isOk(): 1873 file= os.path.join(Appli._PathLocWD,Appli._APCSchedID) 1874 os.system("rm -rf "+file+".jdl") 1875 os.system("rm -rf "+file+".conf")
1876
1877 - def _resultListJob(self, listJob):
1878 # creation fichier avec tous les ID 1879 if listJob._name =="": 1880 filetemp = listJob._ListJob[0].addFullPath(_AddAlea("",6)) 1881 else: 1882 filetemp = listJob._ListJob[0].addFullPath(_AddAlea(listJob._name.replace(' ','_') ,6)) 1883 filetemp = filetemp+'.ids' 1884 listJob.concatId(filetemp) 1885 1886 # command gstat_all 1887 cmd = self._resultCmd(filetemp) 1888 lce=Process(cmd) 1889 lce.wait(60) 1890 if not lce.isOk(): 1891 sys.stdout.write(lce.stdOut()) 1892 sys.stderr.write("ERROR: "+lce.stdErr()) 1893 buffer = "\nCompute Element used:\n---------------------\n" 1894 return buffer+lce.stdOut()
1895 1896 1897 # GLITE 1898 # 1899
1900 -class SchedulerGLITE(SchedulerGrid):
1901 ListWMS = [] 1902
1903 - def __init__(self):
1904 SchedulerGrid.__init__(self) 1905 if not os.path.isfile(MyConf().gLite.file): 1906 sys.stderr.write("ERROR: file glite '%s' configuration doesn't exist.\nDo apcgrid-init to define it !!\n"%MyConf().gLite.file) 1907 sys.exit(1) 1908 self._VO= 'VirtualOrganisation = "%s";'%MyConf().gLite.vo 1909 self._MyProxy= 'MyProxyServer = "%s";'%os.getenv('MYPROXY_SERVER') 1910 self._Rank = 'Rank = ( other.GlueCEStateWaitingJobs == 0 ) ? ((other.GlueCEStateFreeCPUs==0)?-2222:other.GlueCEStateFreeCPUs) : - other.GlueCEStateWaitingJobs * 10 / (other.GlueCEStateRunningJobs + 1) * ( (other.GlueCEStateFreeCPUs == 0)?500:1 ) ;' 1911 self._ForceWMS = -1 1912 self._TypeSched = 'GLITE' 1913 self._checkProxy() 1914 self._InitListWMS()
1915 1916
1917 - def _checkProxy(self):
1918 if SchedulerGrid.DoProxyTest : 1919 self.checkProxy() 1920 if SchedulerGrid.DoProxyTest : 1921 self.checkProxy() 1922 if SchedulerGrid.DoProxyTest: 1923 print "ERROR: Proxy Failed" 1924 sys.exit(1)
1925
1926 - def setWMS(self, listWMSName):
1927 for idx in range(len(SchedulerGLITE.ListWMS)): 1928 # max 2 process submit on same WMS 1929 # Init at 0 process in course 1930 _ProcessSubWMS.Squeue[idx] = [0, 0] 1931 1932 for ask_wms in listWMSName: 1933 findwms = False 1934 for idx in range(len(SchedulerGLITE.ListWMS)): 1935 if SchedulerGLITE.ListWMS[idx].find(ask_wms) >= 0: 1936 _ProcessSubWMS.Squeue[idx] = [2, 0] 1937 findwms = True 1938 break 1939 if not findwms: 1940 sys.stderr.write("\nERROR: your WMS %s isn't available for this VO %s"%(ask_wms, MyConf().gLite.vo)) 1941 sys.exit(1)
1942
1943 - def excludeWMS(self, listWMS):
1944 for ask_wms in listWMS: 1945 findwms = False 1946 for idx in range(len(SchedulerGLITE.ListWMS)): 1947 if SchedulerGLITE.ListWMS[idx].find(ask_wms) >= 0: 1948 _ProcessSubWMS.Squeue[idx] = [0, 0] 1949 findwms = True 1950 break 1951 if not findwms: 1952 sys.stderr.write("\nERROR: your WMS %s isn't available for this VO %s"%(ask_wms, MyConf().gLite.vo)) 1953 sys.exit(1)
1954
1955 - def _InitListWMS(self):
1956 """Private: Initialisation WMS List""" 1957 if len(SchedulerGLITE.ListWMS) > 0 : 1958 return 1959 1960 cmd = "lcg-infosites --vo %s wms"%MyConf().gLite.vo 1961 if G_verbose>=5:print cmd 1962 MyProcess = Process(cmd) 1963 MyProcess.wait() 1964 output = MyProcess.stdOut() 1965 erreur = MyProcess.stdErr() 1966 RetCode = MyProcess.getExitValue() 1967 if G_verbose>=5:print "output: \n"+ output 1968 if G_verbose>=5:print "erreur: \n"+ erreur 1969 if G_verbose>=5:print "Code de retour: " + str(RetCode) 1970 1971 if RetCode != 0: 1972 sys.stderr.write("ERROR get WMS list: "+cmd) 1973 sys.stderr.write(erreur) 1974 sys.exit(1) 1975 List = output.split('\n') 1976 List.remove('') 1977 if len(List) == 0: 1978 sys.stderr.write("ERROR: Any WMS avalaible for VO %s !"%MyConf().gLite.vo) 1979 sys.exit(1) 1980 1981 SchedulerGLITE.ListWMS = _ListUnique(List) 1982 if G_verbose>=5: 1983 for wms in SchedulerGLITE.ListWMS: 1984 sys.stdout.write("\n"+wms) 1985 1986 # Init wms queue of _ProcessSubWMS class, 1 WMS , 1 queue 1987 for idx in range(len(SchedulerGLITE.ListWMS)): 1988 # max 2 process submit on same WMS 1989 # Init at 0 process in course 1990 _ProcessSubWMS.Squeue[idx] = [2, 0]
1991 #print _ProcessSubWMS.Squeue 1992 1993
1994 - def _RemoveWMS(self, wmsIdx):
1995 # mise à 0 du max process submit 1996 _ProcessSubWMS.Squeue[wmsIdx][0] = 0
1997 1998
1999 - def _GetWMS(self, setIdx):
2000 if len(SchedulerGLITE.ListWMS) == 0 : 2001 sys.stderr.write("\nERROR:any WMS available\n" ); 2002 return None 2003 res = _ProcessSubWMS._checkSlotFree(setIdx) 2004 if not res[0]: 2005 return None 2006 # wait a free WMS 2007 while (res[1] == None): 2008 time.sleep(0.5) 2009 for pp in _ProcessSubWMS.SlistProcess: pp.isFinish() 2010 res = _ProcessSubWMS._checkSlotFree(setIdx) 2011 #print res 2012 if G_verbose>=5: 2013 sys.stderr.write("\nfree WMS :"+ SchedulerGLITE.ListWMS[res[1]]) 2014 sys.stdout.write("\nnb process submit :"+ str(len(_ProcessSubWMS.SlistProcess))) 2015 return res[1]
2016 2017
2018 - def _NameWMS(self, Appli):
2019 return SchedulerGLITE.ListWMS[Appli._wmsIdx]
2020
2021 - def _splitWMS(self, wms):
2022 return wms.split('/')[2].split(':')[0]
2023
2024 - def _shortWMSname(self, Appli):
2025 wms = self._NameWMS(Appli) 2026 return self._splitWMS(wms)
2027
2028 - def initWithRB(self, wmsIdx):
2029 wms = SchedulerGLITE.ListWMS[wmsIdx] 2030 self._NSAd= 'NSAddresses = "%s:7772";'%self._splitWMS(wms) 2031 self._LBAd= 'LBAddresses = "%s:9000";'%self._splitWMS(wms) 2032 self._WMProxy= 'WMProxyEndpoints = {"%s"};'%wms
2033
2034 - def _FillConf(self,wms):
2035 self.initWithRB(wms) 2036 print >> self._pFile,'%s'%self._VO 2037 print >> self._pFile,'%s'%self._NSAd 2038 print >> self._pFile,'%s'%self._LBAd 2039 print >> self._pFile,'%s'%self._WMProxy
2040
2041 - def loadBalancingCE(self, vers="LoadBalCE_v2", query=""):
2042 if self._defCE != None: 2043 sys.stderr.write("\nWARNING: CE already definied, conflict between load balancing and setCE ?") 2044 return 2045 self._defCE = True 2046 try: 2047 lb_class = globals()[vers] 2048 except: 2049 sys.stderr.write("ERROR: unknow class "+vers) 2050 sys.exit(1) 2051 2052 if query != "": 2053 self._LoadBalCE = lb_class(query) 2054 else: 2055 self._LoadBalCE = lb_class()
2056 2057
2058 - def _FillJDL(self,Appli):
2059 if self._delegateProxy == "Yes": 2060 print >> self._pFile,'%s'%self._MyProxy 2061 print >> self._pFile,'%s'%self._Rank 2062 if Appli._MPIcpu != []: 2063 if self._defCE != None: 2064 requirmt = 'Requirements = other.GlueCEUniqueID == "%s";' %self._defCE 2065 ce_out = self._defCE 2066 else: 2067 requirmt = 'Requirements = Member("MPI-START", other.GlueHostApplicationSoftwareRunTimeEnvironment)' 2068 requirmt += '&& Member("OPENMPI", other.GlueHostApplicationSoftwareRunTimeEnvironment)' 2069 requirmt += '&& ( other.GlueCEStateStatus == "Production" ) ' 2070 requirmt += '&& ( other.GlueCEInfoTotalCPUs >= %d);'%Appli._MPIcpuTotal 2071 ce_out = "unknow" 2072 else: 2073 requirmt = self._Req 2074 if self._LoadBalCE != None: 2075 ce = self._LoadBalCE.choiceCE() 2076 ce_out = ce.split(".")[0] 2077 if ce != None: 2078 if G_verbose>=5: sys.stdout.write("\nSelect CE: %s"%ce) 2079 requirmt='Requirements = other.GlueCEUniqueID == "%s";'%ce 2080 elif self._defCE: 2081 ce_out = self._defCE.split(".")[0] 2082 else: 2083 ce_out = "unknow" 2084 print >> self._pFile,'%s'% requirmt 2085 return ce_out 2086
2087 - def submitCmd(self, Appli):
2088 if self._delegateProxy == "Yes": 2089 if True: 2090 dpjp = Process('glite-wms-job-delegate-proxy -a -e %s --noint'%self._NameWMS(Appli)) 2091 dpjp.wait(60) 2092 if not dpjp.isOk(): 2093 sys.stderr.write("\nERROR:\n commend '%s' NOK\n"%dpjp._cmd) 2094 sys.stderr.write(dpjp.stdErr()) 2095 if G_verbose >= 10: sys.stdout.write(dpjp.stdOut()) 2096 cmd = 'glite-wms-job-submit --noint --config %s -o %s -a %s'%(self._FileConf, Appli._FileID, self._FileJDL) 2097 return cmd 2098
2099 - def statusCmd(self, FileID):
2100 cmd = 'unset PYTHONHOME;glite-wms-job-status -i '+FileID + ' --noint --verbosity 1' 2101 if G_verbose>=10: print cmd 2102 return cmd 2103
2104 - def retrieveCmd(self, FileID, Appli):
2105 outDir= MyConf().info.workDir+Appli._APCSchedID 2106 cmd = 'glite-wms-job-output -i '+FileID 2107 cmd += ' --dir '+outDir+ ' --noint' 2108 return cmd, outDir
2109
2110 - def _CancelCmd(self, Appli):
2111 cmd = "glite-wms-job-cancel -i %s --noint"%Appli.getPathFile('ID') 2112 return cmd
2113
2114 - def _resultCmd(self, fileID):
2115 cmd='echo a | glite-wms-job-status -i %s --verbosity 2 | awk -F: \'{if(/Current/){s=$NF;n++;} if(/Destination/) {d[$2" "s]++;} } END {for ( x in d ){print x": "d[x]" / "n;} }\' | sort'%fileID 2116 return cmd
2117 2118 # 2119 # Compute Element Load Balancing 2120 # 2121
2122 -class LoadBalancingCE:
2123 - def __init__(self):
2124 self.listCE = [] 2125 self._update() 2126 self._lastUpdate = time.time()
2127
2128 - def __str__(self): raise
2129 - def _update(self): raise
2130 - def choiceCE(self):
2131 # update CE available 2132 if ( time.time() - self._lastUpdate)/60 > 10: 2133 self._lastUpdate = time.time() 2134 self._update()
2135 2136
2137 -class LoadBalCE_v1(LoadBalancingCE):
2138 """Algos CE loadbalancing de Tristan Beau """
2139 - def __init__(self, query=""):
2140 if query == "": 2141 self.query = "CEStatus=Production,PlatformArch=x86_64,EstRespTime=0" 2142 else: 2143 self.query = query 2144 LoadBalancingCE.__init__(self) 2145 self.ce_idx=0 2146 self.dec = 1
2147
2148 - def __str__(self):
2149 buffer="\nCEname: FreeSlot" 2150 for ce in self.listCE: 2151 #buffer +='\n%s : \t%d'%(ce[0][0:7],ce[1]) 2152 buffer +='\n%s : \t%d'%(ce[0],ce[1]) 2153 return buffer
2154
2155 - def _update(self):
2156 self.listCE = [] 2157 lce=Process("lcg-info --list-ce --vo %s --attrs WaitingJobs,FreeJobSlots,TotalCPUs --query %s --sed"%(MyConf().gLite.vo, self.query)) 2158 lce.wait(60) 2159 if not lce.isOk(): 2160 sys.stdout.write(lce.stdOut()) 2161 sys.stderr.write("ERROR ERROR ERROR "+lce.stdErr()) 2162 2163 lineout=lce.stdOut().split('\n') 2164 for line in lineout: 2165 fields=line.split('%') 2166 if len(fields) != 4: 2167 break 2168 #t=[fields[0].split(':')[0],int(round(0.9*int(fields[2])))] 2169 t=[fields[0], int(round(0.9*int(fields[2])))] 2170 self.listCE.append(t) 2171 2172 self.listCE.sort(key=operator.itemgetter(1),reverse=True) 2173 if G_verbose>=5: 2174 sys.stdout.write("\nCE selection:\n") 2175 print self
2176
2177 - def choiceCE(self):
2178 LoadBalancingCE.choiceCE(self) 2179 if self.listCE[self.ce_idx][1]==0: 2180 ce=MyConf().gLite.ce 2181 else: 2182 ce=self.listCE[self.ce_idx][0] 2183 self.listCE[self.ce_idx][1] -= self.dec 2184 if self.listCE[self.ce_idx][1] <= 0: 2185 if self.ce_idx<len(self.listCE): 2186 self.ce_idx +=1 2187 return ce
2188 2189
2190 -class LoadBalCE_v2(LoadBalCE_v1):
2191 """version 1, mais retrie le tableau s'il n'est plus ordonne"""
2192 - def __init__(self,query=""):
2193 LoadBalCE_v1.__init__(self,query)
2194
2195 - def choiceCE(self):
2196 LoadBalancingCE.choiceCE(self) 2197 if len(self.listCE) == 0: 2198 return None 2199 if self.listCE[0][1]==0: 2200 return None 2201 else: 2202 if len(self.listCE) == 1: 2203 return self.listCE[0][0] 2204 else: 2205 ce=self.listCE[0][0] 2206 self.listCE[0][1] -= self.dec 2207 if self.listCE[0][1] < self.listCE[1][1]: 2208 self.listCE.sort(key=operator.itemgetter(1),reverse=True) 2209 return ce
2210 2211
2212 -class LoadBalCE_v3(LoadBalCE_v1):
2213 """Repartition au hasard sur les slots libres des CE selectionnes"""
2214 - def __init__(self,query=""):
2215 LoadBalCE_v1.__init__(self,query)
2216
2217 - def _update(self):
2218 LoadBalCE_v1._update(self) 2219 self.ces=[] 2220 for ce in self.listCE: 2221 for i in range(ce[1]): 2222 self.ces.append(ce[0]) 2223 rd.shuffle(self.ces) 2224 self.nb_ces=len(self.ces) 2225 self.i_ces=0
2226
2227 - def choiceCE(self):
2228 LoadBalancingCE.choiceCE(self) 2229 if self.i_ces >= self.nb_ces: 2230 ce=MyConf().gLite.ce 2231 else: 2232 ce=self.ces[self.i_ces] 2233 self.i_ces += 1 2234 return ce
2235 2236 2237 2238 ################################################## 2239 # 2240 # LOCAL : A faire pour les sub-pipeline 2241 # 2242
2243 -class SchedulerLOCAL(SchedulerAbstract):
2244 - def __init__(self):
2245 SchedulerAbstract.__init__(self) 2246 self.Processus = None 2247 self._retrieveFlag = False
2248
2249 - def submit(self, Appli):
2250 """submit Appli""" 2251 cmd= 'cd %s; '%(Appli._PathLocWD) 2252 if Appli._MPIcpu != []: 2253 NbNode = Appli._MPIcpu[0] 2254 NbProc = NbNode*Appli._MPIcpu[1] 2255 cmd = 'mpirun -np %d %s %s'%(NbProc, Appli._AppliName, Appli._Arg) 2256 else: 2257 cmd += '%s %s'%(Appli._AppliName, Appli._Arg) 2258 if Appli._keepStdFile: 2259 self.Processus = Process(cmd, Appli.getPathFile('STDOUT'), Appli.getPathFile('STDERR')) 2260 else: 2261 self.Processus = Process(cmd) 2262 Appli._Status = "SubmitScheduled" 2263 self.status(Appli)
2264
2265 - def status(self, Appli):
2266 """Update status attribut of Appli object""" 2267 if Appli._Status.find('Submit') != 0: 2268 if G_verbose>10: print "status can't change !!!" 2269 return 2270 self.Processus._updateStatus() 2271 Appli._Status = self.Processus._Status 2272 Appli._StatusAppli = self.Processus.getExitValue() 2273 SchedulerAbstract.status(self,Appli)
2274
2275 - def retrieveOutput(self, Appli):
2276 """Retrieve Appli outputs in work directory""" 2277 if not Appli._keepStdFile: 2278 Appli._stdOut = self.Processus.stdOut() 2279 Appli._stdErr = self.Processus.stdErr() 2280 2281 if Appli._stdErr == "": 2282 os.system('rm '+ Appli.getPathFile("STDERR")) 2283 2284 if not self.Processus.isOk() and G_verbose>10: 2285 sys.stderr.write("\nERROR with command :\n%s\n"%self.Processus._cmd) 2286 sys.stderr.write(self.Processus.stdErr())
2287
2288 - def cancel(self, Appli):
2289 """cancel Application""" 2290 cmd="kill -2 %d"%self.Processus.pid 2291 print cmd 2292 os.system(cmd)
2293 2294 2295 ############################################################### 2296 # 2297 # Abstract Class Cluster Scheduler 2298 # 2299
2300 -class SchedulerCluster(SchedulerAbstract):
2301 - def __init__(self):
2302 SchedulerAbstract.__init__(self) 2303 # Tyep de cluster 2304 self._TypeCluster = '' 2305 # name script file 2306 self._ScriptBatchFile = '' 2307 # pointor on script file 2308 self._ScriptBatchDesc = None
2309
2310 - def _AddSchedulerCommand(self, Appli):
2311 """Add in batch script scheluder command""" 2312 pass
2313
2314 - def _AddPlateformEnv(self, Appli):
2315 """Add in batch script specific plateform command""" 2316 pass
2317
2318 - def _AddBeforeRun(self, Appli):
2319 """Add in batch script user commands before run""" 2320 pass
2321
2322 - def _AddExe(self, Appli):
2323 """Add in batch script exe""" 2324 self._ScriptBatchDesc.write("\n\n# Add Application") 2325 if Appli._MPIcpu == []: 2326 self._ScriptBatchDesc.write("\n%s %s"%(Appli._AppliName, Appli._Arg)) 2327 else: 2328 self._AddExeMPI(Appli)
2329
2330 - def _AddAfterRun(self, Appli):
2331 """Add in batch script user commands after run""" 2332 pass
2333
2334 - def _CreateBatchScript(self, Appli):
2335 """Create a batch script for cluster scheduler""" 2336 # create file 2337 MyFile = Appli._PathLocWD + Appli._APCSchedID 2338 self._ScriptBatchFile = MyFile+'.sh' 2339 self._ScriptBatchDesc = open(self._ScriptBatchFile, "w") 2340 if G_verbose > 5: print self._ScriptBatchFile 2341 # write file 2342 2343 self._AddSchedulerCommand(Appli) 2344 self._AddPlateformEnv(Appli) 2345 self._ScriptBatchDesc.write("\n\ntouch "+self._FileRun) 2346 self._AddBeforeRun(Appli) 2347 self._AddExe(Appli) 2348 self._AddAfterRun(Appli) 2349 self._ScriptBatchDesc.write("\n") 2350 # close file 2351 self._ScriptBatchDesc.close() 2352 os.system('chmod 755 '+self._ScriptBatchFile)
2353 2354 # 2355 # BQS, CCIN2P3 scheduler 2356 # 2357
2358 -class SchedulerBQS(SchedulerCluster):
2359 - def __init__(self):
2360 SchedulerCluster.__init__(self) 2361 self._TypeCluster = 'BQS' 2362 self._FileRun = '' 2363 self.SecToUISec = 35 # Convention Seconde UI vers seconde walltime
2364 2365
2366 - def _AddAfterRun(self, Appli):
2367 # save return calu in file 2368 self._ScriptBatchDesc.write("\necho $? >> "+self._FileRun)
2369
2370 - def submit(self, Appli):
2371 """submit Appli""" 2372 Appli._CPUTime = _ConvHHMMSSInSec(Appli._CPUTime) 2373 if Appli._CPUTimePerWeek != '': 2374 Appli._CPUTimePerWeek = _ConvHHMMSSInSec(Appli._CPUTimePerWeek) 2375 else: 2376 Appli._CPUTimePerWeek = 0 2377 Appli._farm = 'anastasie' 2378 self._CreateBatchScript(Appli) 2379 os.putenv('BQSCLUSTER', Appli._farm) 2380 myp = Process("qsub "+self._ScriptBatchFile) 2381 myp.wait(5) 2382 if myp.isOk(): 2383 Appli._Status = 'SubmitScheduled' 2384 Appli._TimeStart = time.time() 2385 else: 2386 Appli._Status = 'FinishSubmitNOK'
2387
2388 - def cancel(self, Appli):
2389 os.putenv('BQSCLUSTER', Appli._farm) 2390 mp=Process("qdel "+ Appli._APCSchedID) 2391 mp.wait()
2392
2393 - def status(self, Appli):
2394 if Appli._Status.find('Submit') != 0: 2395 if G_verbose>10: print "status can't change !!!" 2396 return 2397 if os.path.isfile(self._FileRun): 2398 Appli._Status = 'SubmitRunning' 2399 finish = False 2400 if os.path.isfile(Appli.getPathFile("STDOUT")): 2401 finish = True 2402 elif os.path.isfile(Appli.getPathFile("STDERR")): 2403 finish = True 2404 if finish: 2405 rvs = _readFile(self._FileRun) 2406 try: 2407 rv = int(rvs) 2408 Appli._StatusAppli = rv 2409 if rv == 0 : Appli._Status = 'FinishOK' 2410 else: Appli._Status = 'FinishNOK' 2411 except: 2412 Appli._Status = 'FinishNOK' 2413 os.system('rm -rf '+self._FileRun) 2414 os.system('rm -rf '+self._ScriptBatchFile) 2415 if G_verbose>5: print "status :"+Appli._Status 2416 2417 # manage time out and call retrieve output 2418 SchedulerAbstract.status(self,Appli)
2419
2420 - def _AddBeforeRun(self, Appli):
2421 Appli._AddEnv(self._ScriptBatchDesc) 2422 if Appli._MPIcpu != []: 2423 # Use OpenMPI 2424 self._ScriptBatchDesc.write("\n. /usr/local/shared/bin/openmpi_env.sh")
2425
2426 - def _AddExeMPI(self, Appli):
2427 NbNode = Appli._MPIcpu[0] 2428 NbProc = NbNode*Appli._MPIcpu[1] 2429 if NbProc > 1: 2430 # Use OpenMPI 2431 self._ScriptBatchDesc.write("\n/usr/local/openmpi/bin/mpirun -x LD_LIBRARY_PATH -x PATH --mca pls_rsh_agent /usr/local/products/bqs/bqsrsh -machinefile $BQS_PROCLISTPATH -np $BQS_PROCNUMBER %s %s"%(Appli._AppliName, Appli._Arg)) 2432 else: 2433 # OpenMPI but without mpirun 2434 self._ScriptBatchDesc.write("\n%s %s"%(Appli._AppliName, Appli._Arg))
2435
2436 - def _AddSchedulerCommand(self, Appli):
2437 """Add in batch script scheluder command""" 2438 # create file 2439 self._FileRun = Appli._PathLocWD + Appli._APCSchedID+'.run' 2440 2441 pf = self._ScriptBatchDesc 2442 pf.write("#!/bin/bash") 2443 pf.write("\n#PBS -N %s"%Appli._APCSchedID) 2444 pf.write("\n#PBS -l platform=LINUX") 2445 pf.write("\n#PBS -l M=%dMB"%Appli._MemorySize) 2446 if Appli._CPUTimePerWeek == 0: 2447 MyTime = Appli._CPUTime*self.SecToUISec 2448 pf.write("\n#PBS -l T=%d"%MyTime) 2449 # check class J 2450 if (Appli._MemorySize > 2200) or (MyTime > 2400000): 2451 pf.write("\n#PBS -q J") 2452 else: 2453 MyTime = Appli._CPUTimePerWeek*self.SecToUISec 2454 pf.write("\n#PBS -l T=%d"%MyTime) 2455 pf.write("\n#PBS -q V") 2456 pf.write("\n#PBS -V") 2457 pf.write("\n#PBS -o "+Appli.getPathFile("STDOUT")) 2458 pf.write("\n#PBS -e "+Appli.getPathFile("STDERR")) 2459 if Appli._MPIcpu == []: 2460 if Appli._AccessDir != '': 2461 pf.write("\n#PBS -l "+Appli._AccessDir) 2462 else: 2463 NbNode = Appli._MPIcpu[0] 2464 NbProc = NbNode*Appli._MPIcpu[1] 2465 if NbProc > 1: 2466 Appli._farm = 'pistoo' 2467 if Appli._AccessDir != '': 2468 pf.write("\n#PBS -l %s=%d"%(Appli._AccessDir, NbProc)) 2469 pf.write("\n#PBS -l ptype=OpenMPI") 2470 StringCPU = "\n#PBS -l proc=%d"%(NbProc) 2471 if NbNode != NbProc: 2472 StringCPU += ",machine=%d"%(NbNode) 2473 pf.write(StringCPU) 2474 else: 2475 # with only one CPU go to anastasie 2476 if Appli._AccessDir != '': 2477 pf.write("\n#PBS -l "+Appli._AccessDir)
2478 2479 # 2480 # SGE, Sun Grid Engine 2481 # 2482
2483 -class GridEngineCC:
2484 """ 2485 check des ressources 2486 """ 2487 pa_mame = ["pa_short","pa_medium","pa_long"] 2488 pa_memLim = [500, 3*1024, 4*1024] 2489 pa_timeLim = [6*60, 5*3600, 30*3600] 2490 pa_cpuMax = 112 2491 huge_time = 46*3600 2492 huge_mem = 16*1024 2493 2494
2495 - def __init__(self, memMB, cpuTime, nbCPU=1):
2496 self.memMB = memMB 2497 self.cpuTime = cpuTime 2498 self.nbCPU = nbCPU
2499 2500
2501 - def checkResource(self):
2502 if self.nbCPU > 1: 2503 return self._checkResourceParal() 2504 else: 2505 return self._checkResourceSeq()
2506 2507
2508 - def _checkResourceSeq(self):
2509 memMax = GridEngineCC.huge_mem 2510 timeMax = GridEngineCC.huge_time 2511 if self.memMB > memMax: 2512 return "ERROR: too many memory, max is %dMB"%memMax 2513 if self.cpuTime > timeMax: 2514 return "ERROR: too many time, max is %f hours"%(timeMax/3600) 2515 return "Ok"
2516 2517
2518 - def _checkResourceParal(self):
2519 memMax = GridEngineCC.pa_memLim[-1] 2520 timeMax = GridEngineCC.pa_timeLim[-1] 2521 cpuMax = GridEngineCC.pa_cpuMax 2522 if self.nbCPU > cpuMax: 2523 return "ERROR: too many CPUs for openmpi GE env., max is %d"%cpuMax 2524 if self.memMB > memMax: 2525 return "ERROR: too many memory for MPI job, max is %dMB"%memMax 2526 if self.cpuTime > timeMax: 2527 return "ERROR: too many time for MPI job, max is %f hours"%(timeMax/3600) 2528 return "Ok"
2529 2530
2531 - def retQueueParal(self):
2532 nbClass = len(GridEngineCC.pa_mame) 2533 imem = -1 2534 for i in range(nbClass): 2535 if (self.memMB <= GridEngineCC.pa_memLim[i]): 2536 imem= i 2537 break 2538 itime = -1 2539 for i in range(nbClass): 2540 if (self.cpuTime <= GridEngineCC.pa_timeLim[i]): 2541 itime = i 2542 break 2543 if imem>=0 and itime>=0: 2544 return GridEngineCC.pa_mame[ max(imem, itime) ] 2545 else: 2546 return None
2547 2548
2549 -class SchedulerSGE(SchedulerBQS):
2550 - def __init__(self):
2551 SchedulerCluster.__init__(self) 2552 self._TypeCluster = 'SGE' 2553 self._FileRun = ''
2554
2555 - def _AddAfterRun(self, Appli):
2556 # save return calu in file 2557 self._ScriptBatchDesc.write("\nres=$? ") 2558 self._ScriptBatchDesc.write("\nexec 3<> "+self._FileEnd) 2559 self._ScriptBatchDesc.write("\necho $res >> "+self._FileEnd) 2560 self._ScriptBatchDesc.write("\nexec 3>&-")
2561
2562 - def submit(self, Appli):
2563 """submit Appli""" 2564 Appli._CPUTime = _ConvHHMMSSInSec(Appli._CPUTime) 2565 if Appli._CPUTimePerWeek != '': 2566 Appli._CPUTimePerWeek = _ConvHHMMSSInSec(Appli._CPUTimePerWeek) 2567 else: 2568 Appli._CPUTimePerWeek = 0 2569 self._CreateBatchScript(Appli) 2570 myp = Process("qsub "+self._ScriptBatchFile) 2571 myp.wait(10) 2572 if myp.isOk(): 2573 Appli._Status = 'SubmitScheduled' 2574 Appli._TimeStart = time.time() 2575 else: 2576 Appli._Status = 'FinishSubmitNOK'
2577
2578 - def cancel(self, Appli):
2579 mp=Process("qdel "+ Appli._APCSchedID) 2580 mp.wait()
2581 2582
2583 - def _AddBeforeRun(self, Appli):
2584 Appli._AddEnv(self._ScriptBatchDesc) 2585 if Appli._MPIcpu != []: 2586 # Use OpenMPI 2587 self._ScriptBatchDesc.write("\n. /usr/local/shared/bin/openmpi_env.sh")
2588
2589 - def _AddExeMPI(self, Appli):
2590 NbNode = Appli._MPIcpu[0] 2591 NbProc = NbNode*Appli._MPIcpu[1] 2592 if NbProc > 1: 2593 # Use OpenMPI 2594 self._ScriptBatchDesc.write("\nmpiexec --mca btl ^udapl,openib --mca btl_tcp_if_include eth0 -n $NSLOTS %s %s"%(Appli._AppliName, Appli._Arg)) 2595 else: 2596 # OpenMPI but without mpirun 2597 self._ScriptBatchDesc.write("\n%s %s"%(Appli._AppliName, Appli._Arg))
2598
2599 - def _AddSchedulerCommand(self, Appli):
2600 """Add in batch script scheluder command""" 2601 # create file 2602 self._FileRun = Appli._PathLocWD + Appli._APCSchedID+'.run' 2603 self._FileEnd = Appli._PathLocWD + Appli._APCSchedID+'.end' 2604 2605 pf = self._ScriptBatchDesc 2606 pf.write("#!/bin/bash -l") 2607 pf.write("\n#$ -N %s"%Appli._APCSchedID) 2608 pf.write("\n#$ -l vmem=%dM"%Appli._MemorySize) 2609 if Appli._CPUTimePerWeek == 0: 2610 MyTime = Appli._CPUTime 2611 pf.write("\n#$ -l ct=%d"%MyTime) 2612 else: 2613 MyTime = Appli._CPUTimePerWeek*self.SecToUISec 2614 pf.write("\n#$ -l ct=%d"%MyTime) 2615 pf.write("\n#$ -q demon") 2616 pf.write("\n#$ -V") 2617 2618 # check ressource 2619 NbProc = 1 2620 if Appli._MPIcpu != []: 2621 NbNode = Appli._MPIcpu[0] 2622 NbProc = NbNode*Appli._MPIcpu[1] 2623 sge = GridEngineCC(Appli._MemorySize, MyTime, NbProc) 2624 ret = sge.checkResource() 2625 if ret != "Ok": 2626 sys.stderr.write(ret) 2627 raise 2628 2629 pf.write("\n#$ -o "+Appli.getPathFile("STDOUT")) 2630 pf.write("\n#$ -e "+Appli.getPathFile("STDERR")) 2631 if NbProc > 1: 2632 queue = sge.retQueueParal() 2633 if queue == None: 2634 sys.stderr.write("ERROR: can't define parallele queue for %dMB, %dseconds"%(Appli._MemorySize, MyTime)) 2635 pf.write("#$ -pe openmpi %d -q %s"%(NbProc, queue) ) 2636 if Appli._AccessDir != '': 2637 pf.write("\n#$ -l %s=1"%Appli._AccessDir)
2638 2639
2640 - def status(self, Appli):
2641 if Appli._Status.find('Submit') != 0: 2642 if G_verbose>10: print "Status can't change !!!" 2643 return 2644 if os.path.isfile(self._FileRun): 2645 Appli._Status = 'SubmitRunning' 2646 2647 # end condition is not robust to user kill or executable error ... to improve 2648 try: 2649 finish = os.path.isfile(self._FileEnd) 2650 except: 2651 finish = False 2652 if finish: 2653 rvs = _readFile(self._FileEnd) 2654 print "run:"+rvs 2655 try: 2656 rv = int(rvs) 2657 Appli._StatusAppli = rv 2658 if rv == 0 : Appli._Status = 'FinishOK' 2659 else: Appli._Status = 'FinishNOK' 2660 except: 2661 print "\nCan't convert:'%s'"%rvs 2662 Appli._Status = 'FinishNOK' 2663 os.system('rm -rf '+self._FileRun) 2664 os.system('rm -rf '+self._FileEnd) 2665 os.system('rm -rf '+self._ScriptBatchFile) 2666 if G_verbose>5: print "Status :"+Appli._Status 2667 2668 # manage time out and call retrieve output 2669 SchedulerAbstract.status(self,Appli)
2670 2671 2672 ############################################################### 2673 # 2674 # Class Application 2675 # 2676 ############################################################### 2677 2678
2679 -class Application:
2680 - def __init__(self, Appli, label=""):
2681 self._AppliName = Appli 2682 # if Appli is /bin/echo AppliOnly is echo 2683 self._AppliOnly = Appli[Appli.rfind('/')+1:] 2684 if label=="": 2685 self._APCSchedID = self._AppliOnly 2686 else: 2687 self._APCSchedID = label.replace(' ','_') 2688 self._Arg = '' 2689 # if [1,1] is not a MPI executable, [4,2] total CPU is 4.2=8 CPUs 2690 self._MpiNodeCPUbyNode = [1,1] 2691 # CPU time necessary hh::mm:ss , memory in MB 2692 self._StatusAppli = -1 # exit value at end executable 2693 self._Status = 'NotSubmit' # exit value at end executable 2694 self._ListSrc = [] 2695 self._ListInput = [] 2696 self._ListOutput = [] 2697 self._ListInputSE = [] 2698 self._ListOutputSE = [] 2699 self._PathLocWD = '' # Path 2700 self._FileIDgrid = '' 2701 self._Timer = 10 2702 self._TimeOutToStart = 0 2703 self._TimeStart = 0 2704 self._RetrieveWithID = True 2705 self._RetrieveSEWithID = True 2706 self._CPUTime = '1:0:0' 2707 self._CPUTimePerWeek = '' 2708 self._AccessDir = '' 2709 self._MemorySize = 1024 2710 self._LocalDiskSize = 1024 2711 self._MPIcpu = [] 2712 self._MPIcpuTotal = 0 2713 self._MainSE = '' 2714 self._WorkDirSE = '%s/APCScheduler'%os.getenv('USER') 2715 self._OutDirSE = None 2716 self._MainScript = None 2717 self._FlagResubmit = False 2718 self._keepStdFile= True 2719 self._stdOut= '' 2720 self._stdErr= '' 2721 self._wmsUsed = set([]) 2722 self._wmsIdx = None 2723 self._threadCopy = None
2724
2725 - def _isAppliExist(self, namefile):
2726 """Return absolute name of namefile and None if doesn't exit""" 2727 return None
2728
2729 - def _Cleaner(self):
2730 pass
2731
2732 - def _FillFileJDL(self):
2733 pass
2734
2735 - def _AddEnv(self, File):
2736 pass
2737
2738 - def copyTarBallOnSE(self, FileSE):
2739 sys.stderr.write("\nNot Available with kind of application") 2740 return False
2741
2742 - def setArg(self, Arg):
2743 """Define excutable/script argument. 2744 Fomrat: string""" 2745 self._Arg = Arg
2746 2747
2748 - def setInput(self, List):
2749 """[Specific grid] Define local input file(s). 2750 Format : string list ['a','b']""" 2751 self._ListInput = List
2752
2753 - def setInputSE(self, List):
2754 """[Specific grid] Define input file(s) in storage element. 2755 Format : string list ['a','b']""" 2756 self._ListInputSE = List
2757
2758 - def setOutput(self, List, AddID=True):
2759 """[Specific grid] 2760 List : Define output file List to retrieve at the end of job via OutputSandbox and copy in job repository 2761 AddID : add prefixe identificator 2762 AddID==True : prefix is SchedulerID random number and letter 2763 AddID is type String : prefix is value of AddID 2764 else : no ID 2765 """ 2766 self._ListOutput = List 2767 self._RetrieveWithID = AddID
2768 2769
2770 - def setOutputSE(self, List, AddID=True):
2771 """[Specific grid] 2772 List : Define output file List to retrieve at the end of job on storage element 2773 AddID : add prefixe identificator 2774 AddID==True : prefix is SchedulerID random number and letter 2775 AddID is type String : prefix is value of AddID 2776 else : no ID 2777 """ 2778 self._ListOutputSE = List 2779 2780 # see _processPrefix 2781 self._RetrieveSEWithID = AddID
2782 2783
2784 - def setOutputDirSE(self, outdir, AddID=False):
2785 """[Specific grid tarball] Define directory on SE where all files in tarball directory toSE/ will copied, if AddID is True APCScheduler add prefixe ID job to name file""" 2786 self._OutDirSE = outdir
2787
2788 - def setMPI(self, NodeCPUbyNode):
2789 """Define node number [Node, CPUbyNode]. Example: [4,2] total CPU is 4.2=8""" 2790 self._MPIcpu = NodeCPUbyNode
2791
2792 - def setCPUTime(self, TimeMem):
2793 """Define CPU Time request for job. 2794 Format : string [[hh:]mm:]ss""" 2795 self._CPUTime = TimeMem
2796
2797 - def setCPUTimePerWeek(self, TimeMem):
2798 """Define CPU Time per week for long and slow job (like class V with BQS). 2799 Format : string [[hh:]mm:]ss""" 2800 self._CPUTimePerWeek = TimeMem
2801
2802 - def setAccessDirectory(self, Dir):
2803 """[specific cluster] Requirement name global aria space disk. For example 'sps_planck' in CCIN2P3 2804 Fomrat : string """ 2805 self._AccessDir = Dir
2806
2807 - def setMemorySize(self, Mem):
2808 """Define memory request for job. 2809 Format : Integer in MByte""" 2810 self._MemorySize = Mem
2811
2812 - def setLocalDiskSize(self, Mem):
2813 """[specific cluster] Requirement local worker node space disk in MByte """ 2814 self._LocalDiskSize = Mem
2815
2816 - def setStorageElement(self, se):
2817 """[specific grid] Define current storage element""" 2818 self._MainSE = se
2819
2820 - def getFile(self,Key, index = 0):
2821 """ Give name file for follow keywords: 'STDOUT', 'STDERR', 'OUT', 'OUT_SE', 'SUBOUT', 'SUBERR', 'TAR','ID'""" 2822 if Key=='STDOUT': 2823 return self.getNameStdOut() 2824 elif Key=='STDERR': 2825 return self.getNameStdErr() 2826 elif Key=='OUT': 2827 return self.getNameOutput(index) 2828 elif Key=='OUT_SE': 2829 return self.getNameOutputSE(index) 2830 elif Key=='SUBOUT': 2831 return self._APCSchedID + '.sub_out' 2832 elif Key=='SUBERR': 2833 return self._APCSchedID + '.sub_err' 2834 elif Key=='TAR': 2835 return self._APCSchedID + '.tar.gz' 2836 elif Key=='ID': 2837 return self.getNameJobID() 2838 else: 2839 print "Keyword '%s' is unknown"%Key 2840 return None
2841
2842 - def getPathFile(self, Key, index = 0):
2843 """Same thing like getFile but with absolute path""" 2844 name = self.getFile(Key,index) 2845 if name == None: 2846 return None 2847 else: 2848 if G_verbose>=10 : print name 2849 return os.path.join(self._PathLocWD, name)
2850
2851 - def getNameStdOut(self):
2852 return (self._APCSchedID +'.stdout')
2853
2854 - def getNameStdErr(self):
2855 return (self._APCSchedID +'.stderr')
2856
2857 - def isFinish(self):
2858 return (self._Status.find('Finish') >= 0)
2859
2860 - def isSubmit(self):
2861 return (self._Status.find('Submit') == 0)
2862 2863
2864 - def _processPrefix(self, AddID):
2865 if AddID == True: 2866 #print self._APCSchedID+"_" 2867 return self._APCSchedID+"_" 2868 elif isinstance(AddID, str): 2869 return AddID 2870 else: 2871 return ""
2872 2873
2874 - def getNameOutput(self, idx):
2875 if idx < len(self._ListOutput): 2876 filename = self._ListOutput[idx] 2877 withID = self._processPrefix(self._RetrieveWithID) 2878 else: 2879 myidx = idx-len(self._ListOutput) 2880 if (myidx < len(self._ListOutputSE)): 2881 aa = self._ListOutputSE[myidx].split('/') 2882 filename = aa[len(aa)-1] 2883 withID = self._processPrefix(self._RetrieveSEWithID) 2884 else: 2885 sys.stderr.write("[getNameOutput]Error out of range") 2886 sys.exit(1) 2887 return (withID + filename)
2888
2889 - def getNameJobID(self):
2890 return (self._APCSchedID +'.id')
2891
2892 - def getNameOutputSE(self, idx):
2893 return self.getNameOutput(idx+len(self._ListOutput))
2894
2895 - def isOk(self):
2896 return (self._Status == 'FinishOK')
2897
2898 - def noKeepStdFile(self):
2899 """ no keep standard output and error file""" 2900 self._keepStdFile= False
2901 2902 2903 2904 ################################################## 2905 # 2906 # Application type executable/script stand alone 2907 # 2908
2909 -class AppliExe(Application):
2910 - def __init__(self, NameExe, label=""):
2911 AbsNameExe = self._isAppliExist(NameExe) 2912 if AbsNameExe != None: 2913 Application.__init__(self, AbsNameExe,label) 2914 else: 2915 sys.stderr.write("\nDon't find %s anywhere, check var. env PATH, wms-proxy problem.\n"%NameExe) 2916 sys.exit(1)
2917 2918
2919 - def _isAppliExist(self, NameExe):
2920 AbsNameExe = _AbsolutePathExe(NameExe) 2921 if AbsNameExe == None: 2922 #print "ERROR: Can't find '"+NameExe+"' executable." 2923 return None 2924 2925 return AbsNameExe
2926
2927 - def _FillFileJDL(self):
2928 buffer = 'Executable = "%s";\n'%self._AppliOnly 2929 buffer += 'Arguments = "%s";\n'%self._Arg 2930 intemp = 'InputSandbox = {"%s"'%self._AppliName 2931 for elt in self._ListInput: 2932 intemp += ',"%s"'%elt 2933 2934 for elt in self._ListSrc: 2935 intemp += ',"%s"'%elt 2936 2937 buffer += intemp +'};\n' 2938 return buffer
2939
2940 - def _FillFileJDLEnv(self):
2941 buffer = '"LCG_CATALOG_TYPE=lfc","VO_NAME=%s"'%MyConf().gLite.vo 2942 if os.environ.has_key('LFC_HOST'): 2943 buffer += ',"LFC_HOST='+os.environ['LFC_HOST']+'"' 2944 return buffer
2945
2946 - def _AddEnv(self, File):
2947 """ env for cluster """ 2948 Val=os.getenv('PATH') 2949 if Val != None: 2950 File.write("\nexport PATH="+Val) 2951 Val=os.getenv('LD_LIBRARY_PATH') 2952 if Val != None: 2953 File.write("\nexport LD_LIBRARY_PATH="+Val)
2954
2955 - def _RetrieveFromSE(self):
2956 # Nothing to do 2957 pass
2958 2959 # 2960 # python 2961 # 2962
2963 -class AppliPython(AppliExe):
2964 - def __init__(self, NameExe,label=""):
2965 AppliExe.__init__(self, NameExe,label)
2966 2967
2968 - def _AddEnv(self, File):
2969 """Python env for cluster """ 2970 AppliExe._AddEnv(self, File) 2971 Val=os.getenv('PYTHONHOME') 2972 if Val != None: 2973 File.write("\nexport PYTHONHOME="+Val) 2974 Val=os.getenv('PYTHONPATH') 2975 if Val != None: 2976 File.write("\nexport PYTHONPATH="+Val)
2977 2978 # 2979 # parachute 2980 # 2981 2982 2983 # Design pattern STATE to manage re-usable tarball on SE 2984 # ... finally only 2 cases, I keep design pattern state approach but not necessary 2985 #
2986 -class _AppliParachute_TarExist:
2987 - def doTarBallIfNecessary(self, master):
2988 """ call by submit """ 2989 # nothing to do, tarball exist on SE 2990 return self
2991
2992 - def doTarBallAndCopy(self, master, pathSE):
2993 """ call by job.copyTarBallOnSE() """ 2994 # nothing to do tar exist already 2995 sys.stdout.write("\nWARNING: tarball '%s' already exist !"%master._nameTarball) 2996 return self
2997 2998
2999 -class _AppliParachute_DoTar:
3000 - def doTarBallIfNecessary(self, master):
3001 """ call by submit """ 3002 # Tar doesn't exist and user doesn't ask to save it 3003 master._DoTarBall() 3004 return _AppliParachute_State._tarExist
3005
3006 - def doTarBallAndCopy(self, master, pathSE):
3007 """ call by job.copyTarBallOnSE() """ 3008 # Tar doesn't exist and user ask to save it 3009 master._DoTarBall(pathSE) 3010 return _AppliParachute_State._tarExist
3011 3012 3013 # container class
3014 -class _AppliParachute_State:
3015 _tarExist = _AppliParachute_TarExist() 3016 _doTar = _AppliParachute_DoTar()
3017 3018
3019 -class AppliParachute(AppliExe):
3020 S_listTarballExist= []
3021 - def __init__(self, NameExe, MainScript = None, label="", KeepTarBall=False):
3022 """MainScript Optional, define the main script of tarball, 3023 ie APCScheduler will called this script in tarball case. If not 3024 present the executable will called directly""" 3025 3026 AppliExe.__init__(self, NameExe, label) 3027 3028 self._KeepTarBall = KeepTarBall 3029 3030 if MainScript != None: 3031 AbsNameExe = _AbsolutePathExe(MainScript) 3032 if AbsNameExe == None: 3033 sys.stderr.write("ERROR: Can't find '"+MainScript+"' executable.\n" ) 3034 sys.exit(1) 3035 else: 3036 self._MainScript = AbsNameExe
3037
3038 - def _isAppliExist(self, NameExe):
3039 AbsNameExe = AppliExe._isAppliExist(self, NameExe) 3040 if AbsNameExe == None: 3041 # may be on SE ? 3042 TarOk = False 3043 se=SEtools( MyConf().gLite.vo,MyConf().gLite.ce ) 3044 if NameExe in AppliParachute.S_listTarballExist: 3045 TarOk = True 3046 else: 3047 if se.sizeFile(NameExe) > 0: 3048 TarOk = True 3049 AppliParachute.S_listTarballExist.append(NameExe) 3050 if G_verbose>=5: 3051 print 'AppliParachute.S_listTarballExist:', AppliParachute.S_listTarballExist 3052 if TarOk: 3053 # NameExe exist on SE 3054 AbsNameExe = NameExe 3055 self._nameTarball = se._simpleDelTag(NameExe) 3056 self._stateObj = _AppliParachute_State._tarExist 3057 else: 3058 return None 3059 else: 3060 # file tar to do 3061 self._stateObj = _AppliParachute_State._doTar 3062 return AbsNameExe
3063
3064 - def _FillFileJDL(self):
3065 # do ScriptBoot.sh 3066 BootFile = os.path.join(self._PathLocWD,"BootScript.sh") 3067 self._DoScriptBoot(BootFile) 3068 3069 # create temp directory on SE 3070 se=SEtools( MyConf().gLite.vo, self._MainSE) 3071 pathdir = '%s/%s'%(self._WorkDirSE,self._APCSchedID) 3072 se._mkdirSE(pathdir) 3073 3074 # do tarball ? 3075 self._stateObj.doTarBallIfNecessary(self) 3076 3077 bufferJDL = 'Executable = "BootScript.sh";\n' 3078 if self._MPIcpu !=[]: 3079 NbNode = self._MPIcpu[0] 3080 self._MPIcpuTotal = NbNode*self._MPIcpu[1] 3081 #bufferJDL += 'JobType = "Normal";\n' 3082 bufferJDL += 'JobType = "MPICH";\n' 3083 bufferJDL += 'CPUNumber = %d;\n'%self._MPIcpuTotal 3084 3085 if self._WorkDirSE == None: 3086 sys.stderr.write("ERROR: You must define SetWorkDirSE()") 3087 raise 3088 3089 # argument bootscript is the tarball path on SE 3090 # chemin du tar sur le SE, workdir SE, argument du job 3091 strArg = "%s "%self._nameTarball 3092 strArg+= "%s/%s "%(self._WorkDirSE,self._APCSchedID) 3093 strArg+= "%s"%self._Arg 3094 3095 bufferJDL += 'Arguments = "%s";\n'%strArg 3096 bufferJDL += 'Environment = {'+self._FillFileJDLEnv()+'};\n' 3097 intemp = 'InputSandbox = {"%s"'%BootFile 3098 for elt in self._ListInput: 3099 intemp += ',"%s"'%elt 3100 #print elt 3101 3102 bufferJDL += intemp+'};\n' 3103 return bufferJDL
3104
3105 - def _RetrieveFromSE(self):
3106 se=SEtools( MyConf().gLite.vo, self._MainSE) 3107 # copy dir UI 3108 pathdir = '%s/%s'%(self._WorkDirSE,self._APCSchedID) 3109 self._threadCopy = _CopySEThread(se, pathdir, self._PathLocWD, self._APCSchedID+'_') 3110 self._threadCopy.start()
3111 3112
3113 - def _Cleaner(self):
3114 if self._threadCopy == None: 3115 # pas de rapatirement de fichier en cours pour ce job 3116 # on oeut tout effacer 3117 pathdir = '%s/%s'%(self._WorkDirSE,self._APCSchedID) 3118 cmd='apcgrid-rm -r '+pathdir 3119 mp = Process(cmd) 3120 mp.wait() 3121 else: 3122 # on attend la fin du thread qui detruira la repertoire temporaire 3123 if G_exit : self._threadCopy.join()
3124 3125
3126 - def _DoTarBall(self, pathTar=""):
3127 if self._FlagResubmit: return 3128 if G_verbose>=10: sys.stdout.write("\ndef _DoTarBall(self, pathTar=""):\n") 3129 root = os.path.join(self._PathLocWD, self._APCSchedID) 3130 TarRoot = os.path.join(root, "tarball") 3131 os.makedirs(TarRoot) 3132 os.makedirs(TarRoot+"/lib") 3133 os.makedirs(TarRoot+"/toSE") 3134 os.makedirs(TarRoot+"/toUI") 3135 os.makedirs(TarRoot+"/python") 3136 3137 # copy exe and script 3138 os.system("cp %s %s "%(self._AppliName, TarRoot)) 3139 namescript = TarRoot+"/tarballScript.py" 3140 self._DoScriptTarball(namescript) 3141 if self._MainScript != None: os.system("cp %s %s"%(self._MainScript, TarRoot)) 3142 3143 # copy input list 3144 for elt in self._ListInput: os.system("cp -r %s %s"%(elt, TarRoot)) 3145 3146 # add lib 3147 filesrc = __file__ 3148 if filesrc[-1]=='c': 3149 filesrc = filesrc[0:-1] 3150 if G_verbose>10: sys.stdout.write("add %s, not %s\n" %(filesrc, __file__)) 3151 os.system("cp -r %s %s/python"%(filesrc, TarRoot)) 3152 listlib = [] 3153 if not _FindLib(self._AppliName, listlib): 3154 sys.exit(1) 3155 3156 for elt in listlib: 3157 cmd ="cp %s %s/lib"%(elt, TarRoot) 3158 os.system(cmd) 3159 if G_verbose>=10: print cmd 3160 3161 # tar 3162 cmd = "cd %s; tar cfz %s *"%(TarRoot, self.getPathFile('TAR')) 3163 #submitCmd(cmd,-1,False) pourquoi ça bloque ? 3164 os.system(cmd) 3165 os.system("rm -rf "+root) 3166 3167 # copy to SE 3168 se=SEtools( MyConf().gLite.vo, self._MainSE) 3169 if pathTar== "": 3170 # cas ou le tarball est SE temporaire, 3171 self._nameTarball='%s/%s/%s'%(self._WorkDirSE,self._APCSchedID,self.getFile("TAR")) 3172 else: 3173 # cas ou le tarball sera reutilise 3174 if se.sizeFile(pathTar) >=0: 3175 sys.stderr.write("\nFile '%s' exist !! Delete it or change name\n"%pathTar) 3176 sys.exit(1) 3177 self._nameTarball=se._simpleDelTag(pathTar) 3178 ret=se.cp(self.getPathFile('TAR'), 'se:'+self._nameTarball) 3179 if ret != 0: 3180 if G_verbose>1: sys.stderr.write("ERROR: Can't copy tarball %s to se:%s\n"%(self.getPathFile('TAR'),self._nameTarball)) 3181 self._Status = 'FinishSubmitNOK' 3182 return 3183 if not self._KeepTarBall: os.system("rm -rf "+self.getPathFile('TAR'))
3184 3185
3186 - def _DoScriptTarball(self, namefile):
3187 # python script call by boot script on compute element 3188 buffer = "import sys\nimport os" 3189 buffer += "\nsys.path.append('python')" 3190 buffer += "\nfrom APCScheduler import *" 3191 buffer += "\nsetVerboseLevel(1)" 3192 buffer += "\n\n#update LD_LIBRARY_PATH\naddLIBRARYPATH(os.environ['PWD']+'/lib')" 3193 buffer += "\nse=SEtools( '%s', '%s') "%(MyConf().gLite.vo, self._MainSE) 3194 buffer += "\n\n# copy input SE" 3195 for elt in self._ListInputSE: 3196 buffer += "\nse.cp('se:%s','%s')"%(elt,elt.split('/')[-1]) 3197 buffer += "\n\n# call executable" 3198 buffer += "\nArg = ''\nfor i in range(2,len(sys.argv)): Arg += sys.argv[i]+ ' '" 3199 if self._MainScript == None: 3200 buffer += "\nret=submitCmd('./%s %%s'%%Arg,-1)"%(self._AppliOnly) 3201 else: 3202 buffer += "\nret=submitCmd('./%s %%s'%%Arg,-1)"%(os.path.basename(self._MainScript)) 3203 if self._OutDirSE != None: 3204 buffer += "\n\n# copy out SE" 3205 buffer += "\nse.cpdir( 'toSE', 'se:%s') " %self._OutDirSE 3206 buffer += "\n\n# copy out UI" 3207 buffer += "\nse.cpdir( 'toUI', 'se:%s/data'%sys.argv[1])\n" 3208 buffer += "\nsys.exit(ret)" 3209 pf = open(namefile,'w+') 3210 pf.write(buffer) 3211 pf.close() 3212 os.system('chmod 755 '+namefile)
3213 3214
3215 - def _DoScriptBoot(self, scriptname):
3216 # script call by inputsandBox 3217 if not os.path.isfile(scriptname): 3218 buffer = "#!/bin/bash" 3219 buffer += "\nlcg-cp -v --vo $VO_NAME lfn:/grid/$VO_NAME/$1 file:`pwd`/tarball.tar.gz" 3220 buffer += "\ntar xzf tarball.tar.gz ; rm -rf tarball.tar.gz" 3221 buffer += "\nexport PATH=$PATH:./" 3222 buffer += "\nshift" 3223 buffer += '\n\necho "**************** date debut job code : " `date`' 3224 buffer += "\npython tarballScript.py $* 2>&1" 3225 buffer += "\nret=$?" 3226 buffer += '\necho "**************** date fin job code : " `date`' 3227 #buffer += "\n\ncd ..;rm -rf tarball" 3228 buffer += "\nexit $ret" 3229 pf = open(scriptname,'w+') 3230 pf.write(buffer) 3231 pf.close() 3232 os.system('chmod 755 '+scriptname)
3233
3234 - def copyTarBallOnSE(self, pathSE):
3235 """Copy tarball on SE""" 3236 if self._Status != 'NotSubmit': 3237 sys.stdout.write("\nWARNING: call copyTarBallOnSE() before submit().\n") 3238 return 3239 self._stateObj = self._stateObj.doTarBallAndCopy(self, pathSE)
3240 3241 3242 3243 ############################################################### 3244 # 3245 # Class JobClass : schedule one application 3246 # 3247 ############################################################### 3248
3249 -class JobClass:
3250 # liste tous les jobs en cours pour suppression (cancel) utilisateur CRTL+C 3251 dictJobSubmit = {} 3252
3253 - def __init__(self, MyAppli, MySched):
3254 self._Appli = MyAppli 3255 self._Scheduler = MySched 3256 self._Appli._PathLocWD = MyConf().info.workDir 3257 3258 ## # id 3259 ## unique = False 3260 ## Cpt_unique = 0 3261 ## while not unique: 3262 ## if Cpt_unique >= 0: 3263 ## self._Appli._APCSchedID = _AddAlea(self._Appli._APCSchedID, 4) 3264 ## cmd = "[[ `ls %s%s.* 2>/dev/null` == '' ]]"%(self._Appli._PathLocWD, self._Appli._APCSchedID) 3265 ## if os.system(cmd) == 0: 3266 ## unique = True 3267 ## else: 3268 ## if Cpt_unique > 10: 3269 ## sys.stderr.write("\nERROR: Can't unique ID !!!\n") 3270 ## raise 3271 ## Cpt_unique += 1 3272 self._Appli._APCSchedID = _AddAlea(self._Appli._APCSchedID, 6) 3273 self._Appli._FileID = self._Appli.getPathFile('ID') 3274 self._TimeOutToStart = "24:0:0" 3275 if self._Appli._MainSE == '': 3276 self._Appli._MainSE = MyConf().gLite.se
3277
3278 - def timerUpdateStatus(self, timeUpdate):
3279 """Time between two update status , 3280 timeUpdate[[hh:]mm:]ss string""" 3281 sec = _ConvHHMMSSInSec(timeUpdate) 3282 if sec > 0 : 3283 self._Appli._Timer = sec
3284
3285 - def addFullPath(self, File):
3286 return (self._Appli._PathLocWD+File)
3287
3288 - def copyTarBallOnSE(self, FileSE):
3289 """Create a tarball and copy it on SE. Call it before submit()""" 3290 return (self._Appli.copyTarBallOnSE(FileSE))
3291
3292 - def name(self):
3293 return (self._Appli._APCSchedID)
3294
3295 - def status(self):
3296 self._Scheduler.status(self._Appli) 3297 return self._Appli._Status
3298
3299 - def isFinish(self):
3300 return self._Appli.isFinish() 3301
3302 - def isSubmit(self):
3303 return self._Appli.isSubmit() 3304
3305 - def isOk(self):
3306 return self._Appli.isOk()
3307
3308 - def submit(self,TimeOutToStart="24:0:0"):
3309 if G_verbose>15: print 'submit' 3310 self._Appli._TimeOutToStart = _ConvHHMMSSInSec(TimeOutToStart) 3311 if G_verbose > 10:print "TimeOutToStart=", self._Appli._TimeOutToStart 3312 if (self._Appli._Status == 'NotSubmit'): 3313 JobClass.dictJobSubmit[self._Appli._APCSchedID] = self 3314 self._Scheduler.submit(self._Appli) 3315 # add job to submit job 3316 else: 3317 print 'Already submit !' 3318
3319 - def wait(self):
3320 self._Scheduler.wait(self._Appli)
3321
3322 - def submitAndWait(self,TimeOutToStart="24:0:0"):
3323 self.submit(TimeOutToStart) 3324 self.wait()
3325
3326 - def fullNameStdOut(self):
3327 return self.addFullPath(self._Appli.getNameStdOut())
3328
3329 - def fullNameStdErr(self):
3330 return self.addFullPath(self._Appli.getNameStdErr())
3331
3332 - def fullNameOutput(self, Idx):
3333 if Idx >= len(self._Appli._ListOutput): 3334 print "Idx too great" 3335 return None 3336 else: 3337 return self.addFullPath(self._Appli.getNameOutput(Idx))
3338
3339 - def stdOut(self):
3340 return self._Scheduler.stdOut(self._Appli)
3341
3342 - def stdErr(self):
3343 return self._Scheduler.stdErr(self._Appli)
3344 3345 #return _readFile(self.fullNameStdErr()) 3346
3347 - def output(self, Idx):
3348 return _readFile(self.fullNameOutput(Idx)) 3349
3350 - def sendMailResult(self):
3351 if MyConf().info.mail == '': 3352 print 'Define address mail with setMail() function' 3353 return 3354 cmd = ('mail -s "APCScheduler : %s status is %s" %s')%(self._Appli._APCSchedID,self._Appli._Status,MyConf().info.mail) 3355 #print cmd 3356 if self._Appli._Status == 'FinishOK': 3357 file = self.fullNameStdOut() 3358 if os.path.isfile(file): 3359 os.system(cmd + '< %s'%(file)) 3360 else: 3361 os.system('echo "No file stdout available"|'+cmd ) 3362 elif self._Appli._Status == 'FinishSubmitNOK': 3363 file = self._Appli.getPathFile('SUBERR') 3364 if os.path.isfile(file): 3365 os.system(cmd + '< %s'%(file)) 3366 else: 3367 os.system('echo "No file submit stderr available"|'+cmd ) 3368 elif self._Appli._Status == 'FinishTimeOut': 3369 os.system('echo " "|'+cmd ) 3370 else: 3371 file = self.fullNameStdErr() 3372 if os.path.isfile(file): 3373 os.system(cmd + '< %s'%(file)) 3374 else: 3375 os.system('echo "No file stderr available"|'+cmd )
3376 3377 3378 3379 ############################################################### 3380 # 3381 # Class MultiJobsClass : schedule N applications 3382 # 3383 ############################################################### 3384
3385 -class MultiJobsClass:
3386 S_EventFinish = threading.Event() 3387
3388 - def __init__(self, name=""):
3389 self._ListJob = [] 3390 self._Timer = 60*5 3391 self._name = name 3392 self._timeStart = time.asctime()
3393
3394 - def append(self, job):
3395 """Add a job to the Run""" 3396 self._ListJob.append(job)
3397 3398
3399 - def timerUpdateStatus(self, timeUpdate):
3400 """Time between two update status , 3401 timeUpdate[[hh:]mm:]ss string""" 3402 sec = _ConvHHMMSSInSec(timeUpdate) 3403 if sec > 0 : 3404 self._Timer = sec
3405 3406
3407 - def submitAndWaitAll(self,TimeOutToStart="24:0:0", MaxRunning=0):
3408 """submit all jobs and wait all: 3409 TimeOutToStart: [[hh:]mm:]ss string, if job doesn't run after TimeOutToStart abort it. 3410 default value is 15 mn ie 15:0 3411 """ 3412 if MaxRunning <=0: 3413 self.submitAll(TimeOutToStart) 3414 self.waitAll() 3415 return 3416 3417 NbRun = 1 3418 while NbRun != 0: 3419 NbRun = 0 3420 for job in self._ListJob: 3421 job.status() 3422 #print "job",job._Appli._APCSchedID, " status ", job._Appli._Status 3423 if job.isSubmit() : 3424 NbRun += 1 3425 3426 # submit if slot available 3427 if NbRun < MaxRunning: 3428 for job in self._ListJob: 3429 if job._Appli._Status == 'NotSubmit': 3430 job.submit(TimeOutToStart) 3431 #print "submit ", job._Appli._APCSchedID 3432 NbRun += 1 3433 if NbRun >= MaxRunning: 3434 break 3435 3436 # Sleep if not finished 3437 if NbRun > 0: 3438 MultiJobsClass.S_EventFinish.wait(self._Timer) 3439 MultiJobsClass.S_EventFinish.clear()
3440 3441
3442 - def submitAll(self,TimeOutToStart="24:0:0"):
3443 """submit all: 3444 TimeOutToStart: [[hh:]mm:]ss string, if job doesn't run after TimeOutToStart abort it. 3445 default value is 15 mn ie 15:0 3446 """ 3447 for job in self._ListJob: 3448 job.submit(TimeOutToStart)
3449 3450
3451 - def waitAll(self):
3452 """wait all job in the run""" 3453 NbRun = 1 3454 while NbRun != 0: 3455 NbRun = 0 3456 for job in self._ListJob: 3457 if not job.isFinish(): 3458 job.status() 3459 if not job.isFinish() : 3460 NbRun += 1 3461 # Sleep if not finished 3462 if NbRun > 0: 3463 MultiJobsClass.S_EventFinish.wait(self._Timer) 3464 MultiJobsClass.S_EventFinish.clear()
3465 3466
3467 - def concatOutput(self, IdxOutput, FileConcat):
3468 """Concate output[IdxOutput] file of the list defined by setOutput() application method 3469 in one file FileConcat 3470 """ 3471 if os.path.isfile(FileConcat): 3472 os.system('rm -rf %s'%FileConcat) 3473 os.system('touch %s '%FileConcat) 3474 for job in self._ListJob: 3475 if job.isOk(): 3476 os.system('cat %s >> %s'%(job.fullNameOutput(IdxOutput),FileConcat)) 3477 if G_verbose>15: print 'add file '+ job.fullNameOutput(IdxOutput)
3478 3479
3480 - def concatEachOutput(self, PrefixeFileConcat):
3481 """Concatenation of all output file """ 3482 for i in range(len(self._ListJob[0]._Appli._ListOutput)): 3483 self.concatOutput(i, PrefixeFileConcat+'_'+self._ListJob[0]._Appli._ListOutput[i])
3484 3485
3486 - def concatStdOut(self, FileConcat):
3487 """Concatenation of all sdtout""" 3488 if os.path.isfile(FileConcat): 3489 os.system('rm -rf %s'%FileConcat) 3490 os.system('touch %s '%FileConcat) 3491 for job in self._ListJob: 3492 if job.isOk(): 3493 os.system('cat %s >> %s'%(job.fullNameStdOut(),FileConcat)) 3494 if G_verbose>15: print 'add file '+ job.fullNameStdOut()
3495 3496
3497 - def concatStdErr(self, FileConcat):
3498 """Concatenation of all sdterr""" 3499 if os.path.isfile(FileConcat): 3500 os.system('rm -rf %s'%FileConcat) 3501 os.system('touch %s '%FileConcat) 3502 for job in self._ListJob: 3503 os.system('cat %s >> %s'%(job.fullNameStdErr(),FileConcat)) 3504 if G_verbose>15: print 'add file '+ job.fullNameStdErr()
3505 3506
3507 - def concatId(self, FileConcat=""):
3508 """Concatenation of all sdterr""" 3509 if os.path.isfile(FileConcat): 3510 os.system('rm -rf %s'%FileConcat) 3511 os.system('touch %s '%FileConcat) 3512 for job in self._ListJob: 3513 os.system('cat %s >> %s'%(job._Appli.getPathFile('ID'),FileConcat))
3514 3515
3516 - def result(self):
3517 NbOk = 0 3518 NbNOk = 0 3519 NbTimeOut = 0 3520 NbTot= len( self._ListJob) 3521 for job in self._ListJob: 3522 if job.isOk(): 3523 NbOk += 1 3524 elif job.status().find("FinishNOK")>=0: 3525 NbNOk += 1 3526 elif job.status() == "FinishTimeOut": 3527 NbTimeOut += 1 3528 3529 buffer = "output directory : %s\nResult:\n"%job._Appli._PathLocWD 3530 buffer += " %d job(s) OK on %d\n"%(NbOk, NbTot) 3531 if NbNOk > 0: 3532 buffer += " %d job(s) NOK\n"%(NbNOk) 3533 if NbTimeOut > 0: 3534 buffer += " %d job(s) time out to pass RUNNING\n"%(NbTimeOut) 3535 3536 NbOtherNOK = NbTot-NbOk-NbNOk-NbTimeOut 3537 if NbOtherNOK > 0: 3538 buffer += " %d job(s) failed to other reason\n"%NbOtherNOK 3539 buffer += self._ListJob[0]._Scheduler._resultListJob(self) 3540 if NbTot != NbOk : 3541 buffer += "\njob(s) failed information:\n--------------------------\n" 3542 for job in self._ListJob: 3543 if not job.isOk(): 3544 buffer += "status job %s is %s"%(job.name(), job.status()) 3545 if job.status() == 'FinishNOK': 3546 buffer +=" with status %d."%job._Appli._StatusAppli 3547 buffer +="\n" 3548 err = job.stdErr() 3549 if err != None and err != '': 3550 buffer += "stderr:\n"+err+"\n" 3551 buffer += "===============================================================================\n\n" 3552 return buffer 3553 3554
3555 - def printResult(self):
3556 print self.result()
3557 3558
3559 - def sendMailResult(self):
3560 if MyConf().info.mail == '': 3561 sys.stderr.write('\nERROR sendMailResult() method:\nno address mail, set it with setMail() function') 3562 return 3563 FileResult = 'tmpresult.txt' 3564 pf = open(FileResult,'w+') 3565 pf.write(self.result()) 3566 pf.close() 3567 cmd = 'mail -s "APCScheduler: result run %s started %s" %s < %s'%(self._name, self._timeStart, MyConf().info.mail, FileResult) 3568 os.system(cmd) 3569 time.sleep(1) 3570 os.system('rm -rf '+FileResult)
3571