Package starcluster :: Module clustersetup
[hide private]
[frames] | no frames]

Source Code for Module starcluster.clustersetup

  1  #!/usr/bin/env python 
  2   
  3  """ 
  4  clustersetup.py 
  5  """ 
  6   
  7  import os 
  8  import shutil 
  9  import tempfile 
 10   
 11  from starcluster.templates.sgeprofile import sgeprofile_template 
 12  from starcluster.templates.sgeinstall import sgeinstall_template 
 13  from starcluster.templates.sge_pe import sge_pe_template 
 14  from starcluster.logger import log 
 15   
16 -class ClusterSetup(object):
17 """ 18 ClusterSetup Interface 19 """
20 - def __init__(self, *args, **kwargs):
21 pass
22
23 - def run(self, nodes, master, user, user_shell, volumes):
24 """ Start cluster setup routines """ 25 raise NotImplementedError('run method not implemented')
26
27 -class DefaultClusterSetup(ClusterSetup):
28 """ 29 Default ClusterSetup implementation for StarCluster 30 """
31 - def __init__(self):
32 self._nodes = None 33 self._master = None 34 self._user = None 35 self._user_shell = None 36 self._volumes = None
37
38 - def _setup_cluster_user(self):
39 """ 40 Create cluster user on all StarCluster nodes 41 42 This command takes care to examine existing folders in /home 43 and set the new cluster_user's uid/gid accordingly. This is necessary 44 for the case of EBS volumes containing /home with large amounts of data 45 in them. It's much less expensive in this case to set the uid/gid of the 46 new user to be the existing uid/gid of the dir in EBS rather than 47 chowning potentially terabytes of data. 48 """ 49 mconn = self._master.ssh 50 home_folder = '/home/%s' % self._user 51 first_uid = 1000 52 uid, gid = first_uid, first_uid 53 if mconn.path_exists(home_folder): 54 # get /home/user's owner/group uid and create user with that uid/gid 55 s = mconn.stat(home_folder) 56 uid = s.st_uid 57 gid = s.st_gid 58 else: 59 # get highest uid/gid of dirs in /home/*, increment by 1 and create user 60 # with that uid/gid 61 uid_db = {} 62 files = mconn.ls('/home') 63 for file in files: 64 if mconn.isdir(file): 65 f = mconn.stat(file) 66 uid_db[f.st_uid] = (file, f.st_gid) 67 if uid_db.keys(): 68 max_uid = max(uid_db.keys()) 69 max_gid = uid_db[max_uid][1] 70 uid, gid = max_uid+1, max_gid+1 71 uid = max(uid, first_uid) 72 gid = max(gid, first_uid) 73 log.debug("Cluster user gid/uid: (%d, %d)" % (uid,gid)) 74 log.info("Creating cluster user: %s" % self._user) 75 for node in self._nodes: 76 nconn = node.ssh 77 nconn.execute('groupadd -o -g %s %s' % (gid, self._user)) 78 nconn.execute('useradd -o -u %s -g %s -m -s `which %s` %s' % 79 (uid, gid, self._user_shell, self._user))
80
81 - def _setup_scratch(self):
82 """ Configure scratch space on all StarCluster nodes """ 83 log.info("Configuring scratch space for user: %s" % self._user) 84 for node in self._nodes: 85 nconn = node.ssh 86 nconn.execute('mkdir /mnt/%s' % self._user) 87 nconn.execute('chown -R %(user)s:%(user)s /mnt/%(user)s' % {'user':self._user}) 88 nconn.execute('mkdir /scratch') 89 nconn.execute('ln -s /mnt/%s /scratch' % self._user)
90
91 - def _setup_etc_hosts(self):
92 """ Configure /etc/hosts on all StarCluster nodes""" 93 log.info("Configuring /etc/hosts on each node") 94 for node in self._nodes: 95 conn = node.ssh 96 host_file = conn.remote_file('/etc/hosts') 97 print >> host_file, "# Do not remove the following line or programs that require network functionality will fail" 98 print >> host_file, "127.0.0.1 localhost.localdomain localhost" 99 for node in self._nodes: 100 print >> host_file, node.get_hosts_entry() 101 host_file.close()
102
103 - def _setup_passwordless_ssh(self):
104 """ Properly configure passwordless ssh for CLUSTER_USER on all StarCluster nodes""" 105 log.info("Configuring passwordless ssh for root") 106 mconn = self._master.ssh 107 # create local ssh key for root and copy to local tempdir 108 # remove any old keys first 109 mconn.execute('rm /root/.ssh/id_rsa*', ignore_exit_status=True) 110 mconn.execute('ssh-keygen -q -t rsa -f /root/.ssh/id_rsa -P ""') 111 tempdir = tempfile.mkdtemp(prefix="starcluster-") 112 temprsa = os.path.join(tempdir, 'id_rsa') 113 temprsa_pub = os.path.join(tempdir, 'id_rsa.pub') 114 tempknown_hosts = os.path.join(tempdir, 'known_hosts') 115 mconn.get('/root/.ssh/id_rsa', temprsa) 116 mconn.get('/root/.ssh/id_rsa.pub', temprsa_pub) 117 118 # copy newly generated id_rsa for root to each node 119 for node in self._nodes: 120 conn = node.ssh 121 conn.put(temprsa,'/root/.ssh/id_rsa') 122 conn.put(temprsa_pub,'/root/.ssh/id_rsa.pub') 123 conn.execute('chmod 400 /root/.ssh/id_rsa*') 124 conn.execute('cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys') 125 126 # Now that root's passwordless ssh is setup: 127 # 1. Make initial connections to all nodes to skip host key checking on first use. 128 # 2. This populates /root/.ssh/known_hosts which is copied to CLUSTER_USER's 129 # ~/.ssh directory below 130 for node in self._nodes: 131 for name in node.network_names.values(): 132 mconn.execute('ssh -o "StrictHostKeyChecking=no" %s hostname' % name) 133 134 # Fetch the newly generated known_hosts file and distribute it to rest 135 # of nodes in /root/.ssh 136 mconn.get('/root/.ssh/known_hosts', tempknown_hosts) 137 for node in self._nodes: 138 conn = node.ssh.put(tempknown_hosts, '/root/.ssh/known_hosts') 139 140 # no longer need the temp directory after copying over newly generated keys 141 # and known_hosts. leave for now to debug 142 #shutil.rmtree(tempdir) 143 144 log.info("Configuring passwordless ssh for user: %s" % self._user) 145 # only needed on master, nfs takes care of the rest 146 mconn.execute('mkdir -p /home/%s/.ssh' % self._user) 147 pkfiles_list = ("/home/%(user)s/.ssh/id_rsa /home/%(user)s/.ssh/id_rsa.pub" % 148 {'user':self._user}).split() 149 # check to see if both private key files exist (ie key and public key) 150 pkfiles_exist = [ eval(mconn.execute('test -f %s && echo "True" || echo "False"'%file)[0]) for file in pkfiles_list ] 151 has_all_pkfiles = (pkfiles_exist.count(True) == len(pkfiles_list)) 152 pkfiles = zip(pkfiles_list, pkfiles_exist) 153 154 if not has_all_pkfiles: 155 # this handles the case of only id_rsa or id_rsa.pub existing (ie not both for whatever reason) 156 # in this case we want to remove whichever exists by itself and generate new local rsa keys 157 for file,exists in pkfiles: 158 log.debug('Checking for orphaned private key file: %s | exists = %s' % (file, exists)) 159 if exists: 160 log.debug('Removing orphaned private key file: %s' % file) 161 mconn.execute('rm %s' % file) 162 log.info("Generating local RSA ssh keys for user: %s" % self._user) 163 mconn.execute('ssh-keygen -q -t rsa -f /home/%s/.ssh/id_rsa -P ""' % 164 self._user) 165 else: 166 # existing rsa key with matching pub key exists, no need to regenerate 167 log.info("Using existing RSA ssh keys found for user: %s" % 168 self._user) 169 170 mconn.execute('cp /root/.ssh/authorized_keys /home/%s/.ssh/' % 171 self._user) 172 mconn.execute('cp /root/.ssh/known_hosts /home/%s/.ssh/' % self._user) 173 mconn.execute('chown -R %(user)s:%(user)s /home/%(user)s/.ssh' % 174 {'user':self._user}) 175 mconn.execute('chmod 400 /home/%s/.ssh/id_rsa*' % self._user) 176 mconn.execute('cat /home/%(user)s/.ssh/id_rsa.pub >> /home/%(user)s/.ssh/authorized_keys' % 177 {'user':self._user})
178
179 - def _setup_ebs_volume(self):
180 """ Mount EBS volume, if specified, in ~/.starclustercfg to /home""" 181 # setup /etc/fstab on master to use block device if specified 182 for vol in self._volumes: 183 vol = self._volumes[vol] 184 vol_id = vol.get("volume_id") 185 device = vol.get("device") 186 volume_partition = vol.get('partition') 187 mount_path = vol.get('mount_path') 188 if vol_id and volume_partition and mount_path: 189 log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path)) 190 mconn = self._master.ssh 191 if not mconn.path_exists(device): 192 log.warn("Cannot find device %s for volume %s" % (device, 193 vol)) 194 log.warn("Not mounting %s on %s" % (vol_id, 195 mount_path)) 196 log.warn("This usually means there was a problem" + \ 197 "attaching the EBS volume to the master node") 198 continue 199 if not mconn.path_exists(volume_partition): 200 log.warn("Cannot find partition %s on volume %s" % 201 (volume_partition, vol_id)) 202 log.warn("Not mounting %s on %s" % (vol_id, 203 mount_path)) 204 log.warn("This either means that the volume has not been" + \ 205 "partitioned or that the partition specified" + \ 206 "does not exist on the volume") 207 continue 208 master_fstab = mconn.remote_file('/etc/fstab', mode='a') 209 print >> master_fstab, "%s %s auto noauto,defaults 0 0 " % ( 210 volume_partition, mount_path) 211 master_fstab.close() 212 mconn.execute('mkdir -p %s' % mount_path) 213 mconn.execute('mount %s' % mount_path)
214
215 - def _setup_nfs(self):
216 """ Share /home and /opt/sge6 via nfs to all nodes""" 217 log.info("Configuring NFS...") 218 219 master = self._master 220 mconn = master.ssh 221 222 # copy fresh sge installation files to /opt/sge6 and make CLUSTER_USER the owner 223 mconn.execute('rm -rf /opt/sge6') 224 mconn.execute('cp -r /opt/sge6-fresh /opt/sge6') 225 mconn.execute('chown -R %(user)s:%(user)s /opt/sge6' % {'user': 226 self._user}) 227 228 # setup /etc/exports and start nfsd on master node 229 nfs_export_settings = "(async,no_root_squash,no_subtree_check,rw)" 230 etc_exports = mconn.remote_file('/etc/exports') 231 for node in self._nodes: 232 if not node.is_master(): 233 etc_exports.write('/home ' + node.private_dns_name + nfs_export_settings + '\n') 234 etc_exports.write('/opt/sge6 ' + node.private_dns_name + nfs_export_settings + '\n') 235 for vol in self._volumes: 236 vol = self._volumes[vol] 237 mount_path = vol.get('mount_path') 238 if not mount_path in ['/home','/opt/sge6']: 239 etc_exports.write(mount_path + ' ' + node.private_dns_name + nfs_export_settings + '\n') 240 etc_exports.close() 241 242 mconn.execute('/etc/init.d/portmap start') 243 mconn.execute('mount -t rpc_pipefs sunrpc /var/lib/nfs/rpc_pipefs/') 244 mconn.execute('/etc/init.d/nfs start') 245 mconn.execute('/usr/sbin/exportfs -r') 246 # fix for xterm/mpi printing to stdout 247 mconn.execute('mount -t devpts none /dev/pts', ignore_exit_status=True) 248 249 # setup /etc/fstab and mount /home and /opt/sge6 on each node 250 for node in self._nodes: 251 if not node.is_master(): 252 nconn = node.ssh 253 nconn.execute('/etc/init.d/portmap start') 254 nconn.execute('mkdir /opt/sge6') 255 nconn.execute('chown -R %(user)s:%(user)s /opt/sge6' % {'user':self._user}) 256 nconn.execute('echo "%s:/home /home nfs user,rw,exec 0 0" >> /etc/fstab' % master.private_dns_name) 257 nconn.execute('echo "%s:/opt/sge6 /opt/sge6 nfs user,rw,exec 0 0" >> /etc/fstab' % master.private_dns_name) 258 nconn.execute('mount /home') 259 nconn.execute('mount /opt/sge6') 260 # fix for xterm 261 nconn.execute('mount -t devpts none /dev/pts', 262 ignore_exit_status=True) 263 for vol in self._volumes: 264 vol = self._volumes[vol] 265 mount_path = vol.get('mount_path') 266 if not mount_path in ['/home','/opt/sge6']: 267 nconn.execute( 268 'echo "%s:%s %s nfs user,rw,exec 0 0" >> /etc/fstab' % 269 (master.private_dns_name,mount_path, 270 mount_path)) 271 nconn.execute('mkdir -p %s' % mount_path) 272 nconn.execute('mount %s' % mount_path)
273
274 - def _setup_sge(self):
275 """ Install Sun Grid Engine with a default parallel environment on StarCluster""" 276 log.info("Installing Sun Grid Engine...") 277 278 # generate /etc/profile.d/sge.sh for each node 279 for node in self._nodes: 280 conn = node.ssh 281 sge_profile = conn.remote_file("/etc/profile.d/sge.sh") 282 arch = conn.execute("/opt/sge6/util/arch")[0] 283 284 print >> sge_profile, sgeprofile_template % {'arch': arch} 285 sge_profile.close() 286 287 # setup sge auto install file 288 master = self._master 289 mconn = master.ssh 290 291 admin_list = '' 292 for node in self._nodes: 293 admin_list = admin_list + " " + node.private_dns_name 294 295 exec_list = admin_list 296 submit_list = admin_list 297 ec2_sge_conf = mconn.remote_file("/opt/sge6/ec2_sge.conf") 298 299 # todo: add sge section to config values for some of the below 300 print >> ec2_sge_conf, sgeinstall_template % (admin_list, exec_list, submit_list) 301 ec2_sge_conf.close() 302 303 # installs sge in /opt/sge6 and starts qmaster and schedd on master node 304 mconn.execute('cd /opt/sge6 && TERM=rxvt ./inst_sge -m -x -auto ./ec2_sge.conf', silent=True, only_printable=True) 305 306 # set all.q shell to bash 307 mconn.execute('source /etc/profile && qconf -mattr queue shell "/bin/bash" all.q') 308 309 # create sge parallel environment 310 # first iterate through each machine and count the number of processors 311 num_processors = 0 312 for node in self._nodes: 313 conn = node.ssh 314 num_procs = int(conn.execute('cat /proc/cpuinfo | grep processor | wc -l')[0]) 315 num_processors += num_procs 316 317 parallel_environment = mconn.remote_file("/tmp/pe.txt") 318 print >> parallel_environment, sge_pe_template % num_processors 319 parallel_environment.close() 320 mconn.execute("source /etc/profile && qconf -Ap %s" % parallel_environment.name) 321 322 mconn.execute('source /etc/profile && qconf -mattr queue pe_list "orte" all.q') 323 324 #todo cleanup /tmp/pe.txt 325 log.info("Done Configuring Sun Grid Engine")
326
327 - def run(self, nodes, master, user, user_shell, volumes):
328 """Start cluster configuration""" 329 self._nodes = nodes 330 self._master = master 331 self._user = user 332 self._user_shell = user_shell 333 self._volumes = volumes 334 self._setup_ebs_volume() 335 self._setup_cluster_user() 336 self._setup_scratch() 337 self._setup_etc_hosts() 338 self._setup_nfs() 339 self._setup_passwordless_ssh() 340 self._setup_sge()
341