cloud-tools/init_notebook.py at master · jigold/cloud-tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python

import os
import json
import time
from subprocess import call, Popen, PIPE

# get role of machine (master or worker)
role = Popen('/usr/share/google/get_metadata_value attributes/dataproc-role', shell=True, stdout=PIPE).communicate()[0]

# initialization actions to perform on master machine only
if role == 'Master':

    # download Anaconda Python 2.7 installation script
    call(['wget', '-P', '/home/anaconda2/', 'https://repo.continuum.io/archive/Anaconda2-4.3.1-Linux-x86_64.sh'])

    # install Anaconda in /home/anaconda2/
    call(['bash', '/home/anaconda2/Anaconda2-4.3.1-Linux-x86_64.sh', '-b', '-f', '-p', '/home/anaconda2/'])
    os.chmod('/home/anaconda2/', 0777)

    # additional packages to install
    pkgs = [
        'lxml',
        'jupyter-spark',
        'jgscm'
    ]

    # use pip to install packages
    for pkg in pkgs:
        call(['/home/anaconda2/bin/pip', 'install', pkg])

    # get Hail hash and Spark version to use for Jupyter notebook, if set through cluster startup metadata
    spark = Popen('/usr/share/google/get_metadata_value attributes/SPARK', shell=True, stdout=PIPE).communicate()[0].strip()
    hash = Popen('/usr/share/google/get_metadata_value attributes/HASH', shell=True, stdout=PIPE).communicate()[0].strip()

    # default to Spark 2.0.2 if not otherwise specified through metadata
    if not spark:
        spark = '2.0.2'

    # default to latest Hail build if none specified through metadata
    if not hash:
        hash = Popen(['gsutil', 'cat', 'gs://hail-common/latest-hash-spark{}.txt'.format(spark)], stdout=PIPE, stderr=PIPE).communicate()[0].strip()

    # Hail jar
    jar = Popen('/usr/share/google/get_metadata_value attributes/JAR', shell=True, stdout=PIPE).communicate()[0].strip()
    if jar:
        hail_jar = jar.rsplit('/')[-1]
        jar_path = jar
    else:
        hail_jar = 'hail-hail-is-master-all-spark{0}-{1}.jar'.format(spark, hash)
        jar_path = 'gs://hail-common/' + hail_jar

    # Hail zip
    zip = Popen('/usr/share/google/get_metadata_value attributes/ZIP', shell=True, stdout=PIPE).communicate()[0].strip()
    if zip:
        hail_zip = zip.rsplit('/')[-1]
        zip_path = zip
    else:
        hail_zip = 'pyhail-hail-is-master-{}.zip'.format(hash)
        zip_path = 'gs://hail-common/' + hail_zip

    # make directory for Hail and Jupyter notebook related files
    os.mkdir('/home/hail/')
    os.chmod('/home/hail/', 0777)

    # copy Hail jar and zip to local directory on master node
    call(['gsutil', 'cp', jar_path, '/home/hail/'])
    call(['gsutil', 'cp', zip_path, '/home/hail/'])

    # create Jupyter kernel spec file
    kernel = {
        'argv': [
            '/home/anaconda2/bin/python',
            '-m',
            'ipykernel',
            '-f',
            '{connection_file}'
        ],
        'display_name': 'Hail',
        'language': 'python',
        'env': {
            'PYTHONHASHSEED': '0',
            'SPARK_HOME': '/usr/lib/spark/',
            'SPARK_CONF_DIR': '/home/hail/conf/',
            'PYTHONPATH': '/usr/lib/spark/python/:/usr/lib/spark/python/lib/py4j-0.10.3-src.zip:/home/hail/{}'.format(hail_zip)
        }
    }

    # write kernel spec file to default Jupyter kernel directory
    os.makedirs('/home/anaconda2/share/jupyter/kernels/hail/')
    with open('/home/anaconda2/share/jupyter/kernels/hail/kernel.json', 'wb') as f:
        json.dump(kernel, f)

    # make directory for custom Spark conf
    os.mkdir('/home/hail/conf')

    # copy conf files to custom directory
    call(['cp', '/etc/spark/conf/spark-defaults.conf', '/home/hail/conf/spark-defaults.conf'])
    call(['cp', '/etc/spark/conf/spark-env.sh', '/home/hail/conf/spark-env.sh'])

    # modify custom Spark conf file to reference Hail jar and zip
    with open('/home/hail/conf/spark-defaults.conf', 'ab') as f:
        opts = [
            'spark.files=/home/hail/{}'.format(hail_jar),
            'spark.submit.pyFiles=/home/hail/{}'.format(hail_zip),
            'spark.driver.extraClassPath=./{}'.format(hail_jar),
            'spark.executor.extraClassPath=./{}'.format(hail_jar)
        ]
        f.write('\n'.join(opts))

    # add Spark variable designating Anaconda Python executable as the default on driver, in both custom and default conf files
    with open('/home/hail/conf/spark-env.sh', 'ab') as f_custom, open('/etc/spark/conf/spark-env.sh', 'ab') as f_default:
        f_custom.write('PYSPARK_DRIVER_PYTHON=/home/anaconda2/bin/python' + '\n')
        f_default.write('PYSPARK_DRIVER_PYTHON=/home/anaconda2/bin/python' + '\n')

    # create Jupyter configuration file
    call(['mkdir', '-p', '/home/anaconda2/etc/jupyter/'])
    with open('/home/anaconda2/etc/jupyter/jupyter_notebook_config.py', 'wb') as f:
	    opts = [
		    'c.Application.log_level = "DEBUG"',
		    'c.NotebookApp.ip = "127.0.0.1"',
		    'c.NotebookApp.open_browser = False',
		    'c.NotebookApp.port = 8123',
		    'c.NotebookApp.token = ""',
		    'c.NotebookApp.contents_manager_class = "jgscm.GoogleStorageContentManager"'
        ]
	    f.write('\n'.join(opts) + '\n')

    # setup jupyter-spark extension
    call(['/home/anaconda2/bin/jupyter', 'serverextension', 'enable', '--user', '--py', 'jupyter_spark'])
    call(['/home/anaconda2/bin/jupyter', 'nbextension', 'install', '--user', '--py', 'jupyter_spark'])
    call(['/home/anaconda2/bin/jupyter', 'nbextension', 'enable', '--user', '--py', 'jupyter_spark'])
    call(['/home/anaconda2/bin/jupyter', 'nbextension', 'enable', '--user', '--py', 'widgetsnbextension'])

    # create systemd service file for Jupyter notebook server process
    with open('/lib/systemd/system/jupyter.service', 'wb') as f:
    	opts = [
    		'[Unit]',
    		'Description=Jupyter Notebook',
    		'After=hadoop-yarn-resourcemanager.service',
    		'[Service]',
    		'Type=simple',
    		'User=root',
    		'Group=root',
    		'WorkingDirectory=/home/hail/',
            'ExecStart=/home/anaconda2/bin/python /home/anaconda2/bin/jupyter notebook',
    		'Restart=always',
    		'RestartSec=1',
    		'[Install]',
    		'WantedBy=multi-user.target'
    	]
    	f.write('\n'.join(opts) + '\n')

    # add Jupyter service to autorun and start it
    call(['systemctl', 'daemon-reload'])
    call(['systemctl', 'enable', 'jupyter'])
    call(['service', 'jupyter', 'start'])

    # sleep for 30 seconds to allow Jupyter notebook server to start
    time.sleep(30)