+*In[1]:*+ [source, ipython3] ---- import wmfdata as wmf ---- +*In[2]:*+ [source, ipython3] ---- session = wmf.spark.create_session(ship_python_env=True) ---- +*Out[2]:*+ ---- /home/neilpquinn-wmf/wmfdata-python/wmfdata/spark.py:139: FutureWarning: Spark 2 has been deprecated. Please migrate to Spark 3. See https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Spark/Migration_to_Spark_3 category=FutureWarning A conda environment is already packed at conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz. If you have recently installed new packages into your conda env, set force=True in conda_pack_kwargs and it will be repacked for you. Will ship conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz to remote Spark executors. PYSPARK_PYTHON=conda-2022-10-12T21.46.32_neilpquinn-wmf/bin/python3 SLF4J: Class path contains multiple SLF4J bindings. SLF4J: Found binding in [jar:file:/usr/lib/spark2/jars/slf4j-log4j12-1.7.16.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory] Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 22/11/17 20:51:02 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN). 22/11/17 20:51:03 WARN Utils: Service 'sparkDriver' could not bind on port 12000. Attempting port 12001. 22/11/17 20:51:03 WARN Utils: Service 'sparkDriver' could not bind on port 12001. Attempting port 12002. 22/11/17 20:51:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041. 22/11/17 20:51:03 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042. 22/11/17 20:51:31 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 13000. Attempting port 13001. 22/11/17 20:51:31 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 13001. Attempting port 13002. ---- +*In[3]:*+ [source, ipython3] ---- session.stop() ---- +*In[4]:*+ [source, ipython3] ---- wmf.spark.get_active_session() ---- +*In[5]:*+ [source, ipython3] ---- wmf.spark.create_session(ship_python_env=True) ---- +*Out[5]:*+ ---- /home/neilpquinn-wmf/wmfdata-python/wmfdata/spark.py:139: FutureWarning: Spark 2 has been deprecated. Please migrate to Spark 3. See https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Spark/Migration_to_Spark_3 category=FutureWarning A conda environment is already packed at conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz. If you have recently installed new packages into your conda env, set force=True in conda_pack_kwargs and it will be repacked for you. 22/11/17 20:52:11 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN). 22/11/17 20:52:11 WARN Utils: Service 'sparkDriver' could not bind on port 12000. Attempting port 12001. 22/11/17 20:52:11 WARN Utils: Service 'sparkDriver' could not bind on port 12001. Attempting port 12002. 22/11/17 20:52:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041. 22/11/17 20:52:11 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042. 22/11/17 20:52:15 WARN Client: Same path resource file:/srv/home/neilpquinn-wmf/conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz#conda-2022-10-12T21.46.32_neilpquinn-wmf added multiple times to distributed cache. 22/11/17 20:52:15 ERROR SparkContext: Error initializing SparkContext. java.lang.IllegalArgumentException: Attempt to add (conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz#conda-2022-10-12T21.46.32_neilpquinn-wmf) multiple times to the distributed cache. at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10$$anonfun$apply$6.apply(Client.scala:607) at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10$$anonfun$apply$6.apply(Client.scala:598) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10.apply(Client.scala:598) at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10.apply(Client.scala:597) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.deploy.yarn.Client.prepareLocalResources(Client.scala:597) at org.apache.spark.deploy.yarn.Client.createContainerLaunchContext(Client.scala:865) at org.apache.spark.deploy.yarn.Client.submitApplication(Client.scala:179) at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend.start(YarnClientSchedulerBackend.scala:57) at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:183) at org.apache.spark.SparkContext.(SparkContext.scala:501) at org.apache.spark.api.java.JavaSparkContext.(JavaSparkContext.scala:58) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:423) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:238) at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80) at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:750) 22/11/17 20:52:15 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered! 22/11/17 20:52:15 WARN MetricsSystem: Stopping a MetricsSystem that is not running --------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) /usr/lib/spark2/python/pyspark/sql/utils.py in deco(*a, **kw) 62 try: ---> 63 return f(*a, **kw) 64 except py4j.protocol.Py4JJavaError as e: /usr/lib/spark2/python/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 327 "An error occurred while calling {0}{1}{2}.\n". --> 328 format(target_id, ".", name), value) 329 else: Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext. : java.lang.IllegalArgumentException: Attempt to add (conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz#conda-2022-10-12T21.46.32_neilpquinn-wmf) multiple times to the distributed cache. at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10$$anonfun$apply$6.apply(Client.scala:607) at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10$$anonfun$apply$6.apply(Client.scala:598) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10.apply(Client.scala:598) at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10.apply(Client.scala:597) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.deploy.yarn.Client.prepareLocalResources(Client.scala:597) at org.apache.spark.deploy.yarn.Client.createContainerLaunchContext(Client.scala:865) at org.apache.spark.deploy.yarn.Client.submitApplication(Client.scala:179) at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend.start(YarnClientSchedulerBackend.scala:57) at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:183) at org.apache.spark.SparkContext.(SparkContext.scala:501) at org.apache.spark.api.java.JavaSparkContext.(JavaSparkContext.scala:58) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:423) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:238) at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80) at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:750) During handling of the above exception, another exception occurred: IllegalArgumentException Traceback (most recent call last) /tmp/ipykernel_30551/3348695440.py in ----> 1 wmf.spark.create_session(ship_python_env=True) ~/wmfdata-python/wmfdata/spark.py in create_session(type, app_name, extra_settings, ship_python_env) 251 app_name=app_name, 252 spark_config=config, --> 253 ship_python_env=ship_python_env 254 ) 255 ~/wmfdata-python/wmfdata/spark.py in create_custom_session(master, app_name, spark_config, ship_python_env, conda_pack_kwargs) 205 builder.config(k, v) 206 --> 207 return builder.getOrCreate() 208 209 def create_session( /usr/lib/spark2/python/pyspark/sql/session.py in getOrCreate(self) 171 for key, value in self._options.items(): 172 sparkConf.set(key, value) --> 173 sc = SparkContext.getOrCreate(sparkConf) 174 # This SparkContext may be an existing one. 175 for key, value in self._options.items(): /usr/lib/spark2/python/pyspark/context.py in getOrCreate(cls, conf) 365 with SparkContext._lock: 366 if SparkContext._active_spark_context is None: --> 367 SparkContext(conf=conf or SparkConf()) 368 return SparkContext._active_spark_context 369 /usr/lib/spark2/python/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls) 134 try: 135 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer, --> 136 conf, jsc, profiler_cls) 137 except: 138 # If an error occurs, clean up in order to allow future SparkContext creation: /usr/lib/spark2/python/pyspark/context.py in _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls) 196 197 # Create the Java SparkContext through Py4J --> 198 self._jsc = jsc or self._initialize_context(self._conf._jconf) 199 # Reset the SparkConf to the one actually used by the SparkContext in JVM. 200 self._conf = SparkConf(_jconf=self._jsc.sc().conf()) /usr/lib/spark2/python/pyspark/context.py in _initialize_context(self, jconf) 304 Initialize SparkContext in function to allow subclass specific initialization 305 """ --> 306 return self._jvm.JavaSparkContext(jconf) 307 308 @classmethod /usr/lib/spark2/python/py4j/java_gateway.py in __call__(self, *args) 1523 answer = self._gateway_client.send_command(command) 1524 return_value = get_return_value( -> 1525 answer, self._gateway_client, None, self._fqn) 1526 1527 for temp_arg in temp_args: /usr/lib/spark2/python/pyspark/sql/utils.py in deco(*a, **kw) 77 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace) 78 if s.startswith('java.lang.IllegalArgumentException: '): ---> 79 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace) 80 raise 81 return deco IllegalArgumentException: 'Attempt to add (conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz#conda-2022-10-12T21.46.32_neilpquinn-wmf) multiple times to the distributed cache.' ---- +*In[ ]:*+ [source, ipython3] ---- ----