Page MenuHomePhabricator
Paste P40122

(An Untitled Masterwork)
ActivePublic

Authored by nshahquinn-wmf on Nov 17 2022, 8:58 PM.
Tags
None
Referenced Files
F35792080: raw-paste-data.txt
Nov 17 2022, 8:58 PM
Subscribers
None
+*In[1]:*+
[source, ipython3]
----
import wmfdata as wmf
----
+*In[2]:*+
[source, ipython3]
----
session = wmf.spark.create_session(ship_python_env=True)
----
+*Out[2]:*+
----
/home/neilpquinn-wmf/wmfdata-python/wmfdata/spark.py:139: FutureWarning:
Spark 2 has been deprecated. Please migrate to Spark 3.
See https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Spark/Migration_to_Spark_3
category=FutureWarning
A conda environment is already packed at conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz. If you have recently installed new packages into your conda env, set force=True in conda_pack_kwargs and it will be repacked for you.
Will ship conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz to remote Spark executors.
PYSPARK_PYTHON=conda-2022-10-12T21.46.32_neilpquinn-wmf/bin/python3
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark2/jars/slf4j-log4j12-1.7.16.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/11/17 20:51:02 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
22/11/17 20:51:03 WARN Utils: Service 'sparkDriver' could not bind on port 12000. Attempting port 12001.
22/11/17 20:51:03 WARN Utils: Service 'sparkDriver' could not bind on port 12001. Attempting port 12002.
22/11/17 20:51:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/11/17 20:51:03 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/11/17 20:51:31 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 13000. Attempting port 13001.
22/11/17 20:51:31 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 13001. Attempting port 13002.
----
+*In[3]:*+
[source, ipython3]
----
session.stop()
----
+*In[4]:*+
[source, ipython3]
----
wmf.spark.get_active_session()
----
+*In[5]:*+
[source, ipython3]
----
wmf.spark.create_session(ship_python_env=True)
----
+*Out[5]:*+
----
/home/neilpquinn-wmf/wmfdata-python/wmfdata/spark.py:139: FutureWarning:
Spark 2 has been deprecated. Please migrate to Spark 3.
See https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Spark/Migration_to_Spark_3
category=FutureWarning
A conda environment is already packed at conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz. If you have recently installed new packages into your conda env, set force=True in conda_pack_kwargs and it will be repacked for you.
22/11/17 20:52:11 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
22/11/17 20:52:11 WARN Utils: Service 'sparkDriver' could not bind on port 12000. Attempting port 12001.
22/11/17 20:52:11 WARN Utils: Service 'sparkDriver' could not bind on port 12001. Attempting port 12002.
22/11/17 20:52:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/11/17 20:52:11 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/11/17 20:52:15 WARN Client: Same path resource file:/srv/home/neilpquinn-wmf/conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz#conda-2022-10-12T21.46.32_neilpquinn-wmf added multiple times to distributed cache.
22/11/17 20:52:15 ERROR SparkContext: Error initializing SparkContext.
java.lang.IllegalArgumentException: Attempt to add (conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz#conda-2022-10-12T21.46.32_neilpquinn-wmf) multiple times to the distributed cache.
at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10$$anonfun$apply$6.apply(Client.scala:607)
at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10$$anonfun$apply$6.apply(Client.scala:598)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10.apply(Client.scala:598)
at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10.apply(Client.scala:597)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.deploy.yarn.Client.prepareLocalResources(Client.scala:597)
at org.apache.spark.deploy.yarn.Client.createContainerLaunchContext(Client.scala:865)
at org.apache.spark.deploy.yarn.Client.submitApplication(Client.scala:179)
at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend.start(YarnClientSchedulerBackend.scala:57)
at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:183)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:501)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:750)
22/11/17 20:52:15 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!
22/11/17 20:52:15 WARN MetricsSystem: Stopping a MetricsSystem that is not running
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
/usr/lib/spark2/python/pyspark/sql/utils.py in deco(*a, **kw)
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
/usr/lib/spark2/python/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.IllegalArgumentException: Attempt to add (conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz#conda-2022-10-12T21.46.32_neilpquinn-wmf) multiple times to the distributed cache.
at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10$$anonfun$apply$6.apply(Client.scala:607)
at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10$$anonfun$apply$6.apply(Client.scala:598)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10.apply(Client.scala:598)
at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$10.apply(Client.scala:597)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.deploy.yarn.Client.prepareLocalResources(Client.scala:597)
at org.apache.spark.deploy.yarn.Client.createContainerLaunchContext(Client.scala:865)
at org.apache.spark.deploy.yarn.Client.submitApplication(Client.scala:179)
at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend.start(YarnClientSchedulerBackend.scala:57)
at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:183)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:501)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:750)
During handling of the above exception, another exception occurred:
IllegalArgumentException Traceback (most recent call last)
/tmp/ipykernel_30551/3348695440.py in <module>
----> 1 wmf.spark.create_session(ship_python_env=True)
~/wmfdata-python/wmfdata/spark.py in create_session(type, app_name, extra_settings, ship_python_env)
251 app_name=app_name,
252 spark_config=config,
--> 253 ship_python_env=ship_python_env
254 )
255
~/wmfdata-python/wmfdata/spark.py in create_custom_session(master, app_name, spark_config, ship_python_env, conda_pack_kwargs)
205 builder.config(k, v)
206
--> 207 return builder.getOrCreate()
208
209 def create_session(
/usr/lib/spark2/python/pyspark/sql/session.py in getOrCreate(self)
171 for key, value in self._options.items():
172 sparkConf.set(key, value)
--> 173 sc = SparkContext.getOrCreate(sparkConf)
174 # This SparkContext may be an existing one.
175 for key, value in self._options.items():
/usr/lib/spark2/python/pyspark/context.py in getOrCreate(cls, conf)
365 with SparkContext._lock:
366 if SparkContext._active_spark_context is None:
--> 367 SparkContext(conf=conf or SparkConf())
368 return SparkContext._active_spark_context
369
/usr/lib/spark2/python/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
134 try:
135 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
--> 136 conf, jsc, profiler_cls)
137 except:
138 # If an error occurs, clean up in order to allow future SparkContext creation:
/usr/lib/spark2/python/pyspark/context.py in _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls)
196
197 # Create the Java SparkContext through Py4J
--> 198 self._jsc = jsc or self._initialize_context(self._conf._jconf)
199 # Reset the SparkConf to the one actually used by the SparkContext in JVM.
200 self._conf = SparkConf(_jconf=self._jsc.sc().conf())
/usr/lib/spark2/python/pyspark/context.py in _initialize_context(self, jconf)
304 Initialize SparkContext in function to allow subclass specific initialization
305 """
--> 306 return self._jvm.JavaSparkContext(jconf)
307
308 @classmethod
/usr/lib/spark2/python/py4j/java_gateway.py in __call__(self, *args)
1523 answer = self._gateway_client.send_command(command)
1524 return_value = get_return_value(
-> 1525 answer, self._gateway_client, None, self._fqn)
1526
1527 for temp_arg in temp_args:
/usr/lib/spark2/python/pyspark/sql/utils.py in deco(*a, **kw)
77 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
78 if s.startswith('java.lang.IllegalArgumentException: '):
---> 79 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
80 raise
81 return deco
IllegalArgumentException: 'Attempt to add (conda-2022-10-12T21.46.32_neilpquinn-wmf.tgz#conda-2022-10-12T21.46.32_neilpquinn-wmf) multiple times to the distributed cache.'
----
+*In[ ]:*+
[source, ipython3]
----
----