enviar script .py no Spark sem instalação do Hadoop
Eu tenho o seguinte script Python de contagem de palavras simples.
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)
from operator import add
f=sc.textFile("C:/Spark/spark-1.2.0/README.md")
wc=f.flatMap(lambda x: x.split(" ")).map(lambda x: (x,1)).reduceByKey(add)
print wc
wc.saveAsTextFile("wc_out.txt")
Estou iniciando este script usando esta linha de comando:
spark-submit "C:/Users/Alexis/Desktop/SparkTest.py"
Eu estou recebendo o seguinte erro:
Picked up _JAVA_OPTIONS: -Djava.net.preferIPv4Stack=true
15/04/,20 18:58:01 WARN Utils: Your hostname, AE-LenovoUltra resolves to a loopba
ck address: 127.0.1.2; using 192.168.1.63 instead (on interface net0)
15/04/20 18:58:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another
address
15/04/20 18:58:10 WARN NativeCodeLoader: Unable to load native-hadoop library fo
r your platform... using builtin-java classes where applicable
15/04/20 18:58:11 ERROR Shell: Failed to locate the winutils binary in the hadoo
p binary path
java.io.IOException: Could not locate executable null\bin\winutils.exe in the Ha
doop binaries.
at org.apache.hadoop.util.Shell.getQualifiedBinPath(Shell.java:278)
at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:300)
at org.apache.hadoop.util.Shell.<clinit>(Shell.java:293)
at org.apache.hadoop.fs.FileUtil.chmod(FileUtil.java:867)
at org.apache.hadoop.fs.FileUtil.chmod(FileUtil.java:853)
at org.apache.spark.util.Utils$.fetchFile(Utils.scala:411)
at org.apache.spark.SparkContext.addFile(SparkContext.scala:969)
at org.apache.spark.SparkContext$anonfun$12.apply(SparkContext.scala:28
0)
at org.apache.spark.SparkContext$anonfun$12.apply(SparkContext.scala:28
0)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:280)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.sc
ala:61)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstruct
orAccessorImpl.java:57)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingC
onstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:214)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand
.java:79)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
Traceback (most recent call last):
File "C:/Users/Alexis/Desktop/SparkTest.py", line 3, in <module>
sc = SparkContext(conf = conf)
File "C:\Spark\spark-1.2.0\python\pyspark\context.py", line 105, in __init__
conf, jsc)
File "C:\Spark\spark-1.2.0\python\pyspark\context.py", line 153, in _do_init
self._jsc = jsc or self._initialize_context(self._conf._jconf)
File "C:\Spark\spark-1.2.0\python\pyspark\context.py", line 201, in _initializ
e_context
return self._jvm.JavaSparkContext(jconf)
File "C:\Spark\spark-1.2.0\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.p
y", line 701, in __call__
File "C:\Spark\spark-1.2.0\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py",
line 300, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling None.org.apache.spa
rk.api.java.JavaSparkContext.
: java.lang.NullPointerException
at java.lang.ProcessBuilder.start(ProcessBuilder.java:1010)
at org.apache.hadoop.util.Shell.runCommand(Shell.java:404)
at org.apache.hadoop.util.Shell.run(Shell.java:379)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:
589)
at org.apache.hadoop.fs.FileUtil.chmod(FileUtil.java:873)
at org.apache.hadoop.fs.FileUtil.chmod(FileUtil.java:853)
at org.apache.spark.util.Utils$.fetchFile(Utils.scala:411)
at org.apache.spark.SparkContext.addFile(SparkContext.scala:969)
at org.apache.spark.SparkContext$anonfun$12.apply(SparkContext.scala:28
0)
at org.apache.spark.SparkContext$anonfun$12.apply(SparkContext.scala:28
0)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:280)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.sc
ala:61)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstruct
orAccessorImpl.java:57)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingC
onstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:214)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand
.java:79)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
Para um iniciante no Spark como eu, parece que este é o problema: "Shell de erro: falha ao localizar o binário do winutils no caminho binário do hadoop". No entanto, a documentação do Spark afirma claramente que uma instalação do Hadoop não é necessária para que o Spark seja executado no modo independente.
O que estou fazendo errado?