unionAll resultando em StackOverflow
Eu fiz alguns progressos com minha própria pergunta (como carregar um quadro de dados de um fluxo de solicitações python que está baixando um arquivo csv?) no StackOverflow, mas estou recebendo um erro do StackOverflow:
import requests
import numpy as np
import pandas as pd
import sys
if sys.version_info[0] < 3:
from StringIO import StringIO
else:
from io import StringIO
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
chunk_size = 1024
url = "https://{0}:8443/gateway/default/webhdfs/v1/{1}?op=OPEN".format(host, filepath)
r = requests.get(url, auth=(username, password),
verify=False, allow_redirects=True,
stream=True)
df = None
curr_line = 1
remainder = ''
for chunk in r.iter_content(chunk_size):
txt = remainder + chunk
[lines, remainder] = txt.rsplit('\n', 1)
pdf = pd.read_csv(StringIO(lines), sep='|', header=None)
if df == None:
df = sqlContext.createDataFrame(pdf)
else:
df = df.unionAll(sqlContext.createDataFrame(pdf))
print df.count()
O stacktrace está aqui:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-4-b3a89df3c7d8> in <module>()
36 df = sqlContext.createDataFrame(pdf)
37 else:
---> 38 df = df.unionAll(sqlContext.createDataFrame(pdf))
39
40 #curr_line = curr_line + 1
/usr/local/src/spark160master/spark/python/pyspark/sql/dataframe.py in unionAll(self, other)
993 This is equivalent to `UNION ALL` in SQL.
994 """
--> 995 return DataFrame(self._jdf.unionAll(other._jdf), self.sql_ctx)
996
997 @since(1.3)
/usr/local/src/spark160master/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/usr/local/src/spark160master/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/usr/local/src/spark160master/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling o19563.unionAll.
: java.lang.StackOverflowError
Não sei como consertar isso. Algumas dicas apreciadas.