unionAll resultando en StackOverflow
He progresado con mi propia pregunta (¿Cómo cargar un marco de datos desde una secuencia de solicitudes de Python que está descargando un archivo CSV?) en StackOverflow, pero recibo un error de StackOverflow:
import requests
import numpy as np
import pandas as pd
import sys
if sys.version_info[0] < 3:
from StringIO import StringIO
else:
from io import StringIO
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
chunk_size = 1024
url = "https://{0}:8443/gateway/default/webhdfs/v1/{1}?op=OPEN".format(host, filepath)
r = requests.get(url, auth=(username, password),
verify=False, allow_redirects=True,
stream=True)
df = None
curr_line = 1
remainder = ''
for chunk in r.iter_content(chunk_size):
txt = remainder + chunk
[lines, remainder] = txt.rsplit('\n', 1)
pdf = pd.read_csv(StringIO(lines), sep='|', header=None)
if df == None:
df = sqlContext.createDataFrame(pdf)
else:
df = df.unionAll(sqlContext.createDataFrame(pdf))
print df.count()
El stacktrace está aquí:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-4-b3a89df3c7d8> in <module>()
36 df = sqlContext.createDataFrame(pdf)
37 else:
---> 38 df = df.unionAll(sqlContext.createDataFrame(pdf))
39
40 #curr_line = curr_line + 1
/usr/local/src/spark160master/spark/python/pyspark/sql/dataframe.py in unionAll(self, other)
993 This is equivalent to `UNION ALL` in SQL.
994 """
--> 995 return DataFrame(self._jdf.unionAll(other._jdf), self.sql_ctx)
996
997 @since(1.3)
/usr/local/src/spark160master/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/usr/local/src/spark160master/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/usr/local/src/spark160master/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling o19563.unionAll.
: java.lang.StackOverflowError
No estoy seguro de cómo solucionar esto. Cualquier consejo apreciado.