¿Cómo ordenar archivos enormes con Python?
Encontré algo de este código prometedor en activestate.com para ordenar archivos enormes. Estoy intentando ejecutarlo en el intérprete predeterminado de Python 2.6.5 en Ubuntu 10.04. Cuando intento ejecutarlo en un pequeño archivo de prueba, obtengo el siguiente rastreo de error. Pedí ayuda en activestate.com, pero este hilo ha estado en silencio durante más de 18 meses. ¿Hay alguien aquí que vea una solución obvia?
Gracias.
## {{{ http://code.activestate.com/recipes/576755/ (r3)
# based on Recipe 466302: Sorting big files the Python 2.4 way
# by Nicolas Lehuen
import os
from tempfile import gettempdir
from itertools import islice, cycle
from collections import namedtuple
import heapq
Keyed = namedtuple("Keyed", ["key", "obj"])
def merge(key=None, *iterables):
# based on code posted by Scott David Daniels in c.l.p.
# http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d
if key is None:
keyed_iterables = iterables
else:
keyed_iterables = [(Keyed(key(obj), obj) for obj in iterable)
for iterable in iterables]
for element in heapq.merge(*keyed_iterables):
yield element.obj
def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None):
if tempdirs is None:
tempdirs = []
if not tempdirs:
tempdirs.append(gettempdir())
chunks = []
try:
with open(input,'rb',64*1024) as input_file:
input_iterator = iter(input_file)
for tempdir in cycle(tempdirs):
current_chunk = list(islice(input_iterator,buffer_size))
if not current_chunk:
break
current_chunk.sort(key=key)
output_chunk = open(os.path.join(tempdir,'%06i'%len(chunks)),'w+b',64*1024)
chunks.append(output_chunk)
output_chunk.writelines(current_chunk)
output_chunk.flush()
output_chunk.seek(0)
with open(output,'wb',64*1024) as output_file:
output_file.writelines(merge(key, *chunks))
finally:
for chunk in chunks:
try:
chunk.close()
os.remove(chunk.name)
except Exception:
pass
Traza de error:
Traceback (most recent call last):
File "./batch_sort.py", line 108, in <module>
batch_sort(args[0],args[1],options.key,options.buffer_size,options.tempdirs)
File "./batch_sort.py", line 54, in batch_sort
output_file.writelines(merge(key, *chunks))
File "./batch_sort.py", line 30, in merge
yield element.obj
AttributeError: 'str' object has no attribute 'obj'