Otimizando código Python usando o SQLite3 + Mutagen
Estou no processo de melhorar um banco de dados de música de código aberto, que lê músicas da minha coleção e as armazena em um banco de dados SQLite. Por sua vez, sou capaz de alavancar o banco de dados para encontrar duplicatas, executar consultas na minha coleção e (se assim o desejar) encontrar músicas duplicadas na coleção. Para ler os metadados dos arquivos de música, estou aproveitando o Mutagen library e, para armazenar os metadados, estou usando o SQLite
Eu queria testar o código que eu havia criado em uma coleção considerável, então entrei em contato com colegas e familiares e me deparei com um tamanho total de teste de cerca de 90.000. Isso também consiste em informações mistas - as músicas são do formato .mp3, .ogg ou .flac.
Meu principal problema é com a velocidade - o código que eu tenho funciona, mas é inadequadamente lento. Em seu estado atual, ele percorre o tamanho do teste em aproximadamente 35: 45. Minha principal pergunta: o que posso fazer para melhorar o desempenho desse bloco de código? Acho que está diretamente relacionado a algo dentro do Mutagen diretamente ou do SQLite3, embora eu esteja aberto a sugestões sobre qual seria a abordagem ideal para melhorar a eficiência daqu
Passei por duas iterações para melhorar essa parte crítica do código. A primeira melhoria levou a um tempo de execução reduzido de 21: 30, mas isso ainda é horrível. Decidi refatorar o código para reduzir o número de chamadas de função feitas e uma tentativa de melhorar o desempenho. O resultado, no entanto, é uma regressão do desempenho, mas umenorm redução no número de chamadas de função feitas - o segundo lote é executado perto de 51: 51, o que é simplesmente inaceitáve
O que se segue em termos de código é para o tempo de execução "aprimorado" e o conjunto refatorado. Também estão anexados perfis separados para cada pedaço de códig
Bloco 1: Tempo de execução "aprimorado"def walk(self, d):
'''Walk down the file structure iteratively, gathering file names to be read in.'''
d = os.path.abspath(d)
dirpath = os.walk(d)
for folder in dirpath:
for f in folder[2]: # for each file in the folder...
supported = 'mp3', 'ogg', 'flac'
if f.split('.')[-1] in supported:
try:
self.parse(os.path.join(folder[0], f))
, if self.filecount == 2000 or self.leftover:
self.filecount = 0
try:
self.db.execute_batch_insert_statement(u"INSERT INTO song VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", self.buf)
except Exception, e:
print e.__unicode__()
finally:
del self.buf
self.buf = [] # wipe the buffers clean so we can repeat a batch parse again.
except Exception, e:
print e.__unicode__()
try:
self.db.execute_batch_insert_statement(u"INSERT INTO song VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", self.buf)
except Exception, e:
print e.__unicode__()
finally:
del self.buf
self.buf = [] # wipe the buffers clean so we can repeat a batch parse again.
def parse(self, filename):
'''Process and parse the music file to extract desired information.
It may be the case that, in the future, we require more information from a song than is provided at this time.
Examine all tags that can be retrieved from a mutagen.File object, and adjust the database's schema accordingly.'''
if ".ogg" in filename:
song = OggVorbis(filename)
elif ".mp3" in filename:
song = MP3(filename)
elif ".flac" in filename:
song = FLAC(filename)
else:
raise InvalidSongException(u"Song is not supported by K'atun at this time.")
filename = u'filename'
#song = mutagen.File(filename, easy=True)
artist, title, genre, track, album, bitrate, year, month = '', '', '', '', '', '', '', ''
try:
artist = song['artist'][0]
title = song['title'][0]
except Exception:
raise InvalidSongException(u"Cannot read " + filename + ": missing critical song information.")
if 'genre' in song:
genre = song['genre'][0]
else:
genre = u'Unknown'
if 'tracknumber' in song:
track = song['tracknumber'][0]
else:
track = 0
if 'album' in song:
album = song['album'][0]
else:
album = u'Unknown'
if 'date' in song:
year = song['date'][0]
else:
year = 'Unknown'
try:
bitrate = int(song.info.bitrate)
except AttributeError: # Likely due to us messing with FLAC
bitrate = 999999 # Set to a special flag value, to indicate that this is a lossless file.
self.buf.append((filename, artist, filename.split('.')[-1], title, genre, track, album, bitrate, year, time.time()))
self.filecount += 1
Sat Dec 24 21:24:23 2011 modified.dat
70626027 function calls (70576436 primitive calls) in 1290.127 CPU seconds
Ordered by: cumulative time
List reduced from 666 to 28 due to restriction <28>
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.033 0.033 1290.127 1290.127 parser.py:6(<module>)
1 0.000 0.000 1290.090 1290.090 parser.py:90(main)
1 0.000 0.000 1286.493 1286.493 parser.py:24(__init__)
1 1.826 1.826 1286.335 1286.335 parser.py:35(walk)
90744 2.376 0.000 1264.788 0.014 parser.py:55(parse)
90744 11.840 0.000 1250.401 0.014 lib/mutagen/__init__.py:158(File)
376019 613.881 0.002 613.881 0.002 {method 'seek' of 'file' objects}
90744 1.231 0.000 580.143 0.006 /usr/lib/pymodules/python2.7/mutagen/apev2.py:458(score)
671848 530.346 0.001 530.346 0.001 {method 'read' of 'file' objects}
90742 0.846 0.000 242.337 0.003 /usr/lib/pymodules/python2.7/mutagen/__init__.py:68(__init__)
63944 2.471 0.000 177.050 0.003 /usr/lib/pymodules/python2.7/mutagen/id3.py:1973(load)
63944 0.526 0.000 119.326 0.002 /usr/lib/pymodules/python2.7/mutagen/easyid3.py:161(__init__)
63944 4.649 0.000 118.077 0.002 /usr/lib/pymodules/python2.7/mutagen/id3.py:89(load)
26782 1.073 0.000 64.435 0.002 /usr/lib/pymodules/python2.7/mutagen/ogg.py:434(load)
127531 0.464 0.000 59.314 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:76(__fullread)
63944 1.078 0.000 54.060 0.001 /usr/lib/pymodules/python2.7/mutagen/mp3.py:68(__init__)
26782 0.638 0.000 53.613 0.002 /usr/lib/pymodules/python2.7/mutagen/ogg.py:379(find_last)
66487 3.167 0.000 50.136 0.001 /usr/lib/pymodules/python2.7/mutagen/mp3.py:106(__try)
855079 6.415 0.000 33.237 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:279(__read_frames)
816987 0.904 0.000 24.491 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:321(__load_framedata)
816987 2.805 0.000 23.587 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:1023(fromData)
60803/11257 0.370 0.000 19.036 0.002 /usr/lib/python2.7/os.py:209(walk)
11256 14.651 0.001 14.651 0.001 {posix.listdir}
816973 3.265 0.000 13.140 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:996(_readData)
879103 4.936 0.000 11.473 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:964(__init__)
872462 0.967 0.000 11.336 0.000 /usr/lib/pymodules/python2.7/mutagen/__init__.py:78(__getitem__)
63944 1.969 0.000 10.871 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:443(update_to_v24)
619380 1.396 0.000 8.521 0.000 /usr/lib/pymodules/python2.7/mutagen/easyid3.py:175(__getitem__)
Bloco 2: Código Refatoradodef walk(self, d):
'''Walk down the file structure iteratively, gathering file names to be read in.'''
d = os.path.abspath(d)
dirpath = os.walk(d)
parsecount = 0
start = time.time()
for folder in dirpath:
for f in folder[2]: # for each file in the folder...
filetype = f.split('.')[-1].lower()
if filetype == 'mp3':
try:
self.read_mp3(os.path.join(folder[0], f).decode('utf_8'))
except Exception, e:
print e.__unicode__()
elif filetype == 'ogg':
try:
self.read_vorbis(os.path.join(folder[0], f).decode('utf_8'))
except Exception, e:
print e.__unicode__()
elif filetype == 'flac':
try:
self.read_flac(os.path.join(folder[0], f).decode('utf_8'))
except Exception, e:
print e.__unicode__()
else:
continue
if self.filecount == 2000 or self.leftover:
self.filecount = 0
print "Time differential: %1.4f s" % (time.time() - start)
self.batch_commit()
try:
print "Wrapping up"
self.batch_commit()
except Exception, e:
print e.__unicode__()
finally:
print "Elapsed time: " + str(time.time()-start)
def batch_commit(self):
'''Insert new values into the database in large quantities.'''
self.db.execute_batch_insert_statement(u"INSERT INTO song VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", self.buf)
self.buf = []
def read_mp3(self, filename):
'''Read and extract an MP3 file's tags. This makes use of the ID3 standard, not the easy ID3 tag system.'''
artist, title, genre, track, album, bitrate, year = '', '', '', '', '', 0, ''
song = MP3(filename)
keys = song.keys()
try:
artist = song['TPE1'].__unicode__()
title = song['TIT2'].__unicode__()
except KeyError, e:
raise InvalidSongException(u"Cannot read " + filename + ": missing critical song information.")
genre = song['TCON'].__unicode__() if "TCON" in keys else u'Unknown'
track = song['TRCK'].__unicode__() if "TRCK" in keys else u'0'
album = song['TALB'].__unicode__() if "TALB" in keys else u'Unknown'
bitrate = int(song.info.bitrate)
year = song['TDRC'].__unicode__() if "TDRC" in keys else u'Unknown'
self.buf.append((filename, artist, filename.split('.')[-1], title, genre, track, album, bitrate, year, time.time()))
self.filecount += 1
def read_vorbis(self, filename):
'''Read and extract an Ogg Vorbis file's tags.'''
song = OggVorbis(filename)
artist, title, genre, track, album, bitrate, year = '', '', '', '', '', 0, ''
try:
artist = song['artist'][0]
title = song['title'][0]
except KeyError, e:
raise InvalidSongException(u"Cannot read " + filename + ": missing critical song information.")
genre = song['genre'][0] if genre else u'Unknown'
track = song['tracknumber'][0] if 'tracknumber' in song else u'0'
album = song['album'][0] if 'album' in song else u'Unknown'
bitrate = int(song.info.bitrate)
year = song['date'][0] if 'date' in song else 'Unknown'
self.buf.append((filename, artist, filename.split('.')[-1], title, genre, track, album, bitrate, year, time.time()))
self.filecount += 1
def read_flac(self, filename):
'''Read and extract a FLAC file's tags.'''
song = FLAC(filename)
artist, title, genre, track, album, bitrate, year = '', '', '', '', '', 0, ''
try:
artist = song['artist'][0]
title = song['title'][0]
except KeyError, e:
raise InvalidSongException(u"Cannot read " + filename + ": missing critical song information.")
genre = song['genre'][0] if genre else u'Unknown'
track = song['tracknumber'][0] if 'tracknumber' in song else u'0'
album = song['album'][0] if 'album' in song else u'Unknown'
bitrate = 999999 # Special flag for K'atun; will know that this is a lossless file
year = song['date'][0] if 'date' in song else 'Unknown'
self.buf.append((filename, artist, filename.split('.')[-1], title, genre, track, album, bitrate, year, time.time()))
self.filecount += 1
Mon Dec 26 03:22:34 2011 refactored.dat
59939763 function calls (59890172 primitive calls) in 3111.490 CPU seconds
Ordered by: cumulative time
List reduced from 559 to 28 due to restriction <28>
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.001 0.001 3111.490 3111.490 parser.py:6(<module>)
1 0.000 0.000 3111.477 3111.477 parser.py:138(main)
1 0.000 0.000 3108.242 3108.242 parser.py:27(__init__)
1 1.760 1.760 3108.062 3108.062 parser.py:40(walk)
46 0.103 0.002 2220.618 48.274 parser.py:78(batch_commit)
46 0.002 0.000 2220.515 48.272 db_backend.py:127(execute_batch_insert_statement)
46 2184.900 47.498 2184.900 47.498 {method 'executemany' of 'sqlite3.Connection' objects}
90747 0.515 0.000 845.343 0.009 /usr/lib/pymodules/python2.7/mutagen/__init__.py:68(__init__)
426651 640.459 0.002 640.459 0.002 {method 'read' of 'file' objects}
63945 1.847 0.000 582.267 0.009 parser.py:83(read_mp3)
63945 2.372 0.000 577.245 0.009 /usr/lib/pymodules/python2.7/mutagen/id3.py:1973(load)
63945 0.307 0.000 514.927 0.008 /usr/lib/pymodules/python2.7/mutagen/id3.py:72(__init__)
63945 0.256 0.000 514.620 0.008 /usr/lib/pymodules/python2.7/mutagen/_util.py:103(__init__)
63945 0.225 0.000 514.363 0.008 /usr/lib/pymodules/python2.7/mutagen/__init__.py:35(__init__)
63945 4.188 0.000 514.139 0.008 /usr/lib/pymodules/python2.7/mutagen/id3.py:89(load)
127533 0.802 0.000 455.713 0.004 /usr/lib/pymodules/python2.7/mutagen/id3.py:76(__fullread)
63945 1.029 0.000 432.574 0.007 /usr/lib/pymodules/python2.7/mutagen/id3.py:202(__load_header)
26786 0.504 0.000 270.216 0.010 parser.py:102(read_vorbis)
26786 1.095 0.000 267.578 0.010 /usr/lib/pymodules/python2.7/mutagen/ogg.py:434(load)
26782 0.627 0.000 143.492 0.005 /usr/lib/pymodules/python2.7/mutagen/ogg.py:379(find_last)
221337 121.448 0.001 121.448 0.001 {method 'seek' of 'file' objects}
97603 1.797 0.000 118.799 0.001 /usr/lib/pymodules/python2.7/mutagen/ogg.py:66(__init__)
26786 0.342 0.000 114.656 0.004 lib/mutagen/oggvorbis.py:40(__init__)
63945 0.646 0.000 58.809 0.001 lib/mutagen/mp3.py:68(__init__)
66480 3.377 0.000 57.489 0.001 lib/mutagen/mp3.py:106(__try)
47 35.609 0.758 35.609 0.758 {method 'commit' of 'sqlite3.Connection' objects}
855108 6.184 0.000 32.181 0.000 /usr/lib/pymodules/python2.7/mutagen/id3.py:279(__read_frames)
60803/11257 0.385 0.000 31.885 0.003 /usr/lib/python2.7/os.py:209(walk)