Python: cómo extraer URL (sin formato / html, entre comillas / base64 / 7bit) de un archivo de correo electrónico
He buscado en muchos lugares pero no he encontrado una lógica / script que extraiga las URL de los correos electrónicos correctamente. Entonces, estoy presentando lo que se me ocurrió. Esto está funcionando perfectamente para mí.
Esto puede manejar tipos de contenido de texto sin formato y de texto html, admite codificaciones imprimibles entre comillas, base64 y 7 bits.
NOTA: Escribí esto como parte de otra tarea, puede que tenga que ajustarlo para satisfacer sus necesidades. Publique cualquier pregunta, y puedo ayudar a responder.
Módulos para importar para que esto funcione:
import traceback
import BeautifulSoup
import re
from sets import Set
import email
import quopri, base64
Aquí están las API que escribí que harán este trabajo:
def decode_quote_printable_part(self, quo_pri_part):
"""
Decodes a quote-printable encoded MIME object
:param quo_pri_part: MIME msg part
:return: decoded text, null if exception
"""
try:
quo_pri_payload = quo_pri_part.get_payload()
return quopri.decodestring(quo_pri_payload)
except Exception as err:
print "ERROR - Exception when decoding quoted printable: %s" % err
return ""
def decode_base64_part(self, base64_part):
"""
Decodes base64 encoded MIME object
:param base64_part: MIME msg part
:return: decoded text, null if exception
"""
try:
decoded_part = base64.b64decode(base64_part)
return decoded_part
except Exception as err:
print "ERROR - Exception when decoding base64 part: %s" % err
return ""
def get_urls_from_html_part(self, html_code):
"""
Parses the given HTML text and extracts the href links from it.
The input should already be decoded
:param html_code: Decoded html text
:return: A list of href links (includes mailto: links as well), null list if exception
"""
try:
soup = BeautifulSoup.BeautifulSoup(html_code)
html_urls = []
for link in soup.findAll("a"):
url = link.get("href")
if url and "http" in url:
html_urls.append(url)
return html_urls
except Exception as err:
print "ERROR - Exception when parsing the html body: %s" % err
return []
def get_urls_from_plain_part(self, email_data):
"""
Parses the given plain text and extracts the URLs out of it
:param email_data: plain text to parse
:return: A list of URLs (deduplicated), a null list if exception
"""
try:
pattern = "abcdefghijklmnopqrstuvwxyz0123456789./\~#%&()_-+=;?:[]!$*,@'^`<{|\""
indices = [m.start() for m in re.finditer('http://', email_data)]
indices.extend([n.start() for n in re.finditer('https://', email_data)])
urls = []
if indices:
if len(indices) > 1:
new_lst = zip(indices, indices[1:])
for x, y in new_lst:
tmp = email_data[x:y]
url = ""
for ch in tmp:
if ch.lower() in pattern:
url += ch
else:
break
urls.append(url)
tmp = email_data[indices[-1]:]
url = ""
for ch in tmp:
if ch.lower() in pattern:
url += ch
else:
break
urls.append(url)
urls = list(Set(urls))
return urls
return []
except Exception as err:
print "ERROR - Exception when parsing plain text for urls: %s" % err
return []
def get_urls_list(self, msg):
"""
Collects all the URLs from an email
:param msg: email message object
:return: A dictionary of URLs => final_urls = {'http': [], 'https': []}
"""
urls = []
for part in msg.walk():
decoded_part = part.get_payload()
if part.__getitem__("Content-Transfer-Encoding") == "quoted-printable":
decoded_part = self.decode_quote_printable_part(part)
elif part.__getitem__("Content-Transfer-Encoding") == "base64":
decoded_part = self.decode_base64_part(part.get_payload())
if part.get_content_subtype() == "plain":
urls.extend(self.get_urls_from_plain_part(decoded_part))
elif part.get_content_subtype() == "html":
urls.extend(self.get_urls_from_html_part(decoded_part))
final_urls = {'http': [], 'https': []}
for url in urls:
if "http://" in url:
final_urls['http'].append(url)
else:
final_urls['https'].append(url)
return final_urls
Aquí es cómo llamar a esta API:
try:
with open(filename, 'r') as f:
data = f.read()
msg = email.message_from_string(data)
final_urls = self.get_urls_list(msg)
except:
pass