Python: cómo extraer URL (sin formato / html, entre comillas / base64 / 7bit) de un archivo de correo electrónico

He buscado en muchos lugares pero no he encontrado una lógica / script que extraiga las URL de los correos electrónicos correctamente. Entonces, estoy presentando lo que se me ocurrió. Esto está funcionando perfectamente para mí.

Esto puede manejar tipos de contenido de texto sin formato y de texto html, admite codificaciones imprimibles entre comillas, base64 y 7 bits.

NOTA: Escribí esto como parte de otra tarea, puede que tenga que ajustarlo para satisfacer sus necesidades. Publique cualquier pregunta, y puedo ayudar a responder.

Módulos para importar para que esto funcione:

import traceback
import BeautifulSoup
import re
from sets import Set
import email
import quopri, base64

Aquí están las API que escribí que harán este trabajo:

   def decode_quote_printable_part(self, quo_pri_part):
        """
        Decodes a quote-printable encoded MIME object
        :param quo_pri_part: MIME msg part
        :return: decoded text, null if exception
        """
        try:
            quo_pri_payload = quo_pri_part.get_payload()
            return quopri.decodestring(quo_pri_payload)
        except Exception as err:
            print "ERROR - Exception when decoding quoted printable: %s" % err
            return ""

    def decode_base64_part(self, base64_part):
        """
        Decodes base64 encoded MIME object
        :param base64_part: MIME msg part
        :return: decoded text, null if exception
        """
        try:
            decoded_part = base64.b64decode(base64_part)
            return decoded_part
        except Exception as err:
            print "ERROR - Exception when decoding base64 part: %s" % err
            return ""

    def get_urls_from_html_part(self, html_code):
        """
        Parses the given HTML text and extracts the href links from it.
        The input should already be decoded
        :param html_code: Decoded html text
        :return: A list of href links (includes mailto: links as well), null list if exception
        """
        try:
            soup = BeautifulSoup.BeautifulSoup(html_code)
            html_urls = []
            for link in soup.findAll("a"):
                url = link.get("href")
                if url and "http" in url:
                    html_urls.append(url)
            return html_urls
        except Exception as err:
            print "ERROR - Exception when parsing the html body: %s" % err
            return []

    def get_urls_from_plain_part(self, email_data):
        """
        Parses the given plain text and extracts the URLs out of it
        :param email_data: plain text to parse
        :return: A list of URLs (deduplicated), a null list if exception
        """
        try:
            pattern = "abcdefghijklmnopqrstuvwxyz0123456789./\~#%&()_-+=;?:[]!$*,@'^`<{|\""
            indices = [m.start() for m in re.finditer('http://', email_data)]
            indices.extend([n.start() for n in re.finditer('https://', email_data)])
            urls = []
            if indices:
                if len(indices) > 1:
                    new_lst = zip(indices, indices[1:])
                    for x, y in new_lst:
                        tmp = email_data[x:y]
                        url = ""
                        for ch in tmp:
                            if ch.lower() in pattern:
                                url += ch
                            else:
                                break
                        urls.append(url)
                tmp = email_data[indices[-1]:]
                url = ""
                for ch in tmp:
                        if ch.lower() in pattern:
                            url += ch
                        else:
                            break
                urls.append(url)
                urls = list(Set(urls))
                return urls
            return []

        except Exception as err:
            print "ERROR - Exception when parsing plain text for urls: %s" % err
            return []

    def get_urls_list(self, msg):
        """
        Collects all the URLs from an email
        :param msg: email message object
        :return: A dictionary of URLs => final_urls = {'http': [], 'https': []}
        """
        urls = []
        for part in msg.walk():
            decoded_part = part.get_payload()
            if part.__getitem__("Content-Transfer-Encoding") == "quoted-printable":
                decoded_part = self.decode_quote_printable_part(part)
            elif part.__getitem__("Content-Transfer-Encoding") == "base64":
                decoded_part = self.decode_base64_part(part.get_payload())
            if part.get_content_subtype() == "plain":
                urls.extend(self.get_urls_from_plain_part(decoded_part))
            elif part.get_content_subtype() == "html":
                urls.extend(self.get_urls_from_html_part(decoded_part))

        final_urls = {'http': [], 'https': []}
        for url in urls:
            if "http://" in url:
                final_urls['http'].append(url)
            else:
                final_urls['https'].append(url)
        return final_urls

Aquí es cómo llamar a esta API:

try:
   with open(filename, 'r') as f:
       data = f.read()
   msg = email.message_from_string(data)
   final_urls = self.get_urls_list(msg)
except:
    pass

Respuestas a la pregunta(0)

Su respuesta a la pregunta