Hacer mi propio rastreador web en Python que muestra la idea principal del rango de página
Estoy tratando de hacer un rastreador web que muestre una idea básica del rango de la página. Y el código para mí parece estar bien para mí, pero me devuelve errores e.x.
`Traceback (most recent call last):
File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 89, in <module>
webpages()
File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 17, in webpages
get_single_item_data(href)
File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 23, in get_single_item_data
source_code = requests.get(item_url)
File "C:\Python34\lib\site-packages\requests\api.py", line 65, in get
return request('get', url, **kwargs)
File "C:\Python34\lib\site-packages\requests\api.py", line 49, in request
response = session.request(method=method, url=url, **kwargs)
File "C:\Python34\lib\site-packages\requests\sessions.py", line 447, in request
prep = self.prepare_request(req)
File "C:\Python34\lib\site-packages\requests\sessions.py", line 378, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Python34\lib\site-packages\requests\models.py", line 303, in prepare
self.prepare_url(url, params)
File "C:\Python34\lib\site-packages\requests\models.py", line 360, in prepare_url
"Perhaps you meant http://{0}?".format(url))
requests.exceptions.MissingSchema: Invalid URL '//www.hm.com/lv/logout': No schema supplied. Perhaps you meant http:////www.hm.com/lv/logout?`
y la última fila de código que Python me devuelve después de ejecutarlo es:
//www.hm.com/lv/logout
Tal vez el problema es con dos//
pero estoy seguro, de todos modos, cuando intento rastrear otras páginas web e.x.http://en.wikipedia.org/wiki/Wiki me devuelveNone
y los mismos errores.
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from operator import itemgetter
all_links = defaultdict(int)
def webpages():
url = 'http://www.hm.com/lv/'
source_code = requests.get(url)
text = source_code.text
soup = BeautifulSoup(text)
for link in soup.findAll ('a'):
href = link.get('href')
print(href)
get_single_item_data(href)
return all_links
def get_single_item_data(item_url):
#if not item_url.startswith('http'):
#item_url = 'http' + item_url
source_code = requests.get(item_url)
text = source_code.text
soup = BeautifulSoup(text)
for link in soup.findAll('a'):
href = link.get('href')
if href and href.startswith('http://www.'):
if href:
all_links[href] += 1
print(href)
def sort_algorithm(list):
for index in range(1,len(list)):
value= list[index]
i = index - 1
while i>=0:
if value < list[i]:
list[i+1] = list[i]
list[i] = value
i=i -1
else:
break
vieni = ["", "viens", "divi", "tris", "cetri", "pieci",
"sesi", "septini", "astoni", "devini"]
padsmiti = ["", "vienpadsmit", "divpadsmit", "trispadsmit", "cetrpadsmit",
"piecpadsmit", 'sespadsmit', "septinpadsmit", "astonpadsmit", "devinpadsmit"]
desmiti = ["", "desmit", "divdesmit", "trisdesmit", "cetrdesmit",
"piecdesmit", "sesdesmit", "septindesmit", "astondesmit", "devindesmit"]
def num_to_words(n):
words = []
if n == 0:
words.append("zero")
else:
num_str = "{}".format(n)
groups = (len(num_str) + 2) // 3
num_str = num_str.zfill(groups * 3)
for i in range(0, groups * 3, 3):
h = int(num_str[i])
t = int(num_str[i + 1])
u = int(num_str[i + 2])
print()
print(vieni[i])
g = groups - (i // 3 + 1)
if h >= 1:
words.append(vieni[h])
words.append("hundred")
if int(num_str) % 100:
words.append("and")
if t > 1:
words.append(desmiti[t])
if u >= 1:
words.append(vieni[u])
elif t == 1:
if u >= 1:
words.append(padsmiti[u])
else:
words.append(desmiti[t])
else:
if u >= 1:
words.append(vieni[u])
return " ".join(words)
webpages()
for k, v in sorted(webpages().items(),key=itemgetter(1),reverse=True):
print(k, num_to_words(v))