Machen meines eigenen Webcrawlers in Python, der die Hauptidee des Page Rank zeigt
Ich versuche, einen Webcrawler zu erstellen, der die Grundidee des Page Rank zeigt. Und Code für mich scheint in Ordnung zu sein, gibt mir aber Fehler zurück e.x.
`Traceback (most recent call last):
File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 89, in <module>
webpages()
File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 17, in webpages
get_single_item_data(href)
File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 23, in get_single_item_data
source_code = requests.get(item_url)
File "C:\Python34\lib\site-packages\requests\api.py", line 65, in get
return request('get', url, **kwargs)
File "C:\Python34\lib\site-packages\requests\api.py", line 49, in request
response = session.request(method=method, url=url, **kwargs)
File "C:\Python34\lib\site-packages\requests\sessions.py", line 447, in request
prep = self.prepare_request(req)
File "C:\Python34\lib\site-packages\requests\sessions.py", line 378, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Python34\lib\site-packages\requests\models.py", line 303, in prepare
self.prepare_url(url, params)
File "C:\Python34\lib\site-packages\requests\models.py", line 360, in prepare_url
"Perhaps you meant http://{0}?".format(url))
requests.exceptions.MissingSchema: Invalid URL '//www.hm.com/lv/logout': No schema supplied. Perhaps you meant http:////www.hm.com/lv/logout?`
und die letzte Codezeile, die mir Python nach dem Ausführen zurückgibt, ist:
//www.hm.com/lv/logout
Möglicherweise Problem ist mit zwei//
aber ich bin mir trotzdem sicher, wenn ich versuche andere Webseiten e.x aufzurufen.http: //en.wikipedia.org/wiki/Wik es gibt mir zurückNone
und dieselben Fehler.
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from operator import itemgetter
all_links = defaultdict(int)
def webpages():
url = 'http://www.hm.com/lv/'
source_code = requests.get(url)
text = source_code.text
soup = BeautifulSoup(text)
for link in soup.findAll ('a'):
href = link.get('href')
print(href)
get_single_item_data(href)
return all_links
def get_single_item_data(item_url):
#if not item_url.startswith('http'):
#item_url = 'http' + item_url
source_code = requests.get(item_url)
text = source_code.text
soup = BeautifulSoup(text)
for link in soup.findAll('a'):
href = link.get('href')
if href and href.startswith('http://www.'):
if href:
all_links[href] += 1
print(href)
def sort_algorithm(list):
for index in range(1,len(list)):
value= list[index]
i = index - 1
while i>=0:
if value < list[i]:
list[i+1] = list[i]
list[i] = value
i=i -1
else:
break
vieni = ["", "viens", "divi", "tris", "cetri", "pieci",
"sesi", "septini", "astoni", "devini"]
padsmiti = ["", "vienpadsmit", "divpadsmit", "trispadsmit", "cetrpadsmit",
"piecpadsmit", 'sespadsmit', "septinpadsmit", "astonpadsmit", "devinpadsmit"]
desmiti = ["", "desmit", "divdesmit", "trisdesmit", "cetrdesmit",
"piecdesmit", "sesdesmit", "septindesmit", "astondesmit", "devindesmit"]
def num_to_words(n):
words = []
if n == 0:
words.append("zero")
else:
num_str = "{}".format(n)
groups = (len(num_str) + 2) // 3
num_str = num_str.zfill(groups * 3)
for i in range(0, groups * 3, 3):
h = int(num_str[i])
t = int(num_str[i + 1])
u = int(num_str[i + 2])
print()
print(vieni[i])
g = groups - (i // 3 + 1)
if h >= 1:
words.append(vieni[h])
words.append("hundred")
if int(num_str) % 100:
words.append("and")
if t > 1:
words.append(desmiti[t])
if u >= 1:
words.append(vieni[u])
elif t == 1:
if u >= 1:
words.append(padsmiti[u])
else:
words.append(desmiti[t])
else:
if u >= 1:
words.append(vieni[u])
return " ".join(words)
webpages()
for k, v in sorted(webpages().items(),key=itemgetter(1),reverse=True):
print(k, num_to_words(v))