Python: Zapisuje tylko ostatni wiersz wyjścia
Próbuję napisać program, który wyodrębnia adresy URL ze strony internetowej. Wynik jest dobry, ale gdy próbuję zapisać wynik do pliku, zapisywany jest tylko ostatni rekord. Oto kod:
import re
import urllib.request
# Retrieves URLs from the HTML source code of a website
def extractUrls(url, unique=True, sort=True, restrictToTld=None):
# Prepend "www." if not present
if url[0:4] != "www.":
url = "".join(["www.",url])
# Open a connection
with urllib.request.urlopen("http://" + url) as h:
# Grab the headers
headers = h.info()
# Default charset
charset = "ISO-8859-1"
# If a charset is in the headers then override the default
for i in headers:
match = re.search(r"charset=([\w\-]+)", headers[i], re.I)
if match != None:
charset = match.group(1).lower()
break
# Grab and decode the source code
source = h.read().decode(charset)
# Find all URLs in the source code
matches = re.findall(r"http\:\/\/(www.)?([a-z0-9\-\.]+\.[a-z]{2,6})\b", source, re.I)
# Abort if no URLs were found
if matches == None:
return None
# Collect URLs
collection = []
# Go over URLs one by one
for url in matches:
url = url[1].lower()
# If there are more than one dot then the URL contains
# subdomain(s), which we remove
if url.count(".") > 1:
temp = url.split(".")
tld = temp.pop()
url = "".join([temp.pop(),".",tld])
# Restrict to TLD if one is set
if restrictToTld:
tld = url.split(".").pop()
if tld != restrictToTld:
continue
# If only unique URLs should be returned
if unique:
if url not in collection:
collection.append(url)
# Otherwise just add the URL to the collection
else:
collection.append(url)
# Done
return sorted(collection) if sort else collection
# Test
url = "msn.com"
print("Parent:", url)
for x in extractUrls(url):
print("-", x)
f = open("f2.txt", "w+", 1)
f.write( x )
f.close()
Dane wyjściowe to:
Parent: msn.com
- 2o7.net
- atdmt.com
- bing.com
- careerbuilder.com
- delish.com
- discoverbing.com
- discovermsn.com
- facebook.com
- foxsports.com
- foxsportsarizona.com
- foxsportssouthwest.com
- icra.org
- live.com
- microsoft.com
- msads.net
- msn.com
- msnrewards.com
- myhomemsn.com
- nbcnews.com
- northjersey.com
- outlook.com
- revsci.net
- rsac.org
- s-msn.com
- scorecardresearch.com
- skype.com
- twitter.com
- w3.org
- yardbarker.com
[Finished in 0.8s]
Tylko plik „yardbarker.com” jest zapisywany w pliku. Doceniam pomoc, dziękuję.