#!/usr/bin/python3
from bs4 import BeautifulSoup
from urllib.request import urlopen
allLinks = []
visitedPages = []
baseUrl = 'http://www.kubuntu.org'
def isLinkStored(link, lst):
for l in lst:
if l == link:
return True
return False
def parsePage(pageUrl, depth):
links = []
visitedPages.append(pageUrl)
pageUrl = baseUrl + pageUrl
page = urlopen(pageUrl)
html = page.read()
soup = BeautifulSoup(html)
for link in soup.find_all("a"):
if link.has_attr('href'):
link = link['href']
if link.find('http://') != -1:
if link.find('http://kubuntu.org') != -1 or link.find('http://www.kubuntu.org') != -1:
link = link.split('kubuntu.org')[1]
if len(link) > 0:
if not isLinkStored(link, allLinks):
allLinks.append(link)
links.append(link)
print('adding link ' + link)
else:
if link.startswith('/'):
if not isLinkStored(link, allLinks):
allLinks.append(link)
links.append(link)
print('adding link ' + link)
for lnk in links:
if isLinkStored(lnk, visitedPages):
continue
print('parsing page ' + lnk + "/ depth: " + str(depth))
parsePage(lnk, depth + 1)
parsePage('/', 0)
f = open('Kubuntu_current_site_links.txt', 'w')
for link in visitedPages:
f.write(link + '\n')
print(link)
f.close()