Ubuntu Pastebin

Paste from yofel at Sun, 15 Mar 2015 16:03:41 +0000

Download as text
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/python3

from bs4 import BeautifulSoup
from urllib.request import urlopen

allLinks = []
visitedPages = []
baseUrl = 'http://www.kubuntu.org'

def isLinkStored(link, lst):
  for l in lst:
    if l == link:
      return True
  return False

def parsePage(pageUrl, depth):
  links = []
  visitedPages.append(pageUrl)
  pageUrl = baseUrl + pageUrl
  page = urlopen(pageUrl)
  html = page.read()
  soup = BeautifulSoup(html)

  for link in soup.find_all("a"):
    if link.has_attr('href'):
      link = link['href']
      if link.find('http://') != -1:
        if link.find('http://kubuntu.org') != -1 or link.find('http://www.kubuntu.org') != -1:
          link = link.split('kubuntu.org')[1]
          if len(link) > 0:
            if not isLinkStored(link, allLinks):
              allLinks.append(link)
              links.append(link)
              print('adding link ' + link)
      else:
        if link.startswith('/'):
          if not isLinkStored(link, allLinks):
            allLinks.append(link)
            links.append(link)
            print('adding link ' + link)

  for lnk in links:
    if isLinkStored(lnk, visitedPages):
      continue
    print('parsing page ' + lnk + "/ depth: " + str(depth))
    parsePage(lnk, depth + 1)

parsePage('/', 0)

f = open('Kubuntu_current_site_links.txt', 'w')
for link in visitedPages:
  f.write(link + '\n')
  print(link)
f.close()
Download as text