A way to explore wikipedia and store the results in a database. A function allows to search in the keywords, but the ideal would be to search in the end of the url to search in the title of the article (see on this page : https://stackoverflow.com/questions...a-url-string-up-into-separate-parts-in-python) . This code is made by getting pieces of code written by other people, I just put them together.
Python:
import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import sqlite3
import base64
import subprocess
import platform
import heapq
import time
import socks
import socket
import random
from random import choice
import sys
import math
import httpx
from gensim.summarization.textcleaner import split_sentences
from rake_nltk import Rake
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
import sys
from bs4 import BeautifulSoup
import requests
import nltk
from gensim.summarization.summarizer import summarize
import csv
import io
from googletrans import Translator
import numpy as np
import string
# import urllib.request
import urllib.request as urllib2
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.feature_extraction.text import HashingVectorizer
import colorama
from colorama import init
from colorama import Fore, Back, Style
from paraphraser import paraphrase
import subprocess
import platform
import heapq
from googlesearch import search
init()
timeout = httpx.Timeout(5)
conn = sqlite3.connect('crawled.db')
c = conn.cursor()
def listToString(s):
# initialize an empty string
str1 = " "
# return string
return (str1.join(s))
desktop_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0']
def random_headers():
return {'User-Agent': choice(desktop_agents), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
logging.basicConfig(
format='%(asctime)s %(levelname)s:%(message)s',
level=logging.INFO)
def remove_text_inside_brackets(text, brackets="[]"):
count = [0] * (len(brackets) // 2) # count open/close brackets
saved_chars = []
for character in text:
for i, b in enumerate(brackets):
if character == b: # found bracket
kind, is_close = divmod(i, 2)
count[kind] += (-1)**is_close # `+1`: open, `-1`: close
if count[kind] < 0: # unbalanced bracket
count[kind] = 0 # keep it
else: # found bracket to remove
break
else: # character is not a [balanced] bracket
if not any(count): # outside brackets
saved_chars.append(character)
return ''.join(saved_chars)
class Crawler:
def __init__(self, urls=[]):
self.visited_urls = []
self.urls_to_visit = urls
def download_url(self, url):
proxies = {'http': 'socks5://localhost:9150', 'https': 'socks5://localhost:9150'}
res = requests.get(url, proxies=proxies,headers=random_headers())
pagetext = res.text
wiki = BeautifulSoup(pagetext, 'html.parser')
for l in wiki.select('p'):
machine_response = l.getText()
clear_response = remove_text_inside_brackets(machine_response)
r = Rake()
r.extract_keywords_from_text(clear_response)
ranked_phrases = r.get_ranked_phrases()[0:20]
vectorizer = TfidfVectorizer(
sublinear_tf=True, encoding='latin-1', stop_words='english')
X = vectorizer.fit_transform(ranked_phrases)
true_k = 1
km = KMeans(n_clusters=true_k, init='k-means++',
max_iter=100, n_init=1, random_state=1)
y = km.fit(X)
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
listkwd = []
for i in range(true_k):
for ind in order_centroids[i, :10]:
test = ' %s' % terms[ind]
listkwd.append(test)
newkeyword = listToString(listkwd)
encodedBytes = base64.b64encode(url.encode("utf-8"))
encodedStr = str(encodedBytes, "utf-8")
encodedBot = base64.b64encode(clear_response.encode("utf-8"))
encodedbot = str(encodedBot, "utf-8")
c.execute("insert into crawl_copy (url, data, keyword) values (?, ?, ?)",
(encodedStr, encodedbot, newkeyword))
conn.commit()
return pagetext
def get_linked_urls(self, url, html):
soup = BeautifulSoup(html, 'html.parser')
for link in soup.find_all('a'):
path = link.get('href')
if path and path.startswith('/'):
path = urljoin(url, path)
yield path
def add_url_to_visit(self, url):
if url not in self.visited_urls and url not in self.urls_to_visit:
self.urls_to_visit.append(url)
def crawl(self, url):
html = self.download_url(url)
for url in self.get_linked_urls(url, html):
self.add_url_to_visit(url)
def run(self):
while self.urls_to_visit:
url = self.urls_to_visit.pop(0)
logging.info(f'Crawling: {url}')
try:
self.crawl(url)
except Exception:
logging.exception(f'Failed to crawl: {url}')
finally:
self.visited_urls.append(url)
if __name__ == '__main__':
Crawler(urls=['https://en.wikipedia.org/wiki/']).run()
Last edited: