PoC: Stackoverflow parser
Load the requests and BeautifulSoup libraries that will be used for scraping the webpage.
import sys import re import requests from bs4 import BeautifulSoup as bs
As a proof of concept, we will grab a random stackoverflow question that has more than one answers, one accepted answer, and code as a part of the answer to try and cover the major cases (still need to check about images). One such example is - https://stackoverflow.com/questions/357307/how-to-call-a-parent-class-function-from-derived-class-function
We’ll download the webpage for further scrutiny using the requests.get
function and then parsing it using BeautifulSoup
.
url = sys.argv[1] # url = "https://math.stackexchange.com/questions/1339709/how-to-derive-the-weak-form-of-the-pde" base_url = url.split("/")[2] response = requests.get(url) if response.ok: soup = bs(response.text, features="html.parser")
There are two main classes of objects that are of our interest - 1. Question and 2. Answers (with possibly an accepted marker). Looking at the source code tells us that there are question and answer properties that can be used to filter them out. We’ll take a stab at finding the question first.
A point to note, we cannot directly search for question-hyperlink links because there are other questions on the page too, so we need to first extract the question header from where we can get the required title.
question_header = soup.find('div', {'id': 'question-header'}) question_title = question_header.find('a', {'class': 'question-hyperlink'}).text.strip() question_link = f"https://{base_url}{question_header.find('a', {'class': 'question-hyperlink'}).get('href')}" # print((question_title, question_link))
Now that we have the title and the link to the question, we parse for the question content.
prog = re.compile(r"\<a .*?\/a\>") def parse_post(post_banner): elements = post_banner.find_all(recursive=False) name_set = {'p', 'pre'} full_post = [] for element in elements: if element.name in name_set: if element.name == "p": post = str(element) post = post.removeprefix("<p>").removesuffix("</p>") # Wrap code blocks in ~ post = post.replace("<code>", "~").replace("</code>", "~") # Wrap emphasis in / post = post.replace("<em>", "/").replace("</em>", "/") # Wrap bold in * post = post.replace("<strong>", "*").replace("</strong>", "*") # Change href links to org links result = prog.finditer(post) for res in result: temp = bs(res.group(0), features="html.parser") href_link = temp.find('a').get('href') href_text = temp.text post = prog.sub(f"[[{href_link}][{href_text}]]", post, count=1) full_post.append(f"{post}\n") else: codeblock = element.find('code') lang = codeblock.get('class', "lang-").rsplit('-')[-1] begin_tag = f"#+begin_example {lang}" end_tag = "#+end_example" code = codeblock.text full_post.append(f"{begin_tag}\n{code}\n{end_tag}\n") post_text = ''.join(full_post) return post_text question = soup.find('div', {'id': 'question'}) post_banner = question.find('div', {'class': 's-prose'}) question_text = parse_post(post_banner) # print(question_text)
Each question also contains comments that can be really useful, so we will try to parse them too.
def parse_comments(cell): comments = [] for comment in cell.find_all('div', {'class': 'comment-body'}): comment_str = comment.find('span').text try: comment_user = comment.find('a', {'class': 'comment-user'}).text except: comment_user = comment.find('span', {'class': 'comment-user'}).text comments.append(f"+ (/{comment_user}/) {comment_str}") comments_text = '\n'.join(comments) return comments_text question_comments = parse_comments(question) # print(question_comments)
We can now combine all the parts to create a complete org question string.
question_org_str = f"* [[{question_link}][{question_title}]]\n\n{question_text}\nComments\n{question_comments}" # print(question_org_str)
Now that we have covered most of the question components, we will start parsing the answers.
answers = [] for i, answer in enumerate(soup.find_all('div', {'class': 'answer'})): answer_post = answer.find('div', {'class': 'post-layout'}) answer_cell = answer_post.find('div', {'class': 'answercell'}).find('div', {'class': 's-prose'}) answer_str = parse_post(answer_cell) answer_comments = parse_comments(answer_post) answers.append(f"** Answer {i+1}\n\n{answer_str}\n Comments:\n{answer_comments}\n") answer_org_str = '\n'.join(answers) # print(answer_org_str)
We now have all the required components to write an org file for the stackoverflow question now. Let’s compile everything and write it to a file.
import os title_lowercase = question_title.lower() filename = f"{os.path.dirname(__file__)}/{'_'.join(title_lowercase.split()).removesuffix('?')}.org" full_file_str = (f":PROPERTIES:\n" f":ID: {'-'.join(title_lowercase.split())}\n" f":END:\n" f"#+title: {question_title}\n\n" f"{question_org_str}\n" f"{answer_org_str}") with open(f"{filename}", "w") as f: f.write(full_file_str)
We can now create a bookmarklet that we can install in the browser. Then, we can just click on whatever stackoverflow page we want to save for future reference.
$.ajax({ type: "POST", url: "/Users/cmehta/Documents/org/roam/org/stackoverflow_parser.py " + document.URL, crossdomain: true, }).done(function() { alert("Saved to roam!"); });