Source Code | goeshard.org

/ src / blog / scraper.py
import os
import re
import requests
from bs4 import BeautifulSoup


class BearScraper:
    def __init__(self, base_url, output_dir="content/blog/scrape"):
        self.base_url = base_url.rstrip("/")
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)




    def extract_description_from_html(self, content_html, max_chars=200):

        text = BeautifulSoup(content_html, "html.parser").get_text(
            separator=" ", strip=True
        )
        if not text:
            return ""

        m = re.search(r'(.+?[.!?])(\s|$)', text)
        if m:
            return m.group(1).strip()

        desc = text[:max_chars].rstrip()
        if len(text) > max_chars:
            desc += "..."
        return desc

    def indent_block(self, text, indent="  "):

        if not text:
            return indent
        return "\n".join(indent + line for line in text.splitlines())




    def fetch(self, url):
        r = requests.get(url)
        r.raise_for_status()
        return r.text

    def parse_list_view(self):
        html = self.fetch(self.base_url)
        soup = BeautifulSoup(html, "html.parser")
        items = []

        for li in soup.select("ul.embedded.blog-posts li"):
            a = li.find("a")
            time_el = li.find("time")
            if not a or not time_el:
                continue

            href = a["href"]
            full_url = self.base_url + href if href.startswith("/") else href

            items.append({
                "url": full_url,
                "title": a.text.strip(),
                "date": time_el["datetime"].split("T")[0],
            })

        return items

    def parse_article(self, url):
        html = self.fetch(url)
        soup = BeautifulSoup(html, "html.parser")

        h1 = soup.select_one("main h1")
        title = h1.text.strip() if h1 else None

        main = soup.find("main")
        if not main:
            return {"title": title, "content_html": ""}




        first_h1 = main.find("h1")
        if first_h1:
            first_h1.decompose()

        first_p = main.find("p")
        if first_p:
            first_p.decompose()

        for form in main.find_all("form"):
            form.decompose()

        for script in main.find_all("script"):
            script.decompose()

        for tag_block in main.find_all(class_="tags"):
            tag_block.decompose()




        content_html = main.decode_contents().strip()

        return {
            "title": title,
            "content_html": content_html,
        }




    def write_markdown(self, title, date, content_html):
        filename = f"{date}.md"
        path = os.path.join(self.output_dir, filename)

        description = self.extract_description_from_html(content_html)
        desc_block = self.indent_block(description)

        frontmatter = (
            "---\n"
            f"title: {title}\n"
            "description: |\n"
            f"{desc_block}\n"
            f"date: {date}\n"
            "categories:\n"
            "  - news\n"
            "authors:\n"
            "  - nobo\n"
            f"source: {self.base_url}\n"
            "---\n\n"
        )

        content = frontmatter + content_html

        with open(path, "w", encoding="utf-8") as f:
            f.write(content)

        return path


    def run(self):
        articles = self.parse_list_view()

        for info in articles:
            article = self.parse_article(info["url"])
            title = article["title"] or info["title"]

            self.write_markdown(
                title=title,
                date=info["date"],
                content_html=article["content_html"],
            )