import os
import re
import requests
from bs4 import BeautifulSoup
class BearScraper:
def __init__(self, base_url, output_dir="content/blog/scrape"):
self.base_url = base_url.rstrip("/")
self.output_dir = output_dir
os.makedirs(self.output_dir, exist_ok=True)
def extract_description_from_html(self, content_html, max_chars=200):
text = BeautifulSoup(content_html, "html.parser").get_text(
separator=" ", strip=True
)
if not text:
return ""
m = re.search(r'(.+?[.!?])(\s|$)', text)
if m:
return m.group(1).strip()
desc = text[:max_chars].rstrip()
if len(text) > max_chars:
desc += "..."
return desc
def indent_block(self, text, indent=" "):
if not text:
return indent
return "\n".join(indent + line for line in text.splitlines())
def fetch(self, url):
r = requests.get(url)
r.raise_for_status()
return r.text
def parse_list_view(self):
html = self.fetch(self.base_url)
soup = BeautifulSoup(html, "html.parser")
items = []
for li in soup.select("ul.embedded.blog-posts li"):
a = li.find("a")
time_el = li.find("time")
if not a or not time_el:
continue
href = a["href"]
full_url = self.base_url + href if href.startswith("/") else href
items.append({
"url": full_url,
"title": a.text.strip(),
"date": time_el["datetime"].split("T")[0],
})
return items
def parse_article(self, url):
html = self.fetch(url)
soup = BeautifulSoup(html, "html.parser")
h1 = soup.select_one("main h1")
title = h1.text.strip() if h1 else None
main = soup.find("main")
if not main:
return {"title": title, "content_html": ""}
first_h1 = main.find("h1")
if first_h1:
first_h1.decompose()
first_p = main.find("p")
if first_p:
first_p.decompose()
for form in main.find_all("form"):
form.decompose()
for script in main.find_all("script"):
script.decompose()
for tag_block in main.find_all(class_="tags"):
tag_block.decompose()
content_html = main.decode_contents().strip()
return {
"title": title,
"content_html": content_html,
}
def write_markdown(self, title, date, content_html):
filename = f"{date}.md"
path = os.path.join(self.output_dir, filename)
description = self.extract_description_from_html(content_html)
desc_block = self.indent_block(description)
frontmatter = (
"---\n"
f"title: {title}\n"
"description: |\n"
f"{desc_block}\n"
f"date: {date}\n"
"categories:\n"
" - news\n"
"authors:\n"
" - nobo\n"
f"source: {self.base_url}\n"
"---\n\n"
)
content = frontmatter + content_html
with open(path, "w", encoding="utf-8") as f:
f.write(content)
return path
def run(self):
articles = self.parse_list_view()
for info in articles:
article = self.parse_article(info["url"])
title = article["title"] or info["title"]
self.write_markdown(
title=title,
date=info["date"],
content_html=article["content_html"],
)