91 lines
2.5 KiB
Python
91 lines
2.5 KiB
Python
#pip install langgraph langchain langchain-community langchain-ollama
|
|
|
|
import requests, json, re, string
|
|
from html.parser import HTMLParser
|
|
|
|
class MyHTMLParser(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.skip_content = False # Skip content inside <script> and <style>
|
|
self.crawled_text = ''
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
# Skip content inside script and style tags
|
|
if tag in ['script', 'style']:
|
|
self.skip_content = True
|
|
|
|
def handle_endtag(self, tag):
|
|
# Resume collecting after closing script and style tags
|
|
if tag in ['script', 'style']:
|
|
self.skip_content = False
|
|
|
|
def handle_data(self, data):
|
|
# Only process data if not inside script/style tags
|
|
if self.skip_content:
|
|
return
|
|
|
|
# Clean up the data: strip whitespace and filter empty lines
|
|
text = data.strip()
|
|
|
|
# Skip if empty or too short
|
|
if not text or len(text) < 2:
|
|
return
|
|
|
|
# Skip common junk: JSON-like patterns, URLs in certain contexts, etc.
|
|
if text.startswith('{') or text.startswith('['):
|
|
return
|
|
|
|
# Print meaningful text
|
|
#print("data :", text)
|
|
self.crawled_text += text + '\n'
|
|
|
|
|
|
#_url = "http://192.168.50.14:9090/api/bookmarks/"
|
|
#_url = "https://linkding.hal.se/api/bookmarks/5/"
|
|
_url = "https://linkding.hal.se/api/bookmarks/?added_since=2026-04-01T00:00:00Z"
|
|
|
|
headers = {
|
|
"Authorization": "Token fa54dee2ccbcad80a0c6259bdbbed896581e1423"
|
|
}
|
|
|
|
htmlParser = MyHTMLParser()
|
|
#MyHTMLParser.convert_charrefs=False
|
|
#MyHTMLParser.scripting=True
|
|
|
|
response = requests.get(_url, headers=headers)
|
|
data = response.json()
|
|
#print(data)
|
|
#for key, value in data.items():
|
|
# print(f"{key}: {value}")
|
|
|
|
for bookmark in data['results']:
|
|
id = bookmark['id']
|
|
url = bookmark['url']
|
|
title = bookmark['title']
|
|
print(f"Id: {id}, Title: {title}\nURL: {url}\ndate_added: {bookmark['date_added']}\ndescription: {bookmark['description']}\ntag_names: {bookmark['tag_names']}\n")
|
|
"""
|
|
raw_html = requests.get(url.strip()).text
|
|
#print(raw_html[:500])
|
|
|
|
htmlParser.feed(raw_html)
|
|
|
|
print(htmlParser.crawled_text) #[:500])
|
|
htmlParser.crawled_text = '' #reset for next bookmark
|
|
"""
|
|
|
|
# https://linkding.link/api/
|
|
# tools #####################
|
|
# Archive:
|
|
# POST /api/bookmarks/<id>/archive/
|
|
|
|
# List Tags
|
|
# GET /api/tags/
|
|
|
|
#PATCH /api/bookmarks/<id>/
|
|
#Example payload:
|
|
#{
|
|
# "tag_names": [
|
|
# "tag1",
|
|
# "tag2"
|
|
# ]
|
|
#} |