feat: add awesomelist auto-sync script

This commit is contained in:
jae 2026-04-04 02:55:50 +00:00
parent 12f6fad160
commit dd1d5adef5

275
api/awesomelist_sync.py Normal file
View file

@ -0,0 +1,275 @@
#!/usr/bin/env python3
"""AWESOMELIST Auto-Sync — Pulls latest trackawesomelist data and rebuilds JSON"""
import os, re, json, subprocess, sys
from datetime import datetime
from pathlib import Path
REPO_URL = "https://github.com/trackawesomelist/trackawesomelist.git"
REPO_DIR = "/opt/awesomelist-source"
OUTPUT_DIR = "/var/www/jaeswift-homepage/api/data/awesomelist"
OUTPUT_INDEX = "/var/www/jaeswift-homepage/api/data/awesomelist_index.json"
LOG = "/var/log/awesomelist-sync.log"
SECTOR_MAP = {
'PRP-001': {'name': 'PROGRAMMING LANGUAGES', 'icon': '💻', 'tags': ['python', 'go', 'rust', 'javascript', 'typescript', 'ruby', 'java', 'kotlin', 'swift', 'dart', 'elixir', 'erlang', 'haskell', 'lua', 'perl', 'php', 'scala', 'clojure', 'crystal', 'nim', 'zig', 'v', 'ocaml', 'r', 'julia', 'fortran', 'pascal', 'ada', 'c', 'cpp', 'csharp', 'fsharp', 'groovy', 'elm', 'purescript', 'idris', 'coq', 'vala', 'actionscript', 'autohotkey', 'autoit', 'commonlisp', 'clojurescript', 'qsharp', 'd', 'eta', 'frege']},
'PRP-002': {'name': 'WEB FRONTEND', 'icon': '🌐', 'tags': ['react', 'vue', 'angular', 'svelte', 'css', 'html', 'tailwind', 'bootstrap', 'sass', 'less', 'webpack', 'vite', 'nextjs', 'nuxt', 'gatsby', 'preact', 'ember', 'backbone', 'knockout', 'cyclejs', 'choo', 'mithril', 'polymer', 'lit', 'storybook', 'draft-js', 'redux', 'relay', 'graphql', 'webcomponent', 'ant-design', 'material-ui', 'flexbox', 'web-animation', 'motion-ui', 'progressive-web', 'service-worker', 'web-extension', 'browserify', 'yew', 'seed-rs', 'aurelia', 'marionette', 'dojo', 'jquery', 'inertia']},
'PRP-003': {'name': 'WEB BACKEND', 'icon': '⚙️', 'tags': ['nodejs', 'django', 'flask', 'rails', 'laravel', 'symfony', 'express', 'fastapi', 'fiber', 'vapor', 'phoenix', 'spring', 'dropwizard', 'vert.x', 'play1', 'cakephp', 'phalcon', 'lumen', 'slim', 'pyramid', 'wagtail', 'directus', 'refinery', 'umbraco', 'sitecore', 'drupal', 'wordpress', 'plone', 'silverstripe', 'craft', 'magento', 'rest', 'microservice', 'serverless', 'jamstack', 'meteor', 'deno', 'npm', 'gulp', 'eslint']},
'PRP-004': {'name': 'MOBILE DEVELOPMENT', 'icon': '📱', 'tags': ['android', 'ios', 'flutter', 'react-native', 'ionic', 'cordova', 'capacitor', 'xamarin', 'appium', 'swift-playground']},
'PRP-005': {'name': 'GAMING & GAME DEV', 'icon': '🎮', 'tags': ['gamedev', 'godot', 'unity', 'libgdx', 'love2d', 'pico-8', 'chip-8', 'flame', 'playcanvas', 'haxe-gamedev', 'gideros', 'game-engine', 'game-dataset', 'game-remake', 'open-source-game', 'games-of-coding', 'game-talk', 'ironsworn', 'minecraft', 'board-game', 'pokemon', 'chess', 'esports', 'pixel-art', 'gbdev', 'dos', 'frc']},
'PRP-006': {'name': 'AI & MACHINE LEARNING', 'icon': '🤖', 'tags': ['machine-learning', 'deep-learning', 'tensorflow', 'pytorch', 'jax', 'nlp', 'computer-vision', 'chatgpt', 'gpt3', 'generative', 'langchain', 'ai-tool', 'ai-finance', 'ai4lam', 'coreml', 'artificial-intelligence', 'deep-vision', 'xai', 'awesome-ai', 'gemini-cli']},
'PRP-007': {'name': 'DATA SCIENCE & ANALYTICS', 'icon': '📊', 'tags': ['datascience', 'data-engineering', 'bigdata', 'analytics', 'streaming', 'spark', 'hadoop', 'polars', 'dash', 'jupyter', 'dataviz', 'json', 'csv', 'json-dataset', 'information-retrieval', 'quantified-self', 'quant']},
'PRP-008': {'name': 'CLOUD & DEVOPS', 'icon': '☁️', 'tags': ['docker', 'kubernetes', 'terraform', 'ansible', 'aws', 'azure', 'gcp', 'cloudflare', 'digitalocean', 'ibmcloud', 'heroku', 'ci', 'cd', 'sre', 'devsecops', 'saltstack', 'vagrant', 'kustomize', 'opentofu', 'cdk', 'k6', 'pulumi', 'container']},
'PRP-009': {'name': 'DATABASES', 'icon': '🗄️', 'tags': ['postgres', 'mysql', 'mongodb', 'redis', 'neo4j', 'cassandra', 'couchdb', 'rethinkdb', 'influxdb', 'hbase', 'tdengine', 'nosql', 'db-tool', 'sql']},
'PRP-010': {'name': 'SECURITY & PRIVACY', 'icon': '🔒', 'tags': ['security', 'hacking', 'pentest', 'ctf', 'malware', 'honeypot', 'incident-response', 'crypto', 'cryptography', 'privacy', 'appsec', 'vehicle-security', 'web-security', 'lockpicking', 'osint', 'fuzzing', 'evm-security', 'blueteam', 'gdpr', 'pci-dss']},
'PRP-011': {'name': 'SYSTEMS & PLATFORMS', 'icon': '🖥️', 'tags': ['linux', 'macos', 'windows', 'bsd', 'dos', 'raspberry-pi', 'wsl', 'nix', 'arch', 'kde', 'gnome', 'qgis', 'qubes', 'amazon-alexa', 'actions-on-google', 'home-assistant', 'smart-tv', 'fuse', 'ros2']},
'PRP-012': {'name': 'DEVELOPER TOOLS', 'icon': '🛠️', 'tags': ['git', 'vim', 'neovim', 'emacs', 'vscode', 'atom', 'jetbrains', 'sublime', 'devenv', 'devtools', 'shell', 'zsh', 'fish', 'tmux', 'cli-app', 'terminal', 'powershell', 'bash', 'dtrace', 'cmake', 'composer', 'alfred', 'scriptable', 'pinned-gist', 'code-review', 'git-addon', 'git-hook', 'github']},
'PRP-013': {'name': 'PACKAGE MANAGERS & BUILD', 'icon': '📦', 'tags': ['npm', 'webpack', 'gulp', 'rollup', 'esbuild', 'micro-npm', 'npm-script', 'awesome-lint']},
'PRP-014': {'name': 'TESTING & QA', 'icon': '🧪', 'tags': ['testing', 'selenium', 'playwright', 'ava', 'tap', 'regression', 'gatling', 'jmeter', 'static-analysis', 'qa']},
'PRP-015': {'name': 'SOFTWARE ARCHITECTURE', 'icon': '🏗️', 'tags': ['design-pattern', 'ddd', 'software-architecture', 'microservice', 'functional-programming', 'recursion-scheme']},
'PRP-016': {'name': 'IoT & HARDWARE', 'icon': '🔌', 'tags': ['iot', 'embedded', 'arduino', 'esp', 'circuitpython', 'adafruit', 'micropython', 'raspberry', 'robot', 'lidar', 'open-hardware', 'electronics', 'beacon', 'mqtt', 'fpga']},
'PRP-017': {'name': 'BLOCKCHAIN & CRYPTO', 'icon': '⛓️', 'tags': ['blockchain', 'bitcoin', 'ethereum', 'solana', 'algorand', 'ripple', 'corda', 'substrate', 'stacks-chain', 'golem', 'eosio', 'waves', 'non-financial-blockchain', 'crypto-paper', 'coin']},
'PRP-018': {'name': 'SCIENCE & RESEARCH', 'icon': '🧬', 'tags': ['science', 'math', 'physics', 'bioinformatics', 'computational-biology', 'neuroscience', 'cheminformatics', 'bioie', 'parasite', 'agriculture', 'cropsteering', 'scientific-computing', 'scientific-writing', 'research', 'latex', 'tikz']},
'PRP-019': {'name': 'EDUCATION & LEARNING', 'icon': '📚', 'tags': ['education', 'learn', 'courses', 'tutorial', 'programming-for-kids', 'educational-game', 'computer-science', 'competitive-programming', 'algorithm', 'kata', 'interview', 'roadmap', 'free-programming-book', 'beginner', 'talk', 'tech-video']},
'PRP-020': {'name': 'DESIGN & UI/UX', 'icon': '🎨', 'tags': ['design', 'ui', 'ux', 'design-system', 'design-principle', 'web-design', 'product-design', 'sketch', 'framer', 'creative-coding', 'canvas', 'webgl', 'vulkan', 'opengl', 'charting', 'd3', 'colorful', 'font', 'icon']},
'PRP-021': {'name': 'MEDIA & CONTENT', 'icon': '🎬', 'tags': ['video', 'audio', 'music', 'podcast', 'broadcasting', 'ffmpeg', 'vlc', 'webaudio', 'audio-visualization', 'photography', 'gif', 'creative-tech', 'audiovisual', 'pixel-art']},
'PRP-022': {'name': 'BUSINESS & CAREER', 'icon': '💼', 'tags': ['business', 'startup', 'indie', 'product-management', 'project-management', 'okr', 'leading', 'managing', 'remote-job', 'job-board', 'internship', 'freelance', 'marketing', 'billing', 'amazon-seller', 'social-enterprise', 'open-company', 'speaking', 'developer-first']},
'PRP-023': {'name': 'COMMUNITY & CULTURE', 'icon': '🌍', 'tags': ['diversity', 'for-girls', 'mental-health', 'accessibility', 'humane-tech', 'earth', 'clean-tech', 'veganism', 'theravada', 'uncopyright', 'ad-free', 'free-software', 'open-source-supporter', 'maintainer', 'patreon', 'naming', 'falsehood', 'answer', 'ama', 'speaker', 'event', 'conference', 'italy-event', 'netherlands-event', 'european-tech']},
'PRP-024': {'name': 'NETWORKING & COMMS', 'icon': '📡', 'tags': ['network', 'sdn', 'pcap', 'snmp', 'irc', 'mastodon', 'slack', 'discord', 'email', 'rtc', 'connectivity', 'ssh', 'radio', 'hacker-news', 'chatops', 'chat', 'bot']},
'PRP-025': {'name': 'UTILITIES & PRODUCTIVITY', 'icon': '🔧', 'tags': ['productivity', 'selfhosted', 'sysadmin', 'tool', 'lowcode', 'no-login', 'calculator', 'userscript', 'boilerplate', 'building-block', 'pagespeed', 'readme', 'htaccess', 'stock-resource', 'creative-commons', 'ponyfill', 'promise', 'observable', 'workflow-automation', 'distraction-blocker']},
'PRP-026': {'name': 'CONTENT MANAGEMENT', 'icon': '📄', 'tags': ['cms', 'markdown', 'text-editing', 'book-authoring', 'blog', 'newsletter', 'rss', 'web-archiving', 'digital-history', 'open-source-document']},
'PRP-027': {'name': 'HEALTH & WELLNESS', 'icon': '🏥', 'tags': ['health', 'healthcare', 'glp1', 'mental-health', 'biomedical', 'digital-health']},
'PRP-028': {'name': 'MISCELLANEOUS', 'icon': '📦', 'tags': []},
}
def log(msg):
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
line = f"[{ts}] {msg}"
print(line)
with open(LOG, "a") as f:
f.write(line + "\n")
def clean_name(name):
cleaned = re.sub(r'^[Aa]wesome[- _]*', '', name).strip()
return cleaned if cleaned else name
def pull_repo():
if os.path.isdir(os.path.join(REPO_DIR, '.git')):
log("Pulling latest changes...")
subprocess.run(['git', '-C', REPO_DIR, 'fetch', '--all'], check=True, capture_output=True)
subprocess.run(['git', '-C', REPO_DIR, 'reset', '--hard', 'origin/main'], check=True, capture_output=True)
else:
log("Cloning repository...")
os.makedirs(REPO_DIR, exist_ok=True)
subprocess.run(['git', 'clone', '--depth', '1', REPO_URL, REPO_DIR], check=True, capture_output=True)
log("Repository updated.")
def parse_readme(filepath):
"""Parse a single awesome list README.md into structured data."""
try:
with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
except:
return None
lines = content.split('\n')
subcategories = []
current_sub = None
title = ''
description = ''
github_url = ''
stars = ''
# Extract frontmatter or title
for i, line in enumerate(lines):
if line.startswith('# '):
title = line[2:].strip()
break
# Extract description (first paragraph after title)
in_desc = False
for i, line in enumerate(lines):
if line.startswith('# '):
in_desc = True
continue
if in_desc:
stripped = line.strip()
if stripped and not stripped.startswith('#') and not stripped.startswith('[') and not stripped.startswith('!'):
description = stripped
break
elif stripped.startswith('#'):
break
# Parse sections and entries
link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
for line in lines:
stripped = line.strip()
# Section headers
header_match = re.match(r'^(#{2,6})\s+(.+)', stripped)
if header_match:
level = len(header_match.group(1))
header_name = header_match.group(2).strip()
# Remove trailing links from header
header_name = re.sub(r'\s*\[.*?\]\(.*?\)', '', header_name).strip()
if header_name and header_name.lower() not in ('contents', 'table of contents', 'toc', 'license', 'contributing', 'footnotes'):
current_sub = {'name': header_name, 'parent': '', 'entries': []}
subcategories.append(current_sub)
continue
# List items with links
if stripped.startswith(('-', '*')) and '[' in stripped and '](' in stripped:
matches = link_pattern.findall(stripped)
if matches:
entry_name = matches[0][0]
entry_url = matches[0][1]
# Get description after the link
entry_desc = ''
desc_match = re.search(r'\)\s*[-–—:]?\s*(.+)', stripped)
if desc_match:
entry_desc = desc_match.group(1).strip()
entry = {'name': entry_name, 'url': entry_url, 'description': entry_desc}
if current_sub is None:
current_sub = {'name': 'General', 'parent': '', 'entries': []}
subcategories.append(current_sub)
current_sub['entries'].append(entry)
# Filter empty subcategories
subcategories = [s for s in subcategories if s['entries']]
total_entries = sum(len(s['entries']) for s in subcategories)
return {
'title': clean_name(title),
'description': description[:300],
'github_url': github_url,
'stars': stars,
'entry_count': total_entries,
'subcategory_count': len(subcategories),
'subcategories': subcategories,
'name': ''
}
def classify_list(slug, title, description):
"""Assign a list to a sector based on slug/title/description matching."""
text = f"{slug} {title} {description}".lower()
for code, sector in SECTOR_MAP.items():
if code == 'PRP-028': # Misc is fallback
continue
for tag in sector['tags']:
if tag in text:
return code
return 'PRP-028' # Miscellaneous fallback
def build_data():
content_dir = os.path.join(REPO_DIR, 'content')
if not os.path.isdir(content_dir):
log(f"ERROR: content dir not found: {content_dir}")
return False
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Clear old data
for f in os.listdir(OUTPUT_DIR):
if f.endswith('.json'):
os.remove(os.path.join(OUTPUT_DIR, f))
sectors = {code: {**info, 'code': code, 'lists': [], 'list_count': 0, 'total_entries': 0} for code, info in SECTOR_MAP.items()}
total_lists = 0
total_entries = 0
# Walk content directory
for org_dir in sorted(os.listdir(content_dir)):
org_path = os.path.join(content_dir, org_dir)
if not os.path.isdir(org_path):
continue
for repo_dir in sorted(os.listdir(org_path)):
repo_path = os.path.join(org_path, repo_dir)
readme_path = os.path.join(repo_path, 'README.md')
if not os.path.isfile(readme_path):
# Try readme.md lowercase
readme_path = os.path.join(repo_path, 'readme.md')
if not os.path.isfile(readme_path):
continue
slug = f"{org_dir}--{repo_dir}"
data = parse_readme(readme_path)
if data is None or data['entry_count'] == 0:
continue
data['slug'] = slug
sector_code = classify_list(slug, data['title'], data['description'])
data['tag'] = sector_code
# Save individual file
out_file = os.path.join(OUTPUT_DIR, f"{slug}.json")
with open(out_file, 'w') as f:
json.dump(data, f)
# Add to sector index
sectors[sector_code]['lists'].append({
'slug': slug,
'title': data['title'],
'description': data['description'][:200],
'stars': data.get('stars', ''),
'entry_count': data['entry_count'],
'subcategory_count': data['subcategory_count']
})
sectors[sector_code]['list_count'] += 1
sectors[sector_code]['total_entries'] += data['entry_count']
total_lists += 1
total_entries += data['entry_count']
# Sort lists within sectors by entry count
for code in sectors:
sectors[code]['lists'].sort(key=lambda x: x['entry_count'], reverse=True)
# Build index (remove empty sectors)
sector_list = [s for s in sectors.values() if s['list_count'] > 0]
sector_list.sort(key=lambda x: x['code'])
index = {
'total_lists': total_lists,
'total_entries': total_entries,
'sector_count': len(sector_list),
'sectors': sector_list
}
with open(OUTPUT_INDEX, 'w') as f:
json.dump(index, f)
log(f"Built {total_lists} lists, {total_entries} entries across {len(sector_list)} sectors")
return True
def restart_api():
log("Restarting API service...")
try:
subprocess.run(['systemctl', 'restart', 'jaeswift-api'], check=True, capture_output=True)
log("API restarted.")
except Exception as e:
log(f"WARNING: Could not restart API: {e}")
def main():
log("="*60)
log("AWESOMELIST SYNC STARTED")
try:
pull_repo()
if build_data():
restart_api()
log("SYNC COMPLETED SUCCESSFULLY")
else:
log("SYNC FAILED - build error")
except Exception as e:
log(f"SYNC FAILED: {e}")
sys.exit(1)
if __name__ == '__main__':
main()