feat: add awesomelist auto-sync script

2026-04-04 02:55:50 +00:00 · 2026-04-04 02:55:50 +00:00 · dd1d5adef5
commit dd1d5adef5
parent 12f6fad160
1 changed files with 275 additions and 0 deletions
--- a/api/awesomelist_sync.py
+++ b/api/awesomelist_sync.py
@ -0,0 +1,275 @@
+#!/usr/bin/env python3
+"""AWESOMELIST Auto-Sync — Pulls latest trackawesomelist data and rebuilds JSON"""
+import os, re, json, subprocess, sys
+from datetime import datetime
+from pathlib import Path
+
+REPO_URL = "https://github.com/trackawesomelist/trackawesomelist.git"
+REPO_DIR = "/opt/awesomelist-source"
+OUTPUT_DIR = "/var/www/jaeswift-homepage/api/data/awesomelist"
+OUTPUT_INDEX = "/var/www/jaeswift-homepage/api/data/awesomelist_index.json"
+LOG = "/var/log/awesomelist-sync.log"
+
+SECTOR_MAP = {
+    'PRP-001': {'name': 'PROGRAMMING LANGUAGES', 'icon': '💻', 'tags': ['python', 'go', 'rust', 'javascript', 'typescript', 'ruby', 'java', 'kotlin', 'swift', 'dart', 'elixir', 'erlang', 'haskell', 'lua', 'perl', 'php', 'scala', 'clojure', 'crystal', 'nim', 'zig', 'v', 'ocaml', 'r', 'julia', 'fortran', 'pascal', 'ada', 'c', 'cpp', 'csharp', 'fsharp', 'groovy', 'elm', 'purescript', 'idris', 'coq', 'vala', 'actionscript', 'autohotkey', 'autoit', 'commonlisp', 'clojurescript', 'qsharp', 'd', 'eta', 'frege']},
+    'PRP-002': {'name': 'WEB FRONTEND', 'icon': '🌐', 'tags': ['react', 'vue', 'angular', 'svelte', 'css', 'html', 'tailwind', 'bootstrap', 'sass', 'less', 'webpack', 'vite', 'nextjs', 'nuxt', 'gatsby', 'preact', 'ember', 'backbone', 'knockout', 'cyclejs', 'choo', 'mithril', 'polymer', 'lit', 'storybook', 'draft-js', 'redux', 'relay', 'graphql', 'webcomponent', 'ant-design', 'material-ui', 'flexbox', 'web-animation', 'motion-ui', 'progressive-web', 'service-worker', 'web-extension', 'browserify', 'yew', 'seed-rs', 'aurelia', 'marionette', 'dojo', 'jquery', 'inertia']},
+    'PRP-003': {'name': 'WEB BACKEND', 'icon': '⚙️', 'tags': ['nodejs', 'django', 'flask', 'rails', 'laravel', 'symfony', 'express', 'fastapi', 'fiber', 'vapor', 'phoenix', 'spring', 'dropwizard', 'vert.x', 'play1', 'cakephp', 'phalcon', 'lumen', 'slim', 'pyramid', 'wagtail', 'directus', 'refinery', 'umbraco', 'sitecore', 'drupal', 'wordpress', 'plone', 'silverstripe', 'craft', 'magento', 'rest', 'microservice', 'serverless', 'jamstack', 'meteor', 'deno', 'npm', 'gulp', 'eslint']},
+    'PRP-004': {'name': 'MOBILE DEVELOPMENT', 'icon': '📱', 'tags': ['android', 'ios', 'flutter', 'react-native', 'ionic', 'cordova', 'capacitor', 'xamarin', 'appium', 'swift-playground']},
+    'PRP-005': {'name': 'GAMING & GAME DEV', 'icon': '🎮', 'tags': ['gamedev', 'godot', 'unity', 'libgdx', 'love2d', 'pico-8', 'chip-8', 'flame', 'playcanvas', 'haxe-gamedev', 'gideros', 'game-engine', 'game-dataset', 'game-remake', 'open-source-game', 'games-of-coding', 'game-talk', 'ironsworn', 'minecraft', 'board-game', 'pokemon', 'chess', 'esports', 'pixel-art', 'gbdev', 'dos', 'frc']},
+    'PRP-006': {'name': 'AI & MACHINE LEARNING', 'icon': '🤖', 'tags': ['machine-learning', 'deep-learning', 'tensorflow', 'pytorch', 'jax', 'nlp', 'computer-vision', 'chatgpt', 'gpt3', 'generative', 'langchain', 'ai-tool', 'ai-finance', 'ai4lam', 'coreml', 'artificial-intelligence', 'deep-vision', 'xai', 'awesome-ai', 'gemini-cli']},
+    'PRP-007': {'name': 'DATA SCIENCE & ANALYTICS', 'icon': '📊', 'tags': ['datascience', 'data-engineering', 'bigdata', 'analytics', 'streaming', 'spark', 'hadoop', 'polars', 'dash', 'jupyter', 'dataviz', 'json', 'csv', 'json-dataset', 'information-retrieval', 'quantified-self', 'quant']},
+    'PRP-008': {'name': 'CLOUD & DEVOPS', 'icon': '☁️', 'tags': ['docker', 'kubernetes', 'terraform', 'ansible', 'aws', 'azure', 'gcp', 'cloudflare', 'digitalocean', 'ibmcloud', 'heroku', 'ci', 'cd', 'sre', 'devsecops', 'saltstack', 'vagrant', 'kustomize', 'opentofu', 'cdk', 'k6', 'pulumi', 'container']},
+    'PRP-009': {'name': 'DATABASES', 'icon': '🗄️', 'tags': ['postgres', 'mysql', 'mongodb', 'redis', 'neo4j', 'cassandra', 'couchdb', 'rethinkdb', 'influxdb', 'hbase', 'tdengine', 'nosql', 'db-tool', 'sql']},
+    'PRP-010': {'name': 'SECURITY & PRIVACY', 'icon': '🔒', 'tags': ['security', 'hacking', 'pentest', 'ctf', 'malware', 'honeypot', 'incident-response', 'crypto', 'cryptography', 'privacy', 'appsec', 'vehicle-security', 'web-security', 'lockpicking', 'osint', 'fuzzing', 'evm-security', 'blueteam', 'gdpr', 'pci-dss']},
+    'PRP-011': {'name': 'SYSTEMS & PLATFORMS', 'icon': '🖥️', 'tags': ['linux', 'macos', 'windows', 'bsd', 'dos', 'raspberry-pi', 'wsl', 'nix', 'arch', 'kde', 'gnome', 'qgis', 'qubes', 'amazon-alexa', 'actions-on-google', 'home-assistant', 'smart-tv', 'fuse', 'ros2']},
+    'PRP-012': {'name': 'DEVELOPER TOOLS', 'icon': '🛠️', 'tags': ['git', 'vim', 'neovim', 'emacs', 'vscode', 'atom', 'jetbrains', 'sublime', 'devenv', 'devtools', 'shell', 'zsh', 'fish', 'tmux', 'cli-app', 'terminal', 'powershell', 'bash', 'dtrace', 'cmake', 'composer', 'alfred', 'scriptable', 'pinned-gist', 'code-review', 'git-addon', 'git-hook', 'github']},
+    'PRP-013': {'name': 'PACKAGE MANAGERS & BUILD', 'icon': '📦', 'tags': ['npm', 'webpack', 'gulp', 'rollup', 'esbuild', 'micro-npm', 'npm-script', 'awesome-lint']},
+    'PRP-014': {'name': 'TESTING & QA', 'icon': '🧪', 'tags': ['testing', 'selenium', 'playwright', 'ava', 'tap', 'regression', 'gatling', 'jmeter', 'static-analysis', 'qa']},
+    'PRP-015': {'name': 'SOFTWARE ARCHITECTURE', 'icon': '🏗️', 'tags': ['design-pattern', 'ddd', 'software-architecture', 'microservice', 'functional-programming', 'recursion-scheme']},
+    'PRP-016': {'name': 'IoT & HARDWARE', 'icon': '🔌', 'tags': ['iot', 'embedded', 'arduino', 'esp', 'circuitpython', 'adafruit', 'micropython', 'raspberry', 'robot', 'lidar', 'open-hardware', 'electronics', 'beacon', 'mqtt', 'fpga']},
+    'PRP-017': {'name': 'BLOCKCHAIN & CRYPTO', 'icon': '⛓️', 'tags': ['blockchain', 'bitcoin', 'ethereum', 'solana', 'algorand', 'ripple', 'corda', 'substrate', 'stacks-chain', 'golem', 'eosio', 'waves', 'non-financial-blockchain', 'crypto-paper', 'coin']},
+    'PRP-018': {'name': 'SCIENCE & RESEARCH', 'icon': '🧬', 'tags': ['science', 'math', 'physics', 'bioinformatics', 'computational-biology', 'neuroscience', 'cheminformatics', 'bioie', 'parasite', 'agriculture', 'cropsteering', 'scientific-computing', 'scientific-writing', 'research', 'latex', 'tikz']},
+    'PRP-019': {'name': 'EDUCATION & LEARNING', 'icon': '📚', 'tags': ['education', 'learn', 'courses', 'tutorial', 'programming-for-kids', 'educational-game', 'computer-science', 'competitive-programming', 'algorithm', 'kata', 'interview', 'roadmap', 'free-programming-book', 'beginner', 'talk', 'tech-video']},
+    'PRP-020': {'name': 'DESIGN & UI/UX', 'icon': '🎨', 'tags': ['design', 'ui', 'ux', 'design-system', 'design-principle', 'web-design', 'product-design', 'sketch', 'framer', 'creative-coding', 'canvas', 'webgl', 'vulkan', 'opengl', 'charting', 'd3', 'colorful', 'font', 'icon']},
+    'PRP-021': {'name': 'MEDIA & CONTENT', 'icon': '🎬', 'tags': ['video', 'audio', 'music', 'podcast', 'broadcasting', 'ffmpeg', 'vlc', 'webaudio', 'audio-visualization', 'photography', 'gif', 'creative-tech', 'audiovisual', 'pixel-art']},
+    'PRP-022': {'name': 'BUSINESS & CAREER', 'icon': '💼', 'tags': ['business', 'startup', 'indie', 'product-management', 'project-management', 'okr', 'leading', 'managing', 'remote-job', 'job-board', 'internship', 'freelance', 'marketing', 'billing', 'amazon-seller', 'social-enterprise', 'open-company', 'speaking', 'developer-first']},
+    'PRP-023': {'name': 'COMMUNITY & CULTURE', 'icon': '🌍', 'tags': ['diversity', 'for-girls', 'mental-health', 'accessibility', 'humane-tech', 'earth', 'clean-tech', 'veganism', 'theravada', 'uncopyright', 'ad-free', 'free-software', 'open-source-supporter', 'maintainer', 'patreon', 'naming', 'falsehood', 'answer', 'ama', 'speaker', 'event', 'conference', 'italy-event', 'netherlands-event', 'european-tech']},
+    'PRP-024': {'name': 'NETWORKING & COMMS', 'icon': '📡', 'tags': ['network', 'sdn', 'pcap', 'snmp', 'irc', 'mastodon', 'slack', 'discord', 'email', 'rtc', 'connectivity', 'ssh', 'radio', 'hacker-news', 'chatops', 'chat', 'bot']},
+    'PRP-025': {'name': 'UTILITIES & PRODUCTIVITY', 'icon': '🔧', 'tags': ['productivity', 'selfhosted', 'sysadmin', 'tool', 'lowcode', 'no-login', 'calculator', 'userscript', 'boilerplate', 'building-block', 'pagespeed', 'readme', 'htaccess', 'stock-resource', 'creative-commons', 'ponyfill', 'promise', 'observable', 'workflow-automation', 'distraction-blocker']},
+    'PRP-026': {'name': 'CONTENT MANAGEMENT', 'icon': '📄', 'tags': ['cms', 'markdown', 'text-editing', 'book-authoring', 'blog', 'newsletter', 'rss', 'web-archiving', 'digital-history', 'open-source-document']},
+    'PRP-027': {'name': 'HEALTH & WELLNESS', 'icon': '🏥', 'tags': ['health', 'healthcare', 'glp1', 'mental-health', 'biomedical', 'digital-health']},
+    'PRP-028': {'name': 'MISCELLANEOUS', 'icon': '📦', 'tags': []},
+}
+
+def log(msg):
+    ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    line = f"[{ts}] {msg}"
+    print(line)
+    with open(LOG, "a") as f:
+        f.write(line + "\n")
+
+def clean_name(name):
+    cleaned = re.sub(r'^[Aa]wesome[- _]*', '', name).strip()
+    return cleaned if cleaned else name
+
+def pull_repo():
+    if os.path.isdir(os.path.join(REPO_DIR, '.git')):
+        log("Pulling latest changes...")
+        subprocess.run(['git', '-C', REPO_DIR, 'fetch', '--all'], check=True, capture_output=True)
+        subprocess.run(['git', '-C', REPO_DIR, 'reset', '--hard', 'origin/main'], check=True, capture_output=True)
+    else:
+        log("Cloning repository...")
+        os.makedirs(REPO_DIR, exist_ok=True)
+        subprocess.run(['git', 'clone', '--depth', '1', REPO_URL, REPO_DIR], check=True, capture_output=True)
+    log("Repository updated.")
+
+def parse_readme(filepath):
+    """Parse a single awesome list README.md into structured data."""
+    try:
+        with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
+            content = f.read()
+    except:
+        return None
+
+    lines = content.split('\n')
+    subcategories = []
+    current_sub = None
+    title = ''
+    description = ''
+    github_url = ''
+    stars = ''
+
+    # Extract frontmatter or title
+    for i, line in enumerate(lines):
+        if line.startswith('# '):
+            title = line[2:].strip()
+            break
+
+    # Extract description (first paragraph after title)
+    in_desc = False
+    for i, line in enumerate(lines):
+        if line.startswith('# '):
+            in_desc = True
+            continue
+        if in_desc:
+            stripped = line.strip()
+            if stripped and not stripped.startswith('#') and not stripped.startswith('[') and not stripped.startswith('!'):
+                description = stripped
+                break
+            elif stripped.startswith('#'):
+                break
+
+    # Parse sections and entries
+    link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
+
+    for line in lines:
+        stripped = line.strip()
+
+        # Section headers
+        header_match = re.match(r'^(#{2,6})\s+(.+)', stripped)
+        if header_match:
+            level = len(header_match.group(1))
+            header_name = header_match.group(2).strip()
+            # Remove trailing links from header
+            header_name = re.sub(r'\s*\[.*?\]\(.*?\)', '', header_name).strip()
+            if header_name and header_name.lower() not in ('contents', 'table of contents', 'toc', 'license', 'contributing', 'footnotes'):
+                current_sub = {'name': header_name, 'parent': '', 'entries': []}
+                subcategories.append(current_sub)
+            continue
+
+        # List items with links
+        if stripped.startswith(('-', '*')) and '[' in stripped and '](' in stripped:
+            matches = link_pattern.findall(stripped)
+            if matches:
+                entry_name = matches[0][0]
+                entry_url = matches[0][1]
+                # Get description after the link
+                entry_desc = ''
+                desc_match = re.search(r'\)\s*[-–—:]?\s*(.+)', stripped)
+                if desc_match:
+                    entry_desc = desc_match.group(1).strip()
+
+                entry = {'name': entry_name, 'url': entry_url, 'description': entry_desc}
+
+                if current_sub is None:
+                    current_sub = {'name': 'General', 'parent': '', 'entries': []}
+                    subcategories.append(current_sub)
+                current_sub['entries'].append(entry)
+
+    # Filter empty subcategories
+    subcategories = [s for s in subcategories if s['entries']]
+
+    total_entries = sum(len(s['entries']) for s in subcategories)
+
+    return {
+        'title': clean_name(title),
+        'description': description[:300],
+        'github_url': github_url,
+        'stars': stars,
+        'entry_count': total_entries,
+        'subcategory_count': len(subcategories),
+        'subcategories': subcategories,
+        'name': ''
+    }
+
+def classify_list(slug, title, description):
+    """Assign a list to a sector based on slug/title/description matching."""
+    text = f"{slug} {title} {description}".lower()
+
+    for code, sector in SECTOR_MAP.items():
+        if code == 'PRP-028':  # Misc is fallback
+            continue
+        for tag in sector['tags']:
+            if tag in text:
+                return code
+
+    return 'PRP-028'  # Miscellaneous fallback
+
+def build_data():
+    content_dir = os.path.join(REPO_DIR, 'content')
+    if not os.path.isdir(content_dir):
+        log(f"ERROR: content dir not found: {content_dir}")
+        return False
+
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+    # Clear old data
+    for f in os.listdir(OUTPUT_DIR):
+        if f.endswith('.json'):
+            os.remove(os.path.join(OUTPUT_DIR, f))
+
+    sectors = {code: {**info, 'code': code, 'lists': [], 'list_count': 0, 'total_entries': 0} for code, info in SECTOR_MAP.items()}
+    total_lists = 0
+    total_entries = 0
+
+    # Walk content directory
+    for org_dir in sorted(os.listdir(content_dir)):
+        org_path = os.path.join(content_dir, org_dir)
+        if not os.path.isdir(org_path):
+            continue
+
+        for repo_dir in sorted(os.listdir(org_path)):
+            repo_path = os.path.join(org_path, repo_dir)
+            readme_path = os.path.join(repo_path, 'README.md')
+
+            if not os.path.isfile(readme_path):
+                # Try readme.md lowercase
+                readme_path = os.path.join(repo_path, 'readme.md')
+                if not os.path.isfile(readme_path):
+                    continue
+
+            slug = f"{org_dir}--{repo_dir}"
+            data = parse_readme(readme_path)
+
+            if data is None or data['entry_count'] == 0:
+                continue
+
+            data['slug'] = slug
+            sector_code = classify_list(slug, data['title'], data['description'])
+            data['tag'] = sector_code
+
+            # Save individual file
+            out_file = os.path.join(OUTPUT_DIR, f"{slug}.json")
+            with open(out_file, 'w') as f:
+                json.dump(data, f)
+
+            # Add to sector index
+            sectors[sector_code]['lists'].append({
+                'slug': slug,
+                'title': data['title'],
+                'description': data['description'][:200],
+                'stars': data.get('stars', ''),
+                'entry_count': data['entry_count'],
+                'subcategory_count': data['subcategory_count']
+            })
+            sectors[sector_code]['list_count'] += 1
+            sectors[sector_code]['total_entries'] += data['entry_count']
+
+            total_lists += 1
+            total_entries += data['entry_count']
+
+    # Sort lists within sectors by entry count
+    for code in sectors:
+        sectors[code]['lists'].sort(key=lambda x: x['entry_count'], reverse=True)
+
+    # Build index (remove empty sectors)
+    sector_list = [s for s in sectors.values() if s['list_count'] > 0]
+    sector_list.sort(key=lambda x: x['code'])
+
+    index = {
+        'total_lists': total_lists,
+        'total_entries': total_entries,
+        'sector_count': len(sector_list),
+        'sectors': sector_list
+    }
+
+    with open(OUTPUT_INDEX, 'w') as f:
+        json.dump(index, f)
+
+    log(f"Built {total_lists} lists, {total_entries} entries across {len(sector_list)} sectors")
+    return True
+
+def restart_api():
+    log("Restarting API service...")
+    try:
+        subprocess.run(['systemctl', 'restart', 'jaeswift-api'], check=True, capture_output=True)
+        log("API restarted.")
+    except Exception as e:
+        log(f"WARNING: Could not restart API: {e}")
+
+def main():
+    log("="*60)
+    log("AWESOMELIST SYNC STARTED")
+    try:
+        pull_repo()
+        if build_data():
+            restart_api()
+            log("SYNC COMPLETED SUCCESSFULLY")
+        else:
+            log("SYNC FAILED - build error")
+    except Exception as e:
+        log(f"SYNC FAILED: {e}")
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()