552 lines
18 KiB
Python
552 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""JAESWIFT Visitor Intelligence Endpoints
|
|
Provides /api/visitor/* endpoints:
|
|
/api/visitor/scan - scan current visitor (IP/geo/UA/device/threat)
|
|
/api/visitor/recent-arcs - last N visitor lat/lon pairs for globe traffic arcs
|
|
"""
|
|
import os, re, time, socket, json
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from collections import OrderedDict, Counter
|
|
from functools import lru_cache
|
|
|
|
from flask import Blueprint, jsonify, request
|
|
|
|
visitor_bp = Blueprint('visitor', __name__)
|
|
|
|
DATA_DIR = Path(__file__).parent / 'data'
|
|
NGINX_LOG = '/var/log/nginx/access.log'
|
|
GEOIP_PATHS = [
|
|
'/usr/share/GeoIP/GeoLite2-Country.mmdb',
|
|
'/usr/share/GeoIP/GeoLite2-City.mmdb',
|
|
'/var/lib/GeoIP/GeoLite2-Country.mmdb',
|
|
]
|
|
CENTROIDS_FILE = DATA_DIR / 'country_centroids.json'
|
|
|
|
# ─── Lazy imports (optional deps) ──────────────────────
|
|
_geoip_reader = None
|
|
_geoip_has_city = False
|
|
|
|
def _get_geoip():
|
|
global _geoip_reader, _geoip_has_city
|
|
if _geoip_reader is not None:
|
|
return _geoip_reader
|
|
try:
|
|
import geoip2.database
|
|
for p in GEOIP_PATHS:
|
|
if os.path.exists(p):
|
|
_geoip_reader = geoip2.database.Reader(p)
|
|
_geoip_has_city = 'City' in p
|
|
return _geoip_reader
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def _parse_ua(ua_string):
|
|
"""Parse user-agent. Uses user-agents lib if available, else crude regex."""
|
|
try:
|
|
from user_agents import parse
|
|
ua = parse(ua_string or '')
|
|
browser_family = ua.browser.family or 'Unknown'
|
|
browser_version = '.'.join(str(x) for x in ua.browser.version[:2] if x is not None) or ''
|
|
os_family = ua.os.family or 'Unknown'
|
|
os_version = '.'.join(str(x) for x in ua.os.version[:2] if x is not None) or ''
|
|
device = 'Mobile' if ua.is_mobile else ('Tablet' if ua.is_tablet else ('Bot' if ua.is_bot else 'Desktop'))
|
|
return {
|
|
'browser': browser_family,
|
|
'browser_version': browser_version,
|
|
'os': os_family,
|
|
'os_version': os_version,
|
|
'device': device,
|
|
'is_bot': bool(ua.is_bot),
|
|
}
|
|
except Exception:
|
|
s = (ua_string or '').lower()
|
|
browser = 'Unknown'; bver = ''
|
|
if 'firefox' in s:
|
|
browser = 'Firefox'
|
|
m = re.search(r'firefox/([0-9.]+)', s); bver = m.group(1)[:6] if m else ''
|
|
elif 'edg/' in s:
|
|
browser = 'Edge'
|
|
m = re.search(r'edg/([0-9.]+)', s); bver = m.group(1)[:6] if m else ''
|
|
elif 'chrome' in s:
|
|
browser = 'Chrome'
|
|
m = re.search(r'chrome/([0-9.]+)', s); bver = m.group(1)[:6] if m else ''
|
|
elif 'safari' in s:
|
|
browser = 'Safari'
|
|
is_bot = any(k in s for k in ('bot', 'crawl', 'spider', 'wget', 'curl'))
|
|
os_family = 'Unknown'
|
|
if 'windows' in s: os_family = 'Windows'
|
|
elif 'mac os' in s or 'macintosh' in s: os_family = 'macOS'
|
|
elif 'android' in s: os_family = 'Android'
|
|
elif 'iphone' in s or 'ipad' in s or 'ios' in s: os_family = 'iOS'
|
|
elif 'linux' in s: os_family = 'Linux'
|
|
device = 'Mobile' if ('mobile' in s or 'android' in s or 'iphone' in s) else ('Bot' if is_bot else 'Desktop')
|
|
return {
|
|
'browser': browser, 'browser_version': bver,
|
|
'os': os_family, 'os_version': '',
|
|
'device': device, 'is_bot': is_bot,
|
|
}
|
|
|
|
|
|
def _mask_ip(ip):
|
|
if not ip:
|
|
return 'UNKNOWN'
|
|
if ':' in ip: # IPv6 — mask middle groups
|
|
parts = ip.split(':')
|
|
if len(parts) >= 3:
|
|
return f"{parts[0]}:****:****:{parts[-1]}"
|
|
return ip
|
|
parts = ip.split('.')
|
|
if len(parts) == 4:
|
|
return f"{parts[0]}.***.***.{ parts[3]}"
|
|
return ip
|
|
|
|
|
|
def _client_ip():
|
|
xff = request.headers.get('X-Forwarded-For', '')
|
|
if xff:
|
|
return xff.split(',')[0].strip()
|
|
xr = request.headers.get('X-Real-IP', '')
|
|
if xr:
|
|
return xr.strip()
|
|
return request.remote_addr or ''
|
|
|
|
|
|
def _reverse_dns(ip, timeout=1.0):
|
|
try:
|
|
socket.setdefaulttimeout(timeout)
|
|
host, _, _ = socket.gethostbyaddr(ip)
|
|
return host
|
|
except Exception:
|
|
return ''
|
|
finally:
|
|
socket.setdefaulttimeout(None)
|
|
|
|
|
|
def _isp_guess(hostname):
|
|
"""Guess ISP from reverse DNS hostname — crude but free."""
|
|
if not hostname:
|
|
return 'REDACTED'
|
|
h = hostname.lower()
|
|
known = {
|
|
'bt.com': 'British Telecommunications',
|
|
'btcentralplus': 'British Telecommunications',
|
|
'virginm.net': 'Virgin Media',
|
|
'sky.com': 'Sky Broadband',
|
|
'talktalk': 'TalkTalk',
|
|
'plus.net': 'Plusnet',
|
|
'vodafone': 'Vodafone',
|
|
'three.co.uk': 'Three UK',
|
|
'ee.co.uk': 'EE',
|
|
'comcast': 'Comcast',
|
|
'verizon': 'Verizon',
|
|
'amazonaws': 'Amazon AWS',
|
|
'googleusercontent': 'Google Cloud',
|
|
'googlebot': 'Google (Bot)',
|
|
'azure': 'Microsoft Azure',
|
|
'hetzner': 'Hetzner',
|
|
'digitalocean': 'DigitalOcean',
|
|
'ovh': 'OVH',
|
|
'cloudflare': 'Cloudflare',
|
|
'linode': 'Linode',
|
|
'deutsche-telekom': 'Deutsche Telekom',
|
|
'telekom': 'Deutsche Telekom',
|
|
'orange.fr': 'Orange',
|
|
'free.fr': 'Free',
|
|
}
|
|
for key, val in known.items():
|
|
if key in h:
|
|
return val
|
|
# fallback: extract last 2 labels as domain
|
|
parts = h.split('.')
|
|
if len(parts) >= 2:
|
|
return parts[-2].capitalize() + '.' + parts[-1]
|
|
return 'REDACTED'
|
|
|
|
|
|
def _country_flag(cc):
|
|
if not cc or len(cc) != 2:
|
|
return ''
|
|
try:
|
|
return chr(0x1F1E6 + ord(cc[0].upper()) - ord('A')) + chr(0x1F1E6 + ord(cc[1].upper()) - ord('A'))
|
|
except Exception:
|
|
return ''
|
|
|
|
|
|
# ─── Rate limiting (in-memory, simple) ─────────────────
|
|
_rate_cache = OrderedDict()
|
|
_RATE_LIMIT_SECS = 10
|
|
|
|
def _rate_limited(ip):
|
|
now = time.time()
|
|
# purge old
|
|
expired = [k for k, t in _rate_cache.items() if now - t > _RATE_LIMIT_SECS]
|
|
for k in expired:
|
|
_rate_cache.pop(k, None)
|
|
last = _rate_cache.get(ip)
|
|
if last and now - last < _RATE_LIMIT_SECS:
|
|
return True
|
|
_rate_cache[ip] = now
|
|
# cap size
|
|
while len(_rate_cache) > 1000:
|
|
_rate_cache.popitem(last=False)
|
|
return False
|
|
|
|
|
|
# ─── Country centroids (lat/lon) for arcs ──────────────
|
|
_CENTROIDS = None
|
|
|
|
def _load_centroids():
|
|
global _CENTROIDS
|
|
if _CENTROIDS is not None:
|
|
return _CENTROIDS
|
|
if CENTROIDS_FILE.exists():
|
|
try:
|
|
with open(CENTROIDS_FILE) as f:
|
|
_CENTROIDS = json.load(f)
|
|
return _CENTROIDS
|
|
except Exception:
|
|
pass
|
|
_CENTROIDS = {}
|
|
return _CENTROIDS
|
|
|
|
|
|
# ─── /api/visitor/scan ─────────────────────────────────
|
|
@visitor_bp.route('/api/visitor/scan')
|
|
def visitor_scan():
|
|
ip = _client_ip()
|
|
ua_string = request.headers.get('User-Agent', '')
|
|
lang = request.headers.get('Accept-Language', 'en').split(',')[0].strip()
|
|
|
|
if _rate_limited(ip):
|
|
return jsonify({'error': 'rate_limited', 'retry_after': _RATE_LIMIT_SECS}), 429
|
|
|
|
country = 'UNKNOWN'
|
|
country_code = 'XX'
|
|
city = ''
|
|
latlon = None
|
|
|
|
reader = _get_geoip()
|
|
if reader and ip:
|
|
try:
|
|
if _geoip_has_city:
|
|
resp = reader.city(ip)
|
|
country = resp.country.name or 'UNKNOWN'
|
|
country_code = resp.country.iso_code or 'XX'
|
|
city = resp.city.name or ''
|
|
if resp.location.latitude is not None:
|
|
latlon = [resp.location.latitude, resp.location.longitude]
|
|
else:
|
|
resp = reader.country(ip)
|
|
country = resp.country.name or 'UNKNOWN'
|
|
country_code = resp.country.iso_code or 'XX'
|
|
except Exception:
|
|
pass
|
|
|
|
ua_info = _parse_ua(ua_string)
|
|
|
|
# ISP via reverse DNS (1s timeout)
|
|
hostname = _reverse_dns(ip, timeout=1.0) if ip else ''
|
|
isp = _isp_guess(hostname)
|
|
|
|
# Threat heuristic
|
|
threat_level = 'GREEN'
|
|
threat_reason = 'TRUSTED OPERATOR'
|
|
if ua_info.get('is_bot'):
|
|
threat_level = 'AMBER'
|
|
threat_reason = 'AUTOMATED AGENT DETECTED'
|
|
# crude TOR heuristic via hostname
|
|
is_tor = 'tor-exit' in hostname.lower() or 'torproject' in hostname.lower()
|
|
if is_tor:
|
|
threat_level = 'AMBER'
|
|
threat_reason = 'TOR EXIT NODE'
|
|
|
|
return jsonify({
|
|
'ip_masked': _mask_ip(ip),
|
|
'country': country,
|
|
'country_code': country_code,
|
|
'country_flag': _country_flag(country_code),
|
|
'city': city,
|
|
'isp': isp,
|
|
'hostname': hostname if hostname else '',
|
|
'browser': ua_info['browser'],
|
|
'browser_version': ua_info['browser_version'],
|
|
'os': ua_info['os'],
|
|
'os_version': ua_info['os_version'],
|
|
'device': ua_info['device'],
|
|
'language': lang,
|
|
'threat_level': threat_level,
|
|
'threat_reason': threat_reason,
|
|
'is_tor': is_tor,
|
|
'is_bot': ua_info['is_bot'],
|
|
'timestamp': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
|
|
})
|
|
|
|
|
|
# ─── /api/visitor/recent-arcs ──────────────────────────
|
|
_arcs_cache = {'ts': 0, 'data': []}
|
|
_ARCS_CACHE_SECS = 300
|
|
|
|
def _parse_nginx_recent_ips(limit=20000):
|
|
"""Read tail of nginx log, extract (ip, timestamp, path) from recent visits."""
|
|
if not os.path.exists(NGINX_LOG):
|
|
return []
|
|
try:
|
|
import subprocess
|
|
r = subprocess.run(['tail', '-n', str(limit), NGINX_LOG],
|
|
capture_output=True, text=True, timeout=8)
|
|
lines = r.stdout.strip().split('\n')
|
|
except Exception:
|
|
return []
|
|
|
|
# Typical combined log: IP - - [dd/Mmm/yyyy:HH:MM:SS +0000] "METHOD /path HTTP/1.1" status size "ref" "ua"
|
|
pat = re.compile(r'^(\S+) .* \[([^\]]+)\] "(\S+) (\S+) \S+" (\d+)')
|
|
rows = []
|
|
for line in lines:
|
|
m = pat.match(line)
|
|
if not m:
|
|
continue
|
|
ip, ts, method, path, status = m.groups()
|
|
if method != 'GET':
|
|
continue
|
|
# skip asset/api requests for arc relevance
|
|
if path.startswith(('/api/', '/css/', '/js/', '/assets/', '/fonts/', '/mascot/')):
|
|
continue
|
|
if any(path.endswith(ext) for ext in ('.ico', '.png', '.jpg', '.svg', '.webp', '.woff', '.woff2')):
|
|
continue
|
|
rows.append((ip, ts, path))
|
|
return rows
|
|
|
|
|
|
@visitor_bp.route('/api/visitor/recent-arcs')
|
|
def visitor_recent_arcs():
|
|
now = time.time()
|
|
if _arcs_cache['data'] and now - _arcs_cache['ts'] < _ARCS_CACHE_SECS:
|
|
return jsonify(_arcs_cache['data'])
|
|
|
|
rows = _parse_nginx_recent_ips(20000)
|
|
reader = _get_geoip()
|
|
centroids = _load_centroids()
|
|
|
|
seen_ips = OrderedDict() # preserve order, last-seen per IP
|
|
for ip, ts, path in rows:
|
|
seen_ips[ip] = (ts, path)
|
|
|
|
# Build arcs — most recent 50 unique
|
|
arcs = []
|
|
items = list(seen_ips.items())[-200:]
|
|
items.reverse() # most recent first
|
|
for ip, (ts, path) in items:
|
|
if len(arcs) >= 50:
|
|
break
|
|
cc = 'XX'
|
|
country_name = 'Unknown'
|
|
lat = lon = None
|
|
if reader:
|
|
try:
|
|
if _geoip_has_city:
|
|
r = reader.city(ip)
|
|
cc = r.country.iso_code or 'XX'
|
|
country_name = r.country.name or 'Unknown'
|
|
if r.location.latitude is not None:
|
|
lat, lon = r.location.latitude, r.location.longitude
|
|
else:
|
|
r = reader.country(ip)
|
|
cc = r.country.iso_code or 'XX'
|
|
country_name = r.country.name or 'Unknown'
|
|
except Exception:
|
|
continue
|
|
if lat is None and cc in centroids:
|
|
lat, lon = centroids[cc][0], centroids[cc][1]
|
|
if lat is None:
|
|
continue
|
|
# jitter slightly so multiple from same country don't overlap exactly
|
|
import random
|
|
lat += (random.random() - 0.5) * 1.5
|
|
lon += (random.random() - 0.5) * 1.5
|
|
arcs.append({
|
|
'country_code': cc,
|
|
'country_name': country_name,
|
|
'lat': round(lat, 3),
|
|
'lon': round(lon, 3),
|
|
'timestamp': ts,
|
|
'page_viewed': path,
|
|
})
|
|
|
|
_arcs_cache['ts'] = now
|
|
_arcs_cache['data'] = arcs
|
|
return jsonify(arcs)
|
|
|
|
|
|
# ─── /api/leaderboards ─────────────────────────────────
|
|
_lb_cache = {'ts': 0, 'data': None}
|
|
_LB_CACHE_SECS = 60
|
|
|
|
_UA_BROWSER_PAT = re.compile(r'(Firefox|Edg|Chrome|Safari|Opera|DuckDuckGo|SamsungBrowser|MSIE|Trident)/?([0-9.]*)', re.I)
|
|
|
|
def _lb_parse_log(limit_lines=50000):
|
|
"""Parse nginx log for leaderboard data. Returns list of dicts."""
|
|
if not os.path.exists(NGINX_LOG):
|
|
return []
|
|
try:
|
|
import subprocess
|
|
r = subprocess.run(['tail', '-n', str(limit_lines), NGINX_LOG],
|
|
capture_output=True, text=True, timeout=10)
|
|
lines = r.stdout.strip().split('\n')
|
|
except Exception:
|
|
return []
|
|
pat = re.compile(r'^(\S+) \S+ \S+ \[([^\]]+)\] "(\S+) (\S+) \S+" (\d+) \S+ "([^"]*)" "([^"]*)"')
|
|
rows = []
|
|
for line in lines:
|
|
m = pat.match(line)
|
|
if not m:
|
|
continue
|
|
ip, ts, method, path, status, referer, ua = m.groups()
|
|
rows.append({
|
|
'ip': ip, 'ts': ts, 'method': method, 'path': path,
|
|
'status': int(status) if status.isdigit() else 0,
|
|
'referer': referer, 'ua': ua,
|
|
})
|
|
return rows
|
|
|
|
|
|
def _parse_nginx_ts(ts_str):
|
|
try:
|
|
return datetime.strptime(ts_str.split()[0], '%d/%b/%Y:%H:%M:%S')
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
@visitor_bp.route('/api/leaderboards')
|
|
def leaderboards():
|
|
now = time.time()
|
|
if _lb_cache['data'] and now - _lb_cache['ts'] < _LB_CACHE_SECS:
|
|
return jsonify(_lb_cache['data'])
|
|
|
|
rows = _lb_parse_log(50000)
|
|
reader = _get_geoip()
|
|
now_dt = datetime.utcnow()
|
|
|
|
# split 24h / 7d
|
|
rows_24h = []
|
|
rows_7d = []
|
|
for r in rows:
|
|
dt = _parse_nginx_ts(r['ts'])
|
|
if not dt:
|
|
continue
|
|
age_h = (now_dt - dt).total_seconds() / 3600
|
|
if age_h <= 24:
|
|
rows_24h.append({**r, 'dt': dt})
|
|
if age_h <= 24 * 7:
|
|
rows_7d.append({**r, 'dt': dt})
|
|
|
|
# Top Countries (24h)
|
|
country_counter = Counter()
|
|
if reader:
|
|
for r in rows_24h:
|
|
try:
|
|
if _geoip_has_city:
|
|
resp = reader.city(r['ip'])
|
|
else:
|
|
resp = reader.country(r['ip'])
|
|
cc = resp.country.iso_code or 'XX'
|
|
name = resp.country.name or 'Unknown'
|
|
country_counter[(cc, name)] += 1
|
|
except Exception:
|
|
continue
|
|
top_countries = [{'code': cc, 'name': name, 'count': c, 'flag': _country_flag(cc)}
|
|
for (cc, name), c in country_counter.most_common(15)]
|
|
|
|
# Top Pages (24h) — exclude api/assets
|
|
pages_counter = Counter()
|
|
for r in rows_24h:
|
|
p = r['path'].split('?', 1)[0]
|
|
if p.startswith(('/api/', '/css/', '/js/', '/assets/', '/fonts/', '/mascot/')):
|
|
continue
|
|
if any(p.endswith(ext) for ext in ('.ico', '.png', '.jpg', '.jpeg', '.svg', '.webp', '.woff', '.woff2', '.gif', '.map')):
|
|
continue
|
|
if r['status'] >= 400:
|
|
continue
|
|
if r['method'] != 'GET':
|
|
continue
|
|
pages_counter[p] += 1
|
|
top_pages = [{'path': p, 'count': c} for p, c in pages_counter.most_common(20)]
|
|
|
|
# Top Referrers (7d) — exclude self/empty
|
|
ref_counter = Counter()
|
|
for r in rows_7d:
|
|
ref = r['referer']
|
|
if not ref or ref == '-':
|
|
continue
|
|
if 'jaeswift.xyz' in ref:
|
|
continue
|
|
# extract hostname
|
|
m = re.match(r'https?://([^/]+)', ref)
|
|
if m:
|
|
host = m.group(1)
|
|
ref_counter[host] += 1
|
|
top_referrers = [{'host': h, 'count': c} for h, c in ref_counter.most_common(10)]
|
|
|
|
# Peak Hours (24h)
|
|
hour_counter = Counter()
|
|
for r in rows_24h:
|
|
hour_counter[r['dt'].hour] += 1
|
|
peak_hours = [{'hour': h, 'count': hour_counter.get(h, 0)} for h in range(24)]
|
|
|
|
# Browser Breakdown (24h)
|
|
browser_counter = Counter()
|
|
for r in rows_24h:
|
|
m = _UA_BROWSER_PAT.search(r['ua'] or '')
|
|
if m:
|
|
name = m.group(1)
|
|
if name == 'Edg':
|
|
name = 'Edge'
|
|
browser_counter[name] += 1
|
|
else:
|
|
s = (r['ua'] or '').lower()
|
|
if any(k in s for k in ('bot', 'crawl', 'spider', 'curl', 'wget', 'python')):
|
|
browser_counter['Bot/CLI'] += 1
|
|
else:
|
|
browser_counter['Other'] += 1
|
|
browsers = [{'name': n, 'count': c} for n, c in browser_counter.most_common(10)]
|
|
|
|
# Operator Leaderboard (7d) — top IPs
|
|
ip_counter = Counter()
|
|
ip_last_seen = {}
|
|
for r in rows_7d:
|
|
ip_counter[r['ip']] += 1
|
|
ip_last_seen[r['ip']] = r['dt']
|
|
top_ops = []
|
|
for ip, c in ip_counter.most_common(10):
|
|
last = ip_last_seen[ip]
|
|
delta = (now_dt - last).total_seconds()
|
|
if delta < 60:
|
|
last_seen_str = f"{int(delta)}s ago"
|
|
elif delta < 3600:
|
|
last_seen_str = f"{int(delta/60)}m ago"
|
|
elif delta < 86400:
|
|
last_seen_str = f"{int(delta/3600)}h ago"
|
|
else:
|
|
last_seen_str = f"{int(delta/86400)}d ago"
|
|
top_ops.append({
|
|
'ip_masked': _mask_ip(ip),
|
|
'count': c,
|
|
'last_seen': last_seen_str,
|
|
})
|
|
|
|
data = {
|
|
'generated_at': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
|
|
'total_requests_24h': len(rows_24h),
|
|
'total_requests_7d': len(rows_7d),
|
|
'top_countries': top_countries,
|
|
'top_pages': top_pages,
|
|
'top_referrers': top_referrers,
|
|
'peak_hours': peak_hours,
|
|
'browsers': browsers,
|
|
'top_operators': top_ops,
|
|
}
|
|
_lb_cache['ts'] = now
|
|
_lb_cache['data'] = data
|
|
return jsonify(data)
|