Spaces:
Building
Building
from flask import Flask, render_template, Response | |
from sonatoki.ilo import Ilo | |
from sonatoki.Configs import PrefConfig, CorpusConfig | |
from atproto import FirehoseSubscribeReposClient, parse_subscribe_repos_message | |
from atproto import CAR, models | |
import json | |
import re | |
import emoji | |
import queue | |
import threading | |
from werkzeug.serving import run_simple | |
from threading import Lock | |
# STL | |
from typing import List, Type, TypedDict | |
# PDM | |
from typing_extensions import NotRequired | |
# LOCAL | |
from sonatoki.types import Number | |
from sonatoki.Filters import ( | |
Or, | |
And, | |
Len, | |
Not, | |
Filter, | |
PuName, | |
Numeric, | |
Syllabic, | |
NimiUCSUR, | |
Alphabetic, | |
NimiKuLili, | |
NimiKuSuli, | |
ProperName, | |
Punctuation, | |
LongSyllabic, | |
Miscellaneous, | |
LongAlphabetic, | |
LongProperName, | |
FalsePosSyllabic, | |
NimiLinkuByUsage, | |
NimiLinkuObscure, | |
NimiLinkuSandbox, | |
NimiLinkuUncommon, | |
FalsePosAlphabetic, | |
) | |
from sonatoki.Scorers import Scorer, Soften, Voting, PassFail, SoftScaling, SoftPassFail | |
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates | |
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe | |
from sonatoki.Preprocessors import ( | |
RECOMMENDED_PREPROCESSORS, | |
URLs, | |
Emoji, | |
Codeblock, | |
Reference, | |
Preprocessor, | |
AngleBracketObject, | |
Emails | |
) | |
__DICT_PHONOMATCHES = { | |
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3. | |
# In this case, all of these appear more often in English by a factor of at least 10. | |
"aka", # also known as | |
"an", # article | |
"api", # API | |
"i", # 1st person | |
"je", # 1st person pronoun, french | |
"kana", # japanese script | |
"me", # 1st person singular, english | |
"ne", # "no" in several languages | |
"nu", # "new" in english, "now" in dutch | |
"omen", # ominous | |
"se", # spanish particle, english "see" | |
"sole", # singular, of shoe | |
"take", # acquire, perhaps forcefully or without permission | |
"ten", # 10 | |
"to", # to, too | |
"u", # no u | |
"we", # 1st person plural, english | |
"wi", # wii and discussions of syllables | |
# unexplored candidates for removal | |
# "papa", # father | |
# "lo", # "lo" and "loo" | |
# "ewe", # sheep | |
# "pa", # father- eh? | |
} | |
app = Flask(__name__) | |
ilo = Ilo(**{ | |
"preprocessors": [ | |
URLs, | |
Emails, | |
Emoji, | |
], | |
"cleaners": [ConsecutiveDuplicates], | |
"ignoring_filters": [Numeric, Punctuation], | |
"scoring_filters": [ | |
Len(Or(NimiLinkuByUsage(30), NimiUCSUR), max=15), | |
Len(And(Syllabic, Not(FalsePosSyllabic)), min=3, max=24), | |
# NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong | |
Len(ProperName, min=2, max=24), | |
Len(And(Alphabetic, Not(FalsePosAlphabetic)), min=3, max=24), | |
], | |
"scorer": SoftScaling, | |
"passing_score": 0.8, | |
}) | |
class JSONExtra(json.JSONEncoder): | |
def default(self, obj): | |
try: | |
return json.JSONEncoder.default(self, obj) | |
except: | |
return repr(obj) | |
def clean_text(text: str) -> str: | |
text = emoji.replace_emoji(text, replace='') | |
text = re.sub(r'https?://\S+', '', text) | |
#text = re.sub(r'[^A-Za-z\s]', '', text) | |
text = text.strip() | |
return text | |
clients = [] | |
clients_lock = Lock() | |
def broadcast_message(msg: dict): | |
# Send the given message to all connected SSE clients | |
with clients_lock: | |
for q in clients: | |
q.put(msg) | |
def process_firehose(): | |
client = FirehoseSubscribeReposClient() | |
def on_message_handler(message): | |
commit = parse_subscribe_repos_message(message) | |
if not isinstance(commit, models.ComAtprotoSyncSubscribeRepos.Commit): | |
return | |
car = CAR.from_bytes(commit.blocks) | |
for op in commit.ops: | |
if op.action == "create" and op.cid: | |
raw = car.blocks.get(op.cid) | |
cooked = models.get_or_create(raw, strict=False) | |
if not cooked: | |
continue | |
if cooked.py_type == "app.bsky.feed.post": | |
#if ilo.is_toki_pona(raw.get("text", "")): | |
# print(raw.get("text", "")) | |
cleaned_text = clean_text(raw.get("text", "")) | |
if not cleaned_text or len(cleaned_text.split()) < 3: | |
continue | |
msg = ilo.preprocess(cleaned_text) | |
scorecard = ilo._is_toki_pona(msg) | |
result = scorecard["cleaned"] and scorecard["score"] >= 0.8 | |
if not result: | |
continue | |
url = f'https://bsky.app/profile/{commit.repo}/post/{op.path.split("/")[1]}' | |
broadcast_message({'text': raw.get("text", ""), 'url': url}) | |
client.start(on_message_handler) | |
def generate_sse(): | |
# Each client gets its own queue. | |
q = queue.Queue() | |
with clients_lock: | |
clients.append(q) | |
try: | |
while True: | |
message = q.get() # Blocking until a new message is broadcast | |
yield f"data: {json.dumps(message)}\n\n" | |
finally: | |
with clients_lock: | |
clients.remove(q) | |
def index(): | |
return """<!DOCTYPE html> | |
<html> | |
<head> | |
<title>Toki Pona Live Stream</title> | |
<style> | |
body { | |
font-family: Arial, sans-serif; | |
max-width: 800px; | |
margin: 0 auto; | |
padding: 20px; | |
background-color: #f5f5f5; | |
} | |
.message { | |
background: white; | |
padding: 15px; | |
margin: 10px 0; | |
border-radius: 5px; | |
box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
} | |
a { | |
color: #0066cc; | |
text-decoration: none; | |
} | |
h1 { | |
text-align: center; | |
} | |
</style> | |
</head> | |
<body> | |
<h1>Toki Pona Live Stream</h1> | |
<div id="messages"></div> | |
<script> | |
const evtSource = new EventSource("/stream"); | |
const messages = document.getElementById('messages'); | |
evtSource.onmessage = function(event) { | |
const data = JSON.parse(event.data); | |
const messageDiv = document.createElement('div'); | |
messageDiv.className = 'message'; | |
messageDiv.innerHTML = ` | |
<p>${data.text}</p> | |
<a href="${data.url}" target="_blank">View on Bluesky</a> | |
`; | |
messages.insertBefore(messageDiv, messages.firstChild); | |
if (messages.children.length > 50) { | |
messages.removeChild(messages.lastChild); | |
} | |
}; | |
</script> | |
</body> | |
</html>""" | |
def stream(): | |
return Response(generate_sse(), mimetype='text/event-stream') | |
if __name__ == '__main__': | |
# Start the firehose processing in a separate thread | |
threading.Thread(target=process_firehose, daemon=True).start() | |
# Use run_simple with threading enabled to allow multiple clients | |
run_simple('0.0.0.0', 7860, app, use_reloader=True, use_debugger=True, threaded=True) | |