johnpaulbin's picture
Update app.py
725f05c verified
from flask import Flask, render_template, Response
from sonatoki.ilo import Ilo
from sonatoki.Configs import PrefConfig, CorpusConfig
from atproto import FirehoseSubscribeReposClient, parse_subscribe_repos_message
from atproto import CAR, models
import json
import re
import emoji
import queue
import threading
from werkzeug.serving import run_simple
from threading import Lock
# STL
from typing import List, Type, TypedDict
# PDM
from typing_extensions import NotRequired
# LOCAL
from sonatoki.types import Number
from sonatoki.Filters import (
Or,
And,
Len,
Not,
Filter,
PuName,
Numeric,
Syllabic,
NimiUCSUR,
Alphabetic,
NimiKuLili,
NimiKuSuli,
ProperName,
Punctuation,
LongSyllabic,
Miscellaneous,
LongAlphabetic,
LongProperName,
FalsePosSyllabic,
NimiLinkuByUsage,
NimiLinkuObscure,
NimiLinkuSandbox,
NimiLinkuUncommon,
FalsePosAlphabetic,
)
from sonatoki.Scorers import Scorer, Soften, Voting, PassFail, SoftScaling, SoftPassFail
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
from sonatoki.Preprocessors import (
RECOMMENDED_PREPROCESSORS,
URLs,
Emoji,
Codeblock,
Reference,
Preprocessor,
AngleBracketObject,
Emails
)
__DICT_PHONOMATCHES = {
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
# In this case, all of these appear more often in English by a factor of at least 10.
"aka", # also known as
"an", # article
"api", # API
"i", # 1st person
"je", # 1st person pronoun, french
"kana", # japanese script
"me", # 1st person singular, english
"ne", # "no" in several languages
"nu", # "new" in english, "now" in dutch
"omen", # ominous
"se", # spanish particle, english "see"
"sole", # singular, of shoe
"take", # acquire, perhaps forcefully or without permission
"ten", # 10
"to", # to, too
"u", # no u
"we", # 1st person plural, english
"wi", # wii and discussions of syllables
# unexplored candidates for removal
# "papa", # father
# "lo", # "lo" and "loo"
# "ewe", # sheep
# "pa", # father- eh?
}
app = Flask(__name__)
ilo = Ilo(**{
"preprocessors": [
URLs,
Emails,
Emoji,
],
"cleaners": [ConsecutiveDuplicates],
"ignoring_filters": [Numeric, Punctuation],
"scoring_filters": [
Len(Or(NimiLinkuByUsage(30), NimiUCSUR), max=15),
Len(And(Syllabic, Not(FalsePosSyllabic)), min=3, max=24),
# NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
Len(ProperName, min=2, max=24),
Len(And(Alphabetic, Not(FalsePosAlphabetic)), min=3, max=24),
],
"scorer": SoftScaling,
"passing_score": 0.8,
})
class JSONExtra(json.JSONEncoder):
def default(self, obj):
try:
return json.JSONEncoder.default(self, obj)
except:
return repr(obj)
def clean_text(text: str) -> str:
text = emoji.replace_emoji(text, replace='')
text = re.sub(r'https?://\S+', '', text)
#text = re.sub(r'[^A-Za-z\s]', '', text)
text = text.strip()
return text
clients = []
clients_lock = Lock()
def broadcast_message(msg: dict):
# Send the given message to all connected SSE clients
with clients_lock:
for q in clients:
q.put(msg)
def process_firehose():
client = FirehoseSubscribeReposClient()
def on_message_handler(message):
commit = parse_subscribe_repos_message(message)
if not isinstance(commit, models.ComAtprotoSyncSubscribeRepos.Commit):
return
car = CAR.from_bytes(commit.blocks)
for op in commit.ops:
if op.action == "create" and op.cid:
raw = car.blocks.get(op.cid)
cooked = models.get_or_create(raw, strict=False)
if not cooked:
continue
if cooked.py_type == "app.bsky.feed.post":
#if ilo.is_toki_pona(raw.get("text", "")):
# print(raw.get("text", ""))
cleaned_text = clean_text(raw.get("text", ""))
if not cleaned_text or len(cleaned_text.split()) < 3:
continue
msg = ilo.preprocess(cleaned_text)
scorecard = ilo._is_toki_pona(msg)
result = scorecard["cleaned"] and scorecard["score"] >= 0.8
if not result:
continue
url = f'https://bsky.app/profile/{commit.repo}/post/{op.path.split("/")[1]}'
broadcast_message({'text': raw.get("text", ""), 'url': url})
client.start(on_message_handler)
def generate_sse():
# Each client gets its own queue.
q = queue.Queue()
with clients_lock:
clients.append(q)
try:
while True:
message = q.get() # Blocking until a new message is broadcast
yield f"data: {json.dumps(message)}\n\n"
finally:
with clients_lock:
clients.remove(q)
@app.route('/')
def index():
return """<!DOCTYPE html>
<html>
<head>
<title>Toki Pona Live Stream</title>
<style>
body {
font-family: Arial, sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f5f5f5;
}
.message {
background: white;
padding: 15px;
margin: 10px 0;
border-radius: 5px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
a {
color: #0066cc;
text-decoration: none;
}
h1 {
text-align: center;
}
</style>
</head>
<body>
<h1>Toki Pona Live Stream</h1>
<div id="messages"></div>
<script>
const evtSource = new EventSource("/stream");
const messages = document.getElementById('messages');
evtSource.onmessage = function(event) {
const data = JSON.parse(event.data);
const messageDiv = document.createElement('div');
messageDiv.className = 'message';
messageDiv.innerHTML = `
<p>${data.text}</p>
<a href="${data.url}" target="_blank">View on Bluesky</a>
`;
messages.insertBefore(messageDiv, messages.firstChild);
if (messages.children.length > 50) {
messages.removeChild(messages.lastChild);
}
};
</script>
</body>
</html>"""
@app.route('/stream')
def stream():
return Response(generate_sse(), mimetype='text/event-stream')
if __name__ == '__main__':
# Start the firehose processing in a separate thread
threading.Thread(target=process_firehose, daemon=True).start()
# Use run_simple with threading enabled to allow multiple clients
run_simple('0.0.0.0', 7860, app, use_reloader=True, use_debugger=True, threaded=True)