from flask import Flask, render_template, Response from sonatoki.ilo import Ilo from sonatoki.Configs import PrefConfig, CorpusConfig from atproto import FirehoseSubscribeReposClient, parse_subscribe_repos_message from atproto import CAR, models import json import re import emoji import queue import threading from werkzeug.serving import run_simple from threading import Lock # STL from typing import List, Type, TypedDict # PDM from typing_extensions import NotRequired # LOCAL from sonatoki.types import Number from sonatoki.Filters import ( Or, And, Len, Not, Filter, PuName, Numeric, Syllabic, NimiUCSUR, Alphabetic, NimiKuLili, NimiKuSuli, ProperName, Punctuation, LongSyllabic, Miscellaneous, LongAlphabetic, LongProperName, FalsePosSyllabic, NimiLinkuByUsage, NimiLinkuObscure, NimiLinkuSandbox, NimiLinkuUncommon, FalsePosAlphabetic, ) from sonatoki.Scorers import Scorer, Soften, Voting, PassFail, SoftScaling, SoftPassFail from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe from sonatoki.Preprocessors import ( RECOMMENDED_PREPROCESSORS, URLs, Emoji, Codeblock, Reference, Preprocessor, AngleBracketObject, Emails ) __DICT_PHONOMATCHES = { # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3. # In this case, all of these appear more often in English by a factor of at least 10. "aka", # also known as "an", # article "api", # API "i", # 1st person "je", # 1st person pronoun, french "kana", # japanese script "me", # 1st person singular, english "ne", # "no" in several languages "nu", # "new" in english, "now" in dutch "omen", # ominous "se", # spanish particle, english "see" "sole", # singular, of shoe "take", # acquire, perhaps forcefully or without permission "ten", # 10 "to", # to, too "u", # no u "we", # 1st person plural, english "wi", # wii and discussions of syllables # unexplored candidates for removal # "papa", # father # "lo", # "lo" and "loo" # "ewe", # sheep # "pa", # father- eh? } app = Flask(__name__) ilo = Ilo(**{ "preprocessors": [ URLs, Emails, Emoji, ], "cleaners": [ConsecutiveDuplicates], "ignoring_filters": [Numeric, Punctuation], "scoring_filters": [ Len(Or(NimiLinkuByUsage(30), NimiUCSUR), max=15), Len(And(Syllabic, Not(FalsePosSyllabic)), min=3, max=24), # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong Len(ProperName, min=2, max=24), Len(And(Alphabetic, Not(FalsePosAlphabetic)), min=3, max=24), ], "scorer": SoftScaling, "passing_score": 0.8, }) class JSONExtra(json.JSONEncoder): def default(self, obj): try: return json.JSONEncoder.default(self, obj) except: return repr(obj) def clean_text(text: str) -> str: text = emoji.replace_emoji(text, replace='') text = re.sub(r'https?://\S+', '', text) #text = re.sub(r'[^A-Za-z\s]', '', text) text = text.strip() return text clients = [] clients_lock = Lock() def broadcast_message(msg: dict): # Send the given message to all connected SSE clients with clients_lock: for q in clients: q.put(msg) def process_firehose(): client = FirehoseSubscribeReposClient() def on_message_handler(message): commit = parse_subscribe_repos_message(message) if not isinstance(commit, models.ComAtprotoSyncSubscribeRepos.Commit): return car = CAR.from_bytes(commit.blocks) for op in commit.ops: if op.action == "create" and op.cid: raw = car.blocks.get(op.cid) cooked = models.get_or_create(raw, strict=False) if not cooked: continue if cooked.py_type == "app.bsky.feed.post": #if ilo.is_toki_pona(raw.get("text", "")): # print(raw.get("text", "")) cleaned_text = clean_text(raw.get("text", "")) if not cleaned_text or len(cleaned_text.split()) < 3: continue msg = ilo.preprocess(cleaned_text) scorecard = ilo._is_toki_pona(msg) result = scorecard["cleaned"] and scorecard["score"] >= 0.8 if not result: continue url = f'https://bsky.app/profile/{commit.repo}/post/{op.path.split("/")[1]}' broadcast_message({'text': raw.get("text", ""), 'url': url}) client.start(on_message_handler) def generate_sse(): # Each client gets its own queue. q = queue.Queue() with clients_lock: clients.append(q) try: while True: message = q.get() # Blocking until a new message is broadcast yield f"data: {json.dumps(message)}\n\n" finally: with clients_lock: clients.remove(q) @app.route('/') def index(): return """