#!/bin/python3 ''' Runs standard input through a part-of-speech tagger, then translates to Shavian. This resolves most heteronyms, but do still check the output for @ signs and fix them by hand. Each line of a dictionary consists of an English word, a space, a Shavian translation, and no comments. Special notations are: ^word 𐑢𐑻𐑛 word is a prefix $word 𐑢𐑻𐑛 word is a suffix Word 𐑢𐑻𐑛 always use a namer dot word_ 𐑢𐑻𐑛 never use a namer dot word_VB 𐑢𐑻𐑛 shave this way when tagged as a verb word. 𐑢𐑻𐑛 shave this way when no suffix is present word .𐑢𐑻𐑛 word takes no prefixes word 𐑢𐑻𐑛. word takes no suffixes word 𐑢𐑻𐑛: suffixes do not alter the root, e.g. "𐑑𐑾" or "𐑕𐑾" palatizing to "𐑖𐑩" or "𐑠𐑩". word . delete this word from the dictionary Words are matched case-sensitive when possible, e.g. US/us, WHO/who, Job/job, Nice/nice, Polish/polish. shaw.py does not care about the order of dictionary entries. shaw.c requries a highly specific order not described here. ''' import re import os import sys import html from html.parser import HTMLParser apostrophe = "'" # whatever you want for apostrophe, e.g. "’" or "" merge_ia = True # True: 𐑣𐑨𐑐𐑽, 𐑣𐑨𐑐𐑾𐑕𐑑 False: 𐑣𐑨𐑐𐑦𐑼, 𐑣𐑨𐑐𐑦𐑩𐑕𐑑 runic_vee = "ᚡ" # could use ᚠ, ᚹ, ᚢ, or ᚠ\u200dᚠ dot_entire_name = True if os.path.exists('config.py'): from config import * script = postag = alphabet = False dict = htags = { } tokens = [ "." ] units = { "ms":"𐑥𐑕","bc":"𐑚𐑰𐑕𐑰","psi":"𐑐𐑕𐑦","pc":"𐑐𐑕","mi":"𐑥𐑲" } contr = [ "'d","'ll","'m","n't","'re","'s","'ve" ] abbrev = [ "abbr","acad","al","alt","apr","assn","at","aug","ave","b","c","ca","cf", "capt","cent","chm","chmn","co","col","comdr","corp","cpl","cpt","d","dec","dept","dist", "div","dr","ed","esq","est","etc","feb","fl","gen", "gov","hon","inc", "inst","jan","jr","lat","lib","lt","ltd","mar","mr","mrs","ms","msgr", "mt","mts","mus","nov","oct","pg","phd","pl","pop","pp","prof","pseud","pt", "rev","sept","ser","sgt","sr","st","uninc","univ","vol","vs","wt" ] # Remove diacritics from Latin letters, break up ligatures, and do nothing else. def unaccent (str): map = "AAAAAA CEEEEIIIIDNOOOOO OUUUUY aaaaaa ceeeeiiiidnooooo ouuuuy yAaAaAaCcCcCcCcDdDdEeEeEeEeEeGgGgGgGgHhHhIiIiIiIiIi JjKkkLlLlLlLlLlNnNnNn OoOoOo RrRrRrSsSsSsSsTtTtTtUuUuUuUuUuUuWwYyYZzZzZz bBBb CcDDDdd EFfG IIKkl NnOOo Pp tTtTUuYVYyZz 255 DZDzdz AaIiOoUuUuUuUuUu AaAaÆæGgGgKkOoOo j Gg NnAaÆæOoAaAaEeEeIiIiOoOoRrRrUuUuSsTt HhNd ZzAaEeOoOoOoOoYylntj ACcLTsz BU EeJjqqRrYy" ext = "AaBbBbBbCcDdDdDdDdDdEeEeEeEeEeFfGgHhHhHhHhHhIiIiKkKkKkLlLlLlLlMmMmMmNnNnNnNnOoOoOoOoPpPpRrRrRrRrSsSsSsSsSsTtTtTtTtUuUuUuUuUuVvVvWwWwWwWwWwXxXxYyZzZzZzhtwyasssSSẟAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYyLLllVvYy" lig = { 'Æ':'AE','æ':'ae','DZ':'DZ','Dz':'Dz','dz':'dz','IJ':'Ij','ij':'ij','LJ':'LJ','Lj':'Lj','lj':'lj','NJ':'NJ','Nj':'Nj','nj':'nj','Œ':'OE','œ':'oe','Ƣ':'OI','ƣ':'oi','ß':'ss','ff':'ff','fi':'fi','fl':'fl','ffi':'ffi','ffl':'ffl','ſt':'st','st':'st' } ret = "" for char in str: n = ord(char) if n >= 0xc0 and n < 0x250 and map[n-0xc0] != ' ': char = map[n-0xc0] if n >= 0x1e00 and n < 0x1f00: char = ext[n-0x1e00] if n >= 0x300 and n < 0x370: char = "" if char in lig: char = lig[char] ret += char return ret def notrans (str): global htags if toki in htags: htags[toki] += str else: htags[toki] = str def tokenize (skr): global tokens, toki if not isinstance(skr,str): skr = "" skr = " "+unaccent(html.unescape(skr))+" " old = 0 for i in range(1,len(skr)-1): new = 0 if skr[i].isalpha(): new = 1 if skr[i].isdigit(): new = 2 if skr[i] == " " and tokens[-1][0].isalpha() and skr[i+1].isalpha(): new = 0 if skr[i] == "'" and skr[i-1].isalpha() and skr[i+1].isalpha(): new = 1 if skr[i] in ",." and skr[i-1].isdigit() and skr[i+1].isdigit(): new = 2 if skr[i] == "." and new == 0 and tokens[-1].lower() in abbrev: continue if old and old == new: tokens[-1] += skr[i] else: for c in contr: # break up contractions so PoS tagging works s = len(tokens[-1]) - len(c) if s < 1: continue low = tokens[-1][s:].lower() if c == low: tokens[-1] = tokens[-1][:s] tokens.append(low) tokens.append(skr[i]) toki += 1 old = new if tokens[-1].isspace() or not tokens[-1].isprintable() or ord(tokens[-1][0]) | 15 == 0xfe0f: toki -= 1 # Whitespace tokens break NLTK and variation notrans (tokens.pop()) # selectors break Flair. Move these to htags. class MyHTMLParser(HTMLParser): def handle_starttag (self, tag, attrs): global script out = '<'+tag for at in attrs: if at[0] == 'charset': at = ('charset', 'UTF-8') if at[0] == 'content': at = ('content', 'text/html; charset=UTF-8') out += ' '+at[0] if at[0] in ('alt','title'): notrans (out+'="') tokenize (at[1]) out = '"' elif type(at[1]) == str: out += '="'+at[1]+'"' out += '>' if tag == 'noscript' or tag == 'script' or tag == 'style': script = True notrans (out) def handle_endtag (self, tag): global script notrans (''+tag+'>') if tag == 'noscript' or tag == 'script' or tag == 'style': script = False def handle_data (self, data): if script: notrans (data); else: tokenize (data); # Search all the ways a word might appear in the dictionary def lookup (word, pos): ret = "" low = word.lower() upp = word[0].upper()+low[1:] pos = "_" + pos if aflag & 2: list = [ low+pos, low+pos[:3], word, word+"_",low, low+"_", low+"_NN", low+"_NNS", upp ] else: list = [ low+pos,low+pos[:3],word+".",upp+".",word,word+"_",low+".",low,low+"_",low+"_NN",low+"_NNS",upp ] for look in list: if look in dict: ret = dict[look] if aflag & 1 and ret[0] == '.' or aflag & 2 and ret[-1] == '.': ret = "" ret = ret.replace(".","") if not ret: continue if (word[0].isupper() or look[0].isupper()) \ and (look[-1] != "_" or aflag) and not re.search("[A-Z]",ret): ret = "·" + ret break return ret def suffix_split (inp, pos, adj): global aflag long = len(inp) root = lookup (inp, pos) if (root): return ((long+adj)**2, root) low = inp.lower(); best = (0, "") for split in range(2,long): suff = "$"+low[split:] if not suff in dict: continue if low[split] == low[split-1]: if long-split == 1 or low[split] in "eos": continue if low[split-1] == low[split-2]: if low[split:] == "ess" and low[split-1] in "ln": continue else: if low[split:] == "ry" and low[split-1] in "aeiouf": continue if low[split:] == "ha" and low[split-1] in "cpst": continue if low[split:] == "th" and low[split-1] in "e": continue if low[split:] == "d" and low[split-1] in "adeiou": continue if low[split:] == "w" and low[split-1] in "aeo": continue if low[split:] == "t" and low[split-1] in "aeioust": continue if low[split:] == "k" and low[split-1] in "aceino": continue if low[split:] == "r" and low[split-1] in "aeiou": continue if low[split:] == "m" and low[split-1] in "eis": continue if low[split:] == "z" and low[split-1] in "i": continue if low[split:] == "n" and low[split-1] in "eio": continue if low[split:] == "es" and low[split-1] not in "hiosuxz": continue suff = dict[suff] if aflag & 2 and suff[-1] == '.': continue for pess in range(2): if pess: word = inp[:split] elif low[split-1] == 'i' and low[split] not in "cfikmpsv": word = inp[:split-1] + 'y' elif low[split] in "aeiouy'" and low[split-1] not in "aeio" \ and low[split:split+2] not in [ "ub","up" ]: if low[split-1] == low[split-2] and low[split-1] not in "hsw": word = inp[:split-1] elif ( low[split-1] in "cdghlsuvz" or low[split] == 'e' or low[split-2] in "aeiousy" ) \ and (low[split-1] not in "cg" or low[split] not in "aou"): word = inp[:split] + "e" else: continue elif low[split-2:split] == "dg": word = inp[:split] + "e" else: continue sflag = aflag aflag &= ~2 if inp[split] != "'" or word != inp[:split]: aflag |= 2 root = suffix_split (word, "UNK", split-len(word)) score = (long-split+adj)**2 + root[0] if root[0] else 0 aflag = sflag if score: if low[split] == "'" and pess == 0: score /= 2 if low[split:] in [ "call" ]: score = 1 if low[split:] in [ "bed","can","cat","cent","dance","ine","kin","one","pal","path","ster","tie","tied","ties","tying","wing","x" ]: score = max (1, score - 9) if (score <= best[0]): continue root = root[1] if low[split-1] == 'e' and low[split-2] not in "aegiou" and low[split] in "aou" \ and (split+1 == long or low[split+1] in "dlmnprstu") \ and low[split:] not in [ "arm","out","und","up" ]: if root[-1] in "𐑦𐑰": root = root[:-1] root += "𐑦" if root[-1] == "𐑓" and suff[0] > "𐑗" and word[-1] in "VWvw" and low[split:] != "s": root = root[:-1] + "𐑝" if root[-2:] == "𐑩𐑤" and suff == "𐑦" and word[-2:].lower() == "le": root = root[:-2] + "𐑤" if root[-3:] in ["𐑞𐑩𐑥","𐑟𐑩𐑥"] and suff not in ["'","𐑛:","𐑟:","𐑦𐑙"]: root = root[:-2] + "𐑥" if root[-2:] in ["𐑩𐑤","𐑭𐑤","𐑾𐑤"] and suff == "𐑦𐑑𐑦": mid = "𐑦" if root[-3] in "𐑖𐑗𐑠𐑡" or root[-2] == "𐑾" else "" root = root[:-2] + mid + "𐑨𐑤" mid = root[-1] + suff[0] if merge_ia: if mid == "𐑦𐑩": mid = "𐑾" if mid == "𐑦𐑼": mid = "𐑽" if mid in ["𐑤𐑤","𐑯𐑯"] and len(suff) < 3: mid = mid[0] best = (score, root[:-1] + mid + suff[1:]) if long > 1 and low[-1] == low[-2] and low[-2] not in "aeiosu": aflag |= 2 root = suffix_split (inp[:-1], "UNK", 0) if best[0] < root[0]: best = root if len(best[1]) > 1: word = best[1][:-2] end = best[1][-2:] if end in [ "𐑛:","𐑟:" ]: tail = -1 while word[tail] in ["'",":"]: tail -= 1 if word[tail] in {"𐑛:":"𐑑𐑛","𐑟:":"𐑕𐑖𐑗𐑟𐑠𐑡"}[end]: word += "𐑩" elif word[tail] >= "𐑐" and word[tail] < "𐑘": end = chr(ord(end[0])-10)+":" word += end if word[-4:] == "𐑒𐑩𐑤𐑦" and word[-5] in "𐑦𐑩": word = word[:-4] + "𐑒𐑤𐑦" if word[-4:-2] == "𐑑𐑵" and word[-2] in "𐑩𐑱𐑺𐑼": word = word[:-4] + "𐑗" + word[-3:] pal = word[::-1].replace("𐑩𐑦","𐑾",1)[::-1].split("𐑾") if len(pal) > 1 and len(pal[-2]) > 1 and \ pal[-2][-1] in "𐑑𐑕𐑟" and pal[-1] in ["𐑕","𐑤","𐑯",""]: mid = "𐑖" if pal[-2][-1] == "𐑑": if pal[-2][-2] == "𐑕": mid = "𐑗" elif pal[-1] in ["𐑯",""] and pal[-2][-2] in "𐑰𐑱𐑴𐑵𐑷𐑻𐑿": mid = "𐑠" pal[-2] = pal[-2][:-1] + mid + "𐑩" + pal[-1] word = "𐑾".join(pal[:-1]) best = (best[0], word) return best def prefix_split (word, pos, ms): global aflag best = suffix_split (word, pos, 0) if best[0] == len(word)**2: return best for split in range(len(word)-2,ms,-1): pref = "^"+word[:split].lower() if not pref in dict: continue if word[:split+1].lower() == "un": continue if pref == "^z" and word[split].lower() in "aeiouy": continue aflag = word[split-1] != "'" root = prefix_split (word[split:], pos, 1) score = split**2 + root[0] if root[0] else 0 if pref == "^la": score -= 4; if score > best[0]: dot = "·" if word[0].isupper() else "" pref = dict[pref] init = root[1][0] if (init == "·"): init = root[1][1] if pref[-1] == init and pref[-1] in "𐑤𐑥𐑮𐑯" and pref[-2] in '𐑦𐑧' \ or pref == "𐑥𐑩𐑒" and init == "𐑒": pref = pref[:-1] best = (score, pref + dot + root[1]) return best if len(sys.argv) < 2: print ("Usage:",sys.argv[0],"file1.dict file2.dict ...") exit() first = True for fname in sys.argv[1:]: if postag == -1: postag = int(fname) continue if alphabet == -1: alphabet = int(fname) continue if fname == "-p": # Choose part-of-speech tagger, default none postag = -1 continue if fname == "-a": # Choose output alphabet, default Shavian alphabet = -1 continue with open (fname, 'r', encoding="utf-8") as df: for line in df: word = line.split() if alphabet == 8 and word[0][-1] in "vw" and word[1][-1] == "𐑓": word[1] = word[1][:-1] + "𐑝" if not postag: word[0] = re.sub('_[A-Z]+','',word[0]) if word[0]+'_' in dict: word[0] += '_' if first and word[0] in dict: if word[1] not in dict[word[0]].split('@'): dict[word[0]] += "@"+word[1] else: dict[word[0]] = word[1] if word[1] == ".": del dict[word[0]] elif not first: # Allow extra dicts to force dotting low = word[0].lower() if low != word[0] and low in dict: del dict[low] first = False if alphabet: merge_ia = False if merge_ia: for word in dict: dict[word] = dict[word].replace("𐑦𐑩","𐑾").replace("𐑦𐑼","𐑽") if alphabet == 3: for word in dict: if word[:2] == "$u" and dict[word][0] == "𐑿": dict[word] = "ᛡ𐑵" + dict[word][1:] dict[word] = re.sub('([𐑐𐑑𐑒𐑓𐑔𐑕𐑖𐑗𐑚𐑛𐑜𐑝𐑞𐑟𐑠𐑡𐑣𐑤𐑥𐑮𐑯])𐑘','\\1ᛡ',dict[word].replace("𐑿","𐑘𐑵")) text = sys.stdin.read() # Any morpheme containing periods, hyphens, or slashes needs special treatment here for tup in [("([ʻˈ‘’ʼ´`ʿ]|&(#8217|rsquo);)","'"),("[\u00ad\u200b]",""), (r'\be\.g\.','igz'),(r'\bi\.e\.','ie'),(r'\bph\.d\.','phd'), (r'\[([a-z])\]',r'\1'),(r'\bde-','dee-'),(r'\bt-','tee-'),("'tee-","'t-"), (r'\bw/o\b','without'),(r'\bw/','with '),('vis-[aà]-vis','vis-ah-vee'), ('sine qua','sinna qua')]: text = re.sub(tup[0],tup[1],text,0,re.I) toki = 1 parser = MyHTMLParser() parser.feed (text) tags = [] if postag == 1: # Do part-of-speech tagging import nltk tags = nltk.pos_tag (tokens) elif postag == 2: import spacy spaCy = spacy.load("en_core_web_sm") tags = [(w.text, w.tag_) for w in spaCy(spacy.tokens.Doc(spaCy.vocab, tokens))] elif postag == 3: save = sys.stdout sys.stdout = sys.stderr from flair.data import Sentence from flair.models import SequenceTagger flair = SequenceTagger.load("flair/pos-english-fast") sys.stdout = save sen = [] for tok in tokens + ['.']: sen.append(tok) if tok in ['.','?','!']: sen = Sentence(sen) flair.predict(sen) for tok in sen: tags.append((tok.text, tok.get_label('pos').value)) sen = [] del tags[-1] else: tags = [(tok,"UNK") for tok in tokens] jtags = [] # Re-join broken contractions for token in tags: if token[0].lower() in contr: jtags[-1] = (jtags[-1][0]+token[0],jtags[-1][1]+"+"+token[1]) else: jtags.append(token) out = [] # Translate to Shavian prev = (".",'.') toki = 1 initial = maydot = True map = { "𐑑":"𐑑𐑩","𐑓":"𐑓𐑹","𐑝":"𐑩𐑝","𐑯":"𐑩𐑯𐑛","𐑞":"𐑞𐑩" } if alphabet == 1: map = { "𐑑":"𐑑𐑵","𐑓":"𐑓𐑹","𐑝":"𐑪𐑝","𐑯":"𐑨𐑯𐑛","𐑩𐑯":"𐑨𐑯","𐑩":"𐑭" } if alphabet == 3: apostrophe = "ᛌ" map.update({"𐑘𐑧𐑩":"𐑘𐑧"}) if alphabet == 6: map = { "𐑑":"𐑑𐑵","𐑓":"𐑓𐑹","𐑝":"𐑪𐑝","𐑯":"𐑨𐑯𐑛","𐑞":"𐑞𐑦","𐑩𐑯":"𐑨𐑯","𐑩":"𐑧" } if alphabet == 7: map = { "𐑦𐑑":"𐑦","𐑦𐑟":"𐑟","𐑚𐑰":"𐑚" } if alphabet == 11: apostrophe = "" for token in jtags[1:]: if toki in htags: out.append(htags[toki]) if htags[toki].lower().find("