diff --git a/README.md b/README.md index b1f48483237d91f9f95e3ac69c133b51e165ec91..7328b6ff2d9da31cd02262c248df2b54b9d18fee 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,10 @@ --- title: SyntaSpeech -emoji: 📊 -colorFrom: red -colorTo: blue +emoji: 🤗 +colorFrom: yellow +colorTo: orange sdk: gradio -sdk_version: 2.9.4 -app_file: app.py +app_file: "inference/tts/gradio/infer.py" pinned: false --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference diff --git a/data/binary/ljspeech/phone_set.json b/data/binary/ljspeech/phone_set.json new file mode 100644 index 0000000000000000000000000000000000000000..62bc3f6943c3d69a8060b9bd6a9e8d9ce93d5bdd --- /dev/null +++ b/data/binary/ljspeech/phone_set.json @@ -0,0 +1 @@ +["!", ",", ".", ":", ";", "", "", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH"] \ No newline at end of file diff --git a/data/binary/ljspeech/spk_map.json b/data/binary/ljspeech/spk_map.json new file mode 100644 index 0000000000000000000000000000000000000000..161106099d8879092aaad3305fec7d371597e168 --- /dev/null +++ b/data/binary/ljspeech/spk_map.json @@ -0,0 +1 @@ +{"": 0} \ No newline at end of file diff --git a/data/binary/ljspeech/word_set.json b/data/binary/ljspeech/word_set.json new file mode 100644 index 0000000000000000000000000000000000000000..2351fad49bbe42f14c19490553b02d7efe780769 --- /dev/null +++ b/data/binary/ljspeech/word_set.json @@ -0,0 +1 @@ +["!", ",", ".", ":", ";", "", "", "?", "a", "abandon", "abandoned", "abandonment", "abash", "abbey", "abduction", "abetting", "abeyance", "abiding", "abilities", "ability", "abject", "able", "ablution", "ablutions", "ably", "abode", "abolished", "abolition", "abominable", "abound", "about", "above", "abraham", "abrahams", "abroad", "abruptly", "abscond", "absconded", "absence", "absent", "absolute", "absolutely", "absorb", "absorbed", "absorbing", "absorbs", "absorption", "abstained", "abstracted", "abstraction", "abstractions", "absurdity", "abundance", "abundant", "abundantly", "abuse", "abused", "abuses", "academy", "accent", "accentuated", "accept", "acceptance", "acceptances", "accepted", "accepting", "access", "accession", "accessory", "accident", "accidentally", "accidents", "accommodate", "accommodated", "accommodating", "accommodation", "accommodations", "accompanied", "accompany", "accompanying", "accomplice", "accomplices", "accomplish", "accomplished", "accomplishing", "accomplishment", "accord", "accordance", "accorded", "according", "accordingly", "accosted", "account", "accountant", "accounted", "accounts", "accumulate", "accumulated", "accumulation", "accuracy", "accurate", "accurately", "accusation", "accused", "accustomed", "achieve", "achieved", "achievements", "acid", "acknowledged", "acquaintance", "acquaintances", "acquaintanceship", "acquainted", "acquire", "acquired", "acquiring", "acquittal", "acquitted", "acres", "across", "act", "acted", "acting", "action", "actions", "active", "actively", "activities", "activity", "actor", "actors", "acts", "actual", "actually", "acutely", "ad", "adam", "adams", "adaptation", "adaptations", "adapted", "adaptive", "add", "added", "addicted", "adding", "addison", "addition", "additional", "additions", "address", "addressed", "addresses", "addressing", "adduced", "adept", "adepts", "adequacy", "adequate", "adequately", "adhered", "adherence", "adhesive", "adjacent", "adjoining", "adjudged", "adjusted", "adjustment", "adlai", "administer", "administered", "administering", "administration", "administrative", "admirable", "admirably", "admiral", "admiralty", "admired", "admission", "admissions", "admit", "admits", "admitted", "admitting", "admonition", "adolph", "adolphus", "adopt", "adopted", "adoption", "adult", "adults", "advance", "advanced", "advancement", "advances", "advantage", "advantages", "adventure", "adventures", "adventurous", "adverse", "adversity", "advert", "advertised", "advertisement", "advertising", "advice", "advise", "advised", "advising", "advocated", "aerial", "affair", "affairs", "affected", "affecting", "affection", "affections", "affidavit", "affinities", "affirmed", "affirms", "affixed", "afflicted", "afford", "afforded", "affording", "aforesaid", "afraid", "after", "afternoon", "afternoons", "afterward", "afterwards", "again", "against", "agar", "age", "aged", "agencies", "agency", "agencys", "agent", "agents", "ages", "aggravated", "aggravation", "aggregate", "aggregations", "aggressive", "aggrieved", "agitate", "agitated", "agitation", "ago", "agony", "agree", "agreed", "agreement", "agreements", "agrees", "agriculture", "ahead", "aid", "aiding", "aim", "aimed", "aiming", "aims", "air", "aircraft", "aired", "aires", "airing", "airplane", "airport", "airway", "airy", "aisle", "aisles", "akermans", "akin", "alan", "alarm", "alarmed", "albans", "albert", "album", "alderman", "aldermen", "aldermens", "aldus", "ale", "alek", "alert", "alerted", "alertness", "alexander", "alfred", "alias", "aliases", "alighted", "alighting", "aligning", "alike", "alimentary", "alive", "all", "allegations", "alleged", "allegiance", "alley", "alliance", "allied", "allington", "allnutt", "allotment", "allotted", "allow", "allowance", "allowances", "allowed", "allowing", "alluded", "almighty", "almost", "alms", "alone", "along", "alongside", "aloud", "already", "also", "alter", "alterations", "altered", "altering", "alternate", "alternated", "alternative", "altgens", "although", "altogether", "always", "am", "amass", "amassed", "amateur", "amateurs", "ambassador", "ambitious", "ameliorated", "ameliorating", "amelioration", "amend", "amended", "amendment", "amendments", "america", "american", "americanized", "americans", "amid", "amidst", "ammonia", "ammunition", "among", "amongst", "amos", "amount", "amounted", "amounting", "amounts", "amphibian", "amphibians", "ample", "amplified", "amputation", "amused", "amusement", "amusing", "an", "anabolism", "analogous", "analysis", "anarchist", "anatomy", "ancestors", "ancestral", "ancient", "ancients", "and", "anderson", "andrews", "angeles", "angelini", "anger", "angle", "angles", "anglesea", "angling", "angry", "animadversion", "animadvert", "animadverted", "animadverting", "animal", "animals", "annals", "annex", "annexe", "announced", "annoy", "annoyance", "annoyances", "annual", "annually", "annum", "anomalous", "another", "anoura", "answer", "answered", "answering", "answers", "antecedents", "anthony", "anti", "anticipated", "anticipating", "anticipation", "anticipations", "antics", "antimony", "antlers", "antonio", "anxiety", "anxious", "anxiously", "any", "anybody", "anyone", "anything", "anywhere", "apace", "apart", "apartment", "apartments", "apathy", "aperture", "apertures", "apiece", "apothecaries", "apothecary", "appalling", "apparatus", "apparel", "appareled", "apparent", "apparently", "appeal", "appealed", "appeals", "appear", "appearance", "appearances", "appeared", "appearing", "appears", "appendix", "appetite", "appleby", "appliances", "application", "applications", "applied", "apply", "appoint", "appointed", "appointment", "apprehended", "apprehension", "apprentice", "apprised", "approach", "approached", "approaches", "approaching", "appropriate", "appropriated", "appropriately", "appropriation", "approval", "approve", "approved", "approver", "approximate", "approximated", "approximately", "april", "apron", "aptitude", "arab", "arabian", "arabic", "arabs", "arachtu", "arbor", "arcade", "arch", "archbishop", "arches", "architect", "architects", "architectural", "architecture", "arduous", "are", "area", "areas", "argue", "argued", "arguing", "argument", "arguments", "arise", "arises", "arising", "aristocracy", "aristocratic", "arm", "armed", "armless", "armpit", "arms", "army", "arnold", "arose", "around", "aroused", "arraigned", "arraignment", "arrange", "arranged", "arrangement", "arrangements", "arrayed", "arrest", "arrested", "arresting", "arrests", "arrival", "arrivals", "arrive", "arrived", "arrives", "arriving", "arrogant", "arsenic", "arson", "art", "arteries", "arthur", "article", "articles", "artificial", "artisans", "artist", "artistic", "artistically", "artlessness", "arts", "as", "ascend", "ascendancy", "ascendant", "ascended", "ascent", "ascertain", "ascertained", "ascot", "ashamed", "ashes", "ashley", "ashy", "aside", "ask", "asked", "askern", "asking", "asleep", "aspect", "aspects", "aspirants", "assailant", "assailed", "assassin", "assassinate", "assassinated", "assassinating", "assassination", "assassins", "assault", "assemblage", "assembled", "assembly", "assert", "asserted", "assiduously", "assigned", "assignment", "assignments", "assist", "assistance", "assistant", "assistants", "assisted", "assisting", "assize", "assizes", "associate", "associated", "associates", "association", "associations", "assume", "assumed", "assuming", "assumption", "assurance", "assurances", "assurbanipal", "assure", "assured", "assyrians", "astonished", "asunder", "asylum", "at", "ate", "atlantic", "atmosphere", "atmospheric", "atop", "atrocious", "atrocity", "attach", "attached", "attachment", "attack", "attacked", "attacks", "attainments", "attempt", "attempted", "attempting", "attempts", "attend", "attendance", "attendant", "attendants", "attended", "attending", "attends", "attention", "attentions", "attentive", "attentively", "attest", "attic", "attitude", "attorney", "attorneys", "attract", "attracted", "attracting", "attribute", "attributed", "atwell", "auburn", "audible", "audience", "auditors", "augmented", "augsburg", "august", "auspices", "austin", "australia", "authentic", "authoritative", "authoritatively", "authorities", "authority", "authorize", "authorized", "authors", "autobiographical", "automatic", "automatically", "automation", "automobile", "autopsy", "autumn", "avail", "available", "availed", "avenue", "avenues", "average", "averaged", "averted", "avocation", "avoid", "avoidance", "avoided", "awaited", "awaiting", "awake", "awakened", "awakening", "aware", "awareness", "away", "awful", "awkward", "axe", "axis", "b", "babel", "baby", "babylon", "babylonia", "babylonian", "babylonians", "back", "backboned", "backed", "backfire", "backgammon", "background", "backwards", "bad", "badge", "badly", "badness", "bag", "bags", "bail", "bailey", "bake", "baker", "bakers", "baking", "balance", "balanced", "balances", "balcony", "bald", "ball", "ballad", "ballistics", "balls", "bambridge", "band", "bandied", "baneful", "bank", "banker", "bankers", "bankes", "banking", "bankrupt", "bankruptcy", "banks", "banquets", "baptist", "bar", "barbara", "barbariously", "barbarous", "barber", "barbers", "bare", "barely", "bargaining", "barman", "barnett", "baron", "baronet", "baronial", "barrack", "barrel", "barrett", "barrier", "barriers", "barry", "bars", "bart", "barthelemy", "bartholomew", "base", "based", "basement", "bases", "bashour", "basic", "basically", "basin", "basis", "baskerville", "basket", "basle", "bat", "batch", "batchelor", "bateman", "bates", "bath", "bathed", "bathing", "bathroom", "baths", "batter", "battered", "battle", "baxter", "bay", "be", "beam", "beams", "bean", "bear", "bearing", "bears", "beast", "beasts", "beat", "beaten", "beating", "beaumont", "beautiful", "beauty", "became", "because", "beckley", "become", "becomes", "becoming", "bed", "bedclothes", "bedding", "bedroom", "beds", "bedstead", "bedsteads", "bedtime", "been", "beer", "before", "began", "begged", "begging", "begin", "beginner", "beginning", "beginnings", "begins", "begun", "behalf", "behaved", "behaves", "behavior", "behind", "behn", "behold", "being", "beings", "bel", "belgium", "belgrave", "belian", "belief", "beliefs", "believe", "believed", "believes", "bell", "bellingham", "bells", "belmont", "belonged", "belonging", "belongings", "beloved", "below", "belshazzar", "belsize", "belt", "belus", "ben", "benavides", "bench", "benches", "beneath", "beneficial", "benefit", "benefits", "benevolence", "benevolent", "benign", "benjamin", "bennet", "bennett", "bent", "bentham", "bequest", "bequests", "berlin", "bermondsey", "berth", "berwick", "beside", "besides", "besotted", "best", "bestow", "bethesda", "bethlehem", "betrayed", "betraying", "better", "betting", "between", "beverage", "beware", "beyond", "bible", "bibles", "bid", "bidden", "bidwell", "big", "bigamy", "bill", "billings", "billions", "billows", "bills", "billy", "bins", "biological", "biology", "birch", "bird", "birmingham", "birth", "birthday", "biscuit", "bishop", "bishops", "bit", "bitten", "bitter", "bitterly", "bitterness", "bitters", "bitumen", "black", "blackburn", "blacksmith", "blame", "blamed", "blanched", "blank", "blanket", "blasphemous", "bleating", "bledsoe", "bleed", "bleeding", "blemish", "blind", "blinded", "blinked", "block", "blocked", "blocks", "blond", "blood", "blooded", "bloodthirsty", "bloody", "blow", "blowing", "blows", "blue", "board", "boarded", "boarding", "boards", "boast", "boasted", "boat", "boats", "bob", "bodies", "bodily", "bodoni", "body", "bogus", "boiled", "boiling", "boldly", "boldness", "bolster", "bolt", "bolted", "bolts", "bomb", "bona", "bond", "bonded", "bonding", "bonds", "bone", "bones", "bonnet", "bonneted", "bonnets", "book", "books", "booming", "boon", "boot", "booty", "bordeaux", "border", "bordered", "bore", "born", "borne", "borough", "boroughs", "borrowed", "boston", "botany", "both", "bottle", "bottom", "bouck", "bought", "bouhe", "boulevard", "boulogne", "bound", "boundary", "bounds", "bourne", "bousfield", "bow", "bowed", "bowl", "box", "boxes", "boxing", "boy", "boyfriend", "boys", "brace", "bracelet", "bradawls", "bradshaws", "brain", "brains", "branch", "branches", "brass", "brave", "bravery", "breach", "bread", "breadth", "break", "breaker", "breakfast", "breaking", "breaks", "breast", "breath", "breathe", "breathed", "breathing", "bred", "breeches", "brennan", "brennans", "brethren", "brewer", "brewery", "brian", "bribe", "bribery", "brick", "bricklayers", "bricks", "brickwork", "bride", "bridewell", "bridge", "bridges", "brief", "briefly", "briefs", "briggs", "bright", "brilliant", "brilliants", "bring", "bringing", "bringuier", "bristol", "britain", "british", "broad", "broadcast", "broader", "broadly", "broadmoor", "broadsheet", "brochure", "broke", "broken", "bronx", "bronze", "brooks", "brooms", "brother", "brothers", "brought", "brown", "brownrigg", "browns", "brows", "bruce", "bruised", "brunt", "brutal", "brutality", "brutalized", "brutally", "brute", "bubble", "bubbletop", "buckingham", "budget", "buell", "buenos", "buff", "build", "builder", "builders", "building", "buildings", "built", "bulge", "bulk", "bulky", "bullet", "bulletproof", "bullets", "bullion", "bulls", "bumble", "buncher", "bunchers", "bundle", "bungling", "buoyed", "buranelli", "burchell", "burdens", "bureau", "bureaucracy", "bureaus", "burgess", "burglar", "burglaries", "burglars", "burglary", "burial", "buried", "buries", "burke", "burking", "burkley", "burn", "burned", "burnett", "burning", "burnt", "burst", "burthen", "bury", "bus", "buses", "bushes", "bushy", "busily", "business", "businessmen", "busy", "but", "butcher", "butler", "butlers", "butt", "butter", "button", "buxton", "buxtons", "buy", "buying", "by", "bye", "bystanders", "c", "cab", "cabin", "cabinet", "cabins", "cabman", "cadigan", "caducibranch", "cage", "cajoled", "cakes", "calais", "calamities", "calamitous", "calcraft", "calcrafts", "calculated", "calculating", "calculation", "calendar", "calendars", "caliber", "call", "callaway", "called", "callers", "calligraphy", "calling", "callous", "callousness", "calm", "calmly", "camberwell", "came", "cameo", "camera", "campaign", "campbell", "campbells", "can", "canada", "canal", "canceled", "candidate", "candles", "canning", "cannings", "cannon", "cannot", "canonicals", "cant", "canterbury", "cap", "capabilities", "capability", "capable", "capacities", "capacity", "capias", "capital", "capitalism", "capitalist", "capitally", "caps", "capstan", "capt", "captain", "captains", "captive", "captors", "capture", "captured", "car", "caravans", "carbine", "carbohydrate", "carbohydrates", "carbon", "carbonic", "carcano", "card", "cardboard", "cardiac", "carding", "cards", "care", "cared", "career", "careful", "carefully", "careless", "carelessly", "carelessness", "cargo", "carlisle", "carlos", "carousing", "carpenter", "carpenters", "carpet", "carpets", "carriage", "carriages", "carrico", "carried", "carro", "carry", "carrying", "cars", "cart", "carter", "cartilage", "carton", "cartons", "cartridge", "cartridges", "carver", "case", "caseload", "casements", "cases", "cash", "cashed", "cashman", "casket", "caslon", "caslons", "caspar", "caspars", "cast", "casting", "castle", "castlereagh", "castro", "castros", "casually", "catabolic", "catabolism", "catastrophe", "catastrophes", "catch", "catching", "categories", "category", "catherine", "catholics", "catnach", "cato", "cattle", "caught", "caulked", "cause", "caused", "causes", "causing", "caustic", "cautioned", "cautiously", "cavern", "cavities", "cavity", "caxtons", "cease", "ceased", "ceiling", "ceilings", "celebrated", "cell", "cellar", "cells", "cellular", "cellulose", "cement", "center", "centered", "centers", "centimeter", "centimeters", "central", "cents", "centuries", "century", "centurys", "ceremonial", "ceremonials", "ceremony", "certain", "certainly", "certainty", "certificate", "certificates", "certified", "certify", "cetera", "chaff", "chain", "chained", "chains", "chair", "chairman", "chalked", "challenge", "chamber", "chambering", "chambers", "champagne", "champions", "chance", "chancery", "chances", "chandler", "chandlers", "change", "changed", "changes", "channel", "chaos", "chapel", "chapels", "chaplain", "chaplains", "chapter", "chapters", "character", "characteristic", "characteristics", "characterize", "characterized", "characterless", "characters", "charge", "charged", "charges", "charging", "charitable", "charities", "charity", "charles", "charlotte", "charter", "chartists", "chase", "chatham", "chats", "cheap", "cheapest", "cheapside", "check", "checked", "checking", "checklist", "checkpoint", "checks", "cheek", "cheeked", "cheer", "cheered", "cheerful", "cheerless", "cheers", "cheese", "chemical", "chemists", "cheque", "cheques", "cheshire", "chest", "chester", "chevaux", "chicago", "chicanery", "chief", "chiefly", "chiefs", "child", "childish", "childless", "children", "chill", "chilly", "chimney", "chimneys", "chin", "chinamen", "chinese", "chisel", "chiswick", "chlorophyll", "choice", "cholera", "choose", "chose", "chosen", "christian", "christmas", "christs", "chronicle", "chronicles", "chubbs", "chummage", "church", "cia", "cigar", "cigarettes", "cigars", "circle", "circles", "circuit", "circular", "circulars", "circulate", "circulated", "circulating", "circulation", "circulatory", "circumference", "circumstance", "circumstances", "circumstantial", "cissian", "cistern", "cited", "cities", "citizen", "citizens", "citizenship", "city", "citys", "civil", "civilian", "civilization", "clad", "claim", "claimed", "claims", "clamp", "clamps", "clanking", "clarence", "clarify", "clarifying", "clark", "clasped", "clasps", "class", "classed", "classes", "classics", "classification", "classified", "classify", "clause", "clay", "clean", "cleaning", "cleanliness", "cleansed", "clear", "cleared", "clearly", "clearness", "clemency", "clench", "clenched", "clergyman", "clergymen", "clerical", "clerk", "clerkenwell", "clerks", "clever", "cleverly", "click", "clients", "cliff", "clifton", "climb", "climbed", "climbing", "clinging", "clings", "clinton", "clipboard", "clipperton", "clipping", "cloak", "clock", "close", "closed", "closely", "closer", "closest", "closing", "cloth", "clothes", "clothing", "club", "clubs", "clue", "clumsy", "clun", "co", "coach", "coachman", "coadjutor", "coal", "coals", "coarse", "coast", "coat", "cobbett", "cobham", "cock", "cocked", "code", "codes", "coffee", "coffin", "coffins", "coiled", "coin", "coincide", "coiners", "coining", "colchicum", "cold", "coldbath", "cole", "coleman", "coles", "collapse", "collapsed", "collar", "collared", "colleagues", "collected", "collection", "collections", "collective", "collector", "college", "collegians", "collins", "collusion", "cologne", "colonel", "colonies", "colony", "color", "colorado", "colored", "coloring", "colors", "colossal", "column", "columns", "combatants", "combination", "combined", "combustible", "come", "comely", "comer", "comes", "comfort", "comfortable", "comforts", "coming", "command", "commanded", "commander", "commanding", "commence", "commenced", "commencement", "commencing", "commendable", "commendation", "commented", "comments", "commerce", "commercial", "commercialism", "commission", "commissioner", "commissioners", "commissions", "commit", "commitment", "committal", "committals", "committed", "committee", "committees", "committing", "commodious", "common", "commonest", "commonly", "commonplace", "commons", "commotion", "communicate", "communicated", "communicating", "communication", "communications", "communion", "communism", "communist", "communities", "community", "commutation", "commuted", "companies", "companion", "companions", "companionship", "company", "companys", "comparable", "comparative", "comparatively", "compare", "compared", "comparing", "comparison", "comparisons", "compartment", "compass", "compassed", "compassion", "compatible", "compel", "compelled", "compelling", "compels", "compensate", "competent", "competition", "competitors", "complacency", "complain", "complained", "complaint", "complaints", "complete", "completed", "completely", "completing", "completion", "complex", "complexion", "compliance", "complicated", "complicity", "comply", "composed", "composites", "composition", "compound", "compounds", "comprehensive", "compression", "comprised", "comprising", "compromised", "compter", "compters", "compulsory", "comrades", "conceal", "concealed", "conceded", "conceit", "conceivable", "conceive", "conceived", "concentrate", "concentration", "concept", "conception", "concern", "concerned", "concerning", "concerns", "concession", "conclude", "concluded", "concludes", "concluding", "conclusion", "conclusions", "conclusively", "concomitant", "concomitants", "concourse", "concrete", "concurrence", "condemnation", "condemned", "condition", "conditions", "conduced", "conducive", "conduct", "conducted", "conducting", "confederate", "confederates", "conference", "conferred", "confess", "confessed", "confessedly", "confession", "confessions", "confided", "confidence", "confident", "confidential", "confidently", "confiding", "configuration", "confine", "confined", "confinement", "confirm", "confirmed", "conflict", "conflicting", "conflicts", "conformity", "confronted", "confuse", "confused", "confusion", "congregate", "congregated", "congregation", "congress", "conjunction", "connally", "connallys", "connect", "connected", "connection", "connived", "conquering", "conscience", "conscious", "consciousness", "consecutive", "consented", "consequence", "consequences", "consequent", "consequently", "conservation", "consider", "considerable", "considerably", "considerate", "consideration", "considerations", "considered", "considering", "considers", "consign", "consist", "consisted", "consistent", "consistently", "consisting", "consists", "consolation", "consolatory", "console", "consolidate", "consolidated", "consolidation", "consoling", "consols", "consort", "conspiracy", "conspirators", "constable", "constables", "constant", "constantly", "consternation", "constituent", "constituents", "constitute", "constituted", "constitutes", "constitution", "constitutional", "construct", "constructed", "construction", "constructive", "construed", "consul", "consultants", "consulted", "consumed", "consumer", "consummate", "contact", "contacts", "contagion", "contagious", "contain", "contained", "containing", "contains", "contaminating", "contamination", "conte", "contemplated", "contemporaneous", "contemporaries", "contemporary", "contempt", "contemptuous", "contend", "contended", "content", "contentedly", "contents", "contested", "context", "continent", "continental", "contingency", "contingents", "continually", "continue", "continued", "continues", "continuing", "continuous", "continuously", "contract", "contracted", "contractile", "contraction", "contractors", "contracts", "contrary", "contrast", "contrasting", "contrasts", "contravention", "contributed", "contributes", "contribution", "contrite", "contrition", "contrivance", "contriver", "contriving", "control", "controlled", "controlling", "controversial", "controversy", "convenience", "convenient", "conveniently", "conventionality", "conversation", "converse", "conversed", "conversing", "convert", "convertible", "convey", "conveyance", "conveyed", "conveying", "convict", "convicted", "conviction", "convictions", "convicts", "convinced", "convinces", "convincing", "convulsions", "convulsive", "convulsively", "cook", "cooked", "cooking", "cooks", "cool", "coolly", "coolness", "cooperate", "cooperating", "cooperation", "coordinate", "coordinated", "coordinating", "coordination", "cop", "cope", "copenhagen", "copes", "copied", "copies", "copper", "copy", "cord", "corn", "corner", "corners", "cornhill", "corporate", "corporation", "corporations", "corporeal", "corps", "corpse", "correct", "correction", "correctly", "correctness", "correspondence", "corroborated", "corroboration", "corrupt", "corrupted", "corruption", "cost", "costing", "costliness", "costly", "costs", "cot", "cottage", "cotton", "cough", "coughing", "could", "couldnt", "council", "councilmen", "councilors", "counsel", "counselor", "counsels", "count", "counted", "countenance", "counteract", "counteracted", "countered", "counterfeit", "counterfeiting", "counties", "countries", "country", "counts", "county", "couple", "coupled", "couples", "coupon", "courage", "courageous", "courier", "course", "court", "courts", "courvoisier", "courvoisiers", "coventry", "cover", "covered", "covering", "coveted", "cowardly", "cracked", "craft", "crafts", "craftsmen", "craig", "craigs", "crammed", "crane", "crank", "cranks", "crape", "crater", "crawford", "crawled", "craze", "create", "created", "creating", "creation", "creature", "creatures", "credible", "credit", "creditable", "creditors", "credulity", "crept", "crescent", "crevice", "crew", "crib", "cribbage", "cried", "cries", "crime", "crimes", "criminal", "criminality", "criminals", "crippled", "crisis", "criteria", "critic", "critical", "criticism", "crop", "cropped", "cross", "crossed", "crosshairs", "crossing", "crowd", "crowded", "crowding", "crowds", "crown", "crowned", "crucial", "cruel", "cruelty", "cruikshanks", "crushed", "cruz", "cry", "crying", "crystal", "ctesias", "cuba", "cuban", "cubans", "cubits", "cuff", "culminated", "culpable", "culprit", "culprits", "culture", "cumbered", "cumbrous", "cummings", "cunningham", "cup", "cupboard", "cupboards", "cupidity", "cups", "curb", "cure", "curiosities", "curiosity", "curious", "current", "curry", "curse", "curses", "cursing", "curtailment", "curtain", "curves", "custodians", "custody", "custom", "customary", "customers", "customs", "cut", "cutdowns", "cutlasses", "cuts", "cutting", "cuvier", "cuviers", "cylinder", "cyrus", "d", "dagger", "daily", "dallas", "damage", "damaged", "damages", "damascus", "damn", "damnatory", "damp", "dance", "danger", "dangerous", "dangers", "daniel", "danish", "dare", "daring", "darius", "dark", "darker", "darkness", "darwin", "dasent", "dasset", "dastardly", "data", "date", "dated", "dates", "dating", "daughter", "daughters", "daulby", "david", "davidson", "davis", "davison", "day", "daylight", "days", "daytime", "dazzling", "de", "dead", "deadly", "deal", "dealer", "dealers", "dealey", "dealing", "dealings", "deals", "dealt", "death", "deaths", "debarred", "debasing", "debate", "debated", "debates", "debauched", "debauchery", "debt", "debtor", "debtors", "debts", "debutant", "decade", "decades", "decapitation", "decaying", "deceased", "deceives", "december", "decency", "decent", "deception", "decide", "decided", "deciding", "decision", "decisions", "decisively", "deck", "declaration", "declare", "declared", "declaring", "decline", "declined", "decomposition", "decorous", "decorum", "decreed", "dedicated", "deduced", "deducted", "deduction", "deed", "deeds", "deem", "deemed", "deep", "deeper", "deepest", "deeply", "deer", "defacing", "defalcations", "default", "defaulter", "defeated", "defeats", "defect", "defected", "defection", "defective", "defectively", "defectors", "defects", "defendants", "defended", "defense", "defenses", "defensive", "deferred", "defiance", "defiant", "deficiencies", "deficiency", "defied", "define", "defined", "defining", "definite", "definitely", "definition", "deformed", "defraud", "defrauding", "defy", "degenerated", "degradation", "degraded", "degree", "degrees", "dejection", "delano", "delarue", "delay", "delayed", "delays", "delectable", "delgado", "deliberate", "deliberately", "deliberation", "delicacy", "delicate", "delightful", "delinquency", "delinquent", "deliver", "delivered", "deliverer", "delivery", "delustered", "demand", "demanded", "demanding", "demands", "demarcation", "demeanor", "democracy", "democratic", "demonstrably", "demonstrate", "demonstrated", "demonstrates", "demonstrating", "demonstration", "demonstrations", "demonstrators", "demoralizing", "den", "denial", "denials", "denied", "denison", "denizens", "denoted", "denouncing", "dense", "densely", "denunciations", "deny", "denying", "depart", "departed", "department", "departments", "departure", "depend", "depended", "dependence", "dependent", "depends", "deplorable", "deployment", "deposed", "deposited", "depositors", "depository", "depraved", "deprecated", "deprecates", "depredators", "depression", "deprivation", "deprived", "depth", "depths", "deputy", "derision", "derive", "derived", "descend", "descended", "descent", "describe", "described", "describes", "describing", "description", "descriptions", "desert", "deserted", "deserve", "deserved", "deserving", "design", "designate", "designated", "designed", "designs", "desirable", "desire", "desired", "desiring", "desk", "desolate", "despair", "desperadoes", "desperate", "desperation", "despite", "despotism", "destined", "destiny", "destitute", "destroy", "destroyed", "destruction", "destructive", "detached", "detachment", "detail", "detailed", "details", "detain", "detained", "detect", "detected", "detection", "detective", "detectives", "detention", "deter", "deteriorated", "deterioration", "determination", "determine", "determined", "determining", "deterred", "deterrent", "detestable", "develop", "developed", "developing", "development", "developmental", "developments", "device", "devices", "devised", "devoid", "devote", "devoted", "devoting", "devotion", "devotional", "dexterous", "diabolical", "diagnosis", "dials", "diameter", "diamonds", "diapers", "diary", "dice", "dickens", "dictator", "did", "didnt", "didot", "die", "died", "diem", "diet", "dietaries", "dietary", "differ", "differed", "difference", "differences", "different", "differently", "differs", "difficult", "difficulties", "difficulty", "diffuse", "diffusion", "digested", "digestion", "digestive", "digging", "dignitaries", "dignity", "digression", "diligently", "dillon", "dimensions", "diminish", "diminished", "diminishing", "diminution", "dinas", "dining", "dinner", "dint", "dioxide", "direct", "directed", "directing", "direction", "directions", "directly", "director", "directors", "dirt", "dirty", "disability", "disagree", "disagreeable", "disappear", "disappearance", "disappeared", "disappointed", "disapproval", "disapproved", "disapproving", "disassemble", "disassembled", "disastrous", "disbeliever", "disbursed", "discarded", "discharge", "discharged", "disciplinary", "discipline", "disclosed", "disclosure", "disclosures", "discomfort", "discontent", "discontented", "discontinuance", "discontinued", "discordant", "discount", "discounted", "discourage", "discouragements", "discouraging", "discourse", "discourses", "discover", "discovered", "discovering", "discovery", "discreditable", "discretion", "discrimination", "discuss", "discussed", "discussion", "discussions", "disease", "disenchanted", "disfigured", "disgrace", "disgraced", "disgraceful", "disgraces", "disguise", "disguised", "disgusted", "disgusting", "disheartening", "dishonest", "dishonesty", "disinclination", "disinterested", "dislike", "dismay", "dismembered", "dismemberment", "dismissed", "disorder", "disordered", "disorderly", "disorders", "dispatch", "dispatched", "dispatcher", "dispensed", "dispensing", "disperses", "display", "displayed", "disporting", "disposal", "dispose", "disposed", "disposing", "disposition", "dispositions", "dispute", "disputes", "disregard", "disregarded", "disreputable", "dissatisfaction", "dissatisfied", "dissecting", "dissection", "dissections", "disseminating", "dissenters", "dissenting", "dissipate", "dissipation", "dissolute", "dissolved", "distance", "distances", "distant", "distemper", "distill", "distinct", "distinction", "distinctions", "distinctive", "distinctly", "distinguish", "distinguishable", "distinguished", "distinguishing", "distracted", "distracting", "distress", "distresses", "distressing", "distributed", "distributing", "distribution", "distributor", "district", "districts", "disturbance", "disturbances", "disturbed", "disturbing", "disuse", "ditch", "diverse", "diversified", "diversions", "diversity", "diverted", "divest", "divided", "dividend", "dividends", "divine", "division", "divisions", "divorce", "dixon", "do", "dobson", "dock", "docks", "doctor", "doctors", "doctrine", "document", "documents", "dodd", "doers", "does", "doesnt", "dog", "dogs", "doing", "dollar", "dollars", "dollimore", "domestic", "domination", "domingo", "dominoes", "don", "donation", "donations", "done", "donovan", "dont", "doom", "doomed", "door", "doors", "dose", "dot", "double", "doubled", "doubly", "doubt", "doubted", "doubtful", "doubtfulness", "doubtless", "doubts", "dough", "dougherty", "dover", "down", "downgrade", "downstairs", "downtown", "downward", "downwards", "dozen", "dr", "drafts", "dragged", "dragons", "drain", "dramatic", "dramatically", "draw", "drawer", "drawers", "drawing", "drawings", "drawn", "dread", "dreaded", "dreadful", "dreadfully", "dreams", "dreamy", "dress", "dressed", "dresser", "dresses", "dressing", "drew", "dried", "drill", "drink", "drinking", "drittal", "drive", "driven", "driver", "drivers", "driveway", "driving", "dromedary", "droops", "drop", "dropped", "drove", "drowning", "drs", "drugged", "drugstore", "drunk", "drunkenness", "drury", "dry", "dryad", "dublin", "duchess", "ducked", "ducking", "due", "dues", "dug", "duke", "dulany", "duly", "dumaine", "dumplings", "dundas", "dungeon", "dungeons", "dupe", "duplicate", "duplicates", "durability", "durance", "duranno", "durham", "during", "dusk", "dust", "dutch", "duties", "duty", "dwarfed", "dwelling", "dwelt", "dwyer", "dyeing", "dying", "e", "each", "eager", "eagerly", "ear", "earlene", "earlier", "earliest", "early", "earned", "earnest", "earnestly", "earnestness", "earning", "earnings", "ears", "earth", "ease", "easement", "easier", "easily", "east", "eastern", "easy", "eat", "eating", "ebullitions", "ecclesiastical", "economic", "economy", "edgar", "edge", "edges", "edgeware", "edgewise", "edifice", "edifying", "edinburgh", "edited", "edition", "editions", "editor", "editorials", "editors", "edmund", "edmunds", "educated", "educates", "education", "edward", "edwards", "edwin", "effect", "effected", "effective", "effectively", "effects", "effectual", "effectually", "efficacious", "efficiency", "efficient", "effluvia", "effort", "efforts", "effrontery", "effusion", "egg", "egress", "egypt", "eight", "eighteen", "eighteenth", "eighth", "eighths", "eighties", "eighty", "either", "ejected", "ekdahl", "eke", "eked", "eking", "el", "elaborate", "elapse", "elapsed", "elated", "elbow", "elder", "elderly", "elders", "elect", "elected", "election", "electric", "electrical", "elegance", "element", "elements", "elevated", "elevator", "elevators", "eleven", "eleventh", "eliminate", "eliminated", "eliminating", "elimination", "ellen", "elm", "eloquent", "else", "elsewhere", "eluded", "eluding", "elzevirs", "embankments", "embark", "embarked", "embarrassment", "embassy", "embellishment", "embezzlement", "embodied", "embodying", "embraced", "embracing", "embryology", "emerged", "emergencies", "emergency", "emigrate", "eminence", "emissaries", "emma", "emmanuel", "emoluments", "emotion", "emotional", "emotionally", "emotions", "emperor", "emphasis", "emphasize", "emphasized", "empire", "employ", "employed", "employee", "employees", "employer", "employers", "employment", "empowered", "empowering", "empty", "emptying", "emulated", "en", "enable", "enabled", "enacted", "enactment", "enactments", "encamped", "enclose", "enclosed", "enclosing", "enclosure", "encounter", "encountered", "encourage", "encouraged", "encouragement", "end", "endanger", "endangering", "endeavor", "endeavored", "endeavoring", "ended", "ending", "endorsed", "ends", "endure", "endured", "enduring", "enemies", "enemy", "energies", "energy", "enforce", "enforced", "enforcement", "engage", "engaged", "engendered", "engine", "england", "english", "engrafted", "engraved", "engraver", "engraving", "enhancing", "enjoy", "enjoyed", "enjoying", "enlarged", "enlargement", "enlightened", "enlightenment", "enlist", "enlisted", "enlivened", "ennui", "enormous", "enormously", "enough", "ensconced", "ensued", "ensuing", "entailed", "enter", "entered", "entering", "enterprise", "enterprises", "enters", "entertain", "entertained", "enthusiasm", "enthusiastic", "enthusiasts", "enticed", "entire", "entirely", "entirety", "entitled", "entrance", "entrances", "entrapped", "entrapper", "entreaties", "entries", "entrust", "entrusted", "entry", "enumerated", "enunciated", "enunciation", "envelope", "enviable", "environment", "enzyme", "epidemic", "epidermis", "episcopal", "episode", "epitomized", "epoch", "equal", "equally", "equipment", "equipped", "equivalent", "erased", "ere", "erect", "erected", "erecting", "erection", "erred", "error", "ervay", "esagil", "esarhaddon", "escape", "escaped", "escapes", "escort", "escorted", "escorting", "escritoire", "especial", "especially", "espoused", "esprit", "esq", "essence", "essential", "essentially", "essex", "establish", "established", "establishes", "establishing", "establishment", "establishments", "estate", "estates", "esther", "estimate", "estimated", "estimates", "et", "etc", "eternal", "eternity", "euins", "eulogized", "euphemistically", "euphrates", "europe", "european", "evaded", "evaluate", "evaluated", "evaluating", "evaluation", "evans", "evaporation", "evasion", "evasive", "eve", "even", "evening", "evenly", "event", "events", "eventually", "ever", "every", "everybody", "everyday", "everyone", "everything", "everywhere", "evidence", "evidenced", "evidences", "evident", "evidently", "evil", "evils", "evince", "evinced", "evoked", "evolution", "evolved", "ex", "exact", "exacted", "exacting", "exaction", "exactions", "exactly", "exaggerated", "exaltation", "exalted", "examination", "examine", "examined", "examining", "example", "examples", "excavated", "exceeded", "exceedingly", "excellence", "excellent", "except", "exception", "exceptional", "exceptions", "excess", "excessive", "excessively", "exchange", "exchanged", "exchequer", "excite", "excited", "excitement", "exciting", "exclaiming", "exclamations", "exclude", "excluded", "exclusion", "exclusive", "exclusively", "excretion", "excretions", "excuse", "excuses", "execration", "execrations", "executed", "executing", "execution", "executioner", "executioners", "executions", "executive", "executor", "executors", "exempted", "exercise", "exercised", "exercises", "exercising", "exertion", "exertions", "exeter", "exhalation", "exhales", "exhaustion", "exhaustive", "exhibit", "exhibited", "exhibiting", "exhibition", "exhortation", "exhortations", "exiles", "exist", "existed", "existence", "existing", "exists", "exit", "exorbitant", "expand", "expect", "expectation", "expected", "expecting", "expedient", "expend", "expended", "expenditure", "expenditures", "expense", "expenses", "expensive", "experience", "experienced", "experiences", "experiencing", "experiment", "experimental", "experiments", "expert", "experts", "expired", "explain", "explained", "explaining", "explanation", "explicit", "exploiting", "exploits", "explored", "explosion", "explosive", "expose", "exposed", "exposes", "exposure", "exposures", "express", "expressed", "expresses", "expressing", "expression", "expressions", "expressly", "extant", "extend", "extended", "extending", "extension", "extensive", "extensively", "extent", "external", "extinct", "extinguished", "extorted", "extorting", "extortion", "extra", "extract", "extracted", "extracts", "extraordinary", "extravagance", "extravagant", "extreme", "extremely", "extremity", "exuberant", "eye", "eyes", "eyewitness", "eyewitnesses", "ezida", "f", "fabricate", "fabrication", "face", "faced", "faces", "facilitate", "facilitating", "facilities", "facility", "facing", "fact", "factitious", "factor", "factors", "factory", "facts", "faded", "faggots", "fail", "failed", "failing", "fails", "failure", "failures", "fain", "fains", "faint", "fainted", "faintest", "fainting", "fair", "fairest", "fairly", "faith", "faithful", "fall", "fallacious", "fallen", "falling", "falls", "false", "falsely", "fame", "familiar", "familiarity", "families", "family", "famine", "famous", "fancied", "fancier", "fantasy", "far", "fare", "farm", "farmer", "farmers", "farming", "farringdon", "farther", "fascist", "fashion", "fashioned", "fasson", "fast", "fastened", "fat", "fatal", "fate", "father", "fathers", "fats", "fault", "fauntleroy", "fauntleroys", "favor", "favorable", "favored", "favorite", "favorites", "favoritism", "fbi", "fbis", "fear", "feared", "fearful", "fearless", "feasibility", "feasible", "feasting", "feature", "features", "february", "fed", "federal", "fee", "feel", "feeling", "feelings", "feels", "fees", "feet", "feigning", "fell", "fellow", "fellows", "felon", "felonies", "felonious", "feloniously", "felons", "felony", "felt", "feltham", "female", "females", "femoral", "fence", "fenning", "ferocity", "ferrari", "ferrers", "fertile", "fervently", "festered", "festive", "fetched", "fettered", "fever", "few", "fewer", "fiber", "fibers", "fiction", "fictitious", "fide", "fidel", "fidelity", "field", "fields", "fiendish", "fiercely", "fifteen", "fifteenth", "fifth", "fifty", "fight", "fighting", "fights", "figure", "figured", "figures", "filaments", "filched", "file", "filed", "files", "fill", "filled", "films", "filthiness", "filthy", "fin", "final", "finally", "finance", "financial", "find", "finding", "findings", "finds", "fine", "fines", "finest", "finger", "fingerprint", "fingerprints", "fingers", "finish", "finished", "finisher", "finishing", "finishings", "fire", "firearms", "firecracker", "fired", "fireplace", "fireplaces", "firers", "fireside", "firing", "firm", "firmest", "firmly", "firmness", "firms", "first", "fiscal", "fischer", "fish", "fished", "fisher", "fishes", "fist", "fit", "fitness", "fits", "fitted", "fitting", "fitz", "fitzroy", "five", "fives", "fix", "fixed", "fixing", "flag", "flagitious", "flagrant", "flames", "flanking", "flash", "flattened", "flaw", "fled", "flee", "fleet", "fleeting", "flemish", "flesh", "fletcher", "flew", "flight", "flings", "flintshire", "flock", "flocked", "flooded", "floor", "flooring", "floors", "flour", "floured", "flow", "flower", "flowery", "flown", "fluctuated", "fluctuations", "flue", "fluid", "fluids", "fluted", "flux", "fly", "flying", "foal", "focus", "fold", "folded", "folkestone", "follow", "followed", "following", "follows", "fond", "fondly", "food", "foods", "fooled", "foolhardy", "foolish", "foolishness", "foot", "foothold", "footing", "footmen", "footsteps", "for", "forbear", "forbearance", "forbid", "forbidden", "force", "forced", "forces", "forcible", "forcibly", "forcing", "ford", "forde", "fore", "foregoing", "forehead", "foreign", "foreigners", "forerunners", "forest", "forethought", "forfeited", "forfeiting", "forged", "forger", "forgeries", "forgers", "forgery", "forget", "forging", "forgiveness", "forgot", "forgotten", "fork", "forks", "form", "formal", "formality", "formally", "formation", "formations", "formed", "former", "formerly", "forming", "forms", "forrest", "forrester", "forster", "fort", "forth", "forthcoming", "forthwith", "fortitude", "fortnight", "fortress", "fortunate", "fortunately", "fortune", "fortunes", "forty", "forward", "forwarded", "forwards", "fostered", "fostering", "fouad", "fought", "foul", "found", "foundation", "foundations", "founded", "founder", "founders", "founts", "four", "fourpence", "fourpenny", "fourteen", "fourteenth", "fourth", "foxen", "fpcc", "fracture", "fragment", "fragments", "frame", "framed", "framers", "frames", "framing", "france", "francis", "franklin", "frantic", "fraud", "frauds", "fraudulent", "frazier", "fraziers", "frederick", "free", "freed", "freedom", "freely", "freemen", "freer", "freeway", "freight", "french", "frenchman", "frenchmen", "frenchwoman", "frenzy", "frequency", "frequent", "frequented", "frequently", "fresh", "freshly", "freshness", "friday", "fried", "friend", "friendless", "friends", "friendships", "frightened", "frightful", "frise", "fritz", "fro", "frog", "from", "front", "fronts", "frowns", "fruit", "fruitful", "frustration", "fry", "frys", "fuel", "fugitive", "fugitives", "fulfilled", "fulfillment", "full", "fullest", "fully", "fumigated", "fumigating", "function", "functionary", "functioning", "functions", "fund", "fundamental", "fundamentally", "funds", "funeral", "fungi", "funny", "furies", "furious", "furlongs", "furnace", "furnish", "furnished", "furniture", "further", "furtherance", "furthered", "furthermore", "futile", "futility", "future", "g", "gaily", "gain", "gained", "gains", "gale", "galleries", "gallery", "gallon", "gallons", "gallows", "galvanized", "gamble", "gambler", "gambling", "game", "games", "gaming", "gamins", "gang", "gangs", "gape", "garage", "garb", "gardelle", "garden", "gardens", "garnish", "garret", "garter", "gas", "gaseous", "gases", "gasoline", "gate", "gates", "gatesman", "gatesmen", "gathered", "gathering", "gave", "gaze", "gazed", "gee", "gen", "general", "generally", "generals", "generate", "generation", "generations", "generosity", "generous", "generously", "gentleman", "gentlemanly", "gentlemans", "gentlemen", "gently", "genuine", "geographic", "george", "gerald", "gerard", "gering", "german", "germans", "germany", "germs", "get", "getting", "ghastly", "ghent", "gibbet", "gibbon", "gibes", "giddy", "giesecke", "gift", "gifts", "gigantic", "gills", "giltspur", "gin", "ginger", "girl", "girls", "give", "given", "givens", "gives", "giving", "glad", "gladly", "glance", "glanced", "glances", "glaring", "glass", "glasses", "glazed", "gleeson", "glimpse", "globe", "gloom", "gloomy", "glories", "glory", "gloucester", "glove", "glucose", "gluttonously", "go", "goaded", "goal", "goals", "gobrias", "god", "godmanchester", "gods", "goes", "going", "gold", "gone", "good", "goode", "goodness", "goods", "goodwill", "gordon", "goree", "goring", "gossip", "got", "gothic", "gotten", "govern", "governed", "governing", "government", "governmental", "governments", "governor", "governors", "gown", "gowrie", "grab", "grabbed", "grabbing", "grace", "gracious", "grade", "gradually", "graham", "grain", "grains", "grand", "grandchildren", "grant", "granted", "graphic", "grasps", "grass", "grated", "grating", "gratings", "gratitude", "gratuitous", "grave", "gravely", "graves", "gravest", "gray", "graze", "grease", "greased", "greaser", "great", "greater", "greatest", "greatly", "greatness", "greed", "greek", "greeks", "green", "greenacre", "greenacres", "greer", "greet", "greeted", "gregory", "grenades", "gretna", "grew", "grey", "greyhound", "grief", "grievance", "grievous", "griffith", "griffiths", "grimms", "grinding", "grip", "grist", "groan", "grog", "grooming", "grooves", "gross", "grossness", "grosvenor", "grotesque", "ground", "grounded", "grounds", "group", "groups", "grow", "growing", "grown", "grows", "growth", "grudge", "gruel", "guarantee", "guard", "guarded", "guarding", "guards", "guessed", "guests", "guidance", "guide", "guidelines", "guides", "guildford", "guileless", "guilt", "guilty", "guinea", "guineas", "guinyard", "gun", "gunman", "guns", "gunshot", "gunsmith", "gunther", "gurney", "gutenberg", "guy", "h", "habit", "habits", "habitual", "habitually", "hackney", "had", "hadnt", "haggerty", "hair", "hairy", "haled", "half", "halfpence", "halfpenny", "halfway", "halifax", "hall", "hallway", "halted", "halter", "halters", "halting", "hammer", "hammock", "hampers", "hampshire", "hampstead", "hams", "hand", "handbill", "handbills", "handbook", "handcuffed", "handed", "handhold", "handicapped", "handicrafts", "handing", "handiwork", "handkerchief", "handkerchiefs", "handle", "handled", "handling", "handmade", "handprinting", "hands", "handsome", "handspikes", "handwriting", "hanfield", "hang", "hanged", "hanging", "hangman", "hangmans", "hanover", "haphazard", "happen", "happened", "happens", "happily", "happiness", "happy", "harass", "harassed", "harboring", "hard", "hardened", "hardihood", "hardly", "hardwicke", "hare", "harkness", "harm", "harmless", "harmony", "harold", "harris", "harrowby", "harrowbys", "harry", "harsh", "hartogs", "harvest", "harvey", "harwood", "has", "haste", "hastened", "hastily", "hat", "hatched", "hate", "hated", "hatfield", "hatred", "hats", "hatters", "hatton", "have", "having", "hawkins", "hayes", "haynau", "hazard", "hazards", "he", "head", "headed", "heading", "headline", "headlong", "headquarters", "heads", "health", "healthful", "healthy", "heap", "hear", "heard", "hearing", "heart", "hearted", "hearts", "hearty", "heat", "heated", "heath", "heating", "heaved", "heaven", "heavier", "heaviest", "heavily", "heaving", "heavy", "hebrew", "heed", "heel", "height", "heinous", "held", "helen", "helicopter", "hell", "helmet", "help", "helped", "helpful", "helpless", "hemp", "hence", "henry", "her", "herald", "herbert", "herded", "here", "hereditary", "heredity", "hereford", "heres", "heretofore", "heritage", "hermetically", "herodotus", "herself", "hertford", "hesitate", "hesitated", "hesitation", "heterogeneous", "hibbert", "hibner", "hickey", "hicks", "hid", "hidden", "hideel", "hidell", "hideous", "hiding", "higgledy", "high", "higher", "highest", "highly", "highway", "highwayman", "highwaymen", "hill", "hillah", "him", "himself", "hind", "hinder", "hindered", "hindrance", "hindsight", "hinges", "hinted", "hire", "hired", "hiring", "his", "hissing", "historic", "histories", "history", "hit", "hitherto", "hits", "hitting", "hoare", "hoary", "hobart", "hocker", "hockers", "hogan", "hogshead", "hold", "holder", "holders", "holding", "hole", "holes", "hollow", "holloway", "holmes", "holster", "home", "homely", "homes", "homicidal", "homicide", "homologies", "homologous", "homology", "hon", "honest", "honestly", "honesty", "honor", "honorable", "honored", "hook", "hooked", "hoover", "hope", "hoped", "hopeless", "hopelessly", "hopes", "horizon", "horn", "horncastle", "horns", "horrible", "horrid", "horror", "horrors", "horse", "horseback", "horsemonger", "horses", "hospital", "hospitals", "host", "hostile", "hostility", "hosts", "hosty", "hostys", "hot", "hotel", "hour", "hours", "house", "housebreakers", "household", "housekeeper", "housemaid", "houses", "houston", "how", "howard", "howards", "however", "howlett", "howse", "hoxton", "huddled", "huge", "huggin", "hughes", "hulks", "human", "humane", "humanitarianism", "humanity", "humanly", "humble", "humiliation", "hump", "hundred", "hundreds", "hung", "hunt", "hunting", "hunton", "hurried", "hurriedly", "hurry", "hurt", "husband", "husbands", "hush", "huts", "huxley", "hydrogen", "hymn", "hymns", "hysterical", "hysterics", "i", "idea", "ideal", "ideals", "ideas", "identical", "identifiable", "identification", "identifications", "identified", "identify", "identifying", "identity", "ideological", "idle", "idleness", "idolaters", "if", "ignominy", "ignorance", "ignorant", "ignored", "ikey", "ilchester", "ill", "illegal", "illegibility", "illegitimate", "illiberal", "illinois", "illiterate", "illness", "illuminated", "illustrate", "illustration", "illustrations", "im", "image", "imaginable", "imaginary", "imagination", "imaginative", "imagined", "imbibed", "imgur", "imitated", "imitates", "imitating", "imitation", "imitators", "immature", "immediate", "immediately", "immense", "immigrants", "imminent", "immoral", "immorality", "immured", "impact", "impairment", "impartial", "imparting", "impassioned", "impatient", "impecunious", "impeded", "impediment", "impediments", "impeding", "impelled", "impending", "impenitent", "imperative", "imperfect", "imperfectly", "imperial", "imperiled", "imperious", "implements", "implicated", "implied", "implored", "importance", "important", "imported", "importer", "imposed", "imposing", "imposition", "impossibility", "impossible", "impounded", "impoverished", "imprecations", "impregnable", "impress", "impressed", "impression", "impressions", "impressive", "imprisoned", "imprisonment", "improper", "improperly", "improve", "improved", "improvement", "improvements", "improvident", "improving", "impudently", "impugned", "impulse", "imputation", "imputed", "in", "inability", "inadequacy", "inadequate", "inasmuch", "inauguration", "inc", "incapable", "incarcerated", "incarceration", "incentive", "incentives", "incessant", "inch", "inches", "incident", "incidentally", "incidents", "incited", "inclement", "inclination", "inclined", "include", "included", "includes", "including", "income", "incomers", "incommodious", "incomplete", "inconceivable", "inconceivably", "inconclusive", "inconsiderable", "inconsistent", "inconvenience", "inconvenient", "incorporated", "incorrect", "incorrigible", "increase", "increased", "increasing", "increasingly", "incredible", "incriminating", "incumbent", "incurred", "indecision", "indeed", "indefatigable", "indefinitely", "independence", "independent", "independently", "indescribable", "index", "indexed", "indian", "indicate", "indicated", "indicates", "indicating", "indication", "indications", "indicted", "indictment", "indifference", "indifferent", "indignantly", "indignation", "indirect", "indirectly", "indiscriminate", "indiscriminately", "indispensable", "indispensably", "indisputable", "indistinguishable", "individual", "individuality", "individuals", "induce", "induced", "inducements", "indulge", "indulged", "indulgence", "industrial", "industries", "industrious", "industry", "inefficiency", "inefficient", "inequalities", "inevitable", "inevitably", "inexperienced", "inexpressible", "infant", "infantry", "infants", "infectious", "inference", "inferior", "infernal", "inferred", "infinite", "infirm", "infirmaries", "infirmary", "infirmity", "inflamed", "inflammation", "inflated", "inflict", "inflicted", "inflicting", "infliction", "influence", "influenced", "influences", "influential", "influx", "informal", "information", "informed", "informer", "infuse", "ingenious", "ingenuity", "ingest", "ings", "inhabitants", "inherent", "inherited", "inhuman", "iniquitous", "initial", "initials", "initiated", "initiative", "injure", "injured", "injuries", "injurious", "injury", "injustice", "injustices", "ink", "inmate", "inmates", "inn", "inner", "innocence", "innocent", "innovation", "innumerable", "inordinately", "inorganic", "inquest", "inquire", "inquired", "inquiries", "inquiry", "inquisitive", "insane", "insanity", "inscribed", "inscription", "inscriptions", "insect", "insecure", "insecurity", "insensible", "inseparable", "insert", "inserted", "inside", "insidious", "insight", "insignificant", "insist", "insisted", "insistence", "insolent", "insolvent", "inspect", "inspected", "inspection", "inspections", "inspector", "inspectors", "inspired", "instability", "instance", "instances", "instant", "instantaneous", "instantly", "instead", "instigation", "instigator", "instituted", "institution", "institutions", "instruct", "instructed", "instruction", "instructions", "instructor", "instructors", "instrument", "instrumentality", "instruments", "insufficiency", "insufficient", "insulted", "insulting", "insuperable", "insurance", "insurances", "insure", "insured", "intact", "integrity", "intellect", "intelligence", "intelligent", "intemperance", "intemperate", "intend", "intended", "intense", "intensified", "intensify", "intensive", "intent", "intention", "intentions", "interagency", "interceded", "intercept", "intercourse", "interest", "interested", "interesting", "interests", "interfere", "interfered", "interference", "interfering", "interim", "interior", "interment", "intermingled", "intermittently", "intermix", "internal", "international", "interpretation", "interpreted", "interpreter", "interrogated", "interrogation", "interruption", "intersection", "interstate", "interval", "intervals", "intervening", "intervention", "interview", "interviewed", "interviews", "intimacy", "intimate", "intimated", "intimately", "intimation", "into", "intolerant", "intoxicated", "intoxicating", "intoxication", "intricate", "introduce", "introduced", "introducing", "introduction", "intrusted", "invariable", "invariably", "inveigled", "inveigling", "invented", "invention", "inverse", "invest", "invested", "investigate", "investigated", "investigating", "investigation", "investigations", "investigative", "investigators", "investigatory", "investment", "investor", "investors", "invitation", "invitations", "invited", "invoice", "involve", "involved", "involvement", "involves", "involving", "inward", "inwards", "ionic", "ipswich", "irksome", "iron", "ironed", "ironing", "irons", "irrational", "irregularities", "irregularity", "irremediable", "irresistible", "irresistibly", "irrespective", "irresponsible", "irrevocable", "irrevocably", "irritable", "irritated", "irving", "is", "islands", "islington", "isolated", "isolation", "issue", "issued", "issues", "isthe", "it", "italian", "italy", "item", "items", "itinerant", "itinerary", "its", "itself", "ive", "j", "jack", "jacket", "jackson", "jacobite", "jacobus", "jacques", "jail", "jailer", "jails", "james", "jamess", "jane", "january", "japan", "jarman", "jarred", "jarrow", "jaws", "jealous", "jeanette", "jeanne", "jebb", "jeered", "jeers", "jefferson", "jem", "jenkins", "jenson", "jensons", "jeopardized", "jeremy", "jerking", "jerome", "jersey", "jerusalem", "jesse", "jests", "jew", "jewel", "jeweler", "jewelers", "jewelery", "jewels", "jews", "job", "jobs", "john", "johnny", "johnson", "johnsons", "join", "joined", "joining", "joint", "jointed", "joints", "joke", "jollity", "jones", "jordan", "joseph", "joshua", "journal", "journalists", "journals", "journey", "journeys", "jowl", "joy", "jr", "judge", "judged", "judges", "judgment", "judice", "judicial", "judiciary", "judicious", "juices", "julian", "july", "jumbled", "jump", "jumped", "june", "junior", "jupiter", "juries", "jurisdiction", "jurisdictions", "jurisprudence", "jury", "just", "justice", "justices", "justification", "justified", "justify", "justinian", "justly", "juvenile", "juveniles", "kaiser", "kate", "katherine", "kay", "kean", "keen", "keenest", "keenness", "keep", "keeper", "keepers", "keeping", "keeps", "keith", "kellerman", "kelly", "kendal", "kennedy", "kennedys", "kennel", "kenneth", "kennington", "kent", "kept", "ker", "kerp", "kers", "ketch", "ketchs", "key", "keys", "khrushchev", "kicked", "kidderminster", "kilburn", "kill", "killed", "killer", "killing", "kind", "kindled", "kindly", "kindness", "kinds", "kinetic", "king", "kingdom", "kingdoms", "kings", "kingston", "kitchen", "kleins", "knack", "knead", "knee", "kneel", "kneeling", "knees", "knell", "knelt", "knew", "knife", "knight", "knives", "knock", "knocked", "knots", "knotted", "know", "knowing", "knowledge", "known", "knows", "krapps", "l", "labor", "laboratory", "labored", "laborers", "lace", "lack", "lad", "ladder", "ladies", "lading", "lads", "lady", "ladys", "lagged", "laid", "lain", "lake", "lamar", "lamb", "lambeth", "lambs", "lamentable", "lamentation", "lancaster", "land", "landed", "landing", "landlady", "lands", "lane", "language", "languid", "languished", "lap", "lapels", "lapse", "lapses", "lapsing", "large", "largely", "larger", "largest", "last", "lasted", "lasts", "late", "lately", "latent", "later", "lateral", "laterally", "latest", "lath", "latin", "latona", "latonas", "latter", "latterly", "latters", "laudable", "laugh", "laughed", "laughing", "laughter", "launched", "laundry", "lavender", "laverstock", "lavish", "law", "lawful", "lawn", "lawrence", "laws", "lawson", "lawsons", "lawyers", "lax", "laxity", "lay", "layer", "layers", "laying", "le", "leach", "lead", "leaden", "leader", "leaders", "leadership", "leading", "leads", "leaf", "leaned", "leant", "leap", "learn", "learned", "learning", "learnt", "lears", "least", "leather", "leave", "leaves", "leaving", "lectures", "led", "lee", "leeds", "lees", "leew", "left", "leg", "legacies", "legacy", "legal", "legend", "legends", "legged", "legibility", "legible", "legislating", "legislation", "legislative", "legislature", "legitimate", "legs", "leicester", "length", "lengthened", "lengths", "lengthy", "leniency", "lennie", "leon", "lerigo", "less", "lessened", "lesser", "lesson", "lessons", "lest", "let", "lethal", "lets", "letter", "letterpress", "letters", "letting", "level", "leveled", "lever", "levied", "levity", "levy", "levying", "lewdness", "lewis", "liabilities", "liable", "liaison", "libels", "liberal", "liberate", "liberties", "liberty", "libitum", "librivox", "lie", "lied", "lies", "lieutenancy", "lieutenant", "life", "lift", "lifted", "lifting", "lifts", "light", "lighted", "lighter", "lighting", "lightly", "lights", "like", "liked", "likelihood", "likely", "likeness", "likewise", "limbo", "limbs", "lime", "limit", "limitation", "limited", "limiting", "limits", "limousine", "lincoln", "lincolnshire", "line", "lineal", "lined", "linen", "lines", "lineup", "lineups", "lingered", "lingering", "lining", "link", "linked", "linking", "links", "linnie", "lion", "lions", "lip", "liquid", "liquids", "liquor", "liquorpond", "liquors", "list", "listed", "listen", "listened", "listening", "listing", "literally", "literary", "literature", "little", "live", "lived", "livelihood", "lively", "liver", "liveries", "liverpool", "lives", "living", "load", "loaded", "loading", "loads", "loaf", "loans", "loathsome", "loaves", "lobby", "local", "localities", "locality", "locally", "locate", "located", "location", "lock", "locked", "locking", "locomotion", "locomotive", "lodge", "lodged", "lodger", "lodging", "lodgings", "loft", "lofty", "log", "london", "lonely", "long", "longer", "longest", "longman", "look", "looked", "looking", "looks", "loomed", "looms", "loop", "loops", "loose", "loosen", "loosened", "lopez", "lord", "lords", "lordship", "lordships", "los", "lose", "loses", "losing", "loss", "lost", "lot", "loud", "louder", "loudon", "louisiana", "love", "lovelady", "loving", "low", "lowe", "lower", "lowered", "lowest", "lowing", "loyal", "loyalty", "lt", "lubeck", "luck", "luckless", "lucky", "ludgate", "ludicrously", "luigi", "lukewarm", "lump", "lumped", "lunacy", "lunatic", "lunatics", "lunch", "luncheon", "lunchroom", "lundy", "lung", "lungs", "lurched", "luxuries", "luxurious", "luxury", "lying", "lymph", "lyndon", "lyons", "m", "machine", "machinery", "machines", "mackay", "mackintosh", "mad", "madame", "maddened", "made", "madness", "mae", "magazines", "magic", "magistracy", "magistrate", "magistrates", "magnates", "magnified", "magnifying", "magnitude", "maid", "maiden", "maidservant", "maidstone", "mail", "mailbox", "mailing", "mailroom", "maimed", "maiming", "main", "mainly", "maintain", "maintained", "maintaining", "maintains", "maintenance", "maintz", "maison", "maj", "majesty", "majestys", "major", "majority", "make", "makes", "making", "malady", "malcolm", "male", "malefactors", "males", "maliciously", "malignant", "mall", "malpractices", "maltby", "mammal", "man", "manacled", "manacling", "manage", "manageable", "managed", "management", "manager", "managers", "manchester", "mandate", "mandella", "manifest", "manifestation", "manifestations", "manifestly", "manifold", "manipulation", "manned", "manner", "mannered", "manners", "manning", "mannings", "mannlicher", "manor", "manpower", "mans", "mansion", "manslaughter", "manual", "manufactory", "manufacture", "manufactured", "manufacturers", "manufactures", "manumitted", "many", "map", "march", "marched", "marching", "marduk", "margaret", "margin", "marguerite", "marguerites", "marie", "marina", "marine", "marines", "marion", "mark", "marked", "market", "markets", "markham", "markhams", "marks", "marksman", "marksmanship", "marksmen", "marley", "marquis", "marriage", "married", "marrs", "marry", "marrying", "mars", "marsalis", "marsh", "marshal", "marshals", "marshalsea", "marsolino", "mart", "martha", "martial", "martin", "marvelous", "marwood", "marxism", "marxist", "mary", "marylebone", "mask", "masonry", "masons", "mass", "massacre", "massage", "masses", "massive", "master", "masters", "mat", "matched", "matches", "mate", "material", "materially", "materials", "mates", "matrices", "matron", "mats", "matter", "mattered", "matters", "matthew", "matting", "mattress", "mature", "maturity", "maudlin", "maximum", "may", "maybe", "maynard", "mayor", "mcbride", "mcclelland", "mcdonald", "mckinley", "mcwatters", "me", "meager", "meal", "meals", "mean", "meandering", "meaning", "meaningful", "meaningless", "means", "meant", "meanwhile", "measure", "measured", "measures", "measuring", "meat", "mechanism", "mediation", "medical", "medically", "medicine", "meditation", "meditations", "mediterranean", "meek", "meet", "meeting", "meetings", "melancholy", "mell", "melted", "member", "members", "membership", "memorable", "memoranda", "memorandum", "memorial", "memorials", "memory", "men", "menaced", "menial", "mens", "mental", "mentelin", "mention", "mentioned", "mentioning", "mentions", "merchant", "merchants", "mercies", "merciful", "mercifully", "mercy", "mere", "merely", "merionethshire", "merited", "meritorious", "merits", "merrily", "merriment", "message", "messenger", "messengers", "messrs", "met", "metabolism", "metal", "metallic", "method", "methods", "metropolis", "metropolitan", "mexico", "michael", "michaelis", "michaelmas", "microscope", "microscopic", "middle", "middlesex", "midnight", "midst", "midway", "midweek", "might", "mighty", "milan", "mild", "mile", "miles", "militant", "military", "militia", "milk", "mill", "millbank", "miller", "millimeter", "million", "millions", "mills", "millstones", "milton", "mince", "mind", "minded", "minds", "mine", "mineral", "mingled", "minimize", "minimum", "minister", "ministers", "ministration", "ministrations", "minor", "minsk", "mint", "minute", "minutely", "minutes", "minutest", "minver", "misapplication", "misapprehension", "misappropriated", "misappropriating", "miscellaneous", "mischief", "misconduct", "miscreant", "misdeeds", "misdemeanant", "misdemeanants", "misdemeanors", "misdirected", "misemployment", "miserable", "miserably", "misery", "mishap", "mismanagement", "misplaced", "misrepresentations", "miss", "missal", "missals", "missed", "misseurs", "missile", "missiles", "missing", "mission", "misspelling", "mistake", "mistaken", "mistress", "misunderstood", "misuse", "mitigate", "mitigating", "mitigation", "mix", "mixed", "mixing", "moat", "mob", "mobbs", "mobility", "mockery", "mode", "model", "moderate", "modern", "modes", "modification", "modifications", "modified", "mohrenschildt", "moisture", "molasses", "molded", "moment", "moments", "monastery", "monday", "monetary", "money", "monger", "monies", "monitor", "monopolize", "monopolized", "monopoly", "monotonous", "monsters", "montgomery", "month", "months", "moon", "moonshine", "moorfields", "mop", "mops", "moral", "morally", "morals", "morbidly", "more", "moreover", "morning", "mornings", "morpeth", "morphology", "mortally", "mortals", "mortar", "mortem", "mortgages", "moscow", "moses", "moss", "most", "mostly", "mother", "mothers", "motion", "motionless", "motivated", "motivation", "motive", "motives", "motorcade", "motorcycle", "motorcycles", "mould", "mound", "mounds", "mount", "mountain", "mountainous", "mounted", "mounting", "mouth", "mouths", "movable", "move", "moved", "movement", "movements", "mover", "moves", "moving", "mr", "mrs", "mss", "much", "mud", "mule", "mules", "mullay", "muller", "multiple", "multiplied", "multitude", "municipal", "murder", "murdered", "murderer", "murderers", "murderess", "murdering", "murderous", "murders", "murmur", "murphy", "murret", "muscle", "muscles", "muscular", "museum", "museums", "must", "mustered", "mute", "mutilated", "mutineers", "mutter", "muttering", "mutual", "mutually", "muzzle", "mwddy", "my", "myself", "mysterious", "mystery", "n", "nabonidus", "nabopolassar", "nagging", "nails", "naked", "nakedness", "name", "named", "namely", "names", "narrative", "narrow", "narrowed", "narrowing", "narrowly", "nasty", "nation", "national", "nations", "native", "natives", "natural", "naturally", "nature", "nauseous", "naval", "navigate", "navy", "ne", "near", "nearby", "nearer", "nearest", "nearly", "neat", "neatly", "nebuchadnezzar", "nebuchadnezzars", "necessaries", "necessarily", "necessary", "necessities", "necessity", "neches", "neck", "necks", "need", "needed", "needing", "needlessly", "needs", "needy", "neely", "nefarious", "negative", "negatively", "negatives", "neglect", "neglected", "negotiate", "negotiating", "neighbor", "neighborhood", "neighboring", "neighbors", "neild", "neilds", "neither", "nerve", "nerves", "nervous", "nest", "net", "netherlands", "netting", "network", "neurological", "never", "nevertheless", "new", "newest", "newgate", "newly", "newman", "newmans", "news", "newsman", "newspaper", "newspapers", "next", "nice", "nicholas", "nicholson", "nickname", "nicol", "niggardliness", "night", "nights", "nimitti", "nine", "nineteen", "nineteenth", "ninety", "nineveh", "ninth", "nitrates", "nitrogen", "nixon", "no", "noble", "nobody", "nocturnal", "nodded", "noise", "noisy", "nominally", "nominated", "non", "noncommissioned", "none", "nonexistent", "nonsense", "noon", "noose", "nor", "norfolk", "normal", "normally", "norman", "north", "northeast", "northern", "northwest", "norwich", "nose", "noses", "not", "notable", "notably", "note", "notebook", "noted", "notes", "nothing", "notice", "noticeable", "noticed", "notification", "notified", "notify", "notion", "notorieties", "notoriety", "notorious", "notoriously", "notwithstanding", "nought", "nourished", "nova", "novel", "novels", "novelty", "november", "novo", "now", "nowhere", "noxious", "noyes", "number", "numbered", "numberless", "numbers", "numerals", "numerous", "nurse", "nursery", "nutriment", "nutrition", "nutritive", "o", "oak", "oakum", "oath", "oaths", "obey", "obeyed", "object", "objected", "objection", "objectionable", "objections", "objective", "objects", "obligation", "obliged", "obliterated", "obliteration", "obscene", "obscenity", "obscure", "obscured", "observable", "observance", "observation", "observations", "observe", "observed", "observer", "observers", "observes", "observing", "obsolete", "obstacles", "obtain", "obtained", "obtaining", "obtains", "obvious", "obviously", "occasion", "occasional", "occasionally", "occasioned", "occasions", "occupant", "occupants", "occupation", "occupations", "occupied", "occupy", "occupying", "occur", "occurred", "occurrences", "occurs", "ocean", "oclock", "oconnor", "oconnors", "october", "odd", "oddity", "odell", "odonnell", "of", "off", "offender", "offenders", "offense", "offenses", "offensive", "offer", "offered", "offering", "offers", "office", "officer", "officers", "offices", "official", "officially", "officials", "often", "oftener", "oh", "oil", "oiled", "old", "older", "oldest", "olympic", "omally", "omissions", "omit", "omitted", "on", "once", "one", "ones", "only", "onto", "ontogeny", "onus", "onwards", "opaque", "open", "opened", "opening", "openly", "operate", "operated", "operates", "operating", "operation", "operations", "operator", "opinion", "opinions", "opponents", "opportunities", "opportunity", "oppose", "opposed", "opposing", "opposite", "opposition", "oppression", "oppressive", "or", "oral", "orange", "oranges", "ordeal", "order", "ordered", "ordering", "orderly", "orders", "ordinarily", "ordinary", "ordinarys", "org", "organ", "organic", "organism", "organisms", "organization", "organizations", "organize", "organized", "organs", "oriental", "origin", "original", "originally", "originated", "orleans", "ornament", "ornamentation", "ornamented", "orphans", "orthodox", "osborne", "osmosis", "ostensibly", "oswald", "oswalds", "other", "others", "otherwise", "ought", "ounces", "our", "ours", "ourselves", "out", "outbreak", "outdone", "outer", "outfitted", "outlay", "outlet", "outlined", "outrage", "outraged", "outrageous", "outrages", "outright", "outset", "outside", "outskirts", "outspoken", "outstanding", "oven", "over", "overall", "overbearing", "overboard", "overcome", "overcrowded", "overcrowding", "overdo", "overend", "overflowed", "overflowing", "overhaul", "overhead", "overleap", "overlooked", "overlooking", "overpass", "overpasses", "overseers", "oversight", "overtake", "overthrow", "overthrown", "overtones", "overtook", "overwhelming", "owen", "owing", "own", "owned", "owner", "owners", "ownership", "owning", "ox", "oxenford", "oxford", "oxfords", "oxidation", "oxon", "oxygen", "oxygenation", "p", "pace", "pacing", "package", "packages", "packed", "packing", "page", "pages", "paget", "paid", "pail", "pain", "paine", "paines", "painful", "pains", "painted", "painters", "pair", "pal", "palace", "palaces", "pale", "paleness", "palliation", "palm", "palmer", "palmers", "palmprint", "palmprints", "paltry", "pampering", "pamphlet", "pan", "panels", "panes", "pangs", "panic", "pannartz", "pans", "pantaloons", "pantry", "paper", "papers", "parade", "paraded", "parades", "paraffin", "paragraph", "parallel", "paralyzed", "paramount", "paramours", "paranoid", "parcel", "parceled", "parcels", "pardon", "pardoned", "parentage", "parents", "paris", "parish", "park", "parked", "parking", "parkland", "parliament", "parliamentary", "parma", "part", "partial", "partially", "participate", "participated", "particular", "particularly", "particulars", "parties", "partitioned", "partly", "partners", "partook", "partridge", "parts", "party", "paso", "pass", "passage", "passages", "passbook", "passed", "passenger", "passengers", "passes", "passing", "passion", "passionate", "passions", "passive", "passport", "past", "paste", "pasties", "patch", "pate", "path", "pathological", "patient", "patiently", "patrick", "patriotic", "patriotism", "patriots", "patrol", "patrolled", "patrolman", "patrolmen", "patron", "patronized", "pattern", "patterson", "patton", "paul", "pause", "pauses", "pawnbrokers", "pay", "paying", "payment", "payments", "pea", "peace", "peaceably", "peaceful", "peacefully", "pear", "pearson", "peculiar", "peculiarities", "pecuniary", "peel", "peer", "peered", "peers", "pegsworth", "pell", "pembroke", "penal", "penalties", "penalty", "pence", "pending", "penitent", "penitentiary", "penman", "pennant", "pennsylvania", "penny", "pennyworth", "pens", "pension", "pensions", "pentonville", "penultimate", "penury", "people", "peoples", "pepper", "per", "percent", "perception", "percival", "peremptorily", "peremptory", "perennibranch", "perennibranchs", "perfect", "perfection", "perfectly", "perforations", "perform", "performance", "performed", "performing", "perfunctory", "perhaps", "perilous", "period", "periodic", "periodical", "periodically", "periods", "peripheral", "perish", "perished", "perjury", "permanent", "permanently", "permissible", "permission", "permissive", "permit", "permitted", "pernicious", "perpetrated", "perpetration", "perpetrator", "perpetrators", "perpetual", "perpetually", "perpetuated", "perpetuation", "perquisite", "perry", "persecuted", "persecution", "perseverance", "persevered", "persia", "persian", "persians", "persistent", "persistently", "person", "personage", "personages", "personal", "personalities", "personality", "personally", "personated", "personnel", "persons", "perspective", "persuaded", "persuasion", "persuasive", "pertinently", "peter", "peters", "petition", "petitioned", "petitioner", "petitions", "petrified", "petticoats", "petty", "petworth", "pew", "phase", "phases", "phenomena", "phial", "philanthropic", "philanthropist", "philanthropists", "philanthropy", "phillips", "phipoe", "phoebe", "phone", "photograph", "photographed", "photographer", "photographers", "photographic", "photographs", "photography", "phrase", "physical", "physically", "physician", "physiology", "pic", "pick", "picked", "pickets", "picking", "pickup", "pics", "picture", "pictures", "pie", "piece", "pieces", "pieman", "piemen", "pierce", "piercingly", "pies", "piety", "piggledy", "pilasters", "pile", "piled", "pilgrim", "pillage", "pillory", "pillow", "pills", "pilot", "pimlico", "pin", "pinched", "pinfold", "pinioned", "pinioning", "pint", "pious", "pipe", "piper", "pipes", "pirates", "pistol", "pistols", "pit", "pitch", "pitchforked", "pitiable", "pittance", "pity", "placards", "place", "placed", "placement", "places", "placing", "plain", "plainly", "plaintiffs", "plan", "plane", "plank", "planks", "planned", "planning", "plans", "plant", "planted", "plants", "plasterers", "plastic", "plate", "plateau", "plates", "platform", "plausible", "play", "played", "players", "playing", "plays", "plaza", "plea", "plead", "pleaded", "pleadings", "pleas", "pleasant", "pleasanter", "please", "pleased", "pleasure", "pleasures", "pledged", "plentiful", "plight", "plot", "plotting", "plowed", "plunged", "plus", "pocket", "pocketbook", "pocketed", "pockets", "poe", "poetical", "poetry", "point", "pointed", "pointing", "points", "poison", "poisoned", "poisoner", "poisoning", "poisons", "poker", "police", "policeman", "policemen", "policies", "policy", "political", "politician", "polluted", "pompous", "pond", "pool", "poor", "poorer", "populace", "popular", "population", "populous", "porch", "pork", "port", "porter", "porters", "portion", "portions", "portland", "portrait", "posed", "position", "positions", "positive", "positively", "possess", "possessed", "possessing", "possession", "possessions", "possessor", "possibilities", "possibility", "possible", "possibly", "post", "postage", "postal", "posted", "postilions", "posting", "postponed", "posts", "pot", "potato", "potatoes", "potential", "potentially", "potman", "poultry", "pound", "pounding", "pounds", "pour", "poverty", "powder", "power", "powerful", "powerless", "powers", "powerscourt", "practical", "practically", "practice", "practiced", "practices", "practicing", "practitioner", "practitioners", "praecipe", "praise", "praises", "praiseworthy", "pray", "prayer", "prayers", "praying", "preached", "preacher", "preachers", "preaches", "preaching", "preamble", "precarious", "precautions", "preceded", "precedence", "precedent", "preceding", "precinct", "precincts", "precious", "precipitancy", "precise", "precisely", "precision", "preclude", "predecessor", "predecessors", "prefer", "preference", "preferred", "preferring", "prefers", "prejudice", "preliminary", "premeditated", "premeditation", "premises", "premising", "preparation", "preparations", "preparatory", "prepare", "prepared", "preparing", "preponderance", "prerogative", "prescribed", "prescription", "presence", "present", "presentation", "presented", "presenting", "presently", "presentment", "presentments", "presents", "preservation", "preserve", "preserved", "preserver", "presided", "presidency", "president", "presidential", "presidents", "press", "pressed", "presses", "pressing", "pressure", "presumably", "presumed", "pretended", "pretense", "pretension", "pretext", "pretty", "prevail", "prevailed", "prevailing", "prevalent", "prevent", "prevented", "preventing", "preventive", "previous", "previously", "prey", "preying", "price", "prices", "pricking", "pride", "pries", "priests", "primarily", "primary", "prime", "primer", "prince", "principal", "principally", "principals", "principle", "principles", "print", "printed", "printer", "printers", "printing", "prints", "prior", "priority", "priory", "prison", "prisoner", "prisoners", "prisons", "privacy", "private", "privately", "privation", "privilege", "privileged", "privileges", "privy", "prize", "pro", "probabilities", "probability", "probable", "probably", "probation", "probative", "probert", "proberts", "problem", "problems", "procedure", "procedures", "proceed", "proceeded", "proceeding", "proceedings", "proceeds", "process", "processed", "processes", "processing", "procession", "proclaimed", "proclivities", "procrastination", "proctors", "procurable", "procure", "procured", "produce", "produced", "produces", "product", "production", "productions", "productive", "profane", "professed", "professes", "profession", "professional", "profit", "profitable", "profits", "profligacy", "profound", "profoundest", "profoundly", "profusely", "program", "programs", "progress", "prohibited", "prohibitory", "project", "projected", "projecting", "projections", "projects", "prolong", "prolonged", "prominence", "prominent", "prominently", "promiscuous", "promise", "promised", "promising", "promissory", "promote", "promoted", "promoters", "prompt", "prompted", "promptly", "promulgation", "prongs", "pronounced", "proof", "proofs", "propaganda", "propagated", "propensities", "propensity", "proper", "properly", "property", "proportion", "proportionately", "proportions", "proposal", "proposals", "propose", "proposed", "proposition", "propositions", "prosecuted", "prosecution", "prospect", "prospective", "prosper", "prospered", "prosperity", "prosperous", "prosperously", "prostitutes", "protect", "protected", "protecting", "protection", "protective", "protector", "proteid", "proteids", "protein", "protest", "protested", "protests", "protoplasm", "protracted", "proud", "prove", "proved", "proves", "provide", "provided", "providentially", "provides", "providing", "province", "provinces", "provincial", "provision", "provisional", "provisions", "proviso", "prs", "prudery", "psalters", "psychiatric", "psychiatrist", "psychological", "public", "publican", "publication", "publications", "publicity", "publicized", "publicly", "published", "publishers", "publishing", "puerto", "pull", "pulled", "pulling", "pulpit", "pulse", "pump", "pumps", "punched", "punches", "punctiliously", "punctually", "pungent", "punish", "punishable", "punished", "punishment", "punishments", "puppy", "purchase", "purchased", "purchases", "purchasing", "pure", "purely", "purging", "purity", "purlieus", "purported", "purporting", "purpose", "purposely", "purposes", "pursuance", "pursue", "pursued", "pursuing", "pursuit", "purveyors", "pushed", "puss", "put", "putrid", "putting", "python", "quadrangle", "quaker", "qualification", "qualifications", "qualified", "qualifying", "qualities", "quality", "quantities", "quantity", "quarrel", "quarreling", "quarrels", "quart", "quarter", "quarterly", "quarters", "queen", "queens", "quelled", "queries", "question", "questioned", "questioning", "questions", "quick", "quickening", "quicklime", "quickly", "quid", "quiet", "quieted", "quietly", "quigley", "quigleys", "quills", "quit", "quite", "quote", "quoted", "quotes", "r", "rabbit", "race", "raced", "rackets", "radiated", "radiation", "radical", "radically", "radio", "radioed", "raff", "rage", "ragged", "rags", "railing", "railroad", "railroads", "railway", "rain", "raise", "raised", "raising", "raked", "ralph", "rambler", "rambling", "ramifications", "rampart", "ran", "randle", "rang", "range", "ranged", "rank", "ranking", "rankled", "rapacity", "rapid", "rapidly", "rare", "rarely", "rash", "ratcliffe", "rate", "rates", "rathbone", "rather", "ratification", "ratified", "ration", "rational", "rations", "rats", "raw", "ray", "re", "reach", "reached", "reaches", "reaching", "reacted", "reaction", "read", "reader", "readers", "readily", "readiness", "reading", "ready", "real", "reality", "realize", "realized", "realizing", "really", "realm", "reappeared", "reappropriate", "rear", "rearranged", "reason", "reasonable", "reasonably", "reasoning", "reasons", "rebelled", "rebellion", "rebuild", "rebuilding", "rebuilt", "rebuke", "recall", "recalled", "recapitulation", "recapture", "recaptured", "receipt", "receipts", "receive", "received", "receiver", "receivers", "receives", "receiving", "recent", "recently", "receptacle", "reception", "recipient", "recites", "reckless", "recklessly", "recklessness", "reckoned", "recognition", "recognize", "recognized", "recognizes", "recognizing", "recollectedness", "recollection", "recommence", "recommend", "recommendation", "recommendations", "recommended", "recommends", "reconstruct", "reconstructed", "reconstructing", "reconstruction", "record", "recorded", "recorder", "recorders", "recording", "records", "recover", "recovered", "recovery", "recruited", "recur", "recurrence", "red", "redeem", "redesdales", "redpath", "redpaths", "reduce", "reduced", "reduction", "reed", "reeds", "reels", "refer", "reference", "references", "referral", "referrals", "referred", "referring", "refers", "reflect", "reflected", "reflection", "reflects", "reflex", "reform", "reformation", "reformatory", "reformers", "reforms", "refractory", "refrain", "refreshed", "refuge", "refuges", "refuse", "refused", "refusing", "refuted", "regained", "regard", "regarded", "regarding", "regardless", "regards", "regime", "register", "registers", "registrar", "registration", "regret", "regular", "regularly", "regulated", "regulation", "regulations", "reid", "reids", "reign", "reigned", "reigns", "reinforced", "reiterate", "reiterating", "reiteration", "rejected", "rejecting", "rejection", "rejoiced", "relapse", "relapsed", "relapsing", "relate", "related", "relating", "relation", "relations", "relationship", "relationships", "relative", "relatively", "relatives", "relax", "relaxation", "relaxed", "release", "released", "relegated", "reliable", "reliance", "relied", "relief", "reliefs", "relieve", "relieved", "religion", "religious", "relish", "relocation", "reluctance", "reluctantly", "rely", "remain", "remainder", "remained", "remaining", "remains", "remark", "remarkable", "remarkably", "remarked", "remarks", "remediable", "remedial", "remedied", "remedies", "remedy", "remember", "remembered", "remind", "reminded", "remington", "remissness", "remonstrance", "remonstrate", "remorse", "remoteness", "removal", "remove", "removed", "removing", "remunerated", "remunerative", "render", "rendered", "renders", "renewed", "renounce", "rent", "rental", "rentals", "rented", "reopened", "reorganization", "repair", "repaired", "repealed", "repeat", "repeated", "repeatedly", "repelled", "repentance", "repetition", "replace", "replaced", "replica", "replied", "replies", "reply", "report", "reported", "reporter", "reporting", "reports", "reprehended", "reprehensible", "reprehension", "representations", "representative", "representatives", "represented", "representing", "represents", "repressed", "reprieve", "reprieved", "reproach", "reprobation", "reproduced", "reproduction", "reproves", "reptile", "repudiated", "repugnance", "repulsive", "reputation", "repute", "reputed", "request", "requested", "requesting", "requests", "require", "required", "requirement", "requirements", "requires", "requiring", "requisition", "rescue", "rescued", "research", "researches", "resemblance", "resemblances", "resemble", "resembled", "resembling", "resented", "reserve", "reserved", "reserves", "reside", "resided", "residence", "resident", "residents", "residing", "resignation", "resigned", "resist", "resistance", "resistant", "resisted", "resolution", "resolutions", "resolve", "resolved", "resorted", "resounded", "resources", "respect", "respectability", "respectable", "respected", "respecting", "respective", "respectively", "respects", "respiration", "respiratory", "respite", "respited", "respond", "responded", "response", "responses", "responsibilities", "responsibility", "responsible", "rest", "rested", "resting", "restless", "restoration", "restore", "restored", "restrain", "restraining", "restraint", "restraints", "restricted", "restrictions", "restrictive", "result", "resulted", "resulting", "results", "resume", "resumed", "resurrectionist", "retailed", "retain", "retained", "retaining", "retains", "reticent", "retire", "retired", "retirement", "retiring", "retorted", "retouched", "retouching", "retrace", "retrenched", "retribution", "retrograde", "return", "returned", "returning", "returns", "rev", "reveal", "revealed", "revealing", "reveals", "revelations", "reveling", "revelries", "revelry", "revels", "revenge", "revenues", "reverend", "reverse", "reversed", "review", "reviewed", "reviewing", "reviews", "revill", "revision", "revival", "revive", "revived", "revolution", "revolutionary", "revolutionize", "revolver", "reward", "rewarded", "reynolds", "rheumatism", "rice", "rich", "richard", "richly", "richmond", "richness", "rick", "rid", "ridden", "ride", "riders", "ridicule", "ridiculous", "riding", "riff", "rifle", "rifled", "rifleman", "rifles", "rifling", "right", "rightful", "rights", "rightwing", "rigid", "rigidly", "ring", "ringleaders", "rings", "riot", "rioters", "rioting", "riotous", "riots", "ripping", "rise", "risen", "rises", "rising", "risk", "risks", "river", "rivers", "road", "roar", "roared", "roaring", "roast", "rob", "robarts", "robbed", "robberies", "robbery", "robert", "roberts", "robson", "robsons", "rochester", "rod", "rode", "rods", "roe", "roehampton", "rogues", "roistering", "role", "roles", "roll", "rolled", "rolling", "rolls", "rolt", "roman", "romanes", "rome", "romeos", "romilly", "ronald", "roof", "roofs", "room", "roominghouse", "rooms", "roosevelt", "root", "rooted", "roots", "rope", "ropes", "rose", "rosy", "rotten", "rouge", "rough", "roughly", "round", "rounded", "rounder", "rounds", "roupell", "roupells", "rouse", "roused", "route", "routes", "routine", "roux", "row", "rowley", "rows", "roy", "royal", "royally", "rubeus", "rude", "rudely", "rudiment", "rudimentary", "rudiments", "rufus", "rug", "rugeley", "rugs", "ruin", "ruined", "ruins", "rule", "rulers", "rules", "ruling", "run", "running", "runs", "rush", "rushed", "russell", "russells", "russia", "russian", "russians", "rustling", "ruth", "s", "saccharine", "sack", "sacred", "sacrifice", "sacrificing", "sacrilege", "safe", "safeguard", "safeguards", "safely", "safer", "safes", "safest", "safety", "sagacity", "said", "sailing", "sailor", "sailors", "sake", "salaries", "salary", "sale", "sales", "salt", "salts", "salutary", "salute", "sam", "same", "sample", "samples", "samuel", "san", "sanctioned", "sanctions", "sand", "sandwiches", "sane", "sank", "santa", "santos", "sap", "sarah", "sarcasms", "sash", "sat", "satisfaction", "satisfactorily", "satisfactory", "satisfied", "satisfy", "satisfying", "sattler", "saturated", "saturday", "saturnalia", "saucepan", "savage", "savagely", "save", "saved", "saving", "savings", "saw", "saward", "sawards", "sawyer", "say", "saying", "says", "scaffold", "scale", "scan", "scandal", "scanned", "scanning", "scanty", "scarcely", "scarcity", "scarlett", "scattered", "scavenger", "scene", "scenes", "schedule", "scheduled", "scheme", "scheming", "schoeffer", "school", "schoolmaster", "schoolmasters", "schools", "schussler", "science", "scientific", "scoggins", "scope", "scorched", "score", "scored", "scores", "scorn", "scorned", "scornful", "scotched", "scotia", "scotland", "scottish", "scrape", "scraped", "scream", "screamed", "screen", "screened", "screws", "scroll", "scruple", "scruples", "scrupulously", "scuffle", "sea", "sealed", "seaport", "search", "searched", "searches", "searching", "seas", "season", "seat", "seated", "seats", "sebastian", "seclusion", "second", "secondary", "secondly", "seconds", "secrecy", "secret", "secretaries", "secretary", "secrete", "secreted", "secretly", "section", "sections", "secure", "secured", "securely", "securing", "securities", "security", "sedition", "seduced", "seducer", "seduction", "see", "seeds", "seedy", "seeing", "seek", "seeking", "seeks", "seem", "seemed", "seeming", "seemingly", "seems", "seen", "sees", "seized", "seizing", "seldom", "select", "selected", "selection", "selective", "seleucia", "self", "sell", "seller", "selling", "semblance", "semi", "senate", "senator", "send", "sending", "sensation", "sensational", "sense", "senseless", "sensibly", "sensitive", "sent", "sentence", "sentenced", "sentences", "sentiment", "sentry", "separate", "separated", "separately", "separating", "separation", "september", "sepulchre", "sepulchres", "sergeant", "sergeants", "serial", "series", "serious", "seriously", "seriousness", "sermon", "sermons", "serpent", "serpents", "servant", "servants", "serve", "served", "service", "services", "serving", "servitude", "session", "sessions", "set", "sets", "setting", "settled", "settlement", "settles", "settling", "seven", "seventeen", "seventeenth", "seventh", "seventy", "several", "severe", "severely", "severity", "sewers", "sexes", "sexual", "sgt", "shackles", "shades", "shadow", "shadows", "shaft", "shaken", "shakes", "shaking", "shall", "sham", "shame", "shameful", "shaneyfelt", "shanklin", "shape", "shaped", "shapeliness", "shaping", "share", "shared", "shares", "sharings", "sharp", "sharpened", "sharply", "sharpshooter", "shattered", "shaw", "shawl", "she", "shed", "sheep", "sheet", "sheets", "shell", "shelley", "shells", "sheriff", "sheriffs", "shield", "shift", "shifts", "shilling", "shillings", "ship", "shipped", "shipping", "ships", "shipwrecks", "shirt", "shirts", "shock", "shocked", "shocking", "shockingly", "shoe", "shoemaker", "shoemaking", "shoes", "shoestore", "shook", "shoot", "shooting", "shop", "shops", "shore", "shores", "short", "shortcomings", "shortened", "shorter", "shortly", "shot", "shotgun", "shots", "should", "shoulder", "shoulders", "shouldnt", "shout", "shouted", "shouting", "shouts", "shovel", "shoving", "show", "showed", "showing", "shown", "shows", "showy", "shrewd", "shrewsbury", "shrigley", "shrink", "shriveled", "shropshire", "shrubbery", "shrunk", "shudder", "shut", "shutters", "shutting", "sick", "sickening", "sickness", "side", "sided", "sides", "sidewalk", "sideways", "sidmouth", "siege", "siegel", "sift", "sifted", "sight", "sighted", "sighting", "sights", "sign", "signal", "signature", "signatures", "signed", "significance", "significant", "signs", "silence", "silent", "silhouette", "silk", "silly", "silver", "silversmiths", "similar", "similarities", "similarity", "similarly", "simmons", "simple", "simpler", "simplest", "simpleton", "simply", "simulated", "simultaneously", "sinacherib", "since", "sincere", "sincerely", "sine", "sing", "singing", "single", "singleness", "sink", "sinking", "sinks", "sinner", "sins", "sir", "sirens", "sister", "sit", "site", "sitting", "sittings", "situated", "situation", "situations", "six", "sixpence", "sixpenny", "sixteen", "sixteenth", "sixth", "sixty", "sizable", "size", "sized", "sizes", "skeletal", "skeleton", "sketch", "skilful", "skill", "skilled", "skillful", "skin", "skirt", "skittles", "skull", "sky", "slack", "slacken", "slackness", "slacks", "slain", "slanting", "slaughterhouses", "slavery", "slaves", "slayer", "sleep", "sleeper", "sleepers", "sleeping", "sleeve", "sleeves", "slender", "slept", "slew", "slide", "slight", "slightest", "slightly", "sligo", "slip", "slipped", "slipping", "slipshod", "slop", "sloping", "slovenly", "slow", "slowed", "slowly", "small", "smaller", "smallest", "smart", "smartly", "smell", "smethurst", "smethursts", "smiled", "smiles", "smirking", "smith", "smithfield", "smiths", "smoke", "smoked", "smoking", "smoldering", "smooth", "smug", "smuggled", "smugglers", "snail", "snakes", "snapping", "snatched", "snatcher", "snatchers", "snatches", "sneered", "snow", "snuff", "so", "soames", "soap", "sobbing", "sober", "sobriety", "social", "socialist", "society", "societys", "soda", "soft", "soil", "sokolow", "sold", "soldier", "soldiers", "sole", "solely", "solemn", "solemnity", "soles", "solicited", "solicitor", "solid", "solidity", "solitary", "solitude", "solomons", "solution", "some", "somebody", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "son", "song", "songs", "sons", "soon", "sooner", "sophisticated", "sorrels", "sorrow", "sorrowful", "sorry", "sort", "sorts", "sought", "soul", "sound", "sounded", "soundly", "sounds", "soup", "source", "sources", "south", "southampton", "southeast", "southern", "southwark", "southwest", "southwesterly", "sovereign", "sovereigns", "soviet", "soviets", "space", "spaced", "spaces", "spacing", "spacious", "span", "spaniards", "spanish", "spare", "spared", "sparing", "sparrow", "spasmodic", "speak", "speaker", "speakers", "speaking", "speaks", "special", "specialist", "specialization", "specially", "specie", "species", "specific", "specifically", "specified", "specify", "specimen", "spectacle", "spectators", "speculation", "speculations", "speculator", "speech", "speed", "speedily", "speeding", "speeds", "speedy", "spelling", "spencer", "spend", "spent", "spikes", "spiky", "spilled", "spine", "spinning", "spires", "spirit", "spirits", "spiritual", "spirituous", "spit", "spite", "splendid", "spoil", "spoke", "spoken", "spokes", "sponge", "sponsor", "spontaneity", "spoon", "spoons", "sport", "sporting", "spot", "spread", "spring", "springs", "spurious", "spy", "squalid", "squalor", "square", "squarely", "squatting", "squeeze", "st", "stab", "stabbed", "stable", "stables", "stack", "stacked", "stadium", "staff", "staffordshire", "stage", "stages", "staggers", "stagnant", "stain", "stained", "staircase", "staircases", "stairs", "stairway", "stairwell", "stake", "stalls", "stamp", "stamped", "stamps", "stand", "standard", "standards", "standing", "standpoint", "stands", "star", "starch", "starchy", "stare", "stared", "stars", "start", "started", "starting", "startled", "starvation", "starve", "starved", "starving", "state", "stated", "statement", "statements", "states", "stating", "station", "stationed", "stationers", "statistics", "stature", "status", "statute", "statutory", "stauntons", "staves", "stay", "stayed", "staying", "steadfastly", "steadily", "steady", "steal", "stealer", "stealing", "steam", "steamer", "steele", "stem", "stemmons", "stench", "step", "stepped", "stepping", "steps", "sternly", "stevenson", "steward", "stick", "sticking", "stiff", "stiffly", "stigmatized", "still", "stillness", "stimulants", "stimulated", "stimulating", "stimuli", "stint", "stir", "stirred", "stock", "stockdale", "stockings", "stocks", "stole", "stolen", "stomach", "stomata", "stombaugh", "stombaughs", "stone", "stones", "stood", "stool", "stooped", "stop", "stopped", "stopping", "stops", "stopwatch", "storage", "store", "stored", "stores", "stories", "storm", "stormy", "stortford", "story", "stout", "stoutly", "strahan", "straight", "strain", "strained", "strand", "strange", "strangely", "stranger", "strangers", "strangled", "strangulation", "strap", "straps", "strasburg", "stratagem", "strategy", "straw", "streak", "streaks", "stream", "street", "streets", "strength", "strengthen", "strengthened", "strengthening", "strenuously", "stressed", "stretch", "stretcher", "stretchers", "stretches", "stretching", "strict", "strictly", "strictures", "strike", "striking", "string", "stringent", "strings", "strip", "striped", "stripped", "stroke", "strong", "stronger", "strongest", "strongly", "strove", "struck", "structural", "structure", "structures", "struggle", "struggled", "struggles", "struggling", "strutton", "strychnia", "stuckey", "studded", "students", "studied", "studies", "study", "studying", "stuff", "stuffs", "stumbled", "stumbles", "stupidity", "sturges", "style", "styled", "sub", "subdue", "subdued", "subiaco", "subject", "subjected", "subjects", "submission", "submit", "submitted", "subordinate", "subordinates", "subscribe", "subscribed", "subscription", "subscriptions", "subsequent", "subsequently", "subsided", "substance", "substances", "substantial", "substantially", "substantiated", "substitute", "substituted", "suburban", "suburbs", "subversive", "succeed", "succeeded", "succeeding", "success", "successful", "successfully", "succession", "successive", "successively", "successor", "successors", "succumbed", "such", "sucked", "sudden", "suddenly", "sue", "sued", "suffer", "suffered", "sufferer", "suffering", "sufferings", "suffice", "sufficient", "sufficiently", "suffocated", "suffocation", "suffolk", "sugar", "suggest", "suggested", "suggesting", "suggestion", "suggestions", "suggestive", "suggests", "suicide", "suicides", "suit", "suitable", "suited", "sullivan", "sullivans", "sum", "summarized", "summary", "summed", "summer", "summing", "summit", "summoned", "sums", "sun", "sunday", "sundays", "sung", "sunk", "sunlight", "sunshine", "superficial", "superfluous", "superimposed", "superintendence", "superintendent", "superior", "superiors", "superseded", "supervised", "supervising", "supervision", "supervisor", "supped", "supplement", "supplied", "supplies", "supply", "supplying", "support", "supported", "supporters", "supporting", "supports", "suppose", "supposed", "supposing", "supposition", "suppress", "suppression", "suppuration", "supreme", "sure", "surely", "surface", "surgeon", "surgeons", "surgery", "surgical", "surmounted", "surpassed", "surplus", "surprise", "surprised", "surprising", "surrender", "surrendered", "surrounded", "surrounding", "surroundings", "surveillance", "survey", "surveying", "surveyor", "surveys", "survival", "survive", "survived", "susannah", "suspect", "suspects", "suspended", "suspense", "suspension", "suspicion", "suspicions", "suspicious", "sustained", "sutherland", "sutton", "suturing", "swab", "swandown", "swear", "swearing", "sweep", "sweeper", "sweeping", "sweet", "sweethearts", "swelling", "swellings", "swept", "sweynheim", "swimming", "swindler", "swindlers", "swing", "swinging", "sword", "swords", "swore", "sworn", "sydney", "sympathies", "sympathy", "symptoms", "system", "systematic", "systematically", "systems", "t", "table", "tables", "tablespoonful", "tablespoonfuls", "tactics", "tadpole", "taffir", "taft", "tagus", "tail", "tailor", "tailoring", "tailors", "take", "taken", "taker", "takers", "takes", "taking", "tale", "talent", "tales", "talk", "talked", "talking", "talks", "tall", "tampa", "tangible", "tap", "tape", "tapster", "target", "targets", "tarpey", "tarpeys", "task", "tasks", "taste", "tastes", "tattered", "taught", "tavern", "taverns", "tax", "taxed", "taxes", "taxi", "taxicab", "taylor", "tea", "teach", "teaching", "team", "tear", "tearing", "teased", "teaspoonful", "teaspoonfuls", "technical", "techniques", "technological", "technology", "ted", "tedious", "teenagers", "teeth", "teleological", "telephone", "telephoned", "telescopic", "television", "tell", "telling", "tells", "temper", "temperature", "tempered", "temple", "temples", "temporarily", "temporary", "temptation", "temptations", "tempted", "ten", "tenants", "tend", "tended", "tendencies", "tendency", "tender", "tenderness", "tending", "tennis", "tensions", "tent", "tentative", "tentatively", "tenth", "tenths", "term", "terminal", "terminated", "termination", "terms", "terrace", "terrible", "terrified", "territory", "terror", "terrors", "test", "tested", "tester", "testified", "testify", "testifying", "testimony", "tests", "tetanic", "tetanus", "texas", "text", "texture", "thain", "thames", "than", "thank", "thankfulness", "thanks", "thanksgiving", "thanksgivings", "that", "the", "theatre", "theatres", "theatrical", "thee", "theft", "thefts", "their", "theirs", "them", "theme", "themselves", "then", "thence", "theodore", "theological", "theory", "there", "thereabout", "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", "these", "they", "thick", "thickening", "thickness", "thief", "thievery", "thieves", "thieving", "thigh", "thin", "thine", "thing", "things", "think", "thinking", "thinness", "thinning", "third", "thirstiness", "thirteen", "thirteenpence", "thirties", "thirtieth", "thirty", "this", "thistlewood", "thistlewoods", "thither", "thomas", "thornberry", "thornley", "thorough", "thoroughly", "those", "thou", "though", "thought", "thoughtful", "thoughts", "thousand", "thousands", "thread", "threading", "threat", "threatened", "threatening", "threats", "three", "threw", "thrice", "thrilling", "throat", "throes", "throne", "throng", "thronged", "through", "throughout", "throw", "throwing", "thrown", "throws", "thrust", "thursday", "thurtell", "thus", "thwart", "thy", "ticket", "tickets", "tidd", "tidy", "tie", "tied", "tiers", "ties", "tight", "tightly", "tigris", "till", "tilling", "timber", "time", "timed", "timely", "times", "tiny", "tippit", "tippits", "tired", "tissue", "tissues", "title", "to", "tobacco", "tobacconist", "tocqueville", "today", "todays", "toe", "toed", "toes", "together", "toil", "tokens", "told", "tolerable", "tolerate", "toll", "tolls", "tom", "tomorrow", "tone", "tones", "tonight", "too", "took", "tool", "tools", "top", "topics", "tops", "tore", "tormented", "tormentor", "tormentors", "torn", "tossing", "total", "totally", "touch", "touched", "touching", "tough", "toughness", "tour", "touts", "toward", "towards", "towel", "towels", "tower", "towers", "town", "towns", "township", "trace", "traced", "traces", "tracheotomy", "track", "tracked", "tracks", "tracts", "trade", "trader", "traders", "trades", "trading", "tradition", "traditional", "traditions", "traffic", "tragedian", "tragedy", "tragic", "train", "trained", "training", "trait", "traitors", "tramped", "trampled", "transaction", "transactions", "transcript", "transfer", "transferred", "transferring", "transfers", "transient", "transit", "transition", "transitional", "translated", "transmitted", "transmitting", "transparent", "transport", "transportation", "transported", "transports", "trap", "trauma", "travel", "traveled", "travelers", "traveling", "travels", "traversed", "traversing", "tray", "tread", "treading", "treason", "treasonable", "treasury", "treat", "treated", "treating", "treatment", "treble", "trebled", "trees", "tremble", "trembled", "trembles", "tremendous", "trenchant", "trenched", "trial", "trials", "triangle", "tribunals", "tribute", "trick", "trickled", "trickling", "tried", "tries", "trifle", "trifling", "trigger", "trip", "triple", "tripped", "trips", "trivial", "trodden", "troops", "trot", "trouble", "trousers", "truancy", "truck", "truculent", "true", "truly", "trump", "trunk", "truss", "trust", "trusted", "trustworthy", "trusty", "truth", "try", "trying", "tub", "tube", "tubes", "tubs", "tucked", "tuft", "tumult", "tumultuous", "turf", "turk", "turkey", "turn", "turned", "turner", "turners", "turning", "turnkey", "turnkeys", "turnpike", "turns", "turtle", "tweed", "twelfth", "twelve", "twenties", "twenty", "twice", "twist", "two", "tyburn", "tying", "tylers", "type", "types", "typhus", "typical", "typography", "tyrannies", "u", "udalric", "ugliness", "ugly", "ulm", "ultimate", "ultimately", "ultra", "unable", "unaccompanied", "unaffected", "unaffectedly", "unanimous", "unanswerable", "unanswered", "unattainable", "unauthorized", "unavailing", "unbecoming", "unbroken", "unceasing", "uncertain", "uncertainty", "unchanged", "unchecked", "unclaimed", "uncle", "uncleanliness", "uncleanly", "uncleanness", "unclothed", "uncomfortable", "uncommon", "uncompromising", "unconcern", "unconscious", "unconsciously", "unconstitutional", "uncontaminated", "uncontrolled", "unconvicted", "uncovered", "undecided", "undefended", "undeniable", "under", "underfoot", "undergo", "undergoing", "undergone", "underground", "underlying", "undermine", "underpass", "underside", "understand", "understanding", "understood", "undertake", "undertaken", "undertaker", "undertaking", "underway", "underwriters", "undesirable", "undeterred", "undisturbed", "undone", "undoubtedly", "undue", "undulate", "unduly", "uneasiness", "uneasy", "unemployed", "unemployment", "unequal", "unequivocally", "uneventful", "unexamined", "unexpected", "unexpectedly", "unfair", "unfairly", "unfavorable", "unfeeling", "unfit", "unflagging", "unflinching", "unfortunate", "unfortunately", "unfrequent", "ungovernable", "unhappily", "unhappy", "unhealthy", "uniform", "uniformity", "uninteresting", "union", "unique", "unison", "unit", "united", "units", "unity", "universal", "universally", "universe", "university", "unjust", "unknown", "unlawful", "unless", "unlike", "unlikely", "unlimited", "unloaded", "unlocked", "unlocking", "unmanly", "unmarked", "unmindful", "unmistakably", "unmixed", "unmoved", "unnatural", "unnecessarily", "unnecessary", "unnerve", "unnoticed", "unoccupied", "unparalleled", "unpleasant", "unpopular", "unpracticed", "unprotected", "unprovided", "unquote", "unrelenting", "unremedied", "unremitting", "unreservedly", "unrestrained", "unrestricted", "unsafe", "unsatisfactory", "unscrupulous", "unsearched", "unseemly", "unsexed", "unskilful", "unsound", "unsoundness", "unstable", "unsubstantiated", "unsuccessful", "unsuccessfully", "unsuited", "unsupervised", "until", "unto", "untouched", "untoward", "untried", "unusual", "unusually", "unventilated", "unwarrantable", "unwary", "unwholesome", "unwilling", "unworthy", "up", "upon", "upper", "uppermost", "upright", "uprights", "uproar", "uproarious", "upset", "upside", "upstairs", "upward", "upwards", "urge", "urged", "urgent", "urgently", "urges", "urging", "us", "usa", "usage", "use", "used", "useful", "useless", "uses", "using", "ussr", "usual", "usually", "usurped", "utilitarian", "utility", "utilize", "utilized", "utilizing", "utmost", "utter", "uttered", "uttering", "utterly", "utters", "v", "vacated", "vagrants", "vague", "vain", "valet", "valets", "valid", "validity", "valley", "valuable", "valuables", "value", "valued", "van", "vanished", "vanishes", "vans", "vantage", "variation", "variations", "varied", "variety", "various", "vartos", "vary", "varying", "vast", "vastly", "vault", "vaulted", "vaunting", "vegetable", "vegetables", "vehicle", "vehicles", "veil", "vein", "vended", "vendor", "vendors", "venetian", "venice", "ventilating", "ventilation", "ventilators", "verbal", "verdict", "verified", "verify", "versed", "vertebral", "vertebrate", "vertebrates", "vertically", "very", "vessel", "vessels", "vested", "vestibule", "vestiges", "vexed", "via", "viaduct", "vice", "vicinity", "vicious", "vicissitudes", "victim", "victims", "victoria", "victuals", "view", "viewed", "viewpoint", "views", "vigilance", "vigorous", "vigorously", "vile", "vileness", "vilest", "vilification", "village", "villain", "villains", "vindelin", "vines", "violated", "violation", "violations", "violence", "violent", "violently", "virginia", "virtually", "virtue", "virtuous", "visa", "visas", "viscose", "visible", "vision", "visit", "visitation", "visitations", "visited", "visiting", "visitor", "visitors", "visits", "visual", "vital", "vitiate", "vitriol", "viz", "vociferously", "voebel", "voice", "voices", "volume", "voluminous", "voluntarily", "voluntary", "volunteer", "vomiting", "vote", "vouchsafed", "vowed", "voyage", "vulgar", "w", "wage", "wages", "wagner", "wagon", "wainwright", "wainwrights", "waist", "waistcoat", "wait", "waited", "waiters", "waiting", "wakefield", "wakefields", "waking", "wales", "walk", "walked", "walker", "walkers", "walking", "wall", "wallace", "wallaces", "walled", "wallet", "walls", "walsall", "walter", "walthers", "wandering", "wanders", "wandsworth", "want", "wanted", "wanting", "wantonness", "war", "ward", "warden", "warder", "warders", "wards", "wardsman", "wardsmen", "warehouse", "warehouses", "wares", "warm", "warmed", "warmly", "warned", "warning", "warnings", "warrant", "warrants", "warren", "warwick", "was", "wash", "washed", "washing", "washington", "wasnt", "waste", "wasted", "watch", "watched", "watches", "watching", "watchman", "water", "watercourse", "waterloo", "waters", "watery", "watkin", "watkins", "watson", "watts", "wave", "wax", "way", "ways", "wdsu", "we", "weak", "weakened", "weaker", "weakest", "weakly", "wealth", "wealthy", "weapon", "weapons", "wear", "weare", "wearer", "wearing", "weary", "weather", "weavers", "weaving", "webster", "wedded", "wedding", "wednesday", "wednesdays", "weed", "week", "weekend", "weekly", "weeks", "weigh", "weighed", "weight", "weighted", "weights", "welcome", "welcomed", "welfare", "well", "welshman", "went", "were", "wesley", "wesson", "west", "westbrook", "western", "westminster", "whale", "whaley", "whaleys", "wharf", "what", "whatever", "wheat", "wheel", "wheeled", "wheeler", "wheels", "when", "whence", "whenever", "where", "whereabouts", "whereas", "whereof", "whereupon", "wherever", "whether", "which", "while", "whiling", "whilst", "whisper", "whispered", "whistles", "whistling", "whitchurch", "white", "whitecross", "whitehall", "whites", "whitewashed", "whitewashing", "whither", "whitworth", "who", "whoever", "whole", "wholesale", "wholesome", "wholly", "whom", "whomsoever", "whose", "why", "wickedness", "wide", "widely", "wider", "widespread", "widow", "widower", "widows", "width", "wife", "wifes", "wig", "wilberforce", "wild", "wildest", "will", "willful", "william", "williams", "willing", "willingly", "willingness", "willoughby", "wills", "wilson", "winchester", "wind", "winded", "winding", "windlass", "window", "windows", "windowsill", "winds", "windsor", "wine", "wing", "wings", "winked", "winston", "winter", "wiped", "wire", "wiry", "wisdom", "wise", "wiser", "wish", "wished", "wishes", "wishing", "with", "withdrawal", "withdrawing", "withdrawn", "withdraws", "withdrew", "within", "without", "witness", "witnessed", "witnesses", "wittmus", "wives", "woebegone", "woman", "womans", "women", "womens", "won", "wonder", "wonders", "wont", "wood", "woodcuts", "woodcutters", "wooden", "woodstock", "woody", "wool", "woolen", "woolwich", "word", "worde", "words", "wore", "work", "worked", "worker", "workers", "workhouse", "working", "workman", "workmen", "works", "workshops", "world", "worn", "worry", "worse", "worship", "worst", "worth", "worthy", "would", "wouldnt", "wound", "wounded", "wounds", "wrapped", "wrapper", "wrapping", "wrestling", "wretch", "wretched", "wretchedness", "wretches", "wrist", "wrists", "writ", "write", "writer", "writers", "writes", "writing", "writings", "writs", "written", "wrong", "wronged", "wrongfully", "wrongs", "wrote", "wrought", "wynkyn", "wynn", "x", "yarborough", "yard", "yards", "yarmouth", "ye", "year", "years", "yeast", "yell", "yelled", "yellow", "yells", "yeoman", "yes", "yesterday", "yet", "yield", "yoke", "yolk", "york", "yorks", "yorkshire", "you", "youll", "young", "youngblood", "youngbloods", "younger", "youngest", "youngster", "your", "yourself", "youth", "youths", "zahm", "zapruder", "zeal", "zeiner", "zeiners", "zero", "zipper", "zoology", "zopyrus", "zulueta"] \ No newline at end of file diff --git a/egs/datasets/audio/biaobei/__pycache__/preprocess.cpython-36.pyc b/egs/datasets/audio/biaobei/__pycache__/preprocess.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7eba272e9b3ed5fdc75d2a93284afcf372c24e25 Binary files /dev/null and b/egs/datasets/audio/biaobei/__pycache__/preprocess.cpython-36.pyc differ diff --git a/egs/datasets/audio/biaobei/base_text2mel.yaml b/egs/datasets/audio/biaobei/base_text2mel.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d2445c07e2aeaa355fc74fe85a09804464aa19dc --- /dev/null +++ b/egs/datasets/audio/biaobei/base_text2mel.yaml @@ -0,0 +1,18 @@ +base_config: egs/egs_bases/tts/base_zh.yaml +raw_data_dir: 'data/raw/biaobei' +processed_data_dir: 'data/processed/biaobei' +binary_data_dir: 'data/binary/biaobei' +preprocess_cls: egs.datasets.audio.biaobei.preprocess.BiaobeiPreprocess + +ds_name: biaobei +binarization_args: + train_range: [ 871, -1 ] + test_range: [ 0, 523 ] + valid_range: [ 523, 871 ] +test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 68, 70, 74, 87, 110, 172, 190, 215, 231, 294, + 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ] +f0_min: 80 +f0_max: 600 +vocoder_ckpt: checkpoints/hifi_biaobei \ No newline at end of file diff --git a/egs/datasets/audio/biaobei/preprocess.py b/egs/datasets/audio/biaobei/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..44f48e22675a02e3a4b91a69caf7344f5a2982ef --- /dev/null +++ b/egs/datasets/audio/biaobei/preprocess.py @@ -0,0 +1,16 @@ +from data_gen.tts.base_preprocess import BasePreprocessor +import re + + +class BiaobeiPreprocess(BasePreprocessor): + def meta_data(self): + input_dir = self.raw_data_dir + with open(f"{input_dir}/ProsodyLabeling/000001-010000.txt", encoding='utf-8') as f: + bb_lines = f.readlines()[::2] + for l_idx, l in (enumerate([re.sub("\#\d+", "", l.split('\t')[1].strip()) for l in bb_lines])): + item_name = f'{l_idx + 1:06d}' + wav_fn = f"{input_dir}/wav/{l_idx + 1:06d}.wav" + yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': l} + +if __name__ == "__main__": + BiaobeiPreprocess().process() diff --git a/egs/datasets/audio/biaobei/ps_flow.yaml b/egs/datasets/audio/biaobei/ps_flow.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9e78e54f845104351a1bc7486d881856f243307b --- /dev/null +++ b/egs/datasets/audio/biaobei/ps_flow.yaml @@ -0,0 +1,3 @@ +base_config: + - egs/egs_bases/tts/ps_flow.yaml + - ./base_text2mel.yaml \ No newline at end of file diff --git a/egs/datasets/audio/biaobei/synta.yaml b/egs/datasets/audio/biaobei/synta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e65bcbc7e584ee35bce17ae7337ceff945ab0df1 --- /dev/null +++ b/egs/datasets/audio/biaobei/synta.yaml @@ -0,0 +1,19 @@ +base_config: + - egs/egs_bases/tts/synta.yaml + - ./base_text2mel.yaml + +lambda_mel_adv: 0.05 + +disc_win_num: 3 +mel_disc_hidden_size: 128 +disc_norm: in +disc_reduction: stack +disc_interval: 1 +disc_lr: 0.0001 +disc_start_steps: 0 +discriminator_scheduler_params: + gamma: 0.5 + step_size: 40000 +discriminator_optimizer_params: + eps: 1.0e-06 + weight_decay: 0.0 \ No newline at end of file diff --git a/egs/datasets/audio/libritts/__pycache__/preprocess.cpython-36.pyc b/egs/datasets/audio/libritts/__pycache__/preprocess.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..728e70ba5c7fdfd36a88ab8adfb707979e38a1ee Binary files /dev/null and b/egs/datasets/audio/libritts/__pycache__/preprocess.cpython-36.pyc differ diff --git a/egs/datasets/audio/libritts/base_text2mel.yaml b/egs/datasets/audio/libritts/base_text2mel.yaml new file mode 100644 index 0000000000000000000000000000000000000000..433c15936eb9ca6a3b972f7b9415785b83687dda --- /dev/null +++ b/egs/datasets/audio/libritts/base_text2mel.yaml @@ -0,0 +1,22 @@ +ds_name: libritts +base_config: egs/egs_bases/tts/base.yaml +raw_data_dir: 'data/raw/LibriTTS' +processed_data_dir: 'data/processed/libritts' +binary_data_dir: 'data/binary/libritts' +preprocess_cls: egs.datasets.audio.libritts.preprocess.LibriTTSPreprocess +binarization_args: + train_range: [ 871, -1 ] + test_range: [ 0, 523 ] + valid_range: [ 523, 871 ] + shuffle: false + with_spk_id: true + with_spk_embed: false +test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 68, 70, 74, 87, 110, 172, 190, 215, 231, 294, + 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ] +f0_min: 80 +f0_max: 600 +vocoder: PWG +vocoder_ckpt: checkpoints/pwg_libritts +num_spk: 2000 \ No newline at end of file diff --git a/egs/datasets/audio/libritts/preprocess.py b/egs/datasets/audio/libritts/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..cdb6c7322de4a62e23dd586bee3ea145d2bc5f58 --- /dev/null +++ b/egs/datasets/audio/libritts/preprocess.py @@ -0,0 +1,13 @@ +from data_gen.tts.base_preprocess import BasePreprocessor +import glob, os + +class LibriTTSPreprocess(BasePreprocessor): + def meta_data(self): + wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*/*.wav')) + for wav_fn in wav_fns: + item_name = os.path.basename(wav_fn)[:-4] + txt_fn = f'{wav_fn[:-4]}.normalized.txt' + with open(txt_fn, 'r') as f: + txt = f.read() + spk_name = item_name.split("_")[0] + yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt, 'spk_name': spk_name} \ No newline at end of file diff --git a/egs/datasets/audio/libritts/ps_flow.yaml b/egs/datasets/audio/libritts/ps_flow.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9e78e54f845104351a1bc7486d881856f243307b --- /dev/null +++ b/egs/datasets/audio/libritts/ps_flow.yaml @@ -0,0 +1,3 @@ +base_config: + - egs/egs_bases/tts/ps_flow.yaml + - ./base_text2mel.yaml \ No newline at end of file diff --git a/egs/datasets/audio/libritts/synta.yaml b/egs/datasets/audio/libritts/synta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e65bcbc7e584ee35bce17ae7337ceff945ab0df1 --- /dev/null +++ b/egs/datasets/audio/libritts/synta.yaml @@ -0,0 +1,19 @@ +base_config: + - egs/egs_bases/tts/synta.yaml + - ./base_text2mel.yaml + +lambda_mel_adv: 0.05 + +disc_win_num: 3 +mel_disc_hidden_size: 128 +disc_norm: in +disc_reduction: stack +disc_interval: 1 +disc_lr: 0.0001 +disc_start_steps: 0 +discriminator_scheduler_params: + gamma: 0.5 + step_size: 40000 +discriminator_optimizer_params: + eps: 1.0e-06 + weight_decay: 0.0 \ No newline at end of file diff --git a/egs/datasets/audio/lj/__pycache__/preprocess.cpython-36.pyc b/egs/datasets/audio/lj/__pycache__/preprocess.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9b9212ea01ef62d9cad2d4ef54f6897474951c2 Binary files /dev/null and b/egs/datasets/audio/lj/__pycache__/preprocess.cpython-36.pyc differ diff --git a/egs/datasets/audio/lj/base_mel2wav.yaml b/egs/datasets/audio/lj/base_mel2wav.yaml new file mode 100644 index 0000000000000000000000000000000000000000..626745c25ef7fc2254a778dc7297a33a11142694 --- /dev/null +++ b/egs/datasets/audio/lj/base_mel2wav.yaml @@ -0,0 +1,4 @@ +base_config: egs/egs_bases/tts/vocoder/base.yaml +raw_data_dir: 'data/raw/LJSpeech-1.1' +processed_data_dir: 'data/processed/ljspeech' +binary_data_dir: 'data/binary/ljspeech_wav' diff --git a/egs/datasets/audio/lj/base_text2mel.yaml b/egs/datasets/audio/lj/base_text2mel.yaml new file mode 100644 index 0000000000000000000000000000000000000000..52a91fe37c1867817ab820459721491291c045b7 --- /dev/null +++ b/egs/datasets/audio/lj/base_text2mel.yaml @@ -0,0 +1,17 @@ +ds_name: ljspeech +base_config: egs/egs_bases/tts/base.yaml +raw_data_dir: 'data/raw/LJSpeech-1.1' +processed_data_dir: 'data/processed/ljspeech' +binary_data_dir: 'data/binary/ljspeech' +preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess +binarization_args: + train_range: [ 871, -1 ] + test_range: [ 0, 523 ] + valid_range: [ 523, 871 ] +test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 68, 70, 74, 87, 110, 172, 190, 215, 231, 294, + 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ] +f0_min: 80 +f0_max: 600 +vocoder_ckpt: checkpoints/hifi_lj \ No newline at end of file diff --git a/egs/datasets/audio/lj/ds.yaml b/egs/datasets/audio/lj/ds.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aec608475ef34d8e6dfea8fd91e612e624ea7548 --- /dev/null +++ b/egs/datasets/audio/lj/ds.yaml @@ -0,0 +1,29 @@ +base_config: + - egs/egs_bases/tts/ds.yaml + - ./fs2_orig.yaml + +fs2_ckpt: checkpoints/aux_exp/model_ckpt_steps_100000.ckpt + +# spec_min and spec_max are calculated on the training set. +spec_min: [ -4.7574, -4.6783, -4.6431, -4.5832, -4.5390, -4.6771, -4.8089, -4.7672, + -4.5784, -4.7755, -4.7150, -4.8919, -4.8271, -4.7389, -4.6047, -4.7759, + -4.6799, -4.8201, -4.7823, -4.8262, -4.7857, -4.7545, -4.9358, -4.9733, + -5.1134, -5.1395, -4.9016, -4.8434, -5.0189, -4.8460, -5.0529, -4.9510, + -5.0217, -5.0049, -5.1831, -5.1445, -5.1015, -5.0281, -4.9887, -4.9916, + -4.9785, -4.9071, -4.9488, -5.0342, -4.9332, -5.0650, -4.8924, -5.0875, + -5.0483, -5.0848, -5.0655, -5.0279, -5.0015, -5.0792, -5.0636, -5.2413, + -5.1421, -5.1710, -5.3256, -5.0511, -5.1186, -5.0057, -5.0446, -5.1173, + -5.0325, -5.1085, -5.0053, -5.0755, -5.1176, -5.1004, -5.2153, -5.2757, + -5.3025, -5.2867, -5.2918, -5.3328, -5.2731, -5.2985, -5.2400, -5.2211 ] +spec_max: [ -0.5982, -0.0778, 0.1205, 0.2747, 0.4657, 0.5123, 0.5830, 0.7093, + 0.6461, 0.6101, 0.7316, 0.7715, 0.7681, 0.8349, 0.7815, 0.7591, + 0.7910, 0.7433, 0.7352, 0.6869, 0.6854, 0.6623, 0.5353, 0.6492, + 0.6909, 0.6106, 0.5761, 0.5236, 0.5638, 0.4054, 0.4545, 0.3407, + 0.3037, 0.3380, 0.1599, 0.1603, 0.2741, 0.2130, 0.1569, 0.1911, + 0.2324, 0.1586, 0.1221, 0.0341, -0.0558, 0.0553, -0.1153, -0.0933, + -0.1171, -0.0050, -0.1519, -0.1629, -0.0522, -0.0739, -0.2069, -0.2405, + -0.1244, -0.2582, -0.1361, -0.1575, -0.1442, 0.0513, -0.1567, -0.2000, + 0.0086, -0.0698, 0.1385, 0.0941, 0.1864, 0.1225, 0.1389, 0.1382, + 0.1670, 0.1007, 0.1444, 0.0888, 0.1998, 0.2280, 0.2932, 0.3047 ] + +max_tokens: 30000 \ No newline at end of file diff --git a/egs/datasets/audio/lj/fs.yaml b/egs/datasets/audio/lj/fs.yaml new file mode 100644 index 0000000000000000000000000000000000000000..22bc563d16aa3c6b2c03a7559afc9c800aa85f27 --- /dev/null +++ b/egs/datasets/audio/lj/fs.yaml @@ -0,0 +1,3 @@ +base_config: + - egs/egs_bases/tts/fs.yaml + - ./base_text2mel.yaml diff --git a/egs/datasets/audio/lj/fs2_orig.yaml b/egs/datasets/audio/lj/fs2_orig.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f872891409c6c3b745f17909e3ed4d2540a0e2f3 --- /dev/null +++ b/egs/datasets/audio/lj/fs2_orig.yaml @@ -0,0 +1,4 @@ +base_config: + - egs/egs_bases/tts/fs2_orig.yaml + - ./base_text2mel.yaml +binary_data_dir: 'data/binary/ljspeech_cwt' \ No newline at end of file diff --git a/egs/datasets/audio/lj/hifigan.yaml b/egs/datasets/audio/lj/hifigan.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e840cd2bc5b1c3860f55bb1e36b3be1b41cc9e2 --- /dev/null +++ b/egs/datasets/audio/lj/hifigan.yaml @@ -0,0 +1,3 @@ +base_config: + - egs/egs_bases/tts/vocoder/hifigan.yaml + - ./base_mel2wav.yaml \ No newline at end of file diff --git a/egs/datasets/audio/lj/preprocess.py b/egs/datasets/audio/lj/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..a3d45c9aa855bb7ce40b5e8374547014350fa92b --- /dev/null +++ b/egs/datasets/audio/lj/preprocess.py @@ -0,0 +1,9 @@ +from data_gen.tts.base_preprocess import BasePreprocessor + + +class LJPreprocess(BasePreprocessor): + def meta_data(self): + for l in open(f'{self.raw_data_dir}/metadata.csv').readlines(): + item_name, _, txt = l.strip().split("|") + wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav" + yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt} diff --git a/egs/datasets/audio/lj/ps_flow.yaml b/egs/datasets/audio/lj/ps_flow.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9e78e54f845104351a1bc7486d881856f243307b --- /dev/null +++ b/egs/datasets/audio/lj/ps_flow.yaml @@ -0,0 +1,3 @@ +base_config: + - egs/egs_bases/tts/ps_flow.yaml + - ./base_text2mel.yaml \ No newline at end of file diff --git a/egs/datasets/audio/lj/ps_flow_nips2021.yaml b/egs/datasets/audio/lj/ps_flow_nips2021.yaml new file mode 100644 index 0000000000000000000000000000000000000000..63497f42ee07a0a84e7818e55005d6b3648fe9ed --- /dev/null +++ b/egs/datasets/audio/lj/ps_flow_nips2021.yaml @@ -0,0 +1,11 @@ +base_config: + - ./ps_flow.yaml +max_sentences: 64 +dur_level: word +use_word_encoder: false +enc_prenet: true +enc_pre_ln: false +fvae_encoder_type: wn +fvae_decoder_type: wn +text_encoder_postnet: false +warmup_updates: 8000 \ No newline at end of file diff --git a/egs/datasets/audio/lj/ps_flow_small.yaml b/egs/datasets/audio/lj/ps_flow_small.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8e5a662838a4ff84bd8693a3b5443b9f577e9d87 --- /dev/null +++ b/egs/datasets/audio/lj/ps_flow_small.yaml @@ -0,0 +1,3 @@ +base_config: + - egs/egs_bases/tts/ps_flow_small.yaml + - ./base_text2mel.yaml \ No newline at end of file diff --git a/egs/datasets/audio/lj/ps_flow_small_nips2021.yaml b/egs/datasets/audio/lj/ps_flow_small_nips2021.yaml new file mode 100644 index 0000000000000000000000000000000000000000..97f61858b6e4f19e21550ac8eabbf8d7d3586e0c --- /dev/null +++ b/egs/datasets/audio/lj/ps_flow_small_nips2021.yaml @@ -0,0 +1,11 @@ +base_config: + - ./ps_flow_small.yaml +max_sentences: 128 +dur_level: word +use_word_encoder: false +enc_prenet: true +enc_pre_ln: false +fvae_encoder_type: wn +fvae_decoder_type: wn +text_encoder_postnet: false +warmup_updates: 8000 \ No newline at end of file diff --git a/egs/datasets/audio/lj/synta.yaml b/egs/datasets/audio/lj/synta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e65bcbc7e584ee35bce17ae7337ceff945ab0df1 --- /dev/null +++ b/egs/datasets/audio/lj/synta.yaml @@ -0,0 +1,19 @@ +base_config: + - egs/egs_bases/tts/synta.yaml + - ./base_text2mel.yaml + +lambda_mel_adv: 0.05 + +disc_win_num: 3 +mel_disc_hidden_size: 128 +disc_norm: in +disc_reduction: stack +disc_interval: 1 +disc_lr: 0.0001 +disc_start_steps: 0 +discriminator_scheduler_params: + gamma: 0.5 + step_size: 40000 +discriminator_optimizer_params: + eps: 1.0e-06 + weight_decay: 0.0 \ No newline at end of file diff --git a/egs/egs_bases/config_base.yaml b/egs/egs_bases/config_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..be08547583ac3e109ece79c99856d92c43ef097a --- /dev/null +++ b/egs/egs_bases/config_base.yaml @@ -0,0 +1,41 @@ +# task +binary_data_dir: '' +work_dir: '' # experiment directory. +infer: false # infer +amp: false +seed: 1234 +debug: false +save_codes: ['tasks', 'modules', 'egs'] + +############# +# dataset +############# +ds_workers: 1 +test_num: 100 +endless_ds: true +sort_by_len: true + +######### +# train and eval +######### +print_nan_grads: false +load_ckpt: '' +save_best: false +num_ckpt_keep: 3 +clip_grad_norm: 0 +accumulate_grad_batches: 1 +tb_log_interval: 100 +num_sanity_val_steps: 5 # steps of validation at the beginning +check_val_every_n_epoch: 10 +val_check_interval: 2000 +valid_monitor_key: 'val_loss' +valid_monitor_mode: 'min' +max_epochs: 1000 +max_updates: 1000000 +max_tokens: 40000 +max_sentences: 100000 +max_valid_tokens: -1 +max_valid_sentences: -1 +eval_max_batches: -1 +resume_from_checkpoint: 0 +rename_tmux: true \ No newline at end of file diff --git a/egs/egs_bases/tts/base.yaml b/egs/egs_bases/tts/base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..578cad7dc8820e398db7f0bba3b7bebd2558f17e --- /dev/null +++ b/egs/egs_bases/tts/base.yaml @@ -0,0 +1,56 @@ +# task +base_config: + - ../config_base.yaml + - ./dataset_params.yaml + +############# +# dataset in training +############# +endless_ds: true +min_frames: 0 +max_frames: 1548 +frames_multiple: 1 +max_input_tokens: 1550 +ds_workers: 1 + +######### +# model +######### +use_spk_id: false +use_spk_embed: false +mel_losses: "ssim:0.5|l1:0.5" + +########### +# optimization +########### +lr: 0.0005 +scheduler: warmup # rsqrt|warmup|none +warmup_updates: 4000 +optimizer_adam_beta1: 0.9 +optimizer_adam_beta2: 0.98 +weight_decay: 0 +clip_grad_norm: 1 +clip_grad_value: 0 + + +########### +# train and eval +########### +use_word_input: false +max_valid_sentences: 1 +max_valid_tokens: 60000 +valid_infer_interval: 10000 +train_set_name: 'train' +train_sets: '' +valid_set_name: 'valid' +test_set_name: 'test' +num_valid_plots: 10 +test_ids: [ ] +test_input_yaml: '' +vocoder: HifiGAN +vocoder_ckpt: '' +profile_infer: false +out_wav_norm: false +save_gt: true +save_f0: false +gen_dir_name: '' \ No newline at end of file diff --git a/egs/egs_bases/tts/base_zh.yaml b/egs/egs_bases/tts/base_zh.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aaadcfa490c6ba5958e50d10fb9e0693536d343f --- /dev/null +++ b/egs/egs_bases/tts/base_zh.yaml @@ -0,0 +1,5 @@ +base_config: ./base.yaml +preprocess_args: + txt_processor: zh + +word_size: 3000 \ No newline at end of file diff --git a/egs/egs_bases/tts/dataset_params.yaml b/egs/egs_bases/tts/dataset_params.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4d98d3e83d93eb5532f93a7a0ba35c3f81ee301a --- /dev/null +++ b/egs/egs_bases/tts/dataset_params.yaml @@ -0,0 +1,52 @@ +audio_num_mel_bins: 80 +audio_sample_rate: 22050 +hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate) +win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate) +fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter +fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) +fmax: 7600 # To be increased/reduced depending on data. +f0_min: 80 +f0_max: 800 +griffin_lim_iters: 30 +pitch_extractor: parselmouth +num_spk: 1 +mel_vmin: -6 +mel_vmax: 1.5 +loud_norm: false + +raw_data_dir: '' +processed_data_dir: '' +binary_data_dir: '' +preprocess_cls: '' +binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer +preprocess_args: + nsample_per_mfa_group: 1000 + # text process + txt_processor: en + use_mfa: true + with_phsep: true + reset_phone_dict: true + reset_word_dict: true + add_eos_bos: true + # mfa + mfa_group_shuffle: false + mfa_offset: 0.02 + # wav processors + wav_processors: [ ] + save_sil_mask: true + vad_max_silence_length: 12 +binarization_args: + shuffle: false + with_wav: false + with_align: true + with_spk_embed: false + with_f0: true + with_f0cwt: false + with_linear: false + trim_eos_bos: false + min_sil_duration: 0.1 + train_range: [ 200, -1 ] + test_range: [ 0, 100 ] + valid_range: [ 100, 200 ] +word_dict_size: 10000 +pitch_key: pitch \ No newline at end of file diff --git a/egs/egs_bases/tts/ds.yaml b/egs/egs_bases/tts/ds.yaml new file mode 100644 index 0000000000000000000000000000000000000000..85f606a96c9efec2634542313ca7eea6b16ad239 --- /dev/null +++ b/egs/egs_bases/tts/ds.yaml @@ -0,0 +1,33 @@ +base_config: ./fs2_orig.yaml + +# special configs for diffspeech +task_cls: tasks.tts.diffspeech.DiffSpeechTask +lr: 0.001 +timesteps: 100 +K_step: 71 +diff_loss_type: l1 +diff_decoder_type: 'wavenet' +schedule_type: 'linear' +max_beta: 0.06 + +## model configs for diffspeech +dilation_cycle_length: 1 +residual_layers: 20 +residual_channels: 256 +decay_steps: 50000 +keep_bins: 80 +#content_cond_steps: [ ] # [ 0, 10000 ] +#spk_cond_steps: [ ] # [ 0, 10000 ] +#gen_tgt_spk_id: -1 + + + +# training configs for diffspeech +#max_sentences: 48 +#num_sanity_val_steps: 1 +num_valid_plots: 10 +use_gt_dur: false +use_gt_f0: false +use_energy_embed: false +#pitch_type: cwt +max_updates: 160000 \ No newline at end of file diff --git a/egs/egs_bases/tts/fs.yaml b/egs/egs_bases/tts/fs.yaml new file mode 100644 index 0000000000000000000000000000000000000000..20592ffcdd69eb73d449088ed4b01700c9c9abc0 --- /dev/null +++ b/egs/egs_bases/tts/fs.yaml @@ -0,0 +1,75 @@ +base_config: ./base.yaml +task_cls: tasks.tts.fs.FastSpeechTask + +# model +hidden_size: 256 +dropout: 0.0 +encoder_type: rel_fft # rel_fft|fft|tacotron|tacotron2|conformer +decoder_type: conv # fft|rnn|conv|conformer|wn + +# rnn enc/dec +encoder_K: 8 +decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2 + +# fft enc/dec +enc_layers: 4 +enc_ffn_kernel_size: 9 +enc_prenet: true +enc_pre_ln: true +dec_layers: 4 +dec_ffn_kernel_size: 9 +num_heads: 2 +ffn_act: gelu +ffn_hidden_size: 1024 +use_pos_embed: true + +# conv enc/dec +enc_dec_norm: ln +conv_use_pos: false +layers_in_block: 2 +enc_dilations: [ 1, 1, 1, 1 ] +enc_kernel_size: 5 +enc_post_net_kernel: 3 +dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder +dec_kernel_size: 5 +dec_post_net_kernel: 3 + +# duration +predictor_hidden: -1 +dur_predictor_kernel: 3 +dur_predictor_layers: 2 +predictor_kernel: 5 +predictor_layers: 5 +predictor_dropout: 0.5 + +# pitch and energy +use_pitch_embed: false +pitch_type: frame # frame|ph|cwt +use_uv: true + +# reference encoder and speaker embedding +lambda_commit: 0.25 +ref_norm_layer: bn +dec_inp_add_noise: false + +# mel +mel_losses: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5 + +# loss lambda +lambda_f0: 1.0 +lambda_uv: 1.0 +lambda_energy: 0.1 +lambda_ph_dur: 0.1 +lambda_sent_dur: 1.0 +lambda_word_dur: 1.0 +predictor_grad: 0.1 + +# train and eval +warmup_updates: 4000 +max_tokens: 40000 +max_sentences: 128 +max_valid_sentences: 1 +max_updates: 160000 +use_gt_dur: false +use_gt_f0: false +ds_workers: 2 diff --git a/egs/egs_bases/tts/fs2_orig.yaml b/egs/egs_bases/tts/fs2_orig.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0a003331f1e45a342ac16175b7dde97bb1d6db03 --- /dev/null +++ b/egs/egs_bases/tts/fs2_orig.yaml @@ -0,0 +1,13 @@ +base_config: ./fs.yaml +task_cls: tasks.tts.fs2_orig.FastSpeech2OrigTask +encoder_type: fft +decoder_type: fft +use_energy_embed: false +use_pitch_embed: true +pitch_type: cwt # frame|ph|cwt +binarization_args: + with_f0cwt: true +use_gt_energy: false +cwt_std_scale: 0.8 +dropout: 0.1 +mel_losses: l1 \ No newline at end of file diff --git a/egs/egs_bases/tts/ps.yaml b/egs/egs_bases/tts/ps.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e749faf5d159f44c4a0061db075361f704cc5a4b --- /dev/null +++ b/egs/egs_bases/tts/ps.yaml @@ -0,0 +1,63 @@ +base_config: ./fs.yaml + +########################### +# models +########################### +# encoders +hidden_size: 192 +ffn_hidden_size: 768 +enc_ffn_kernel_size: 5 +enc_layers: 4 +dur_level: word +encoder_type: rel_fft +use_word_encoder: true + +# mix ling encoder +word_enc_layers: 4 +word_encoder_type: rel_fft +use_pitch_embed: false +enc_prenet: true +enc_pre_ln: true +text_encoder_postnet: true +dropout: 0.0 +add_word_pos: true + +# dur predictor +dur_predictor_layers: 3 +dur_predictor_kernel: 5 +predictor_dropout: 0.2 + +## fvae +use_fvae: true +latent_size: 16 +fvae_encoder_type: conv +fvae_decoder_type: conv +fvae_enc_dec_hidden: 192 +fvae_kernel_size: 5 +fvae_enc_n_layers: 8 +fvae_dec_n_layers: 4 +fvae_strides: 4 +fvae_noise_scale: 1.0 + +# prior flow +use_prior_flow: true +prior_flow_hidden: 64 +prior_flow_kernel_size: 3 +prior_flow_n_blocks: 4 + +########################### +# training and inference +########################### +lambda_kl: 1.0 +kl_min: 0.0 +lambda_sent_dur: 0.0 +kl_start_steps: 10000 +posterior_start_steps: 0 +frames_multiple: 4 +num_valid_plots: 10 +lr: 0.0002 +warmup_updates: 8000 +max_tokens: 40000 +valid_infer_interval: 10000 +max_sentences: 80 +max_updates: 480000 \ No newline at end of file diff --git a/egs/egs_bases/tts/ps_flow.yaml b/egs/egs_bases/tts/ps_flow.yaml new file mode 100644 index 0000000000000000000000000000000000000000..56a4326101aa0d89046687cc8cbdcb2f06c6d2d8 --- /dev/null +++ b/egs/egs_bases/tts/ps_flow.yaml @@ -0,0 +1,20 @@ +base_config: ./ps.yaml +task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask + +use_post_flow: true +detach_postflow_input: true +post_flow_lr: 0.001 +post_glow_hidden: 192 +post_glow_kernel_size: 3 +post_glow_n_blocks: 12 +post_glow_n_block_layers: 3 +post_share_cond_layers: false +share_wn_layers: 4 +use_cond_proj: false +use_latent_cond: false +use_txt_cond: true +sigmoid_scale: false +post_glow_training_start: 160000 +noise_scale: 0.8 +infer_post_glow: true +two_stage: true \ No newline at end of file diff --git a/egs/egs_bases/tts/ps_flow_small.yaml b/egs/egs_bases/tts/ps_flow_small.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e95a86492ee9c6516a05201bf4d0a2de6e55e947 --- /dev/null +++ b/egs/egs_bases/tts/ps_flow_small.yaml @@ -0,0 +1,42 @@ +base_config: ./ps_flow.yaml + +########################### +# models +########################### +# encoders +hidden_size: 128 +ffn_hidden_size: 512 +enc_ffn_kernel_size: 3 +enc_layers: 3 +word_enc_layers: 3 + +# dur predictor +dur_predictor_layers: 3 +dur_predictor_kernel: 5 +predictor_dropout: 0.2 + +## fvae +use_fvae: true +latent_size: 16 +fvae_encoder_type: wn +fvae_decoder_type: wn +fvae_enc_dec_hidden: 128 +fvae_kernel_size: 3 +fvae_enc_n_layers: 8 +fvae_dec_n_layers: 3 +fvae_strides: 4 +fvae_noise_scale: 1.0 + + +# prior flow +use_prior_flow: true +prior_flow_hidden: 32 +prior_flow_kernel_size: 3 +prior_flow_n_blocks: 3 +# post flow +post_glow_hidden: 128 +post_glow_kernel_size: 3 +post_glow_n_blocks: 8 +post_glow_n_block_layers: 3 +share_wn_layers: 4 +noise_scale: 0.6 \ No newline at end of file diff --git a/egs/egs_bases/tts/synta.yaml b/egs/egs_bases/tts/synta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45c8e4ddeb80d69dc807d7869235e56fa61a6e9f --- /dev/null +++ b/egs/egs_bases/tts/synta.yaml @@ -0,0 +1,20 @@ +base_config: ./ps.yaml +task_cls: tasks.tts.synta.SyntaSpeechTask + +use_post_flow: true +detach_postflow_input: true +post_flow_lr: 0.001 +post_glow_hidden: 192 +post_glow_kernel_size: 3 +post_glow_n_blocks: 12 +post_glow_n_block_layers: 3 +post_share_cond_layers: false +share_wn_layers: 4 +use_cond_proj: false +use_latent_cond: false +use_txt_cond: true +sigmoid_scale: false +post_glow_training_start: 160000 +noise_scale: 0.8 +infer_post_glow: true +two_stage: true \ No newline at end of file diff --git a/egs/egs_bases/tts/vocoder/base.yaml b/egs/egs_bases/tts/vocoder/base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7adee14286b0c2377ce88670b1dab787ea8a2c9 --- /dev/null +++ b/egs/egs_bases/tts/vocoder/base.yaml @@ -0,0 +1,20 @@ +base_config: + - egs/egs_bases/config_base.yaml + - ../dataset_params.yaml +binarization_args: + with_wav: true + with_spk_embed: false + with_align: false + +generator_grad_norm: 10.0 # Generator's gradient norm. +discriminator_grad_norm: 1.0 # Discriminator's gradient norm. + +########### +# train and eval +########### +max_samples: 20480 +max_sentences: 8 +max_valid_sentences: 1 +max_updates: 2000000 +val_check_interval: 5000 +valid_infer_interval: 50000 diff --git a/egs/egs_bases/tts/vocoder/hifigan.yaml b/egs/egs_bases/tts/vocoder/hifigan.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fb77ba5104bf9d5b7ce9282f626bf56e957fbfe8 --- /dev/null +++ b/egs/egs_bases/tts/vocoder/hifigan.yaml @@ -0,0 +1,28 @@ +base_config: ./base.yaml +task_cls: tasks.vocoder.hifigan.HifiGanTask +resblock: "1" +adam_b1: 0.8 +adam_b2: 0.99 +upsample_rates: [ 8,8,2,2 ] +upsample_kernel_sizes: [ 16,16,4,4 ] +upsample_initial_channel: 512 +resblock_kernel_sizes: [ 3,7,11 ] +resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ] + +use_pitch_embed: false +use_fm_loss: false +use_ms_stft: false + +lambda_mel: 5.0 +lambda_mel_adv: 1.0 +lambda_cdisc: 4.0 +lambda_adv: 1.0 + +lr: 0.0002 # Generator's learning rate. +generator_scheduler_params: + step_size: 600 + gamma: 0.999 +discriminator_scheduler_params: + step_size: 600 + gamma: 0.999 +max_updates: 3000000 \ No newline at end of file diff --git a/inference/tts/__pycache__/base_tts_infer.cpython-36.pyc b/inference/tts/__pycache__/base_tts_infer.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07e0149e2d7517ea65fa03ad0df3c205640068dd Binary files /dev/null and b/inference/tts/__pycache__/base_tts_infer.cpython-36.pyc differ diff --git a/inference/tts/base_tts_infer.py b/inference/tts/base_tts_infer.py new file mode 100644 index 0000000000000000000000000000000000000000..c11388e15010d836ff125c262c35d85ea4024d4f --- /dev/null +++ b/inference/tts/base_tts_infer.py @@ -0,0 +1,120 @@ +import os + +import torch + +from modules.vocoder.hifigan.hifigan import HifiGanGenerator +from tasks.tts.dataset_utils import FastSpeechWordDataset +from tasks.tts.tts_utils import load_data_preprocessor +from utils.commons.ckpt_utils import load_ckpt +from utils.commons.hparams import set_hparams + + +class BaseTTSInfer: + def __init__(self, hparams, device=None): + if device is None: + device = 'cuda' if torch.cuda.is_available() else 'cpu' + self.hparams = hparams + self.device = device + self.data_dir = hparams['binary_data_dir'] + self.preprocessor, self.preprocess_args = load_data_preprocessor() + self.ph_encoder, self.word_encoder = self.preprocessor.load_dict(self.data_dir) + self.spk_map = self.preprocessor.load_spk_map(self.data_dir) + self.ds_cls = FastSpeechWordDataset + self.model = self.build_model() + self.model.eval() + self.model.to(self.device) + self.vocoder = self.build_vocoder() + self.vocoder.eval() + self.vocoder.to(self.device) + + def build_model(self): + raise NotImplementedError + + def forward_model(self, inp): + raise NotImplementedError + + def build_vocoder(self): + base_dir = self.hparams['vocoder_ckpt'] + config_path = f'{base_dir}/config.yaml' + config = set_hparams(config_path, global_hparams=False) + vocoder = HifiGanGenerator(config) + load_ckpt(vocoder, base_dir, 'model_gen') + return vocoder + + def run_vocoder(self, c): + c = c.transpose(2, 1) + y = self.vocoder(c)[:, 0] + return y + + def preprocess_input(self, inp): + """ + + :param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)} + :return: + """ + preprocessor, preprocess_args = self.preprocessor, self.preprocess_args + text_raw = inp['text'] + item_name = inp.get('item_name', '') + spk_name = inp.get('spk_name', '') + ph, txt, word, ph2word, ph_gb_word = preprocessor.txt_to_ph( + preprocessor.txt_processor, text_raw, preprocess_args) + word_token = self.word_encoder.encode(word) + ph_token = self.ph_encoder.encode(ph) + spk_id = self.spk_map[spk_name] + item = {'item_name': item_name, 'text': txt, 'ph': ph, 'spk_id': spk_id, + 'ph_token': ph_token, 'word_token': word_token, 'ph2word': ph2word, + 'ph_words':ph_gb_word, 'words': word} + item['ph_len'] = len(item['ph_token']) + return item + + def input_to_batch(self, item): + item_names = [item['item_name']] + text = [item['text']] + ph = [item['ph']] + txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device) + txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device) + word_tokens = torch.LongTensor(item['word_token'])[None, :].to(self.device) + word_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device) + ph2word = torch.LongTensor(item['ph2word'])[None, :].to(self.device) + spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device) + batch = { + 'item_name': item_names, + 'text': text, + 'ph': ph, + 'txt_tokens': txt_tokens, + 'txt_lengths': txt_lengths, + 'word_tokens': word_tokens, + 'word_lengths': word_lengths, + 'ph2word': ph2word, + 'spk_ids': spk_ids, + } + return batch + + def postprocess_output(self, output): + return output + + def infer_once(self, inp): + inp = self.preprocess_input(inp) + output = self.forward_model(inp) + output = self.postprocess_output(output) + return output + + @classmethod + def example_run(cls): + from utils.commons.hparams import set_hparams + from utils.commons.hparams import hparams as hp + from utils.audio.io import save_wav + + set_hparams() + if hp['ds_name'] in ['lj', 'libritts']: + inp = { + 'text': 'the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.' + } + elif hp['ds_name'] in ['biaobei']: + inp = { + 'text': '如果我想你三遍,天上乌云就散一片。' + } + infer_ins = cls(hp) + out = infer_ins.infer_once(inp) + os.makedirs('infer_out', exist_ok=True) + save_wav(out, f'infer_out/example_out.wav', hp['audio_sample_rate']) diff --git a/inference/tts/ds.py b/inference/tts/ds.py new file mode 100644 index 0000000000000000000000000000000000000000..04b5b4925bfcbfc0e05732054fd3746f1e89bf02 --- /dev/null +++ b/inference/tts/ds.py @@ -0,0 +1,30 @@ +import torch +# from inference.tts.fs import FastSpeechInfer +# from modules.tts.fs2_orig import FastSpeech2Orig +from inference.tts.base_tts_infer import BaseTTSInfer +from modules.tts.diffspeech.shallow_diffusion_tts import GaussianDiffusion +from utils.commons.ckpt_utils import load_ckpt +from utils.commons.hparams import hparams + + +class DiffSpeechInfer(BaseTTSInfer): + def build_model(self): + dict_size = len(self.ph_encoder) + model = GaussianDiffusion(dict_size, self.hparams) + model.eval() + load_ckpt(model, hparams['work_dir'], 'model') + return model + + def forward_model(self, inp): + sample = self.input_to_batch(inp) + txt_tokens = sample['txt_tokens'] # [B, T_t] + spk_id = sample.get('spk_ids') + with torch.no_grad(): + output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True) + mel_out = output['mel_out'] + wav_out = self.run_vocoder(mel_out) + wav_out = wav_out.cpu().numpy() + return wav_out[0] + +if __name__ == '__main__': + DiffSpeechInfer.example_run() diff --git a/inference/tts/fs.py b/inference/tts/fs.py new file mode 100644 index 0000000000000000000000000000000000000000..ee7beb321b699e92e3ad72e9959a093ce65deb12 --- /dev/null +++ b/inference/tts/fs.py @@ -0,0 +1,29 @@ +import torch +from inference.tts.base_tts_infer import BaseTTSInfer +from modules.tts.fs import FastSpeech +from utils.commons.ckpt_utils import load_ckpt +from utils.commons.hparams import hparams + + +class FastSpeechInfer(BaseTTSInfer): + def build_model(self): + dict_size = len(self.ph_encoder) + model = FastSpeech(dict_size, self.hparams) + model.eval() + load_ckpt(model, hparams['work_dir'], 'model') + return model + + def forward_model(self, inp): + sample = self.input_to_batch(inp) + txt_tokens = sample['txt_tokens'] # [B, T_t] + spk_id = sample.get('spk_ids') + with torch.no_grad(): + output = self.model(txt_tokens, spk_id=spk_id, infer=True) + mel_out = output['mel_out'] + wav_out = self.run_vocoder(mel_out) + wav_out = wav_out.cpu().numpy() + return wav_out[0] + + +if __name__ == '__main__': + FastSpeechInfer.example_run() diff --git a/inference/tts/fs2_orig.py b/inference/tts/fs2_orig.py new file mode 100644 index 0000000000000000000000000000000000000000..fe2665d451d5a36c47ffbf815b3d19876882bd91 --- /dev/null +++ b/inference/tts/fs2_orig.py @@ -0,0 +1,17 @@ +from inference.tts.fs import FastSpeechInfer +from modules.tts.fs2_orig import FastSpeech2Orig +from utils.commons.ckpt_utils import load_ckpt +from utils.commons.hparams import hparams + + +class FastSpeech2OrigInfer(FastSpeechInfer): + def build_model(self): + dict_size = len(self.ph_encoder) + model = FastSpeech2Orig(dict_size, self.hparams) + model.eval() + load_ckpt(model, hparams['work_dir'], 'model') + return model + + +if __name__ == '__main__': + FastSpeech2OrigInfer.example_run() diff --git a/inference/tts/gradio/gradio_settings.yaml b/inference/tts/gradio/gradio_settings.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5edb18bfe896a0149b6adb0d2f2dc7ebe9df4ee2 --- /dev/null +++ b/inference/tts/gradio/gradio_settings.yaml @@ -0,0 +1,12 @@ +title: 'yerfor/SyntaSpeech' +description: | + Gradio demo for yerfor/SyntaSpeech. To use it, simply add your audio, or click one of the examples to load them. Note: This space is running on CPU, inference times will be higher. +article: | + Link to Github REPO +example_inputs: + - |- + the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing. + - |- + produced the block books, which were the immediate predecessors of the true printed book, +inference_cls: inference.tts.synta.SyntaSpeechInfer +exp_name: lj_synta \ No newline at end of file diff --git a/inference/tts/gradio/infer.py b/inference/tts/gradio/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..7a48747aecbeed2b426ad03e3294454682fd1d06 --- /dev/null +++ b/inference/tts/gradio/infer.py @@ -0,0 +1,69 @@ +import importlib +import re + +import gradio as gr +import yaml +from gradio.inputs import Textbox + +from inference.tts.base_tts_infer import BaseTTSInfer +from utils.commons.hparams import set_hparams +from utils.commons.hparams import hparams as hp +import numpy as np + +from utils.text.text_encoder import PUNCS + + +class GradioInfer: + def __init__(self, exp_name, inference_cls, title, description, article, example_inputs): + self.exp_name = exp_name + self.title = title + self.description = description + self.article = article + self.example_inputs = example_inputs + pkg = ".".join(inference_cls.split(".")[:-1]) + cls_name = inference_cls.split(".")[-1] + self.inference_cls = getattr(importlib.import_module(pkg), cls_name) + + def greet(self, text): + sents = re.split(rf'([{PUNCS}])', text.replace('\n', ',')) + if sents[-1] not in list(PUNCS): + sents = sents + ['.'] + audio_outs = [] + s = "" + for i in range(0, len(sents), 2): + if len(sents[i]) > 0: + s += sents[i] + sents[i + 1] + if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0): + audio_out = self.infer_ins.infer_once({ + 'text': s + }) + audio_out = audio_out * 32767 + audio_out = audio_out.astype(np.int16) + audio_outs.append(audio_out) + audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16)) + s = "" + audio_outs = np.concatenate(audio_outs) + return hp['audio_sample_rate'], audio_outs + + def run(self): + set_hparams(exp_name=self.exp_name) + infer_cls = self.inference_cls + self.infer_ins: BaseTTSInfer = infer_cls(hp) + example_inputs = self.example_inputs + iface = gr.Interface(fn=self.greet, + inputs=Textbox( + lines=10, placeholder=None, default=example_inputs[0], label="input text"), + outputs="audio", + allow_flagging="never", + title=self.title, + description=self.description, + article=self.article, + examples=example_inputs, + enable_queue=True) + iface.launch(share=True,cache_examples=True) + + +if __name__ == '__main__': + gradio_config = yaml.safe_load(open('inference/tts/gradio/gradio_settings.yaml')) + g = GradioInfer(**gradio_config) + g.run() diff --git a/inference/tts/ps_flow.py b/inference/tts/ps_flow.py new file mode 100644 index 0000000000000000000000000000000000000000..ea5804ad0ff75e62e0c21f649f3dc51d1270a1e3 --- /dev/null +++ b/inference/tts/ps_flow.py @@ -0,0 +1,39 @@ +import torch +from inference.tts.base_tts_infer import BaseTTSInfer +from modules.tts.portaspeech.portaspeech_flow import PortaSpeechFlow +from utils.commons.ckpt_utils import load_ckpt +from utils.commons.hparams import hparams + + +class PortaSpeechFlowInfer(BaseTTSInfer): + def build_model(self): + ph_dict_size = len(self.ph_encoder) + word_dict_size = len(self.word_encoder) + model = PortaSpeechFlow(ph_dict_size, word_dict_size, self.hparams) + load_ckpt(model, hparams['work_dir'], 'model') + model.to(self.device) + with torch.no_grad(): + model.store_inverse_all() + model.eval() + return model + + def forward_model(self, inp): + sample = self.input_to_batch(inp) + with torch.no_grad(): + output = self.model( + sample['txt_tokens'], + sample['word_tokens'], + ph2word=sample['ph2word'], + word_len=sample['word_lengths'].max(), + infer=True, + forward_post_glow=True, + spk_id=sample.get('spk_ids') + ) + mel_out = output['mel_out'] + wav_out = self.run_vocoder(mel_out) + wav_out = wav_out.cpu().numpy() + return wav_out[0] + + +if __name__ == '__main__': + PortaSpeechFlowInfer.example_run() diff --git a/inference/tts/synta.py b/inference/tts/synta.py new file mode 100644 index 0000000000000000000000000000000000000000..009ef36d4bfd72ccd6c43bb9a47a350708503722 --- /dev/null +++ b/inference/tts/synta.py @@ -0,0 +1,76 @@ +import torch +from inference.tts.base_tts_infer import BaseTTSInfer +from modules.tts.syntaspeech.syntaspeech import SyntaSpeech +from utils.commons.ckpt_utils import load_ckpt +from utils.commons.hparams import hparams + +from modules.tts.syntaspeech.syntactic_graph_buider import Sentence2GraphParser + +class SyntaSpeechInfer(BaseTTSInfer): + def __init__(self, hparams, device=None): + super().__init__(hparams, device) + if hparams['ds_name'] in ['biaobei']: + self.syntactic_graph_builder = Sentence2GraphParser(language='zh') + elif hparams['ds_name'] in ['ljspeech', 'libritts']: + self.syntactic_graph_builder = Sentence2GraphParser(language='en') + + def build_model(self): + ph_dict_size = len(self.ph_encoder) + word_dict_size = len(self.word_encoder) + model = SyntaSpeech(ph_dict_size, word_dict_size, self.hparams) + load_ckpt(model, hparams['work_dir'], 'model') + model.to(self.device) + with torch.no_grad(): + model.store_inverse_all() + model.eval() + return model + + def input_to_batch(self, item): + item_names = [item['item_name']] + text = [item['text']] + ph = [item['ph']] + txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device) + txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device) + word_tokens = torch.LongTensor(item['word_token'])[None, :].to(self.device) + word_lengths = torch.LongTensor([word_tokens.shape[1]]).to(self.device) + ph2word = torch.LongTensor(item['ph2word'])[None, :].to(self.device) + spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device) + dgl_graph, etypes = self.syntactic_graph_builder.parse(item['text'], words=item['words'].split(" "), ph_words=item['ph_words'].split(" ")) + dgl_graph = dgl_graph.to(self.device) + etypes = etypes.to(self.device) + batch = { + 'item_name': item_names, + 'text': text, + 'ph': ph, + 'txt_tokens': txt_tokens, + 'txt_lengths': txt_lengths, + 'word_tokens': word_tokens, + 'word_lengths': word_lengths, + 'ph2word': ph2word, + 'spk_ids': spk_ids, + 'graph_lst': [dgl_graph], + 'etypes_lst': [etypes] + } + return batch + def forward_model(self, inp): + sample = self.input_to_batch(inp) + with torch.no_grad(): + output = self.model( + sample['txt_tokens'], + sample['word_tokens'], + ph2word=sample['ph2word'], + word_len=sample['word_lengths'].max(), + infer=True, + forward_post_glow=True, + spk_id=sample.get('spk_ids'), + graph_lst=sample['graph_lst'], + etypes_lst=sample['etypes_lst'] + ) + mel_out = output['mel_out'] + wav_out = self.run_vocoder(mel_out) + wav_out = wav_out.cpu().numpy() + return wav_out[0] + + +if __name__ == '__main__': + SyntaSpeechInfer.example_run() diff --git a/modules/commons/__pycache__/conv.cpython-36.pyc b/modules/commons/__pycache__/conv.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47e91c2ee5b3c0dd215d980d55a835c7964d22ac Binary files /dev/null and b/modules/commons/__pycache__/conv.cpython-36.pyc differ diff --git a/modules/commons/__pycache__/conv.cpython-37.pyc b/modules/commons/__pycache__/conv.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e47f66ce62d79ee0c2cf68b18a11117406b144e0 Binary files /dev/null and b/modules/commons/__pycache__/conv.cpython-37.pyc differ diff --git a/modules/commons/__pycache__/layers.cpython-36.pyc b/modules/commons/__pycache__/layers.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47b6a87ab2547cd707a9dd92439feb9c4c4db308 Binary files /dev/null and b/modules/commons/__pycache__/layers.cpython-36.pyc differ diff --git a/modules/commons/__pycache__/layers.cpython-37.pyc b/modules/commons/__pycache__/layers.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b17eddaa645fa6c29e25a9b0eb54414396b5a23d Binary files /dev/null and b/modules/commons/__pycache__/layers.cpython-37.pyc differ diff --git a/modules/commons/__pycache__/nar_tts_modules.cpython-36.pyc b/modules/commons/__pycache__/nar_tts_modules.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d170f292575bc3fc71503be98d9591b20b481fa8 Binary files /dev/null and b/modules/commons/__pycache__/nar_tts_modules.cpython-36.pyc differ diff --git a/modules/commons/__pycache__/nar_tts_modules.cpython-37.pyc b/modules/commons/__pycache__/nar_tts_modules.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..872ccadb79e692ed4fba1c189f80e7dd122cc8f5 Binary files /dev/null and b/modules/commons/__pycache__/nar_tts_modules.cpython-37.pyc differ diff --git a/modules/commons/__pycache__/rel_transformer.cpython-36.pyc b/modules/commons/__pycache__/rel_transformer.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6aec50c275ad7f9bf7bd85dabdd5b92a3a0af954 Binary files /dev/null and b/modules/commons/__pycache__/rel_transformer.cpython-36.pyc differ diff --git a/modules/commons/__pycache__/rel_transformer.cpython-37.pyc b/modules/commons/__pycache__/rel_transformer.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe8b47ae228b784f7fea651ae7d32effcd0105c3 Binary files /dev/null and b/modules/commons/__pycache__/rel_transformer.cpython-37.pyc differ diff --git a/modules/commons/__pycache__/rnn.cpython-36.pyc b/modules/commons/__pycache__/rnn.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69ab44637e5e68747762e6ce6618cf4cec49e96c Binary files /dev/null and b/modules/commons/__pycache__/rnn.cpython-36.pyc differ diff --git a/modules/commons/__pycache__/rnn.cpython-37.pyc b/modules/commons/__pycache__/rnn.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a65b4ae7599969124a4da50d053934efe45e7b2 Binary files /dev/null and b/modules/commons/__pycache__/rnn.cpython-37.pyc differ diff --git a/modules/commons/__pycache__/transformer.cpython-36.pyc b/modules/commons/__pycache__/transformer.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4aaede9972ce7f7cb8cfd7dc4c8d834a9d9450d8 Binary files /dev/null and b/modules/commons/__pycache__/transformer.cpython-36.pyc differ diff --git a/modules/commons/__pycache__/transformer.cpython-37.pyc b/modules/commons/__pycache__/transformer.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..830363326d20edcfd27e3eb42a830d180d29bb20 Binary files /dev/null and b/modules/commons/__pycache__/transformer.cpython-37.pyc differ diff --git a/modules/commons/__pycache__/wavenet.cpython-36.pyc b/modules/commons/__pycache__/wavenet.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bce88086195964f7a5534440e03c77c68c346b4b Binary files /dev/null and b/modules/commons/__pycache__/wavenet.cpython-36.pyc differ diff --git a/modules/commons/__pycache__/wavenet.cpython-37.pyc b/modules/commons/__pycache__/wavenet.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e1f39c5aae04c5ef7b1d6a5ceaca618a75eb5e1 Binary files /dev/null and b/modules/commons/__pycache__/wavenet.cpython-37.pyc differ diff --git a/modules/commons/conformer/conformer.py b/modules/commons/conformer/conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..21e1ecdda7ec069864d3904abb4360ec5aee637e --- /dev/null +++ b/modules/commons/conformer/conformer.py @@ -0,0 +1,72 @@ +from torch import nn +from .espnet_positional_embedding import RelPositionalEncoding +from .espnet_transformer_attn import RelPositionMultiHeadedAttention +from .layers import Swish, ConvolutionModule, EncoderLayer, MultiLayeredConv1d +from ..layers import Embedding + + +class ConformerLayers(nn.Module): + def __init__(self, hidden_size, num_layers, kernel_size=9, dropout=0.0, num_heads=4, + use_last_norm=True, save_hidden=False): + super().__init__() + self.use_last_norm = use_last_norm + self.layers = nn.ModuleList() + positionwise_layer = MultiLayeredConv1d + positionwise_layer_args = (hidden_size, hidden_size * 4, 1, dropout) + self.pos_embed = RelPositionalEncoding(hidden_size, dropout) + self.encoder_layers = nn.ModuleList([EncoderLayer( + hidden_size, + RelPositionMultiHeadedAttention(num_heads, hidden_size, 0.0), + positionwise_layer(*positionwise_layer_args), + positionwise_layer(*positionwise_layer_args), + ConvolutionModule(hidden_size, kernel_size, Swish()), + dropout, + ) for _ in range(num_layers)]) + if self.use_last_norm: + self.layer_norm = nn.LayerNorm(hidden_size) + else: + self.layer_norm = nn.Linear(hidden_size, hidden_size) + self.save_hidden = save_hidden + if save_hidden: + self.hiddens = [] + + def forward(self, x, padding_mask=None): + """ + + :param x: [B, T, H] + :param padding_mask: [B, T] + :return: [B, T, H] + """ + self.hiddens = [] + nonpadding_mask = x.abs().sum(-1) > 0 + x = self.pos_embed(x) + for l in self.encoder_layers: + x, mask = l(x, nonpadding_mask[:, None, :]) + if self.save_hidden: + self.hiddens.append(x[0]) + x = x[0] + x = self.layer_norm(x) * nonpadding_mask.float()[:, :, None] + return x + + +class ConformerEncoder(ConformerLayers): + def __init__(self, hidden_size, dict_size, num_layers=None): + conformer_enc_kernel_size = 9 + super().__init__(hidden_size, num_layers, conformer_enc_kernel_size) + self.embed = Embedding(dict_size, hidden_size, padding_idx=0) + + def forward(self, x): + """ + + :param src_tokens: [B, T] + :return: [B x T x C] + """ + x = self.embed(x) # [B, T, H] + x = super(ConformerEncoder, self).forward(x) + return x + + +class ConformerDecoder(ConformerLayers): + def __init__(self, hidden_size, num_layers): + conformer_dec_kernel_size = 9 + super().__init__(hidden_size, num_layers, conformer_dec_kernel_size) diff --git a/modules/commons/conformer/espnet_positional_embedding.py b/modules/commons/conformer/espnet_positional_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..89b9b5549cc779d1ea67f052b1c99cad92365503 --- /dev/null +++ b/modules/commons/conformer/espnet_positional_embedding.py @@ -0,0 +1,113 @@ +import math +import torch + + +class PositionalEncoding(torch.nn.Module): + """Positional encoding. + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + reverse (bool): Whether to reverse the input position. + """ + + def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False): + """Construct an PositionalEncoding object.""" + super(PositionalEncoding, self).__init__() + self.d_model = d_model + self.reverse = reverse + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x): + """Reset the positional encodings.""" + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + if self.reverse: + position = torch.arange( + x.size(1) - 1, -1, -1.0, dtype=torch.float32 + ).unsqueeze(1) + else: + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) + * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor): + """Add positional encoding. + Args: + x (torch.Tensor): Input tensor (batch, time, `*`). + Returns: + torch.Tensor: Encoded tensor (batch, time, `*`). + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class ScaledPositionalEncoding(PositionalEncoding): + """Scaled positional encoding module. + See Sec. 3.2 https://arxiv.org/abs/1809.08895 + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + """ + + def __init__(self, d_model, dropout_rate, max_len=5000): + """Initialize class.""" + super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len) + self.alpha = torch.nn.Parameter(torch.tensor(1.0)) + + def reset_parameters(self): + """Reset parameters.""" + self.alpha.data = torch.tensor(1.0) + + def forward(self, x): + """Add positional encoding. + Args: + x (torch.Tensor): Input tensor (batch, time, `*`). + Returns: + torch.Tensor: Encoded tensor (batch, time, `*`). + """ + self.extend_pe(x) + x = x + self.alpha * self.pe[:, : x.size(1)] + return self.dropout(x) + + +class RelPositionalEncoding(PositionalEncoding): + """Relative positional encoding module. + See : Appendix B in https://arxiv.org/abs/1901.02860 + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + """ + + def __init__(self, d_model, dropout_rate, max_len=5000): + """Initialize class.""" + super().__init__(d_model, dropout_rate, max_len, reverse=True) + + def forward(self, x): + """Compute positional encoding. + Args: + x (torch.Tensor): Input tensor (batch, time, `*`). + Returns: + torch.Tensor: Encoded tensor (batch, time, `*`). + torch.Tensor: Positional embedding tensor (1, time, `*`). + """ + self.extend_pe(x) + x = x * self.xscale + pos_emb = self.pe[:, : x.size(1)] + return self.dropout(x), self.dropout(pos_emb) \ No newline at end of file diff --git a/modules/commons/conformer/espnet_transformer_attn.py b/modules/commons/conformer/espnet_transformer_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..a479a27ea6fd4202359da435234408ba074f7577 --- /dev/null +++ b/modules/commons/conformer/espnet_transformer_attn.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2019 Shigeki Karita +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +"""Multi-Head Attention layer definition.""" + +import math + +import numpy +import torch +from torch import nn + + +class MultiHeadedAttention(nn.Module): + """Multi-Head Attention layer. + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + """ + + def __init__(self, n_head, n_feat, dropout_rate): + """Construct an MultiHeadedAttention object.""" + super(MultiHeadedAttention, self).__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + self.linear_q = nn.Linear(n_feat, n_feat) + self.linear_k = nn.Linear(n_feat, n_feat) + self.linear_v = nn.Linear(n_feat, n_feat) + self.linear_out = nn.Linear(n_feat, n_feat) + self.attn = None + self.dropout = nn.Dropout(p=dropout_rate) + + def forward_qkv(self, query, key, value): + """Transform query, key and value. + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + Returns: + torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + """ + n_batch = query.size(0) + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) + k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + q = q.transpose(1, 2) # (batch, head, time1, d_k) + k = k.transpose(1, 2) # (batch, head, time2, d_k) + v = v.transpose(1, 2) # (batch, head, time2, d_k) + + return q, k, v + + def forward_attention(self, value, scores, mask): + """Compute attention context vector. + Args: + value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + """ + n_batch = value.size(0) + if mask is not None: + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + min_value = float( + numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min + ) + scores = scores.masked_fill(mask, min_value) + self.attn = torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0 + ) # (batch, head, time1, time2) + else: + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + p_attn = self.dropout(self.attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward(self, query, key, value, mask): + """Compute scaled dot product attention. + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + """ + q, k, v = self.forward_qkv(query, key, value) + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + return self.forward_attention(v, scores, mask) + + +class RelPositionMultiHeadedAttention(MultiHeadedAttention): + """Multi-Head Attention layer with relative position encoding. + Paper: https://arxiv.org/abs/1901.02860 + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + """ + + def __init__(self, n_head, n_feat, dropout_rate): + """Construct an RelPositionMultiHeadedAttention object.""" + super().__init__(n_head, n_feat, dropout_rate) + # linear transformation for positional ecoding + self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) + self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + + def rel_shift(self, x, zero_triu=False): + """Compute relative positinal encoding. + Args: + x (torch.Tensor): Input tensor (batch, time, size). + zero_triu (bool): If true, return the lower triangular part of the matrix. + Returns: + torch.Tensor: Output tensor. + """ + zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) + x = x_padded[:, :, 1:].view_as(x) + + if zero_triu: + ones = torch.ones((x.size(2), x.size(3))) + x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] + + return x + + def forward(self, query, key, value, pos_emb, mask): + """Compute 'Scaled Dot Product Attention' with rel. positional encoding. + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + pos_emb (torch.Tensor): Positional embedding tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + """ + q, k, v = self.forward_qkv(query, key, value) + q = q.transpose(1, 2) # (batch, time1, head, d_k) + + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) + p = p.transpose(1, 2) # (batch, head, time1, d_k) + + # (batch, head, time1, d_k) + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + # (batch, head, time1, d_k) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch, head, time1, time2) + matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) + + # compute matrix b and matrix d + # (batch, head, time1, time2) + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + matrix_bd = self.rel_shift(matrix_bd) + + scores = (matrix_ac + matrix_bd) / math.sqrt( + self.d_k + ) # (batch, head, time1, time2) + + return self.forward_attention(v, scores, mask) diff --git a/modules/commons/conformer/layers.py b/modules/commons/conformer/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..cd7f501667e0b8aa816373d843adc816748e73a8 --- /dev/null +++ b/modules/commons/conformer/layers.py @@ -0,0 +1,260 @@ +from torch import nn +import torch + +from modules.commons.layers import LayerNorm + + +class ConvolutionModule(nn.Module): + """ConvolutionModule in Conformer model. + Args: + channels (int): The number of channels of conv layers. + kernel_size (int): Kernerl size of conv layers. + """ + + def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True): + """Construct an ConvolutionModule object.""" + super(ConvolutionModule, self).__init__() + # kernerl_size should be a odd number for 'SAME' padding + assert (kernel_size - 1) % 2 == 0 + + self.pointwise_conv1 = nn.Conv1d( + channels, + 2 * channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.depthwise_conv = nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + padding=(kernel_size - 1) // 2, + groups=channels, + bias=bias, + ) + self.norm = nn.BatchNorm1d(channels) + self.pointwise_conv2 = nn.Conv1d( + channels, + channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.activation = activation + + def forward(self, x): + """Compute convolution module. + Args: + x (torch.Tensor): Input tensor (#batch, time, channels). + Returns: + torch.Tensor: Output tensor (#batch, time, channels). + """ + # exchange the temporal dimension and the feature dimension + x = x.transpose(1, 2) + + # GLU mechanism + x = self.pointwise_conv1(x) # (batch, 2*channel, dim) + x = nn.functional.glu(x, dim=1) # (batch, channel, dim) + + # 1D Depthwise Conv + x = self.depthwise_conv(x) + x = self.activation(self.norm(x)) + + x = self.pointwise_conv2(x) + + return x.transpose(1, 2) + + +class MultiLayeredConv1d(torch.nn.Module): + """Multi-layered conv1d for Transformer block. + This is a module of multi-leyered conv1d designed + to replace positionwise feed-forward network + in Transforner block, which is introduced in + `FastSpeech: Fast, Robust and Controllable Text to Speech`_. + .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`: + https://arxiv.org/pdf/1905.09263.pdf + """ + + def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): + """Initialize MultiLayeredConv1d module. + Args: + in_chans (int): Number of input channels. + hidden_chans (int): Number of hidden channels. + kernel_size (int): Kernel size of conv1d. + dropout_rate (float): Dropout rate. + """ + super(MultiLayeredConv1d, self).__init__() + self.w_1 = torch.nn.Conv1d( + in_chans, + hidden_chans, + kernel_size, + stride=1, + padding=(kernel_size - 1) // 2, + ) + self.w_2 = torch.nn.Conv1d( + hidden_chans, + in_chans, + kernel_size, + stride=1, + padding=(kernel_size - 1) // 2, + ) + self.dropout = torch.nn.Dropout(dropout_rate) + + def forward(self, x): + """Calculate forward propagation. + Args: + x (torch.Tensor): Batch of input tensors (B, T, in_chans). + Returns: + torch.Tensor: Batch of output tensors (B, T, hidden_chans). + """ + x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1) + return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1) + + +class Swish(torch.nn.Module): + """Construct an Swish object.""" + + def forward(self, x): + """Return Swich activation function.""" + return x * torch.sigmoid(x) + + +class EncoderLayer(nn.Module): + """Encoder layer module. + Args: + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance + can be used as the argument. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + feed_forward_macaron (torch.nn.Module): Additional feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + conv_module (torch.nn.Module): Convolution module instance. + `ConvlutionModule` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + """ + + def __init__( + self, + size, + self_attn, + feed_forward, + feed_forward_macaron, + conv_module, + dropout_rate, + normalize_before=True, + concat_after=False, + ): + """Construct an EncoderLayer object.""" + super(EncoderLayer, self).__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.feed_forward_macaron = feed_forward_macaron + self.conv_module = conv_module + self.norm_ff = LayerNorm(size) # for the FNN module + self.norm_mha = LayerNorm(size) # for the MHA module + if feed_forward_macaron is not None: + self.norm_ff_macaron = LayerNorm(size) + self.ff_scale = 0.5 + else: + self.ff_scale = 1.0 + if self.conv_module is not None: + self.norm_conv = LayerNorm(size) # for the CNN module + self.norm_final = LayerNorm(size) # for the final output of the block + self.dropout = nn.Dropout(dropout_rate) + self.size = size + self.normalize_before = normalize_before + self.concat_after = concat_after + if self.concat_after: + self.concat_linear = nn.Linear(size + size, size) + + def forward(self, x_input, mask, cache=None): + """Compute encoded features. + Args: + x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb. + - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. + - w/o pos emb: Tensor (#batch, time, size). + mask (torch.Tensor): Mask tensor for the input (#batch, time). + cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time). + """ + if isinstance(x_input, tuple): + x, pos_emb = x_input[0], x_input[1] + else: + x, pos_emb = x_input, None + + # whether to use macaron style + if self.feed_forward_macaron is not None: + residual = x + if self.normalize_before: + x = self.norm_ff_macaron(x) + x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x)) + if not self.normalize_before: + x = self.norm_ff_macaron(x) + + # multi-headed self-attention module + residual = x + if self.normalize_before: + x = self.norm_mha(x) + + if cache is None: + x_q = x + else: + assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size) + x_q = x[:, -1:, :] + residual = residual[:, -1:, :] + mask = None if mask is None else mask[:, -1:, :] + + if pos_emb is not None: + x_att = self.self_attn(x_q, x, x, pos_emb, mask) + else: + x_att = self.self_attn(x_q, x, x, mask) + + if self.concat_after: + x_concat = torch.cat((x, x_att), dim=-1) + x = residual + self.concat_linear(x_concat) + else: + x = residual + self.dropout(x_att) + if not self.normalize_before: + x = self.norm_mha(x) + + # convolution module + if self.conv_module is not None: + residual = x + if self.normalize_before: + x = self.norm_conv(x) + x = residual + self.dropout(self.conv_module(x)) + if not self.normalize_before: + x = self.norm_conv(x) + + # feed forward module + residual = x + if self.normalize_before: + x = self.norm_ff(x) + x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm_ff(x) + + if self.conv_module is not None: + x = self.norm_final(x) + + if cache is not None: + x = torch.cat([cache, x], dim=1) + + if pos_emb is not None: + return (x, pos_emb), mask + + return x, mask diff --git a/modules/commons/conv.py b/modules/commons/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..c67d90ebf971e54ae57d08750041a698268042db --- /dev/null +++ b/modules/commons/conv.py @@ -0,0 +1,167 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +from modules.commons.layers import LayerNorm, Embedding + + +class LambdaLayer(nn.Module): + def __init__(self, lambd): + super(LambdaLayer, self).__init__() + self.lambd = lambd + + def forward(self, x): + return self.lambd(x) + + +def init_weights_func(m): + classname = m.__class__.__name__ + if classname.find("Conv1d") != -1: + torch.nn.init.xavier_uniform_(m.weight) + + +class ResidualBlock(nn.Module): + """Implements conv->PReLU->norm n-times""" + + def __init__(self, channels, kernel_size, dilation, n=2, norm_type='bn', dropout=0.0, + c_multiple=2, ln_eps=1e-12): + super(ResidualBlock, self).__init__() + + if norm_type == 'bn': + norm_builder = lambda: nn.BatchNorm1d(channels) + elif norm_type == 'in': + norm_builder = lambda: nn.InstanceNorm1d(channels, affine=True) + elif norm_type == 'gn': + norm_builder = lambda: nn.GroupNorm(8, channels) + elif norm_type == 'ln': + norm_builder = lambda: LayerNorm(channels, dim=1, eps=ln_eps) + else: + norm_builder = lambda: nn.Identity() + + self.blocks = [ + nn.Sequential( + norm_builder(), + nn.Conv1d(channels, c_multiple * channels, kernel_size, dilation=dilation, + padding=(dilation * (kernel_size - 1)) // 2), + LambdaLayer(lambda x: x * kernel_size ** -0.5), + nn.GELU(), + nn.Conv1d(c_multiple * channels, channels, 1, dilation=dilation), + ) + for i in range(n) + ] + + self.blocks = nn.ModuleList(self.blocks) + self.dropout = dropout + + def forward(self, x): + nonpadding = (x.abs().sum(1) > 0).float()[:, None, :] + for b in self.blocks: + x_ = b(x) + if self.dropout > 0 and self.training: + x_ = F.dropout(x_, self.dropout, training=self.training) + x = x + x_ + x = x * nonpadding + return x + + +class ConvBlocks(nn.Module): + """Decodes the expanded phoneme encoding into spectrograms""" + + def __init__(self, hidden_size, out_dims, dilations, kernel_size, + norm_type='ln', layers_in_block=2, c_multiple=2, + dropout=0.0, ln_eps=1e-5, + init_weights=True, is_BTC=True, num_layers=None, post_net_kernel=3): + super(ConvBlocks, self).__init__() + self.is_BTC = is_BTC + if num_layers is not None: + dilations = [1] * num_layers + self.res_blocks = nn.Sequential( + *[ResidualBlock(hidden_size, kernel_size, d, + n=layers_in_block, norm_type=norm_type, c_multiple=c_multiple, + dropout=dropout, ln_eps=ln_eps) + for d in dilations], + ) + if norm_type == 'bn': + norm = nn.BatchNorm1d(hidden_size) + elif norm_type == 'in': + norm = nn.InstanceNorm1d(hidden_size, affine=True) + elif norm_type == 'gn': + norm = nn.GroupNorm(8, hidden_size) + elif norm_type == 'ln': + norm = LayerNorm(hidden_size, dim=1, eps=ln_eps) + self.last_norm = norm + self.post_net1 = nn.Conv1d(hidden_size, out_dims, kernel_size=post_net_kernel, + padding=post_net_kernel // 2) + if init_weights: + self.apply(init_weights_func) + + def forward(self, x, nonpadding=None): + """ + + :param x: [B, T, H] + :return: [B, T, H] + """ + if self.is_BTC: + x = x.transpose(1, 2) + if nonpadding is None: + nonpadding = (x.abs().sum(1) > 0).float()[:, None, :] + elif self.is_BTC: + nonpadding = nonpadding.transpose(1, 2) + x = self.res_blocks(x) * nonpadding + x = self.last_norm(x) * nonpadding + x = self.post_net1(x) * nonpadding + if self.is_BTC: + x = x.transpose(1, 2) + return x + + +class TextConvEncoder(ConvBlocks): + def __init__(self, dict_size, hidden_size, out_dims, dilations, kernel_size, + norm_type='ln', layers_in_block=2, c_multiple=2, + dropout=0.0, ln_eps=1e-5, init_weights=True, num_layers=None, post_net_kernel=3): + super().__init__(hidden_size, out_dims, dilations, kernel_size, + norm_type, layers_in_block, c_multiple, + dropout, ln_eps, init_weights, num_layers=num_layers, + post_net_kernel=post_net_kernel) + self.embed_tokens = Embedding(dict_size, hidden_size, 0) + self.embed_scale = math.sqrt(hidden_size) + + def forward(self, txt_tokens): + """ + + :param txt_tokens: [B, T] + :return: { + 'encoder_out': [B x T x C] + } + """ + x = self.embed_scale * self.embed_tokens(txt_tokens) + return super().forward(x) + + +class ConditionalConvBlocks(ConvBlocks): + def __init__(self, hidden_size, c_cond, c_out, dilations, kernel_size, + norm_type='ln', layers_in_block=2, c_multiple=2, + dropout=0.0, ln_eps=1e-5, init_weights=True, is_BTC=True, num_layers=None): + super().__init__(hidden_size, c_out, dilations, kernel_size, + norm_type, layers_in_block, c_multiple, + dropout, ln_eps, init_weights, is_BTC=False, num_layers=num_layers) + self.g_prenet = nn.Conv1d(c_cond, hidden_size, 3, padding=1) + self.is_BTC_ = is_BTC + if init_weights: + self.g_prenet.apply(init_weights_func) + + def forward(self, x, cond, nonpadding=None): + if self.is_BTC_: + x = x.transpose(1, 2) + cond = cond.transpose(1, 2) + if nonpadding is not None: + nonpadding = nonpadding.transpose(1, 2) + if nonpadding is None: + nonpadding = x.abs().sum(1)[:, None] + x = x + self.g_prenet(cond) + x = x * nonpadding + x = super(ConditionalConvBlocks, self).forward(x) # input needs to be BTC + if self.is_BTC_: + x = x.transpose(1, 2) + return x diff --git a/modules/commons/layers.py b/modules/commons/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..88e1c75876050fa05a768a5ae0467fdfc05bb006 --- /dev/null +++ b/modules/commons/layers.py @@ -0,0 +1,50 @@ +import torch +from torch import nn + + +class LayerNorm(torch.nn.LayerNorm): + """Layer normalization module. + :param int nout: output dim size + :param int dim: dimension to be normalized + """ + + def __init__(self, nout, dim=-1, eps=1e-5): + """Construct an LayerNorm object.""" + super(LayerNorm, self).__init__(nout, eps=eps) + self.dim = dim + + def forward(self, x): + """Apply layer normalization. + :param torch.Tensor x: input tensor + :return: layer normalized tensor + :rtype torch.Tensor + """ + if self.dim == -1: + return super(LayerNorm, self).forward(x) + return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1) + + +class Reshape(nn.Module): + def __init__(self, *args): + super(Reshape, self).__init__() + self.shape = args + + def forward(self, x): + return x.view(self.shape) + + +class Permute(nn.Module): + def __init__(self, *args): + super(Permute, self).__init__() + self.args = args + + def forward(self, x): + return x.permute(self.args) + + +def Embedding(num_embeddings, embedding_dim, padding_idx=None): + m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) + nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) + if padding_idx is not None: + nn.init.constant_(m.weight[padding_idx], 0) + return m diff --git a/modules/commons/nar_tts_modules.py b/modules/commons/nar_tts_modules.py new file mode 100644 index 0000000000000000000000000000000000000000..d3c4b8c4a54572779629833e862811f2630a307a --- /dev/null +++ b/modules/commons/nar_tts_modules.py @@ -0,0 +1,138 @@ +import torch +from torch import nn + +from modules.commons.layers import LayerNorm +import torch.nn.functional as F + +class DurationPredictor(torch.nn.Module): + def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0): + super(DurationPredictor, self).__init__() + self.offset = offset + self.conv = torch.nn.ModuleList() + self.kernel_size = kernel_size + for idx in range(n_layers): + in_chans = idim if idx == 0 else n_chans + self.conv += [torch.nn.Sequential( + torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), + torch.nn.ReLU(), + LayerNorm(n_chans, dim=1), + torch.nn.Dropout(dropout_rate) + )] + self.linear = nn.Sequential(torch.nn.Linear(n_chans, 1), nn.Softplus()) + + def forward(self, x, x_padding=None): + x = x.transpose(1, -1) # (B, idim, Tmax) + for f in self.conv: + x = f(x) # (B, C, Tmax) + if x_padding is not None: + x = x * (1 - x_padding.float())[:, None, :] + + x = self.linear(x.transpose(1, -1)) # [B, T, C] + x = x * (1 - x_padding.float())[:, :, None] # (B, T, C) + x = x[..., 0] # (B, Tmax) + return x + + +class SyntaDurationPredictor(torch.nn.Module): + def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0): + super(SyntaDurationPredictor, self).__init__() + from modules.tts.syntaspeech.syntactic_graph_encoder import GraphAuxEnc + self.graph_encoder = GraphAuxEnc(in_dim=idim, hid_dim=idim, out_dim=idim) + self.offset = offset + self.conv = torch.nn.ModuleList() + self.kernel_size = kernel_size + for idx in range(n_layers): + in_chans = idim if idx == 0 else n_chans + self.conv += [torch.nn.Sequential( + torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), + torch.nn.ReLU(), + LayerNorm(n_chans, dim=1), + torch.nn.Dropout(dropout_rate) + )] + self.linear = nn.Sequential(torch.nn.Linear(n_chans, 1), nn.Softplus()) + + def forward(self, x, x_padding=None, ph2word=None, graph_lst=None, etypes_lst=None): + x = x.transpose(1, -1) # (B, idim, Tmax) + assert ph2word is not None and graph_lst is not None and etypes_lst is not None + x_graph = self.graph_encoder(graph_lst, x, ph2word, etypes_lst) + x = x + x_graph * 1. + + for f in self.conv: + x = f(x) # (B, C, Tmax) + if x_padding is not None: + x = x * (1 - x_padding.float())[:, None, :] + + x = self.linear(x.transpose(1, -1)) # [B, T, C] + x = x * (1 - x_padding.float())[:, :, None] # (B, T, C) + x = x[..., 0] # (B, Tmax) + return x + + +class LengthRegulator(torch.nn.Module): + def __init__(self, pad_value=0.0): + super(LengthRegulator, self).__init__() + self.pad_value = pad_value + + def forward(self, dur, dur_padding=None, alpha=1.0): + """ + Example (no batch dim version): + 1. dur = [2,2,3] + 2. token_idx = [[1],[2],[3]], dur_cumsum = [2,4,7], dur_cumsum_prev = [0,2,4] + 3. token_mask = [[1,1,0,0,0,0,0], + [0,0,1,1,0,0,0], + [0,0,0,0,1,1,1]] + 4. token_idx * token_mask = [[1,1,0,0,0,0,0], + [0,0,2,2,0,0,0], + [0,0,0,0,3,3,3]] + 5. (token_idx * token_mask).sum(0) = [1,1,2,2,3,3,3] + + :param dur: Batch of durations of each frame (B, T_txt) + :param dur_padding: Batch of padding of each frame (B, T_txt) + :param alpha: duration rescale coefficient + :return: + mel2ph (B, T_speech) + assert alpha > 0 + """ + dur = torch.round(dur.float() * alpha).long() + if dur_padding is not None: + dur = dur * (1 - dur_padding.long()) + token_idx = torch.arange(1, dur.shape[1] + 1)[None, :, None].to(dur.device) + dur_cumsum = torch.cumsum(dur, 1) + dur_cumsum_prev = F.pad(dur_cumsum, [1, -1], mode='constant', value=0) + + pos_idx = torch.arange(dur.sum(-1).max())[None, None].to(dur.device) + token_mask = (pos_idx >= dur_cumsum_prev[:, :, None]) & (pos_idx < dur_cumsum[:, :, None]) + mel2token = (token_idx * token_mask.long()).sum(1) + return mel2token + + +class PitchPredictor(torch.nn.Module): + def __init__(self, idim, n_layers=5, n_chans=384, odim=2, kernel_size=5, dropout_rate=0.1): + super(PitchPredictor, self).__init__() + self.conv = torch.nn.ModuleList() + self.kernel_size = kernel_size + for idx in range(n_layers): + in_chans = idim if idx == 0 else n_chans + self.conv += [torch.nn.Sequential( + torch.nn.Conv1d(in_chans, n_chans, kernel_size, padding=kernel_size // 2), + torch.nn.ReLU(), + LayerNorm(n_chans, dim=1), + torch.nn.Dropout(dropout_rate) + )] + self.linear = torch.nn.Linear(n_chans, odim) + + def forward(self, x): + """ + + :param x: [B, T, H] + :return: [B, T, H] + """ + x = x.transpose(1, -1) # (B, idim, Tmax) + for f in self.conv: + x = f(x) # (B, C, Tmax) + x = self.linear(x.transpose(1, -1)) # (B, Tmax, H) + return x + + +class EnergyPredictor(PitchPredictor): + pass diff --git a/modules/commons/normalizing_flow/__pycache__/glow_modules.cpython-36.pyc b/modules/commons/normalizing_flow/__pycache__/glow_modules.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15258af683f2bfd466534f50bd07f8a39c7746b8 Binary files /dev/null and b/modules/commons/normalizing_flow/__pycache__/glow_modules.cpython-36.pyc differ diff --git a/modules/commons/normalizing_flow/__pycache__/res_flow.cpython-36.pyc b/modules/commons/normalizing_flow/__pycache__/res_flow.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48a15ac1dababbbffae93bc98e38b0877ab227cf Binary files /dev/null and b/modules/commons/normalizing_flow/__pycache__/res_flow.cpython-36.pyc differ diff --git a/modules/commons/normalizing_flow/__pycache__/res_flow.cpython-37.pyc b/modules/commons/normalizing_flow/__pycache__/res_flow.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48943de6311c55a044997d2c82879de15bdc4a70 Binary files /dev/null and b/modules/commons/normalizing_flow/__pycache__/res_flow.cpython-37.pyc differ diff --git a/modules/commons/normalizing_flow/glow_modules.py b/modules/commons/normalizing_flow/glow_modules.py new file mode 100644 index 0000000000000000000000000000000000000000..c589af0f2eba2b154317912f9ad01a4163b3fd6a --- /dev/null +++ b/modules/commons/normalizing_flow/glow_modules.py @@ -0,0 +1,362 @@ +import scipy +from torch.nn import functional as F +import torch +from torch import nn +import numpy as np +from modules.commons.wavenet import WN +from modules.tts.glow import utils + + +class ActNorm(nn.Module): + def __init__(self, channels, ddi=False, **kwargs): + super().__init__() + self.channels = channels + self.initialized = not ddi + + self.logs = nn.Parameter(torch.zeros(1, channels, 1)) + self.bias = nn.Parameter(torch.zeros(1, channels, 1)) + + def forward(self, x, x_mask=None, reverse=False, **kwargs): + if x_mask is None: + x_mask = torch.ones(x.size(0), 1, x.size(2)).to(device=x.device, dtype=x.dtype) + x_len = torch.sum(x_mask, [1, 2]) + if not self.initialized: + self.initialize(x, x_mask) + self.initialized = True + + if reverse: + z = (x - self.bias) * torch.exp(-self.logs) * x_mask + logdet = torch.sum(-self.logs) * x_len + else: + z = (self.bias + torch.exp(self.logs) * x) * x_mask + logdet = torch.sum(self.logs) * x_len # [b] + return z, logdet + + def store_inverse(self): + pass + + def set_ddi(self, ddi): + self.initialized = not ddi + + def initialize(self, x, x_mask): + with torch.no_grad(): + denom = torch.sum(x_mask, [0, 2]) + m = torch.sum(x * x_mask, [0, 2]) / denom + m_sq = torch.sum(x * x * x_mask, [0, 2]) / denom + v = m_sq - (m ** 2) + logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6)) + + bias_init = (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype) + logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype) + + self.bias.data.copy_(bias_init) + self.logs.data.copy_(logs_init) + + +class InvConvNear(nn.Module): + def __init__(self, channels, n_split=4, no_jacobian=False, lu=True, n_sqz=2, **kwargs): + super().__init__() + assert (n_split % 2 == 0) + self.channels = channels + self.n_split = n_split + self.n_sqz = n_sqz + self.no_jacobian = no_jacobian + + w_init = torch.qr(torch.FloatTensor(self.n_split, self.n_split).normal_())[0] + if torch.det(w_init) < 0: + w_init[:, 0] = -1 * w_init[:, 0] + self.lu = lu + if lu: + # LU decomposition can slightly speed up the inverse + np_p, np_l, np_u = scipy.linalg.lu(w_init) + np_s = np.diag(np_u) + np_sign_s = np.sign(np_s) + np_log_s = np.log(np.abs(np_s)) + np_u = np.triu(np_u, k=1) + l_mask = np.tril(np.ones(w_init.shape, dtype=float), -1) + eye = np.eye(*w_init.shape, dtype=float) + + self.register_buffer('p', torch.Tensor(np_p.astype(float))) + self.register_buffer('sign_s', torch.Tensor(np_sign_s.astype(float))) + self.l = nn.Parameter(torch.Tensor(np_l.astype(float)), requires_grad=True) + self.log_s = nn.Parameter(torch.Tensor(np_log_s.astype(float)), requires_grad=True) + self.u = nn.Parameter(torch.Tensor(np_u.astype(float)), requires_grad=True) + self.register_buffer('l_mask', torch.Tensor(l_mask)) + self.register_buffer('eye', torch.Tensor(eye)) + else: + self.weight = nn.Parameter(w_init) + + def forward(self, x, x_mask=None, reverse=False, **kwargs): + b, c, t = x.size() + assert (c % self.n_split == 0) + if x_mask is None: + x_mask = 1 + x_len = torch.ones((b,), dtype=x.dtype, device=x.device) * t + else: + x_len = torch.sum(x_mask, [1, 2]) + + x = x.view(b, self.n_sqz, c // self.n_split, self.n_split // self.n_sqz, t) + x = x.permute(0, 1, 3, 2, 4).contiguous().view(b, self.n_split, c // self.n_split, t) + + if self.lu: + self.weight, log_s = self._get_weight() + logdet = log_s.sum() + logdet = logdet * (c / self.n_split) * x_len + else: + logdet = torch.logdet(self.weight) * (c / self.n_split) * x_len # [b] + + if reverse: + if hasattr(self, "weight_inv"): + weight = self.weight_inv + else: + weight = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype) + logdet = -logdet + else: + weight = self.weight + if self.no_jacobian: + logdet = 0 + + weight = weight.view(self.n_split, self.n_split, 1, 1) + z = F.conv2d(x, weight) + + z = z.view(b, self.n_sqz, self.n_split // self.n_sqz, c // self.n_split, t) + z = z.permute(0, 1, 3, 2, 4).contiguous().view(b, c, t) * x_mask + return z, logdet + + def _get_weight(self): + l, log_s, u = self.l, self.log_s, self.u + l = l * self.l_mask + self.eye + u = u * self.l_mask.transpose(0, 1).contiguous() + torch.diag(self.sign_s * torch.exp(log_s)) + weight = torch.matmul(self.p, torch.matmul(l, u)) + return weight, log_s + + def store_inverse(self): + weight, _ = self._get_weight() + self.weight_inv = torch.inverse(weight.float()).to(next(self.parameters()).device) + + +class InvConv(nn.Module): + def __init__(self, channels, no_jacobian=False, lu=True, **kwargs): + super().__init__() + w_shape = [channels, channels] + w_init = np.linalg.qr(np.random.randn(*w_shape))[0].astype(float) + LU_decomposed = lu + if not LU_decomposed: + # Sample a random orthogonal matrix: + self.register_parameter("weight", nn.Parameter(torch.Tensor(w_init))) + else: + np_p, np_l, np_u = scipy.linalg.lu(w_init) + np_s = np.diag(np_u) + np_sign_s = np.sign(np_s) + np_log_s = np.log(np.abs(np_s)) + np_u = np.triu(np_u, k=1) + l_mask = np.tril(np.ones(w_shape, dtype=float), -1) + eye = np.eye(*w_shape, dtype=float) + + self.register_buffer('p', torch.Tensor(np_p.astype(float))) + self.register_buffer('sign_s', torch.Tensor(np_sign_s.astype(float))) + self.l = nn.Parameter(torch.Tensor(np_l.astype(float))) + self.log_s = nn.Parameter(torch.Tensor(np_log_s.astype(float))) + self.u = nn.Parameter(torch.Tensor(np_u.astype(float))) + self.l_mask = torch.Tensor(l_mask) + self.eye = torch.Tensor(eye) + self.w_shape = w_shape + self.LU = LU_decomposed + self.weight = None + + def get_weight(self, device, reverse): + w_shape = self.w_shape + self.p = self.p.to(device) + self.sign_s = self.sign_s.to(device) + self.l_mask = self.l_mask.to(device) + self.eye = self.eye.to(device) + l = self.l * self.l_mask + self.eye + u = self.u * self.l_mask.transpose(0, 1).contiguous() + torch.diag(self.sign_s * torch.exp(self.log_s)) + dlogdet = self.log_s.sum() + if not reverse: + w = torch.matmul(self.p, torch.matmul(l, u)) + else: + l = torch.inverse(l.double()).float() + u = torch.inverse(u.double()).float() + w = torch.matmul(u, torch.matmul(l, self.p.inverse())) + return w.view(w_shape[0], w_shape[1], 1), dlogdet + + def forward(self, x, x_mask=None, reverse=False, **kwargs): + """ + log-det = log|abs(|W|)| * pixels + """ + b, c, t = x.size() + if x_mask is None: + x_len = torch.ones((b,), dtype=x.dtype, device=x.device) * t + else: + x_len = torch.sum(x_mask, [1, 2]) + logdet = 0 + if not reverse: + weight, dlogdet = self.get_weight(x.device, reverse) + z = F.conv1d(x, weight) + if logdet is not None: + logdet = logdet + dlogdet * x_len + return z, logdet + else: + if self.weight is None: + weight, dlogdet = self.get_weight(x.device, reverse) + else: + weight, dlogdet = self.weight, self.dlogdet + z = F.conv1d(x, weight) + if logdet is not None: + logdet = logdet - dlogdet * x_len + return z, logdet + + def store_inverse(self): + self.weight, self.dlogdet = self.get_weight('cuda', reverse=True) + + +class CouplingBlock(nn.Module): + def __init__(self, in_channels, hidden_channels, kernel_size, dilation_rate, n_layers, + gin_channels=0, p_dropout=0, sigmoid_scale=False, wn=None): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + self.sigmoid_scale = sigmoid_scale + + start = torch.nn.Conv1d(in_channels // 2, hidden_channels, 1) + start = torch.nn.utils.weight_norm(start) + self.start = start + # Initializing last layer to 0 makes the affine coupling layers + # do nothing at first. This helps with training stability + end = torch.nn.Conv1d(hidden_channels, in_channels, 1) + end.weight.data.zero_() + end.bias.data.zero_() + self.end = end + self.wn = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels, p_dropout) + if wn is not None: + self.wn.in_layers = wn.in_layers + self.wn.res_skip_layers = wn.res_skip_layers + + def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs): + if x_mask is None: + x_mask = 1 + x_0, x_1 = x[:, :self.in_channels // 2], x[:, self.in_channels // 2:] + + x = self.start(x_0) * x_mask + x = self.wn(x, x_mask, g) + out = self.end(x) + + z_0 = x_0 + m = out[:, :self.in_channels // 2, :] + logs = out[:, self.in_channels // 2:, :] + if self.sigmoid_scale: + logs = torch.log(1e-6 + torch.sigmoid(logs + 2)) + if reverse: + z_1 = (x_1 - m) * torch.exp(-logs) * x_mask + logdet = torch.sum(-logs * x_mask, [1, 2]) + else: + z_1 = (m + torch.exp(logs) * x_1) * x_mask + logdet = torch.sum(logs * x_mask, [1, 2]) + z = torch.cat([z_0, z_1], 1) + return z, logdet + + def store_inverse(self): + self.wn.remove_weight_norm() + + +class Glow(nn.Module): + def __init__(self, + in_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_blocks, + n_layers, + p_dropout=0., + n_split=4, + n_sqz=2, + sigmoid_scale=False, + gin_channels=0, + inv_conv_type='near', + share_cond_layers=False, + share_wn_layers=0, + ): + super().__init__() + + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_blocks = n_blocks + self.n_layers = n_layers + self.p_dropout = p_dropout + self.n_split = n_split + self.n_sqz = n_sqz + self.sigmoid_scale = sigmoid_scale + self.gin_channels = gin_channels + self.share_cond_layers = share_cond_layers + if gin_channels != 0 and share_cond_layers: + cond_layer = torch.nn.Conv1d(gin_channels * n_sqz, 2 * hidden_channels * n_layers, 1) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + wn = None + self.flows = nn.ModuleList() + for b in range(n_blocks): + self.flows.append(ActNorm(channels=in_channels * n_sqz)) + if inv_conv_type == 'near': + self.flows.append(InvConvNear(channels=in_channels * n_sqz, n_split=n_split, n_sqz=n_sqz)) + if inv_conv_type == 'invconv': + self.flows.append(InvConv(channels=in_channels * n_sqz)) + if share_wn_layers > 0: + if b % share_wn_layers == 0: + wn = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels * n_sqz, + p_dropout, share_cond_layers) + self.flows.append( + CouplingBlock( + in_channels * n_sqz, + hidden_channels, + kernel_size=kernel_size, + dilation_rate=dilation_rate, + n_layers=n_layers, + gin_channels=gin_channels * n_sqz, + p_dropout=p_dropout, + sigmoid_scale=sigmoid_scale, + wn=wn + )) + + def forward(self, x, x_mask=None, g=None, reverse=False, return_hiddens=False): + logdet_tot = 0 + if not reverse: + flows = self.flows + else: + flows = reversed(self.flows) + if return_hiddens: + hs = [] + if self.n_sqz > 1: + x, x_mask_ = utils.squeeze(x, x_mask, self.n_sqz) + if g is not None: + g, _ = utils.squeeze(g, x_mask, self.n_sqz) + x_mask = x_mask_ + if self.share_cond_layers and g is not None: + g = self.cond_layer(g) + for f in flows: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + if return_hiddens: + hs.append(x) + logdet_tot += logdet + if self.n_sqz > 1: + x, x_mask = utils.unsqueeze(x, x_mask, self.n_sqz) + if return_hiddens: + return x, logdet_tot, hs + return x, logdet_tot + + def store_inverse(self): + def remove_weight_norm(m): + try: + nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(remove_weight_norm) + for f in self.flows: + f.store_inverse() diff --git a/modules/commons/normalizing_flow/res_flow.py b/modules/commons/normalizing_flow/res_flow.py new file mode 100644 index 0000000000000000000000000000000000000000..d0d13285704543ec28fe37d82346011240bdcaf8 --- /dev/null +++ b/modules/commons/normalizing_flow/res_flow.py @@ -0,0 +1,61 @@ +import torch +from torch import nn +from modules.commons.conv import ConditionalConvBlocks +from modules.commons.wavenet import WN + + +class FlipLayer(nn.Module): + def forward(self, x, nonpadding, cond=None, reverse=False): + x = torch.flip(x, [1]) + return x + + +class CouplingLayer(nn.Module): + def __init__(self, c_in, hidden_size, kernel_size, n_layers, p_dropout=0, c_in_g=0, nn_type='wn'): + super().__init__() + self.channels = c_in + self.hidden_size = hidden_size + self.kernel_size = kernel_size + self.n_layers = n_layers + self.c_half = c_in // 2 + + self.pre = nn.Conv1d(self.c_half, hidden_size, 1) + if nn_type == 'wn': + self.enc = WN(hidden_size, kernel_size, 1, n_layers, p_dropout=p_dropout, + c_cond=c_in_g) + elif nn_type == 'conv': + self.enc = ConditionalConvBlocks( + hidden_size, c_in_g, hidden_size, None, kernel_size, + layers_in_block=1, is_BTC=False, num_layers=n_layers) + self.post = nn.Conv1d(hidden_size, self.c_half, 1) + + def forward(self, x, nonpadding, cond=None, reverse=False): + x0, x1 = x[:, :self.c_half], x[:, self.c_half:] + x_ = self.pre(x0) * nonpadding + x_ = self.enc(x_, nonpadding=nonpadding, cond=cond) + m = self.post(x_) + x1 = m + x1 if not reverse else x1 - m + x = torch.cat([x0, x1], 1) + return x * nonpadding + + +class ResFlow(nn.Module): + def __init__(self, + c_in, + hidden_size, + kernel_size, + n_flow_layers, + n_flow_steps=4, + c_cond=0, + nn_type='wn'): + super().__init__() + self.flows = nn.ModuleList() + for i in range(n_flow_steps): + self.flows.append( + CouplingLayer(c_in, hidden_size, kernel_size, n_flow_layers, c_in_g=c_cond, nn_type=nn_type)) + self.flows.append(FlipLayer()) + + def forward(self, x, nonpadding, cond=None, reverse=False): + for flow in (self.flows if not reverse else reversed(self.flows)): + x = flow(x, nonpadding, cond=cond, reverse=reverse) + return x diff --git a/modules/commons/normalizing_flow/utils.py b/modules/commons/normalizing_flow/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7eb56ec514bff822ba1a19a6474207ed82492410 --- /dev/null +++ b/modules/commons/normalizing_flow/utils.py @@ -0,0 +1,29 @@ +import torch + + +def squeeze(x, x_mask=None, n_sqz=2): + b, c, t = x.size() + + t = (t // n_sqz) * n_sqz + x = x[:, :, :t] + x_sqz = x.view(b, c, t // n_sqz, n_sqz) + x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) + + if x_mask is not None: + x_mask = x_mask[:, :, n_sqz - 1::n_sqz] + else: + x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) + return x_sqz * x_mask, x_mask + + +def unsqueeze(x, x_mask=None, n_sqz=2): + b, c, t = x.size() + + x_unsqz = x.view(b, n_sqz, c // n_sqz, t) + x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) + + if x_mask is not None: + x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) + else: + x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) + return x_unsqz * x_mask, x_mask diff --git a/modules/commons/rel_transformer.py b/modules/commons/rel_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..3d83cb1c9bd8df6f9ddc6f373e299336ec55f08c --- /dev/null +++ b/modules/commons/rel_transformer.py @@ -0,0 +1,378 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + +from modules.commons.layers import Embedding + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +class Encoder(nn.Module): + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., + window_size=None, block_length=None, pre_ln=False, **kwargs): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + self.block_length = block_length + self.pre_ln = pre_ln + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention(hidden_channels, hidden_channels, n_heads, window_size=window_size, + p_dropout=p_dropout, block_length=block_length)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + if pre_ln: + self.last_ln = LayerNorm(hidden_channels) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + for i in range(self.n_layers): + x = x * x_mask + x_ = x + if self.pre_ln: + x = self.norm_layers_1[i](x) + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = x_ + y + if not self.pre_ln: + x = self.norm_layers_1[i](x) + + x_ = x + if self.pre_ln: + x = self.norm_layers_2[i](x) + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = x_ + y + if not self.pre_ln: + x = self.norm_layers_2[i](x) + if self.pre_ln: + x = self.last_ln(x) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__(self, channels, out_channels, n_heads, window_size=None, heads_share=True, p_dropout=0., + block_length=None, proximal_bias=False, proximal_init=False): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.p_dropout = p_dropout + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels ** -0.5 + self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + if proximal_init: + self.conv_k.weight.data.copy_(self.conv_q.weight.data) + self.conv_k.bias.data.copy_(self.conv_q.bias.data) + nn.init.xavier_uniform_(self.conv_v.weight) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels) + if self.window_size is not None: + assert t_s == t_t, "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings) + rel_logits = self._relative_position_to_absolute_position(rel_logits) + scores_local = rel_logits / math.sqrt(self.k_channels) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) + scores = scores * block_mask + -1e4 * (1 - block_mask) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) + output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) + output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])) + x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(x * x_mask) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + return x * x_mask + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-4): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + n_dims = len(x.shape) + mean = torch.mean(x, 1, keepdim=True) + variance = torch.mean((x - mean) ** 2, 1, keepdim=True) + + x = (x - mean) * torch.rsqrt(variance + self.eps) + + shape = [1, -1] + [1] * (n_dims - 2) + x = x * self.gamma.view(*shape) + self.beta.view(*shape) + return x + + +class ConvReluNorm(nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential( + nn.ReLU(), + nn.Dropout(p_dropout)) + for _ in range(n_layers - 1): + self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class RelTransformerEncoder(nn.Module): + def __init__(self, + n_vocab, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout=0.0, + window_size=4, + block_length=None, + prenet=True, + pre_ln=True, + ): + + super().__init__() + + self.n_vocab = n_vocab + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + self.block_length = block_length + self.prenet = prenet + if n_vocab > 0: + self.emb = Embedding(n_vocab, hidden_channels, padding_idx=0) + + if prenet: + self.pre = ConvReluNorm(hidden_channels, hidden_channels, hidden_channels, + kernel_size=5, n_layers=3, p_dropout=0) + self.encoder = Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + window_size=window_size, + block_length=block_length, + pre_ln=pre_ln, + ) + + def forward(self, x, x_mask=None): + if self.n_vocab > 0: + x_lengths = (x > 0).long().sum(-1) + x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] + else: + x_lengths = (x.abs().sum(-1) > 0).long().sum(-1) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + + if self.prenet: + x = self.pre(x, x_mask) + x = self.encoder(x, x_mask) + return x.transpose(1, 2) diff --git a/modules/commons/rnn.py b/modules/commons/rnn.py new file mode 100755 index 0000000000000000000000000000000000000000..205c2c76b8fda2de920bc59228a5eec0a20119a9 --- /dev/null +++ b/modules/commons/rnn.py @@ -0,0 +1,261 @@ +import torch +from torch import nn +import torch.nn.functional as F + + +class PreNet(nn.Module): + def __init__(self, in_dims, fc1_dims=256, fc2_dims=128, dropout=0.5): + super().__init__() + self.fc1 = nn.Linear(in_dims, fc1_dims) + self.fc2 = nn.Linear(fc1_dims, fc2_dims) + self.p = dropout + + def forward(self, x): + x = self.fc1(x) + x = F.relu(x) + x = F.dropout(x, self.p, training=self.training) + x = self.fc2(x) + x = F.relu(x) + x = F.dropout(x, self.p, training=self.training) + return x + + +class HighwayNetwork(nn.Module): + def __init__(self, size): + super().__init__() + self.W1 = nn.Linear(size, size) + self.W2 = nn.Linear(size, size) + self.W1.bias.data.fill_(0.) + + def forward(self, x): + x1 = self.W1(x) + x2 = self.W2(x) + g = torch.sigmoid(x2) + y = g * F.relu(x1) + (1. - g) * x + return y + + +class BatchNormConv(nn.Module): + def __init__(self, in_channels, out_channels, kernel, relu=True): + super().__init__() + self.conv = nn.Conv1d(in_channels, out_channels, kernel, stride=1, padding=kernel // 2, bias=False) + self.bnorm = nn.BatchNorm1d(out_channels) + self.relu = relu + + def forward(self, x): + x = self.conv(x) + x = F.relu(x) if self.relu is True else x + return self.bnorm(x) + + +class ConvNorm(torch.nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, + padding=None, dilation=1, bias=True, w_init_gain='linear'): + super(ConvNorm, self).__init__() + if padding is None: + assert (kernel_size % 2 == 1) + padding = int(dilation * (kernel_size - 1) / 2) + + self.conv = torch.nn.Conv1d(in_channels, out_channels, + kernel_size=kernel_size, stride=stride, + padding=padding, dilation=dilation, + bias=bias) + + torch.nn.init.xavier_uniform_( + self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, signal): + conv_signal = self.conv(signal) + return conv_signal + + +class CBHG(nn.Module): + def __init__(self, K, in_channels, channels, proj_channels, num_highways): + super().__init__() + + # List of all rnns to call `flatten_parameters()` on + self._to_flatten = [] + + self.bank_kernels = [i for i in range(1, K + 1)] + self.conv1d_bank = nn.ModuleList() + for k in self.bank_kernels: + conv = BatchNormConv(in_channels, channels, k) + self.conv1d_bank.append(conv) + + self.maxpool = nn.MaxPool1d(kernel_size=2, stride=1, padding=1) + + self.conv_project1 = BatchNormConv(len(self.bank_kernels) * channels, proj_channels[0], 3) + self.conv_project2 = BatchNormConv(proj_channels[0], proj_channels[1], 3, relu=False) + + # Fix the highway input if necessary + if proj_channels[-1] != channels: + self.highway_mismatch = True + self.pre_highway = nn.Linear(proj_channels[-1], channels, bias=False) + else: + self.highway_mismatch = False + + self.highways = nn.ModuleList() + for i in range(num_highways): + hn = HighwayNetwork(channels) + self.highways.append(hn) + + self.rnn = nn.GRU(channels, channels, batch_first=True, bidirectional=True) + self._to_flatten.append(self.rnn) + + # Avoid fragmentation of RNN parameters and associated warning + self._flatten_parameters() + + def forward(self, x): + # Although we `_flatten_parameters()` on init, when using DataParallel + # the model gets replicated, making it no longer guaranteed that the + # weights are contiguous in GPU memory. Hence, we must call it again + self._flatten_parameters() + + # Save these for later + residual = x + seq_len = x.size(-1) + conv_bank = [] + + # Convolution Bank + for conv in self.conv1d_bank: + c = conv(x) # Convolution + conv_bank.append(c[:, :, :seq_len]) + + # Stack along the channel axis + conv_bank = torch.cat(conv_bank, dim=1) + + # dump the last padding to fit residual + x = self.maxpool(conv_bank)[:, :, :seq_len] + + # Conv1d projections + x = self.conv_project1(x) + x = self.conv_project2(x) + + # Residual Connect + x = x + residual + + # Through the highways + x = x.transpose(1, 2) + if self.highway_mismatch is True: + x = self.pre_highway(x) + for h in self.highways: + x = h(x) + + # And then the RNN + x, _ = self.rnn(x) + return x + + def _flatten_parameters(self): + """Calls `flatten_parameters` on all the rnns used by the WaveRNN. Used + to improve efficiency and avoid PyTorch yelling at us.""" + [m.flatten_parameters() for m in self._to_flatten] + + +class TacotronEncoder(nn.Module): + def __init__(self, embed_dims, num_chars, cbhg_channels, K, num_highways, dropout): + super().__init__() + self.embedding = nn.Embedding(num_chars, embed_dims) + self.pre_net = PreNet(embed_dims, embed_dims, embed_dims, dropout=dropout) + self.cbhg = CBHG(K=K, in_channels=cbhg_channels, channels=cbhg_channels, + proj_channels=[cbhg_channels, cbhg_channels], + num_highways=num_highways) + self.proj_out = nn.Linear(cbhg_channels * 2, cbhg_channels) + + def forward(self, x): + x = self.embedding(x) + x = self.pre_net(x) + x.transpose_(1, 2) + x = self.cbhg(x) + x = self.proj_out(x) + return x + + +class RNNEncoder(nn.Module): + def __init__(self, num_chars, embedding_dim, n_convolutions=3, kernel_size=5): + super(RNNEncoder, self).__init__() + self.embedding = nn.Embedding(num_chars, embedding_dim, padding_idx=0) + convolutions = [] + for _ in range(n_convolutions): + conv_layer = nn.Sequential( + ConvNorm(embedding_dim, + embedding_dim, + kernel_size=kernel_size, stride=1, + padding=int((kernel_size - 1) / 2), + dilation=1, w_init_gain='relu'), + nn.BatchNorm1d(embedding_dim)) + convolutions.append(conv_layer) + self.convolutions = nn.ModuleList(convolutions) + + self.lstm = nn.LSTM(embedding_dim, int(embedding_dim / 2), 1, + batch_first=True, bidirectional=True) + + def forward(self, x): + input_lengths = (x > 0).sum(-1) + input_lengths = input_lengths.cpu().numpy() + + x = self.embedding(x) + x = x.transpose(1, 2) # [B, H, T] + for conv in self.convolutions: + x = F.dropout(F.relu(conv(x)), 0.5, self.training) + x + x = x.transpose(1, 2) # [B, T, H] + + # pytorch tensor are not reversible, hence the conversion + x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True, enforce_sorted=False) + + self.lstm.flatten_parameters() + outputs, _ = self.lstm(x) + outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) + + return outputs + + +class DecoderRNN(torch.nn.Module): + def __init__(self, hidden_size, decoder_rnn_dim, dropout): + super(DecoderRNN, self).__init__() + self.in_conv1d = nn.Sequential( + torch.nn.Conv1d( + in_channels=hidden_size, + out_channels=hidden_size, + kernel_size=9, padding=4, + ), + torch.nn.ReLU(), + torch.nn.Conv1d( + in_channels=hidden_size, + out_channels=hidden_size, + kernel_size=9, padding=4, + ), + ) + self.ln = nn.LayerNorm(hidden_size) + if decoder_rnn_dim == 0: + decoder_rnn_dim = hidden_size * 2 + self.rnn = torch.nn.LSTM( + input_size=hidden_size, + hidden_size=decoder_rnn_dim, + num_layers=1, + batch_first=True, + bidirectional=True, + dropout=dropout + ) + self.rnn.flatten_parameters() + self.conv1d = torch.nn.Conv1d( + in_channels=decoder_rnn_dim * 2, + out_channels=hidden_size, + kernel_size=3, + padding=1, + ) + + def forward(self, x): + input_masks = x.abs().sum(-1).ne(0).data[:, :, None] + input_lengths = input_masks.sum([-1, -2]) + input_lengths = input_lengths.cpu().numpy() + + x = self.in_conv1d(x.transpose(1, 2)).transpose(1, 2) + x = self.ln(x) + x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True, enforce_sorted=False) + self.rnn.flatten_parameters() + x, _ = self.rnn(x) # [B, T, C] + x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True) + x = x * input_masks + pre_mel = self.conv1d(x.transpose(1, 2)).transpose(1, 2) # [B, T, C] + pre_mel = pre_mel * input_masks + return pre_mel diff --git a/modules/commons/transformer.py b/modules/commons/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..e79847edbdbcc10ef24602c8316e5826238d9256 --- /dev/null +++ b/modules/commons/transformer.py @@ -0,0 +1,747 @@ +import math +import torch +from torch import nn +from torch.nn import Parameter, Linear +from modules.commons.layers import LayerNorm, Embedding +from utils.nn.seq_utils import get_incremental_state, set_incremental_state, softmax, make_positions +import torch.nn.functional as F + +DEFAULT_MAX_SOURCE_POSITIONS = 2000 +DEFAULT_MAX_TARGET_POSITIONS = 2000 + + +class SinusoidalPositionalEmbedding(nn.Module): + """This module produces sinusoidal positional embeddings of any length. + + Padding symbols are ignored. + """ + + def __init__(self, embedding_dim, padding_idx, init_size=1024): + super().__init__() + self.embedding_dim = embedding_dim + self.padding_idx = padding_idx + self.weights = SinusoidalPositionalEmbedding.get_embedding( + init_size, + embedding_dim, + padding_idx, + ) + self.register_buffer('_float_tensor', torch.FloatTensor(1)) + + @staticmethod + def get_embedding(num_embeddings, embedding_dim, padding_idx=None): + """Build sinusoidal embeddings. + + This matches the implementation in tensor2tensor, but differs slightly + from the description in Section 3.5 of "Attention Is All You Need". + """ + half_dim = embedding_dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb) + emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0) + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1) + if embedding_dim % 2 == 1: + # zero pad + emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) + if padding_idx is not None: + emb[padding_idx, :] = 0 + return emb + + def forward(self, input, incremental_state=None, timestep=None, positions=None, **kwargs): + """Input is expected to be of size [bsz x seqlen].""" + bsz, seq_len = input.shape[:2] + max_pos = self.padding_idx + 1 + seq_len + if self.weights is None or max_pos > self.weights.size(0): + # recompute/expand embeddings if needed + self.weights = SinusoidalPositionalEmbedding.get_embedding( + max_pos, + self.embedding_dim, + self.padding_idx, + ) + self.weights = self.weights.to(self._float_tensor) + + if incremental_state is not None: + # positions is the same for every token when decoding a single step + pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len + return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1) + + positions = make_positions(input, self.padding_idx) if positions is None else positions + return self.weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach() + + def max_positions(self): + """Maximum number of supported positions.""" + return int(1e5) # an arbitrary large number + + +class TransformerFFNLayer(nn.Module): + def __init__(self, hidden_size, filter_size, padding="SAME", kernel_size=1, dropout=0., act='gelu'): + super().__init__() + self.kernel_size = kernel_size + self.dropout = dropout + self.act = act + if padding == 'SAME': + self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2) + elif padding == 'LEFT': + self.ffn_1 = nn.Sequential( + nn.ConstantPad1d((kernel_size - 1, 0), 0.0), + nn.Conv1d(hidden_size, filter_size, kernel_size) + ) + self.ffn_2 = Linear(filter_size, hidden_size) + + def forward(self, x, incremental_state=None): + # x: T x B x C + if incremental_state is not None: + saved_state = self._get_input_buffer(incremental_state) + if 'prev_input' in saved_state: + prev_input = saved_state['prev_input'] + x = torch.cat((prev_input, x), dim=0) + x = x[-self.kernel_size:] + saved_state['prev_input'] = x + self._set_input_buffer(incremental_state, saved_state) + + x = self.ffn_1(x.permute(1, 2, 0)).permute(2, 0, 1) + x = x * self.kernel_size ** -0.5 + + if incremental_state is not None: + x = x[-1:] + if self.act == 'gelu': + x = F.gelu(x) + if self.act == 'relu': + x = F.relu(x) + x = F.dropout(x, self.dropout, training=self.training) + x = self.ffn_2(x) + return x + + def _get_input_buffer(self, incremental_state): + return get_incremental_state( + self, + incremental_state, + 'f', + ) or {} + + def _set_input_buffer(self, incremental_state, buffer): + set_incremental_state( + self, + incremental_state, + 'f', + buffer, + ) + + def clear_buffer(self, incremental_state): + if incremental_state is not None: + saved_state = self._get_input_buffer(incremental_state) + if 'prev_input' in saved_state: + del saved_state['prev_input'] + self._set_input_buffer(incremental_state, saved_state) + + +class MultiheadAttention(nn.Module): + def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=True, + add_bias_kv=False, add_zero_attn=False, self_attention=False, + encoder_decoder_attention=False): + super().__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + self.scaling = self.head_dim ** -0.5 + + self.self_attention = self_attention + self.encoder_decoder_attention = encoder_decoder_attention + + assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \ + 'value to be of the same size' + + if self.qkv_same_dim: + self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim)) + else: + self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim)) + self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim)) + self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim)) + + if bias: + self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim)) + else: + self.register_parameter('in_proj_bias', None) + + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + if add_bias_kv: + self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) + self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) + else: + self.bias_k = self.bias_v = None + + self.add_zero_attn = add_zero_attn + + self.reset_parameters() + + self.enable_torch_version = False + if hasattr(F, "multi_head_attention_forward"): + self.enable_torch_version = True + else: + self.enable_torch_version = False + self.last_attn_probs = None + + def reset_parameters(self): + if self.qkv_same_dim: + nn.init.xavier_uniform_(self.in_proj_weight) + else: + nn.init.xavier_uniform_(self.k_proj_weight) + nn.init.xavier_uniform_(self.v_proj_weight) + nn.init.xavier_uniform_(self.q_proj_weight) + + nn.init.xavier_uniform_(self.out_proj.weight) + if self.in_proj_bias is not None: + nn.init.constant_(self.in_proj_bias, 0.) + nn.init.constant_(self.out_proj.bias, 0.) + if self.bias_k is not None: + nn.init.xavier_normal_(self.bias_k) + if self.bias_v is not None: + nn.init.xavier_normal_(self.bias_v) + + def forward( + self, + query, key, value, + key_padding_mask=None, + incremental_state=None, + need_weights=True, + static_kv=False, + attn_mask=None, + before_softmax=False, + need_head_weights=False, + enc_dec_attn_constraint_mask=None, + reset_attn_weight=None + ): + """Input shape: Time x Batch x Channel + + Args: + key_padding_mask (ByteTensor, optional): mask to exclude + keys that are pads, of shape `(batch, src_len)`, where + padding elements are indicated by 1s. + need_weights (bool, optional): return the attention weights, + averaged over heads (default: False). + attn_mask (ByteTensor, optional): typically used to + implement causal attention, where the mask prevents the + attention from looking forward in time (default: None). + before_softmax (bool, optional): return the raw attention + weights and values before the attention softmax. + need_head_weights (bool, optional): return the attention + weights for each head. Implies *need_weights*. Default: + return the average attention weights over all heads. + """ + if need_head_weights: + need_weights = True + + tgt_len, bsz, embed_dim = query.size() + assert embed_dim == self.embed_dim + assert list(query.size()) == [tgt_len, bsz, embed_dim] + if self.enable_torch_version and incremental_state is None and not static_kv and reset_attn_weight is None: + if self.qkv_same_dim: + return F.multi_head_attention_forward(query, key, value, + self.embed_dim, self.num_heads, + self.in_proj_weight, + self.in_proj_bias, self.bias_k, self.bias_v, + self.add_zero_attn, self.dropout, + self.out_proj.weight, self.out_proj.bias, + self.training, key_padding_mask, need_weights, + attn_mask) + else: + return F.multi_head_attention_forward(query, key, value, + self.embed_dim, self.num_heads, + torch.empty([0]), + self.in_proj_bias, self.bias_k, self.bias_v, + self.add_zero_attn, self.dropout, + self.out_proj.weight, self.out_proj.bias, + self.training, key_padding_mask, need_weights, + attn_mask, use_separate_proj_weight=True, + q_proj_weight=self.q_proj_weight, + k_proj_weight=self.k_proj_weight, + v_proj_weight=self.v_proj_weight) + + if incremental_state is not None: + saved_state = self._get_input_buffer(incremental_state) + if 'prev_key' in saved_state: + # previous time steps are cached - no need to recompute + # key and value if they are static + if static_kv: + assert self.encoder_decoder_attention and not self.self_attention + key = value = None + else: + saved_state = None + + if self.self_attention: + # self-attention + q, k, v = self.in_proj_qkv(query) + elif self.encoder_decoder_attention: + # encoder-decoder attention + q = self.in_proj_q(query) + if key is None: + assert value is None + k = v = None + else: + k = self.in_proj_k(key) + v = self.in_proj_v(key) + + else: + q = self.in_proj_q(query) + k = self.in_proj_k(key) + v = self.in_proj_v(value) + q *= self.scaling + + if self.bias_k is not None: + assert self.bias_v is not None + k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1)], dim=1) + + q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1) + if k is not None: + k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) + if v is not None: + v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) + + if saved_state is not None: + # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) + if 'prev_key' in saved_state: + prev_key = saved_state['prev_key'].view(bsz * self.num_heads, -1, self.head_dim) + if static_kv: + k = prev_key + else: + k = torch.cat((prev_key, k), dim=1) + if 'prev_value' in saved_state: + prev_value = saved_state['prev_value'].view(bsz * self.num_heads, -1, self.head_dim) + if static_kv: + v = prev_value + else: + v = torch.cat((prev_value, v), dim=1) + if 'prev_key_padding_mask' in saved_state and saved_state['prev_key_padding_mask'] is not None: + prev_key_padding_mask = saved_state['prev_key_padding_mask'] + if static_kv: + key_padding_mask = prev_key_padding_mask + else: + key_padding_mask = torch.cat((prev_key_padding_mask, key_padding_mask), dim=1) + + saved_state['prev_key'] = k.view(bsz, self.num_heads, -1, self.head_dim) + saved_state['prev_value'] = v.view(bsz, self.num_heads, -1, self.head_dim) + saved_state['prev_key_padding_mask'] = key_padding_mask + + self._set_input_buffer(incremental_state, saved_state) + + src_len = k.size(1) + + # This is part of a workaround to get around fork/join parallelism + # not supporting Optional types. + if key_padding_mask is not None and key_padding_mask.shape == torch.Size([]): + key_padding_mask = None + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bsz + assert key_padding_mask.size(1) == src_len + + if self.add_zero_attn: + src_len += 1 + k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) + v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) + if attn_mask is not None: + attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1) + + attn_weights = torch.bmm(q, k.transpose(1, 2)) + attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) + + assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] + + if attn_mask is not None: + if len(attn_mask.shape) == 2: + attn_mask = attn_mask.unsqueeze(0) + elif len(attn_mask.shape) == 3: + attn_mask = attn_mask[:, None].repeat([1, self.num_heads, 1, 1]).reshape( + bsz * self.num_heads, tgt_len, src_len) + attn_weights = attn_weights + attn_mask + + if enc_dec_attn_constraint_mask is not None: # bs x head x L_kv + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.masked_fill( + enc_dec_attn_constraint_mask.unsqueeze(2).bool(), + -1e8, + ) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if key_padding_mask is not None: + # don't attend to padding symbols + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2), + -1e8, + ) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_logits = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + + if before_softmax: + return attn_weights, v + + attn_weights_float = softmax(attn_weights, dim=-1) + attn_weights = attn_weights_float.type_as(attn_weights) + attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training) + + if reset_attn_weight is not None: + if reset_attn_weight: + self.last_attn_probs = attn_probs.detach() + else: + assert self.last_attn_probs is not None + attn_probs = self.last_attn_probs + attn = torch.bmm(attn_probs, v) + assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] + attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) + attn = self.out_proj(attn) + + if need_weights: + attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0) + if not need_head_weights: + # average attention weights over heads + attn_weights = attn_weights.mean(dim=0) + else: + attn_weights = None + + return attn, (attn_weights, attn_logits) + + def in_proj_qkv(self, query): + return self._in_proj(query).chunk(3, dim=-1) + + def in_proj_q(self, query): + if self.qkv_same_dim: + return self._in_proj(query, end=self.embed_dim) + else: + bias = self.in_proj_bias + if bias is not None: + bias = bias[:self.embed_dim] + return F.linear(query, self.q_proj_weight, bias) + + def in_proj_k(self, key): + if self.qkv_same_dim: + return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim) + else: + weight = self.k_proj_weight + bias = self.in_proj_bias + if bias is not None: + bias = bias[self.embed_dim:2 * self.embed_dim] + return F.linear(key, weight, bias) + + def in_proj_v(self, value): + if self.qkv_same_dim: + return self._in_proj(value, start=2 * self.embed_dim) + else: + weight = self.v_proj_weight + bias = self.in_proj_bias + if bias is not None: + bias = bias[2 * self.embed_dim:] + return F.linear(value, weight, bias) + + def _in_proj(self, input, start=0, end=None): + weight = self.in_proj_weight + bias = self.in_proj_bias + weight = weight[start:end, :] + if bias is not None: + bias = bias[start:end] + return F.linear(input, weight, bias) + + def _get_input_buffer(self, incremental_state): + return get_incremental_state( + self, + incremental_state, + 'attn_state', + ) or {} + + def _set_input_buffer(self, incremental_state, buffer): + set_incremental_state( + self, + incremental_state, + 'attn_state', + buffer, + ) + + def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz): + return attn_weights + + def clear_buffer(self, incremental_state=None): + if incremental_state is not None: + saved_state = self._get_input_buffer(incremental_state) + if 'prev_key' in saved_state: + del saved_state['prev_key'] + if 'prev_value' in saved_state: + del saved_state['prev_value'] + self._set_input_buffer(incremental_state, saved_state) + + +class EncSALayer(nn.Module): + def __init__(self, c, num_heads, dropout, attention_dropout=0.1, + relu_dropout=0.1, kernel_size=9, padding='SAME', act='gelu'): + super().__init__() + self.c = c + self.dropout = dropout + self.num_heads = num_heads + if num_heads > 0: + self.layer_norm1 = LayerNorm(c) + self.self_attn = MultiheadAttention( + self.c, num_heads, self_attention=True, dropout=attention_dropout, bias=False) + self.layer_norm2 = LayerNorm(c) + self.ffn = TransformerFFNLayer( + c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, padding=padding, act=act) + + def forward(self, x, encoder_padding_mask=None, **kwargs): + layer_norm_training = kwargs.get('layer_norm_training', None) + if layer_norm_training is not None: + self.layer_norm1.training = layer_norm_training + self.layer_norm2.training = layer_norm_training + if self.num_heads > 0: + residual = x + x = self.layer_norm1(x) + x, _, = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=encoder_padding_mask + ) + x = F.dropout(x, self.dropout, training=self.training) + x = residual + x + x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None] + + residual = x + x = self.layer_norm2(x) + x = self.ffn(x) + x = F.dropout(x, self.dropout, training=self.training) + x = residual + x + x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None] + return x + + +class DecSALayer(nn.Module): + def __init__(self, c, num_heads, dropout, attention_dropout=0.1, relu_dropout=0.1, + kernel_size=9, act='gelu'): + super().__init__() + self.c = c + self.dropout = dropout + self.layer_norm1 = LayerNorm(c) + self.self_attn = MultiheadAttention( + c, num_heads, self_attention=True, dropout=attention_dropout, bias=False + ) + self.layer_norm2 = LayerNorm(c) + self.encoder_attn = MultiheadAttention( + c, num_heads, encoder_decoder_attention=True, dropout=attention_dropout, bias=False, + ) + self.layer_norm3 = LayerNorm(c) + self.ffn = TransformerFFNLayer( + c, 4 * c, padding='LEFT', kernel_size=kernel_size, dropout=relu_dropout, act=act) + + def forward( + self, + x, + encoder_out=None, + encoder_padding_mask=None, + incremental_state=None, + self_attn_mask=None, + self_attn_padding_mask=None, + attn_out=None, + reset_attn_weight=None, + **kwargs, + ): + layer_norm_training = kwargs.get('layer_norm_training', None) + if layer_norm_training is not None: + self.layer_norm1.training = layer_norm_training + self.layer_norm2.training = layer_norm_training + self.layer_norm3.training = layer_norm_training + residual = x + x = self.layer_norm1(x) + x, _ = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=self_attn_padding_mask, + incremental_state=incremental_state, + attn_mask=self_attn_mask + ) + x = F.dropout(x, self.dropout, training=self.training) + x = residual + x + + attn_logits = None + if encoder_out is not None or attn_out is not None: + residual = x + x = self.layer_norm2(x) + if encoder_out is not None: + x, attn = self.encoder_attn( + query=x, + key=encoder_out, + value=encoder_out, + key_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + static_kv=True, + enc_dec_attn_constraint_mask=get_incremental_state(self, incremental_state, + 'enc_dec_attn_constraint_mask'), + reset_attn_weight=reset_attn_weight + ) + attn_logits = attn[1] + elif attn_out is not None: + x = self.encoder_attn.in_proj_v(attn_out) + if encoder_out is not None or attn_out is not None: + x = F.dropout(x, self.dropout, training=self.training) + x = residual + x + + residual = x + x = self.layer_norm3(x) + x = self.ffn(x, incremental_state=incremental_state) + x = F.dropout(x, self.dropout, training=self.training) + x = residual + x + return x, attn_logits + + def clear_buffer(self, input, encoder_out=None, encoder_padding_mask=None, incremental_state=None): + self.encoder_attn.clear_buffer(incremental_state) + self.ffn.clear_buffer(incremental_state) + + def set_buffer(self, name, tensor, incremental_state): + return set_incremental_state(self, incremental_state, name, tensor) + + +class TransformerEncoderLayer(nn.Module): + def __init__(self, hidden_size, dropout, kernel_size=9, num_heads=2): + super().__init__() + self.hidden_size = hidden_size + self.dropout = dropout + self.num_heads = num_heads + self.op = EncSALayer( + hidden_size, num_heads, dropout=dropout, + attention_dropout=0.0, relu_dropout=dropout, + kernel_size=kernel_size) + + def forward(self, x, **kwargs): + return self.op(x, **kwargs) + + +class TransformerDecoderLayer(nn.Module): + def __init__(self, hidden_size, dropout, kernel_size=9, num_heads=2): + super().__init__() + self.hidden_size = hidden_size + self.dropout = dropout + self.num_heads = num_heads + self.op = DecSALayer( + hidden_size, num_heads, dropout=dropout, + attention_dropout=0.0, relu_dropout=dropout, + kernel_size=kernel_size) + + def forward(self, x, **kwargs): + return self.op(x, **kwargs) + + def clear_buffer(self, *args): + return self.op.clear_buffer(*args) + + def set_buffer(self, *args): + return self.op.set_buffer(*args) + + +class FFTBlocks(nn.Module): + def __init__(self, hidden_size, num_layers, ffn_kernel_size=9, dropout=0.0, + num_heads=2, use_pos_embed=True, use_last_norm=True, + use_pos_embed_alpha=True): + super().__init__() + self.num_layers = num_layers + embed_dim = self.hidden_size = hidden_size + self.dropout = dropout + self.use_pos_embed = use_pos_embed + self.use_last_norm = use_last_norm + if use_pos_embed: + self.max_source_positions = DEFAULT_MAX_TARGET_POSITIONS + self.padding_idx = 0 + self.pos_embed_alpha = nn.Parameter(torch.Tensor([1])) if use_pos_embed_alpha else 1 + self.embed_positions = SinusoidalPositionalEmbedding( + embed_dim, self.padding_idx, init_size=DEFAULT_MAX_TARGET_POSITIONS, + ) + + self.layers = nn.ModuleList([]) + self.layers.extend([ + TransformerEncoderLayer(self.hidden_size, self.dropout, + kernel_size=ffn_kernel_size, num_heads=num_heads) + for _ in range(self.num_layers) + ]) + if self.use_last_norm: + self.layer_norm = nn.LayerNorm(embed_dim) + else: + self.layer_norm = None + + def forward(self, x, padding_mask=None, attn_mask=None, return_hiddens=False): + """ + :param x: [B, T, C] + :param padding_mask: [B, T] + :return: [B, T, C] or [L, B, T, C] + """ + padding_mask = x.abs().sum(-1).eq(0).data if padding_mask is None else padding_mask + nonpadding_mask_TB = 1 - padding_mask.transpose(0, 1).float()[:, :, None] # [T, B, 1] + if self.use_pos_embed: + positions = self.pos_embed_alpha * self.embed_positions(x[..., 0]) + x = x + positions + x = F.dropout(x, p=self.dropout, training=self.training) + # B x T x C -> T x B x C + x = x.transpose(0, 1) * nonpadding_mask_TB + hiddens = [] + for layer in self.layers: + x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB + hiddens.append(x) + if self.use_last_norm: + x = self.layer_norm(x) * nonpadding_mask_TB + if return_hiddens: + x = torch.stack(hiddens, 0) # [L, T, B, C] + x = x.transpose(1, 2) # [L, B, T, C] + else: + x = x.transpose(0, 1) # [B, T, C] + return x + + +class FastSpeechEncoder(FFTBlocks): + def __init__(self, dict_size, hidden_size=256, num_layers=4, kernel_size=9, num_heads=2, + dropout=0.0): + super().__init__(hidden_size, num_layers, kernel_size, num_heads=num_heads, + use_pos_embed=False, dropout=dropout) # use_pos_embed_alpha for compatibility + self.embed_tokens = Embedding(dict_size, hidden_size, 0) + self.embed_scale = math.sqrt(hidden_size) + self.padding_idx = 0 + self.embed_positions = SinusoidalPositionalEmbedding( + hidden_size, self.padding_idx, init_size=DEFAULT_MAX_TARGET_POSITIONS, + ) + + def forward(self, txt_tokens, attn_mask=None): + """ + + :param txt_tokens: [B, T] + :return: { + 'encoder_out': [B x T x C] + } + """ + encoder_padding_mask = txt_tokens.eq(self.padding_idx).data + x = self.forward_embedding(txt_tokens) # [B, T, H] + if self.num_layers > 0: + x = super(FastSpeechEncoder, self).forward(x, encoder_padding_mask, attn_mask=attn_mask) + return x + + def forward_embedding(self, txt_tokens): + # embed tokens and positions + x = self.embed_scale * self.embed_tokens(txt_tokens) + if self.use_pos_embed: + positions = self.embed_positions(txt_tokens) + x = x + positions + x = F.dropout(x, p=self.dropout, training=self.training) + return x + + +class FastSpeechDecoder(FFTBlocks): + def __init__(self, hidden_size=256, num_layers=4, kernel_size=9, num_heads=2): + super().__init__(hidden_size, num_layers, kernel_size, num_heads=num_heads) diff --git a/modules/commons/wavenet.py b/modules/commons/wavenet.py new file mode 100644 index 0000000000000000000000000000000000000000..7809c9b9d3331ba4fd2ffd4caae14e721e4b0732 --- /dev/null +++ b/modules/commons/wavenet.py @@ -0,0 +1,97 @@ +import torch +from torch import nn + + +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +class WN(torch.nn.Module): + def __init__(self, hidden_size, kernel_size, dilation_rate, n_layers, c_cond=0, + p_dropout=0, share_cond_layers=False, is_BTC=False): + super(WN, self).__init__() + assert (kernel_size % 2 == 1) + assert (hidden_size % 2 == 0) + self.is_BTC = is_BTC + self.hidden_size = hidden_size + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = c_cond + self.p_dropout = p_dropout + self.share_cond_layers = share_cond_layers + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if c_cond != 0 and not share_cond_layers: + cond_layer = torch.nn.Conv1d(c_cond, 2 * hidden_size * n_layers, 1) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + + for i in range(n_layers): + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d(hidden_size, 2 * hidden_size, kernel_size, + dilation=dilation, padding=padding) + in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_size + else: + res_skip_channels = hidden_size + + res_skip_layer = torch.nn.Conv1d(hidden_size, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, nonpadding=None, cond=None): + if self.is_BTC: + x = x.transpose(1, 2) + cond = cond.transpose(1, 2) if cond is not None else None + nonpadding = nonpadding.transpose(1, 2) if nonpadding is not None else None + if nonpadding is None: + nonpadding = 1 + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_size]) + + if cond is not None and not self.share_cond_layers: + cond = self.cond_layer(cond) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + x_in = self.drop(x_in) + if cond is not None: + cond_offset = i * 2 * self.hidden_size + cond_l = cond[:, cond_offset:cond_offset + 2 * self.hidden_size, :] + else: + cond_l = torch.zeros_like(x_in) + + acts = fused_add_tanh_sigmoid_multiply(x_in, cond_l, n_channels_tensor) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + x = (x + res_skip_acts[:, :self.hidden_size, :]) * nonpadding + output = output + res_skip_acts[:, self.hidden_size:, :] + else: + output = output + res_skip_acts + output = output * nonpadding + if self.is_BTC: + output = output.transpose(1, 2) + return output + + def remove_weight_norm(self): + def remove_weight_norm(m): + try: + nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(remove_weight_norm) diff --git a/modules/tts/__pycache__/fs.cpython-36.pyc b/modules/tts/__pycache__/fs.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0a15eca9cf634159d51d3d8abb8c3ca7bde61d4 Binary files /dev/null and b/modules/tts/__pycache__/fs.cpython-36.pyc differ diff --git a/modules/tts/__pycache__/fs.cpython-37.pyc b/modules/tts/__pycache__/fs.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49e93d389a938a11ccfe807286a4193bf9a20b7d Binary files /dev/null and b/modules/tts/__pycache__/fs.cpython-37.pyc differ diff --git a/modules/tts/commons/__pycache__/align_ops.cpython-36.pyc b/modules/tts/commons/__pycache__/align_ops.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11d285b300df7a954f9c57bceaf80331972dd88a Binary files /dev/null and b/modules/tts/commons/__pycache__/align_ops.cpython-36.pyc differ diff --git a/modules/tts/commons/__pycache__/align_ops.cpython-37.pyc b/modules/tts/commons/__pycache__/align_ops.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fde10363f76667ebeb5ee78541e8ef803d4fb3a9 Binary files /dev/null and b/modules/tts/commons/__pycache__/align_ops.cpython-37.pyc differ diff --git a/modules/tts/commons/align_ops.py b/modules/tts/commons/align_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a190d63a3f3ba31f41754975569336a87c63089d --- /dev/null +++ b/modules/tts/commons/align_ops.py @@ -0,0 +1,25 @@ +import torch +import torch.nn.functional as F + + +def build_word_mask(x2word, y2word): + return (x2word[:, :, None] == y2word[:, None, :]).long() + + +def mel2ph_to_mel2word(mel2ph, ph2word): + mel2word = (ph2word - 1).gather(1, (mel2ph - 1).clamp(min=0)) + 1 + mel2word = mel2word * (mel2ph > 0).long() + return mel2word + + +def clip_mel2token_to_multiple(mel2token, frames_multiple): + max_frames = mel2token.shape[1] // frames_multiple * frames_multiple + mel2token = mel2token[:, :max_frames] + return mel2token + + +def expand_states(h, mel2token): + h = F.pad(h, [0, 0, 1, 0]) + mel2token_ = mel2token[..., None].repeat([1, 1, h.shape[-1]]) + h = torch.gather(h, 1, mel2token_) # [B, T, H] + return h diff --git a/modules/tts/diffspeech/net.py b/modules/tts/diffspeech/net.py new file mode 100644 index 0000000000000000000000000000000000000000..764020f28add5e4ee387a9d081ab6d548fc0f201 --- /dev/null +++ b/modules/tts/diffspeech/net.py @@ -0,0 +1,110 @@ +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from math import sqrt + +Linear = nn.Linear +ConvTranspose2d = nn.ConvTranspose2d + + +class Mish(nn.Module): + def forward(self, x): + return x * torch.tanh(F.softplus(x)) + + +class SinusoidalPosEmb(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + device = x.device + half_dim = self.dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, device=device) * -emb) + emb = x[:, None] * emb[None, :] + emb = torch.cat((emb.sin(), emb.cos()), dim=-1) + return emb + + +def Conv1d(*args, **kwargs): + layer = nn.Conv1d(*args, **kwargs) + nn.init.kaiming_normal_(layer.weight) + return layer + + +class ResidualBlock(nn.Module): + def __init__(self, encoder_hidden, residual_channels, dilation): + super().__init__() + self.dilated_conv = Conv1d(residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation) + self.diffusion_projection = Linear(residual_channels, residual_channels) + self.conditioner_projection = Conv1d(encoder_hidden, 2 * residual_channels, 1) + self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1) + + def forward(self, x, conditioner, diffusion_step): + diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1) + conditioner = self.conditioner_projection(conditioner) + y = x + diffusion_step + + y = self.dilated_conv(y) + conditioner + + gate, filter = torch.chunk(y, 2, dim=1) + y = torch.sigmoid(gate) * torch.tanh(filter) + + y = self.output_projection(y) + residual, skip = torch.chunk(y, 2, dim=1) + return (x + residual) / sqrt(2.0), skip + + +class DiffNet(nn.Module): + def __init__(self, hparams): + super().__init__() + in_dims = hparams['audio_num_mel_bins'] + self.encoder_hidden = hparams['hidden_size'] + self.residual_layers = hparams['residual_layers'] + self.residual_channels = hparams['residual_channels'] + self.dilation_cycle_length = hparams['dilation_cycle_length'] + + self.input_projection = Conv1d(in_dims, self.residual_channels, 1) + self.diffusion_embedding = SinusoidalPosEmb(self.residual_channels) + dim = self.residual_channels + self.mlp = nn.Sequential( + nn.Linear(dim, dim * 4), + Mish(), + nn.Linear(dim * 4, dim) + ) + self.residual_layers = nn.ModuleList([ + ResidualBlock(self.encoder_hidden, self.residual_channels, 2 ** (i % self.dilation_cycle_length)) + for i in range(self.residual_layers) + ]) + self.skip_projection = Conv1d(self.residual_channels, self.residual_channels, 1) + self.output_projection = Conv1d(self.residual_channels, in_dims, 1) + nn.init.zeros_(self.output_projection.weight) + + def forward(self, spec, diffusion_step, cond): + """ + + :param spec: [B, 1, M, T] + :param diffusion_step: [B, 1] + :param cond: [B, M, T] + :return: + """ + x = spec[:, 0] + x = self.input_projection(x) # x [B, residual_channel, T] + + x = F.relu(x) + diffusion_step = self.diffusion_embedding(diffusion_step) + diffusion_step = self.mlp(diffusion_step) + skip = [] + for layer_id, layer in enumerate(self.residual_layers): + x, skip_connection = layer(x, cond, diffusion_step) + skip.append(skip_connection) + + x = torch.sum(torch.stack(skip), dim=0) / sqrt(len(self.residual_layers)) + x = self.skip_projection(x) + x = F.relu(x) + x = self.output_projection(x) # [B, 80, T] + return x[:, None, :, :] diff --git a/modules/tts/diffspeech/shallow_diffusion_tts.py b/modules/tts/diffspeech/shallow_diffusion_tts.py new file mode 100644 index 0000000000000000000000000000000000000000..e3c3a6d891a7721949e05f6065c194aaae8ea9e8 --- /dev/null +++ b/modules/tts/diffspeech/shallow_diffusion_tts.py @@ -0,0 +1,279 @@ +import math +import random +from functools import partial +from inspect import isfunction +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn +from tqdm import tqdm + +from modules.tts.fs2_orig import FastSpeech2Orig +from modules.tts.diffspeech.net import DiffNet +from modules.tts.commons.align_ops import expand_states + + +def exists(x): + return x is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +# gaussian diffusion trainer class + +def extract(a, t, x_shape): + b, *_ = t.shape + out = a.gather(-1, t) + return out.reshape(b, *((1,) * (len(x_shape) - 1))) + + +def noise_like(shape, device, repeat=False): + repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1))) + noise = lambda: torch.randn(shape, device=device) + return repeat_noise() if repeat else noise() + + +def linear_beta_schedule(timesteps, max_beta=0.01): + """ + linear schedule + """ + betas = np.linspace(1e-4, max_beta, timesteps) + return betas + + +def cosine_beta_schedule(timesteps, s=0.008): + """ + cosine schedule + as proposed in https://openreview.net/forum?id=-NEXDKk8gZ + """ + steps = timesteps + 1 + x = np.linspace(0, steps, steps) + alphas_cumprod = np.cos(((x / steps) + s) / (1 + s) * np.pi * 0.5) ** 2 + alphas_cumprod = alphas_cumprod / alphas_cumprod[0] + betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1]) + return np.clip(betas, a_min=0, a_max=0.999) + + +beta_schedule = { + "cosine": cosine_beta_schedule, + "linear": linear_beta_schedule, +} + + +DIFF_DECODERS = { + 'wavenet': lambda hp: DiffNet(hp), +} + + +class AuxModel(FastSpeech2Orig): + def forward(self, txt_tokens, mel2ph=None, spk_embed=None, spk_id=None, + f0=None, uv=None, energy=None, infer=False, **kwargs): + ret = {} + encoder_out = self.encoder(txt_tokens) # [B, T, C] + src_nonpadding = (txt_tokens > 0).float()[:, :, None] + style_embed = self.forward_style_embed(spk_embed, spk_id) + + # add dur + dur_inp = (encoder_out + style_embed) * src_nonpadding + mel2ph = self.forward_dur(dur_inp, mel2ph, txt_tokens, ret) + tgt_nonpadding = (mel2ph > 0).float()[:, :, None] + decoder_inp = decoder_inp_ = expand_states(encoder_out, mel2ph) + + # add pitch and energy embed + if self.hparams['use_pitch_embed']: + pitch_inp = (decoder_inp_ + style_embed) * tgt_nonpadding + decoder_inp = decoder_inp + self.forward_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out) + + # add pitch and energy embed + if self.hparams['use_energy_embed']: + energy_inp = (decoder_inp_ + style_embed) * tgt_nonpadding + decoder_inp = decoder_inp + self.forward_energy(energy_inp, energy, ret) + + # decoder input + ret['decoder_inp'] = decoder_inp = (decoder_inp + style_embed) * tgt_nonpadding + if self.hparams['dec_inp_add_noise']: + B, T, _ = decoder_inp.shape + z = kwargs.get('adv_z', torch.randn([B, T, self.z_channels])).to(decoder_inp.device) + ret['adv_z'] = z + decoder_inp = torch.cat([decoder_inp, z], -1) + decoder_inp = self.dec_inp_noise_proj(decoder_inp) * tgt_nonpadding + if kwargs['skip_decoder']: + return ret + ret['mel_out'] = self.forward_decoder(decoder_inp, tgt_nonpadding, ret, infer=infer, **kwargs) + return ret + + +class GaussianDiffusion(nn.Module): + def __init__(self, dict_size, hparams, out_dims=None): + super().__init__() + self.hparams = hparams + out_dims = hparams['audio_num_mel_bins'] + denoise_fn = DIFF_DECODERS[hparams['diff_decoder_type']](hparams) + timesteps = hparams['timesteps'] + K_step = hparams['K_step'] + loss_type = hparams['diff_loss_type'] + spec_min = hparams['spec_min'] + spec_max = hparams['spec_max'] + + self.denoise_fn = denoise_fn + self.fs2 = AuxModel(dict_size, hparams) + self.mel_bins = out_dims + + if hparams['schedule_type'] == 'linear': + betas = linear_beta_schedule(timesteps, hparams['max_beta']) + else: + betas = cosine_beta_schedule(timesteps) + + alphas = 1. - betas + alphas_cumprod = np.cumprod(alphas, axis=0) + alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1]) + + timesteps, = betas.shape + self.num_timesteps = int(timesteps) + self.K_step = K_step + self.loss_type = loss_type + + to_torch = partial(torch.tensor, dtype=torch.float32) + + self.register_buffer('betas', to_torch(betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod))) + self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod))) + self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1))) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod) + # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t) + self.register_buffer('posterior_variance', to_torch(posterior_variance)) + # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain + self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20)))) + self.register_buffer('posterior_mean_coef1', to_torch( + betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod))) + self.register_buffer('posterior_mean_coef2', to_torch( + (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod))) + + self.register_buffer('spec_min', torch.FloatTensor(spec_min)[None, None, :hparams['keep_bins']]) + self.register_buffer('spec_max', torch.FloatTensor(spec_max)[None, None, :hparams['keep_bins']]) + + def q_mean_variance(self, x_start, t): + mean = extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + variance = extract(1. - self.alphas_cumprod, t, x_start.shape) + log_variance = extract(self.log_one_minus_alphas_cumprod, t, x_start.shape) + return mean, variance, log_variance + + def predict_start_from_noise(self, x_t, t, noise): + return ( + extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - + extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise + ) + + def q_posterior(self, x_start, x_t, t): + posterior_mean = ( + extract(self.posterior_mean_coef1, t, x_t.shape) * x_start + + extract(self.posterior_mean_coef2, t, x_t.shape) * x_t + ) + posterior_variance = extract(self.posterior_variance, t, x_t.shape) + posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape) + return posterior_mean, posterior_variance, posterior_log_variance_clipped + + def p_mean_variance(self, x, t, cond, clip_denoised: bool): + noise_pred = self.denoise_fn(x, t, cond=cond) + x_recon = self.predict_start_from_noise(x, t=t, noise=noise_pred) + + if clip_denoised: + x_recon.clamp_(-1., 1.) + + model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t) + return model_mean, posterior_variance, posterior_log_variance + + @torch.no_grad() + def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False): + b, *_, device = *x.shape, x.device + model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, cond=cond, clip_denoised=clip_denoised) + noise = noise_like(x.shape, device, repeat_noise) + # no noise when t == 0 + nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1))) + return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise + + def q_sample(self, x_start, t, noise=None): + noise = default(noise, lambda: torch.randn_like(x_start)) + return ( + extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + + extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise + ) + + def p_losses(self, x_start, t, cond, noise=None, nonpadding=None): + noise = default(noise, lambda: torch.randn_like(x_start)) + + x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise) + x_recon = self.denoise_fn(x_noisy, t, cond) + + if self.loss_type == 'l1': + if nonpadding is not None: + loss = ((noise - x_recon).abs() * nonpadding.unsqueeze(1)).mean() + else: + # print('are you sure w/o nonpadding?') + loss = (noise - x_recon).abs().mean() + + elif self.loss_type == 'l2': + loss = F.mse_loss(noise, x_recon) + else: + raise NotImplementedError() + + return loss + + def forward(self, txt_tokens, mel2ph=None, spk_embed=None, spk_id=None, + ref_mels=None, f0=None, uv=None, energy=None, infer=False, **kwargs): + b, *_, device = *txt_tokens.shape, txt_tokens.device + ret = self.fs2(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed, spk_id=spk_id, + f0=f0, uv=uv, energy=energy, infer=infer, skip_decoder=(not infer), **kwargs) + cond = ret['decoder_inp'].transpose(1, 2) + + if not infer: + t = torch.randint(0, self.K_step, (b,), device=device).long() + x = ref_mels + x = self.norm_spec(x) + x = x.transpose(1, 2)[:, None, :, :] # [B, 1, M, T] + ret['diff_loss'] = self.p_losses(x, t, cond) + # nonpadding = (mel2ph != 0).float() + # ret['diff_loss'] = self.p_losses(x, t, cond, nonpadding=nonpadding) + ret['mel_out'] = None + else: + ret['fs2_mel'] = ret['mel_out'] + fs2_mels = ret['mel_out'] + t = self.K_step + fs2_mels = self.norm_spec(fs2_mels) + fs2_mels = fs2_mels.transpose(1, 2)[:, None, :, :] + + x = self.q_sample(x_start=fs2_mels, t=torch.tensor([t - 1], device=device).long()) + if self.hparams.get('gaussian_start') is not None and self.hparams['gaussian_start']: + print('===> gaussian start.') + shape = (cond.shape[0], 1, self.mel_bins, cond.shape[2]) + x = torch.randn(shape, device=device) + for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t): + x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond) + x = x[:, 0].transpose(1, 2) + ret['mel_out'] = self.denorm_spec(x) + + return ret + + def norm_spec(self, x): + return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1 + + def denorm_spec(self, x): + return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min + + def cwt2f0_norm(self, cwt_spec, mean, std, mel2ph): + return self.fs2.cwt2f0_norm(cwt_spec, mean, std, mel2ph) + + def out2mel(self, x): + return x \ No newline at end of file diff --git a/modules/tts/fs.py b/modules/tts/fs.py new file mode 100755 index 0000000000000000000000000000000000000000..b15b4348c1abf58a476c12115b5b088dc7b46979 --- /dev/null +++ b/modules/tts/fs.py @@ -0,0 +1,172 @@ +from copy import deepcopy + +import torch +from torch import nn +import torch.nn.functional as F +from modules.commons.conv import TextConvEncoder, ConvBlocks +from modules.commons.layers import Embedding +from modules.commons.nar_tts_modules import PitchPredictor, DurationPredictor, LengthRegulator +from modules.commons.rel_transformer import RelTransformerEncoder +from modules.commons.rnn import TacotronEncoder, RNNEncoder, DecoderRNN +from modules.commons.transformer import FastSpeechEncoder, FastSpeechDecoder +from modules.commons.wavenet import WN +from modules.tts.commons.align_ops import clip_mel2token_to_multiple, expand_states +from utils.audio.pitch.utils import denorm_f0, f0_to_coarse + +FS_ENCODERS = { + 'fft': lambda hp, dict_size: FastSpeechEncoder( + dict_size, hp['hidden_size'], hp['enc_layers'], hp['enc_ffn_kernel_size'], + num_heads=hp['num_heads']), + 'tacotron': lambda hp, dict_size: TacotronEncoder( + hp['hidden_size'], dict_size, hp['hidden_size'], + K=hp['encoder_K'], num_highways=4, dropout=hp['dropout']), + 'tacotron2': lambda hp, dict_size: RNNEncoder(dict_size, hp['hidden_size']), + 'conv': lambda hp, dict_size: TextConvEncoder(dict_size, hp['hidden_size'], hp['hidden_size'], + hp['enc_dilations'], hp['enc_kernel_size'], + layers_in_block=hp['layers_in_block'], + norm_type=hp['enc_dec_norm'], + post_net_kernel=hp.get('enc_post_net_kernel', 3)), + 'rel_fft': lambda hp, dict_size: RelTransformerEncoder( + dict_size, hp['hidden_size'], hp['hidden_size'], + hp['ffn_hidden_size'], hp['num_heads'], hp['enc_layers'], + hp['enc_ffn_kernel_size'], hp['dropout'], prenet=hp['enc_prenet'], pre_ln=hp['enc_pre_ln']), +} + +FS_DECODERS = { + 'fft': lambda hp: FastSpeechDecoder( + hp['hidden_size'], hp['dec_layers'], hp['dec_ffn_kernel_size'], hp['num_heads']), + 'rnn': lambda hp: DecoderRNN(hp['hidden_size'], hp['decoder_rnn_dim'], hp['dropout']), + 'conv': lambda hp: ConvBlocks(hp['hidden_size'], hp['hidden_size'], hp['dec_dilations'], + hp['dec_kernel_size'], layers_in_block=hp['layers_in_block'], + norm_type=hp['enc_dec_norm'], dropout=hp['dropout'], + post_net_kernel=hp.get('dec_post_net_kernel', 3)), + 'wn': lambda hp: WN(hp['hidden_size'], kernel_size=5, dilation_rate=1, n_layers=hp['dec_layers'], + is_BTC=True), +} + + +class FastSpeech(nn.Module): + def __init__(self, dict_size, hparams, out_dims=None): + super().__init__() + self.hparams = deepcopy(hparams) + self.enc_layers = hparams['enc_layers'] + self.dec_layers = hparams['dec_layers'] + self.hidden_size = hparams['hidden_size'] + self.encoder = FS_ENCODERS[hparams['encoder_type']](hparams, dict_size) + self.decoder = FS_DECODERS[hparams['decoder_type']](hparams) + self.out_dims = hparams['audio_num_mel_bins'] if out_dims is None else out_dims + self.mel_out = nn.Linear(self.hidden_size, self.out_dims, bias=True) + if hparams['use_spk_id']: + self.spk_id_proj = Embedding(hparams['num_spk'], self.hidden_size) + if hparams['use_spk_embed']: + self.spk_embed_proj = nn.Linear(256, self.hidden_size, bias=True) + predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size + self.dur_predictor = DurationPredictor( + self.hidden_size, + n_chans=predictor_hidden, + n_layers=hparams['dur_predictor_layers'], + dropout_rate=hparams['predictor_dropout'], + kernel_size=hparams['dur_predictor_kernel']) + self.length_regulator = LengthRegulator() + if hparams['use_pitch_embed']: + self.pitch_embed = Embedding(300, self.hidden_size, 0) + self.pitch_predictor = PitchPredictor( + self.hidden_size, n_chans=predictor_hidden, + n_layers=5, dropout_rate=0.1, odim=2, + kernel_size=hparams['predictor_kernel']) + if hparams['dec_inp_add_noise']: + self.z_channels = hparams['z_channels'] + self.dec_inp_noise_proj = nn.Linear(self.hidden_size + self.z_channels, self.hidden_size) + + def forward(self, txt_tokens, mel2ph=None, spk_embed=None, spk_id=None, + f0=None, uv=None, infer=False, **kwargs): + ret = {} + encoder_out = self.encoder(txt_tokens) # [B, T, C] + src_nonpadding = (txt_tokens > 0).float()[:, :, None] + style_embed = self.forward_style_embed(spk_embed, spk_id) + + # add dur + dur_inp = (encoder_out + style_embed) * src_nonpadding + mel2ph = self.forward_dur(dur_inp, mel2ph, txt_tokens, ret) + tgt_nonpadding = (mel2ph > 0).float()[:, :, None] + decoder_inp = expand_states(encoder_out, mel2ph) + + # add pitch embed + if self.hparams['use_pitch_embed']: + pitch_inp = (decoder_inp + style_embed) * tgt_nonpadding + decoder_inp = decoder_inp + self.forward_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out) + + # decoder input + ret['decoder_inp'] = decoder_inp = (decoder_inp + style_embed) * tgt_nonpadding + if self.hparams['dec_inp_add_noise']: + B, T, _ = decoder_inp.shape + z = kwargs.get('adv_z', torch.randn([B, T, self.z_channels])).to(decoder_inp.device) + ret['adv_z'] = z + decoder_inp = torch.cat([decoder_inp, z], -1) + decoder_inp = self.dec_inp_noise_proj(decoder_inp) * tgt_nonpadding + ret['mel_out'] = self.forward_decoder(decoder_inp, tgt_nonpadding, ret, infer=infer, **kwargs) + return ret + + def forward_style_embed(self, spk_embed=None, spk_id=None): + # add spk embed + style_embed = 0 + if self.hparams['use_spk_embed']: + style_embed = style_embed + self.spk_embed_proj(spk_embed)[:, None, :] + if self.hparams['use_spk_id']: + style_embed = style_embed + self.spk_id_proj(spk_id)[:, None, :] + return style_embed + + def forward_dur(self, dur_input, mel2ph, txt_tokens, ret): + """ + + :param dur_input: [B, T_txt, H] + :param mel2ph: [B, T_mel] + :param txt_tokens: [B, T_txt] + :param ret: + :return: + """ + src_padding = txt_tokens == 0 + if self.hparams['predictor_grad'] != 1: + dur_input = dur_input.detach() + self.hparams['predictor_grad'] * (dur_input - dur_input.detach()) + dur = self.dur_predictor(dur_input, src_padding) + ret['dur'] = dur + if mel2ph is None: + mel2ph = self.length_regulator(dur, src_padding).detach() + ret['mel2ph'] = mel2ph = clip_mel2token_to_multiple(mel2ph, self.hparams['frames_multiple']) + return mel2ph + + def forward_pitch(self, decoder_inp, f0, uv, mel2ph, ret, encoder_out=None): + if self.hparams['pitch_type'] == 'frame': + pitch_pred_inp = decoder_inp + pitch_padding = mel2ph == 0 + else: + pitch_pred_inp = encoder_out + pitch_padding = encoder_out.abs().sum(-1) == 0 + uv = None + if self.hparams['predictor_grad'] != 1: + pitch_pred_inp = pitch_pred_inp.detach() + \ + self.hparams['predictor_grad'] * (pitch_pred_inp - pitch_pred_inp.detach()) + ret['pitch_pred'] = pitch_pred = self.pitch_predictor(pitch_pred_inp) + use_uv = self.hparams['pitch_type'] == 'frame' and self.hparams['use_uv'] + if f0 is None: + f0 = pitch_pred[:, :, 0] + if use_uv: + uv = pitch_pred[:, :, 1] > 0 + f0_denorm = denorm_f0(f0, uv if use_uv else None, pitch_padding=pitch_padding) + pitch = f0_to_coarse(f0_denorm) # start from 0 [B, T_txt] + ret['f0_denorm'] = f0_denorm + ret['f0_denorm_pred'] = denorm_f0( + pitch_pred[:, :, 0], (pitch_pred[:, :, 1] > 0) if use_uv else None, + pitch_padding=pitch_padding) + if self.hparams['pitch_type'] == 'ph': + pitch = torch.gather(F.pad(pitch, [1, 0]), 1, mel2ph) + ret['f0_denorm'] = torch.gather(F.pad(ret['f0_denorm'], [1, 0]), 1, mel2ph) + ret['f0_denorm_pred'] = torch.gather(F.pad(ret['f0_denorm_pred'], [1, 0]), 1, mel2ph) + pitch_embed = self.pitch_embed(pitch) + return pitch_embed + + def forward_decoder(self, decoder_inp, tgt_nonpadding, ret, infer, **kwargs): + x = decoder_inp # [B, T, H] + x = self.decoder(x) + x = self.mel_out(x) + return x * tgt_nonpadding diff --git a/modules/tts/fs2_orig.py b/modules/tts/fs2_orig.py new file mode 100755 index 0000000000000000000000000000000000000000..4bc8db59c3004731ae4c4feca3e61969b98e45cc --- /dev/null +++ b/modules/tts/fs2_orig.py @@ -0,0 +1,102 @@ +import torch +from torch import nn +from modules.commons.layers import Embedding +from modules.commons.nar_tts_modules import EnergyPredictor, PitchPredictor +from modules.tts.commons.align_ops import expand_states +from modules.tts.fs import FastSpeech +from utils.audio.cwt import cwt2f0, get_lf0_cwt +from utils.audio.pitch.utils import denorm_f0, f0_to_coarse, norm_f0 +import numpy as np + + +class FastSpeech2Orig(FastSpeech): + def __init__(self, dict_size, hparams, out_dims=None): + super().__init__(dict_size, hparams, out_dims) + predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size + if hparams['use_energy_embed']: + self.energy_embed = Embedding(300, self.hidden_size, 0) + self.energy_predictor = EnergyPredictor( + self.hidden_size, n_chans=predictor_hidden, + n_layers=hparams['predictor_layers'], dropout_rate=hparams['predictor_dropout'], odim=2, + kernel_size=hparams['predictor_kernel']) + if hparams['pitch_type'] == 'cwt' and hparams['use_pitch_embed']: + self.pitch_predictor = PitchPredictor( + self.hidden_size, n_chans=predictor_hidden, + n_layers=hparams['predictor_layers'], dropout_rate=hparams['predictor_dropout'], odim=11, + kernel_size=hparams['predictor_kernel']) + self.cwt_stats_layers = nn.Sequential( + nn.Linear(self.hidden_size, self.hidden_size), nn.ReLU(), + nn.Linear(self.hidden_size, self.hidden_size), nn.ReLU(), nn.Linear(self.hidden_size, 2)) + + def forward(self, txt_tokens, mel2ph=None, spk_embed=None, spk_id=None, + f0=None, uv=None, energy=None, infer=False, **kwargs): + ret = {} + encoder_out = self.encoder(txt_tokens) # [B, T, C] + src_nonpadding = (txt_tokens > 0).float()[:, :, None] + style_embed = self.forward_style_embed(spk_embed, spk_id) + + # add dur + dur_inp = (encoder_out + style_embed) * src_nonpadding + mel2ph = self.forward_dur(dur_inp, mel2ph, txt_tokens, ret) + tgt_nonpadding = (mel2ph > 0).float()[:, :, None] + decoder_inp = decoder_inp_ = expand_states(encoder_out, mel2ph) + + # add pitch and energy embed + if self.hparams['use_pitch_embed']: + pitch_inp = (decoder_inp_ + style_embed) * tgt_nonpadding + decoder_inp = decoder_inp + self.forward_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out) + + # add pitch and energy embed + if self.hparams['use_energy_embed']: + energy_inp = (decoder_inp_ + style_embed) * tgt_nonpadding + decoder_inp = decoder_inp + self.forward_energy(energy_inp, energy, ret) + + # decoder input + ret['decoder_inp'] = decoder_inp = (decoder_inp + style_embed) * tgt_nonpadding + if self.hparams['dec_inp_add_noise']: + B, T, _ = decoder_inp.shape + z = kwargs.get('adv_z', torch.randn([B, T, self.z_channels])).to(decoder_inp.device) + ret['adv_z'] = z + decoder_inp = torch.cat([decoder_inp, z], -1) + decoder_inp = self.dec_inp_noise_proj(decoder_inp) * tgt_nonpadding + ret['mel_out'] = self.forward_decoder(decoder_inp, tgt_nonpadding, ret, infer=infer, **kwargs) + return ret + + def forward_pitch(self, decoder_inp, f0, uv, mel2ph, ret, encoder_out=None): + if self.hparams['pitch_type'] == 'cwt': + decoder_inp = decoder_inp.detach() + self.hparams['predictor_grad'] * (decoder_inp - decoder_inp.detach()) + pitch_padding = mel2ph == 0 + ret['cwt'] = cwt_out = self.pitch_predictor(decoder_inp) + stats_out = self.cwt_stats_layers(decoder_inp.mean(1)) # [B, 2] + mean = ret['f0_mean'] = stats_out[:, 0] + std = ret['f0_std'] = stats_out[:, 1] + cwt_spec = cwt_out[:, :, :10] + if f0 is None: + std = std * self.hparams['cwt_std_scale'] + f0 = self.cwt2f0_norm(cwt_spec, mean, std, mel2ph) + if self.hparams['use_uv']: + assert cwt_out.shape[-1] == 11 + uv = cwt_out[:, :, -1] > 0 + ret['f0_denorm'] = f0_denorm = denorm_f0(f0, uv if self.hparams['use_uv'] else None, + pitch_padding=pitch_padding) + pitch = f0_to_coarse(f0_denorm) # start from 0 + pitch_embed = self.pitch_embed(pitch) + return pitch_embed + else: + return super(FastSpeech2Orig, self).forward_pitch(decoder_inp, f0, uv, mel2ph, ret, encoder_out) + + def forward_energy(self, decoder_inp, energy, ret): + decoder_inp = decoder_inp.detach() + self.hparams['predictor_grad'] * (decoder_inp - decoder_inp.detach()) + ret['energy_pred'] = energy_pred = self.energy_predictor(decoder_inp)[:, :, 0] + energy_embed_inp = energy_pred if energy is None else energy + energy_embed_inp = torch.clamp(energy_embed_inp * 256 // 4, min=0, max=255).long() + energy_embed = self.energy_embed(energy_embed_inp) + return energy_embed + + def cwt2f0_norm(self, cwt_spec, mean, std, mel2ph): + _, cwt_scales = get_lf0_cwt(np.ones(10)) + f0 = cwt2f0(cwt_spec, mean, std, cwt_scales) + f0 = torch.cat( + [f0] + [f0[:, -1:]] * (mel2ph.shape[1] - f0.shape[1]), 1) + f0_norm = norm_f0(f0, None) + return f0_norm diff --git a/modules/tts/glow/__pycache__/utils.cpython-36.pyc b/modules/tts/glow/__pycache__/utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9516a86cb8114dadd126b4c5caaefddc15b6a9ea Binary files /dev/null and b/modules/tts/glow/__pycache__/utils.cpython-36.pyc differ diff --git a/modules/tts/glow/utils.py b/modules/tts/glow/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..214853251e5f00a939c5fe1a348ad9f7dc1d4c5e --- /dev/null +++ b/modules/tts/glow/utils.py @@ -0,0 +1,29 @@ +import torch + + +def squeeze(x, nonpadding=None, n_sqz=2): + b, c, t = x.size() + + t = (t // n_sqz) * n_sqz + x = x[:, :, :t] + x_sqz = x.view(b, c, t // n_sqz, n_sqz) + x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) + + if nonpadding is not None: + nonpadding = nonpadding[:, :, n_sqz - 1::n_sqz] + else: + nonpadding = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) + return x_sqz * nonpadding, nonpadding + + +def unsqueeze(x, nonpadding=None, n_sqz=2): + b, c, t = x.size() + + x_unsqz = x.view(b, n_sqz, c // n_sqz, t) + x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) + + if nonpadding is not None: + nonpadding = nonpadding.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) + else: + nonpadding = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) + return x_unsqz * nonpadding, nonpadding diff --git a/modules/tts/portaspeech/__pycache__/fvae.cpython-36.pyc b/modules/tts/portaspeech/__pycache__/fvae.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c8df107009171ba680b04bef68e63cf14373da9 Binary files /dev/null and b/modules/tts/portaspeech/__pycache__/fvae.cpython-36.pyc differ diff --git a/modules/tts/portaspeech/__pycache__/fvae.cpython-37.pyc b/modules/tts/portaspeech/__pycache__/fvae.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1b06869689d0c5896d9e33b939db282e6c94f86 Binary files /dev/null and b/modules/tts/portaspeech/__pycache__/fvae.cpython-37.pyc differ diff --git a/modules/tts/portaspeech/__pycache__/portaspeech.cpython-36.pyc b/modules/tts/portaspeech/__pycache__/portaspeech.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1fb91c00ce5014b7cb68e29f8011e9e679ef078 Binary files /dev/null and b/modules/tts/portaspeech/__pycache__/portaspeech.cpython-36.pyc differ diff --git a/modules/tts/portaspeech/__pycache__/portaspeech_flow.cpython-36.pyc b/modules/tts/portaspeech/__pycache__/portaspeech_flow.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ede7b0d452e1da568acc5d6b85b028746ba78916 Binary files /dev/null and b/modules/tts/portaspeech/__pycache__/portaspeech_flow.cpython-36.pyc differ diff --git a/modules/tts/portaspeech/fvae.py b/modules/tts/portaspeech/fvae.py new file mode 100644 index 0000000000000000000000000000000000000000..ee3588a4b9b0ab65184930bc3de94fdec9eeb20b --- /dev/null +++ b/modules/tts/portaspeech/fvae.py @@ -0,0 +1,203 @@ +import numpy as np +import torch +import torch.distributions as dist +from torch import nn + +from modules.commons.conv import ConditionalConvBlocks +from modules.commons.normalizing_flow.res_flow import ResFlow +from modules.commons.wavenet import WN +from modules.tts.syntaspeech.syntactic_graph_encoder import GraphAuxEnc + + +class FVAEEncoder(nn.Module): + def __init__(self, c_in, hidden_size, c_latent, kernel_size, + n_layers, c_cond=0, p_dropout=0, strides=[4], nn_type='wn'): + super().__init__() + self.strides = strides + self.hidden_size = hidden_size + if np.prod(strides) == 1: + self.pre_net = nn.Conv1d(c_in, hidden_size, kernel_size=1) + else: + self.pre_net = nn.Sequential(*[ + nn.Conv1d(c_in, hidden_size, kernel_size=s * 2, stride=s, padding=s // 2) + if i == 0 else + nn.Conv1d(hidden_size, hidden_size, kernel_size=s * 2, stride=s, padding=s // 2) + for i, s in enumerate(strides) + ]) + if nn_type == 'wn': + self.nn = WN(hidden_size, kernel_size, 1, n_layers, c_cond, p_dropout) + elif nn_type == 'conv': + self.nn = ConditionalConvBlocks( + hidden_size, c_cond, hidden_size, None, kernel_size, + layers_in_block=2, is_BTC=False, num_layers=n_layers) + + self.out_proj = nn.Conv1d(hidden_size, c_latent * 2, 1) + self.latent_channels = c_latent + + def forward(self, x, nonpadding, cond): + x = self.pre_net(x) + nonpadding = nonpadding[:, :, ::np.prod(self.strides)][:, :, :x.shape[-1]] + x = x * nonpadding + x = self.nn(x, nonpadding=nonpadding, cond=cond) * nonpadding + x = self.out_proj(x) + m, logs = torch.split(x, self.latent_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) + return z, m, logs, nonpadding + + +class FVAEDecoder(nn.Module): + def __init__(self, c_latent, hidden_size, out_channels, kernel_size, + n_layers, c_cond=0, p_dropout=0, strides=[4], nn_type='wn'): + super().__init__() + self.strides = strides + self.hidden_size = hidden_size + self.pre_net = nn.Sequential(*[ + nn.ConvTranspose1d(c_latent, hidden_size, kernel_size=s, stride=s) + if i == 0 else + nn.ConvTranspose1d(hidden_size, hidden_size, kernel_size=s, stride=s) + for i, s in enumerate(strides) + ]) + if nn_type == 'wn': + self.nn = WN(hidden_size, kernel_size, 1, n_layers, c_cond, p_dropout) + elif nn_type == 'conv': + self.nn = ConditionalConvBlocks( + hidden_size, c_cond, hidden_size, [1] * n_layers, kernel_size, + layers_in_block=2, is_BTC=False) + self.out_proj = nn.Conv1d(hidden_size, out_channels, 1) + + def forward(self, x, nonpadding, cond): + x = self.pre_net(x) + x = x * nonpadding + x = self.nn(x, nonpadding=nonpadding, cond=cond) * nonpadding + x = self.out_proj(x) + return x + + +class FVAE(nn.Module): + def __init__(self, + c_in_out, hidden_size, c_latent, + kernel_size, enc_n_layers, dec_n_layers, c_cond, strides, + use_prior_flow, flow_hidden=None, flow_kernel_size=None, flow_n_steps=None, + encoder_type='wn', decoder_type='wn'): + super(FVAE, self).__init__() + self.strides = strides + self.hidden_size = hidden_size + self.latent_size = c_latent + self.use_prior_flow = use_prior_flow + if np.prod(strides) == 1: + self.g_pre_net = nn.Conv1d(c_cond, c_cond, kernel_size=1) + else: + self.g_pre_net = nn.Sequential(*[ + nn.Conv1d(c_cond, c_cond, kernel_size=s * 2, stride=s, padding=s // 2) + for i, s in enumerate(strides) + ]) + self.encoder = FVAEEncoder(c_in_out, hidden_size, c_latent, kernel_size, + enc_n_layers, c_cond, strides=strides, nn_type=encoder_type) + if use_prior_flow: + self.prior_flow = ResFlow( + c_latent, flow_hidden, flow_kernel_size, flow_n_steps, 4, c_cond=c_cond) + self.decoder = FVAEDecoder(c_latent, hidden_size, c_in_out, kernel_size, + dec_n_layers, c_cond, strides=strides, nn_type=decoder_type) + self.prior_dist = dist.Normal(0, 1) + + def forward(self, x=None, nonpadding=None, cond=None, infer=False, noise_scale=1.0): + """ + + :param x: [B, C_in_out, T] + :param nonpadding: [B, 1, T] + :param cond: [B, C_g, T] + :return: + """ + if nonpadding is None: + nonpadding = 1 + cond_sqz = self.g_pre_net(cond) + if not infer: + z_q, m_q, logs_q, nonpadding_sqz = self.encoder(x, nonpadding, cond_sqz) + q_dist = dist.Normal(m_q, logs_q.exp()) + if self.use_prior_flow: + logqx = q_dist.log_prob(z_q) + z_p = self.prior_flow(z_q, nonpadding_sqz, cond_sqz) + logpx = self.prior_dist.log_prob(z_p) + loss_kl = ((logqx - logpx) * nonpadding_sqz).sum() / nonpadding_sqz.sum() / logqx.shape[1] + else: + loss_kl = torch.distributions.kl_divergence(q_dist, self.prior_dist) + loss_kl = (loss_kl * nonpadding_sqz).sum() / nonpadding_sqz.sum() / z_q.shape[1] + z_p = None + return z_q, loss_kl, z_p, m_q, logs_q + else: + latent_shape = [cond_sqz.shape[0], self.latent_size, cond_sqz.shape[2]] + z_p = torch.randn(latent_shape).to(cond.device) * noise_scale + if self.use_prior_flow: + z_p = self.prior_flow(z_p, 1, cond_sqz, reverse=True) + return z_p + + +class SyntaFVAE(nn.Module): + def __init__(self, + c_in_out, hidden_size, c_latent, + kernel_size, enc_n_layers, dec_n_layers, c_cond, strides, + use_prior_flow, flow_hidden=None, flow_kernel_size=None, flow_n_steps=None, + encoder_type='wn', decoder_type='wn'): + super(SyntaFVAE, self).__init__() + self.strides = strides + self.hidden_size = hidden_size + self.latent_size = c_latent + self.use_prior_flow = use_prior_flow + if np.prod(strides) == 1: + self.g_pre_net = nn.Conv1d(c_cond, c_cond, kernel_size=1) + else: + self.g_pre_net = nn.Sequential(*[ + nn.Conv1d(c_cond, c_cond, kernel_size=s * 2, stride=s, padding=s // 2) + for i, s in enumerate(strides) + ]) + self.encoder = FVAEEncoder(c_in_out, hidden_size, c_latent, kernel_size, + enc_n_layers, c_cond, strides=strides, nn_type=encoder_type) + if use_prior_flow: + self.prior_flow = ResFlow( + c_latent, flow_hidden, flow_kernel_size, flow_n_steps, 4, c_cond=c_cond) + self.decoder = FVAEDecoder(c_latent, hidden_size, c_in_out, kernel_size, + dec_n_layers, c_cond, strides=strides, nn_type=decoder_type) + self.prior_dist = dist.Normal(0, 1) + self.graph_encoder = GraphAuxEnc(in_dim=hidden_size, hid_dim=hidden_size,out_dim=hidden_size) + + def forward(self, x=None, nonpadding=None, cond=None, infer=False, noise_scale=1.0, + mel2word=None, ph2word=None, graph_lst=None, etypes_lst=None): + """ + + :param x: target mel, [B, C_in_out, T] + :param nonpadding: [B, 1, T] + :param cond: phoneme encoding, [B, C_g, T] + :return: + """ + word_len = ph2word.max(dim=1)[0] + ph_encoding_for_graph = cond.detach() + 0.1 * (cond - cond.detach()) # only 0.1x grad can pass through + _, ph_out_word_encoding_for_graph = GraphAuxEnc.ph_encoding_to_word_encoding(ph_encoding_for_graph.transpose(1,2), mel2word, word_len) + t_m = mel2word.shape[-1] + g_graph = self.graph_encoder.word_forward(graph_lst=graph_lst, word_encoding=ph_out_word_encoding_for_graph, etypes_lst=etypes_lst) + g_graph = g_graph.transpose(1,2) + g_graph = GraphAuxEnc._postprocess_word2ph(g_graph,mel2word,t_m) + g_graph = g_graph.transpose(1,2) + cond = cond + g_graph * 1. + + if nonpadding is None: + nonpadding = 1 + cond_sqz = self.g_pre_net(cond) + if not infer: + z_q, m_q, logs_q, nonpadding_sqz = self.encoder(x, nonpadding, cond_sqz) + q_dist = dist.Normal(m_q, logs_q.exp()) + if self.use_prior_flow: + logqx = q_dist.log_prob(z_q) + z_p = self.prior_flow(z_q, nonpadding_sqz, cond_sqz) + logpx = self.prior_dist.log_prob(z_p) + loss_kl = ((logqx - logpx) * nonpadding_sqz).sum() / nonpadding_sqz.sum() / logqx.shape[1] + else: + loss_kl = torch.distributions.kl_divergence(q_dist, self.prior_dist) + loss_kl = (loss_kl * nonpadding_sqz).sum() / nonpadding_sqz.sum() / z_q.shape[1] + z_p = None + return z_q, loss_kl, z_p, m_q, logs_q + else: + latent_shape = [cond_sqz.shape[0], self.latent_size, cond_sqz.shape[2]] + z_p = torch.randn(latent_shape).to(cond.device) * noise_scale + if self.use_prior_flow: + z_p = self.prior_flow(z_p, 1, cond_sqz, reverse=True) + return z_p \ No newline at end of file diff --git a/modules/tts/portaspeech/portaspeech.py b/modules/tts/portaspeech/portaspeech.py new file mode 100644 index 0000000000000000000000000000000000000000..7f862f5ec7f210efc3faa14172151ec685fe00a9 --- /dev/null +++ b/modules/tts/portaspeech/portaspeech.py @@ -0,0 +1,225 @@ +import math +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn import Linear + +from modules.commons.conv import ConvBlocks, ConditionalConvBlocks +from modules.commons.layers import Embedding +from modules.commons.rel_transformer import RelTransformerEncoder +from modules.commons.transformer import MultiheadAttention, FFTBlocks +from modules.tts.commons.align_ops import clip_mel2token_to_multiple, build_word_mask, expand_states, mel2ph_to_mel2word +from modules.tts.fs import FS_DECODERS, FastSpeech +from modules.tts.portaspeech.fvae import FVAE +from utils.commons.meters import Timer +from utils.nn.seq_utils import group_hidden_by_segs + + +class SinusoidalPosEmb(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + """ + + :param x: [B, T] + :return: [B, T, H] + """ + device = x.device + half_dim = self.dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, device=device) * -emb) + emb = x[:, :, None] * emb[None, :] + emb = torch.cat((emb.sin(), emb.cos()), dim=-1) + return emb + + +class PortaSpeech(FastSpeech): + def __init__(self, ph_dict_size, word_dict_size, hparams, out_dims=None): + super().__init__(ph_dict_size, hparams, out_dims) + # build linguistic encoder + if hparams['use_word_encoder']: + self.word_encoder = RelTransformerEncoder( + word_dict_size, self.hidden_size, self.hidden_size, self.hidden_size, 2, + hparams['word_enc_layers'], hparams['enc_ffn_kernel_size']) + if hparams['dur_level'] == 'word': + if hparams['word_encoder_type'] == 'rel_fft': + self.ph2word_encoder = RelTransformerEncoder( + 0, self.hidden_size, self.hidden_size, self.hidden_size, 2, + hparams['word_enc_layers'], hparams['enc_ffn_kernel_size']) + if hparams['word_encoder_type'] == 'fft': + self.ph2word_encoder = FFTBlocks( + self.hidden_size, hparams['word_enc_layers'], 1, num_heads=hparams['num_heads']) + self.sin_pos = SinusoidalPosEmb(self.hidden_size) + self.enc_pos_proj = nn.Linear(2 * self.hidden_size, self.hidden_size) + self.dec_query_proj = nn.Linear(2 * self.hidden_size, self.hidden_size) + self.dec_res_proj = nn.Linear(2 * self.hidden_size, self.hidden_size) + self.attn = MultiheadAttention(self.hidden_size, 1, encoder_decoder_attention=True, bias=False) + self.attn.enable_torch_version = False + if hparams['text_encoder_postnet']: + self.text_encoder_postnet = ConvBlocks( + self.hidden_size, self.hidden_size, [1] * 3, 5, layers_in_block=2) + else: + self.sin_pos = SinusoidalPosEmb(self.hidden_size) + # build VAE decoder + if hparams['use_fvae']: + del self.decoder + del self.mel_out + self.fvae = FVAE( + c_in_out=self.out_dims, + hidden_size=hparams['fvae_enc_dec_hidden'], c_latent=hparams['latent_size'], + kernel_size=hparams['fvae_kernel_size'], + enc_n_layers=hparams['fvae_enc_n_layers'], + dec_n_layers=hparams['fvae_dec_n_layers'], + c_cond=self.hidden_size, + use_prior_flow=hparams['use_prior_flow'], + flow_hidden=hparams['prior_flow_hidden'], + flow_kernel_size=hparams['prior_flow_kernel_size'], + flow_n_steps=hparams['prior_flow_n_blocks'], + strides=[hparams['fvae_strides']], + encoder_type=hparams['fvae_encoder_type'], + decoder_type=hparams['fvae_decoder_type'], + ) + else: + self.decoder = FS_DECODERS[hparams['decoder_type']](hparams) + self.mel_out = Linear(self.hidden_size, self.out_dims, bias=True) + if hparams['use_pitch_embed']: + self.pitch_embed = Embedding(300, self.hidden_size, 0) + if self.hparams['add_word_pos']: + self.word_pos_proj = Linear(self.hidden_size, self.hidden_size) + + def build_embedding(self, dictionary, embed_dim): + num_embeddings = len(dictionary) + emb = Embedding(num_embeddings, embed_dim, self.padding_idx) + return emb + + def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word=None, mel2ph=None, + spk_embed=None, spk_id=None, pitch=None, infer=False, tgt_mels=None, + global_step=None, *args, **kwargs): + ret = {} + style_embed = self.forward_style_embed(spk_embed, spk_id) + x, tgt_nonpadding = self.run_text_encoder( + txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret) + x = x * tgt_nonpadding + ret['nonpadding'] = tgt_nonpadding + if self.hparams['use_pitch_embed']: + x = x + self.pitch_embed(pitch) + ret['decoder_inp'] = x + ret['mel_out_fvae'] = ret['mel_out'] = self.run_decoder(x, tgt_nonpadding, ret, infer, tgt_mels, global_step) + return ret + + def run_text_encoder(self, txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret): + word2word = torch.arange(word_len)[None, :].to(ph2word.device) + 1 # [B, T_mel, T_word] + src_nonpadding = (txt_tokens > 0).float()[:, :, None] + ph_encoder_out = self.encoder(txt_tokens) * src_nonpadding + style_embed + if self.hparams['use_word_encoder']: + word_encoder_out = self.word_encoder(word_tokens) + style_embed + ph_encoder_out = ph_encoder_out + expand_states(word_encoder_out, ph2word) + if self.hparams['dur_level'] == 'word': + word_encoder_out = 0 + h_ph_gb_word = group_hidden_by_segs(ph_encoder_out, ph2word, word_len)[0] + word_encoder_out = word_encoder_out + self.ph2word_encoder(h_ph_gb_word) + if self.hparams['use_word_encoder']: + word_encoder_out = word_encoder_out + self.word_encoder(word_tokens) + mel2word = self.forward_dur(ph_encoder_out, mel2word, ret, ph2word=ph2word, word_len=word_len) + mel2word = clip_mel2token_to_multiple(mel2word, self.hparams['frames_multiple']) + tgt_nonpadding = (mel2word > 0).float()[:, :, None] + enc_pos = self.get_pos_embed(word2word, ph2word) # [B, T_ph, H] + dec_pos = self.get_pos_embed(word2word, mel2word) # [B, T_mel, H] + dec_word_mask = build_word_mask(mel2word, ph2word) # [B, T_mel, T_ph] + x, weight = self.attention(ph_encoder_out, enc_pos, word_encoder_out, dec_pos, mel2word, dec_word_mask) + if self.hparams['add_word_pos']: + x = x + self.word_pos_proj(dec_pos) + ret['attn'] = weight + else: + mel2ph = self.forward_dur(ph_encoder_out, mel2ph, ret) + mel2ph = clip_mel2token_to_multiple(mel2ph, self.hparams['frames_multiple']) + mel2word = mel2ph_to_mel2word(mel2ph, ph2word) + x = expand_states(ph_encoder_out, mel2ph) + if self.hparams['add_word_pos']: + dec_pos = self.get_pos_embed(word2word, mel2word) # [B, T_mel, H] + x = x + self.word_pos_proj(dec_pos) + tgt_nonpadding = (mel2ph > 0).float()[:, :, None] + if self.hparams['use_word_encoder']: + x = x + expand_states(word_encoder_out, mel2word) + return x, tgt_nonpadding + + def attention(self, ph_encoder_out, enc_pos, word_encoder_out, dec_pos, mel2word, dec_word_mask): + ph_kv = self.enc_pos_proj(torch.cat([ph_encoder_out, enc_pos], -1)) + word_enc_out_expend = expand_states(word_encoder_out, mel2word) + word_enc_out_expend = torch.cat([word_enc_out_expend, dec_pos], -1) + if self.hparams['text_encoder_postnet']: + word_enc_out_expend = self.dec_res_proj(word_enc_out_expend) + word_enc_out_expend = self.text_encoder_postnet(word_enc_out_expend) + dec_q = x_res = word_enc_out_expend + else: + dec_q = self.dec_query_proj(word_enc_out_expend) + x_res = self.dec_res_proj(word_enc_out_expend) + ph_kv, dec_q = ph_kv.transpose(0, 1), dec_q.transpose(0, 1) + x, (weight, _) = self.attn(dec_q, ph_kv, ph_kv, attn_mask=(1 - dec_word_mask) * -1e9) + x = x.transpose(0, 1) + x = x + x_res + return x, weight + + def run_decoder(self, x, tgt_nonpadding, ret, infer, tgt_mels=None, global_step=0): + if not self.hparams['use_fvae']: + x = self.decoder(x) + x = self.mel_out(x) + ret['kl'] = 0 + return x * tgt_nonpadding + else: + decoder_inp = x + x = x.transpose(1, 2) # [B, H, T] + tgt_nonpadding_BHT = tgt_nonpadding.transpose(1, 2) # [B, H, T] + if infer: + z = self.fvae(cond=x, infer=True) + else: + tgt_mels = tgt_mels.transpose(1, 2) # [B, 80, T] + z, ret['kl'], ret['z_p'], ret['m_q'], ret['logs_q'] = self.fvae( + tgt_mels, tgt_nonpadding_BHT, cond=x) + if global_step < self.hparams['posterior_start_steps']: + z = torch.randn_like(z) + x_recon = self.fvae.decoder(z, nonpadding=tgt_nonpadding_BHT, cond=x).transpose(1, 2) + ret['pre_mel_out'] = x_recon + return x_recon + + def forward_dur(self, dur_input, mel2word, ret, **kwargs): + """ + + :param dur_input: [B, T_txt, H] + :param mel2ph: [B, T_mel] + :param txt_tokens: [B, T_txt] + :param ret: + :return: + """ + src_padding = dur_input.data.abs().sum(-1) == 0 + dur_input = dur_input.detach() + self.hparams['predictor_grad'] * (dur_input - dur_input.detach()) + dur = self.dur_predictor(dur_input, src_padding) + if self.hparams['dur_level'] == 'word': + word_len = kwargs['word_len'] + ph2word = kwargs['ph2word'] + B, T_ph = ph2word.shape + dur = torch.zeros([B, word_len.max() + 1]).to(ph2word.device).scatter_add(1, ph2word, dur) + dur = dur[:, 1:] + ret['dur'] = dur + if mel2word is None: + mel2word = self.length_regulator(dur).detach() + return mel2word + + def get_pos_embed(self, word2word, x2word): + x_pos = build_word_mask(word2word, x2word).float() # [B, T_word, T_ph] + x_pos = (x_pos.cumsum(-1) / x_pos.sum(-1).clamp(min=1)[..., None] * x_pos).sum(1) + x_pos = self.sin_pos(x_pos.float()) # [B, T_ph, H] + return x_pos + + def store_inverse_all(self): + def remove_weight_norm(m): + try: + if hasattr(m, 'store_inverse'): + m.store_inverse() + nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(remove_weight_norm) diff --git a/modules/tts/portaspeech/portaspeech_flow.py b/modules/tts/portaspeech/portaspeech_flow.py new file mode 100644 index 0000000000000000000000000000000000000000..256887dd8b365e38ac6c1973f4ec376e93029652 --- /dev/null +++ b/modules/tts/portaspeech/portaspeech_flow.py @@ -0,0 +1,75 @@ +import torch +import torch.distributions as dist +from torch import nn +from modules.commons.normalizing_flow.glow_modules import Glow +from modules.tts.portaspeech.portaspeech import PortaSpeech + + +class PortaSpeechFlow(PortaSpeech): + def __init__(self, ph_dict_size, word_dict_size, hparams, out_dims=None): + super().__init__(ph_dict_size, word_dict_size, hparams, out_dims) + cond_hs = 80 + if hparams.get('use_txt_cond', True): + cond_hs = cond_hs + hparams['hidden_size'] + if hparams.get('use_latent_cond', False): + cond_hs = cond_hs + hparams['latent_size'] + if hparams['use_cond_proj']: + self.g_proj = nn.Conv1d(cond_hs, 160, 5, padding=2) + cond_hs = 160 + self.post_flow = Glow( + 80, hparams['post_glow_hidden'], hparams['post_glow_kernel_size'], 1, + hparams['post_glow_n_blocks'], hparams['post_glow_n_block_layers'], + n_split=4, n_sqz=2, + gin_channels=cond_hs, + share_cond_layers=hparams['post_share_cond_layers'], + share_wn_layers=hparams['share_wn_layers'], + sigmoid_scale=hparams['sigmoid_scale'] + ) + self.prior_dist = dist.Normal(0, 1) + + def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word=None, mel2ph=None, + spk_embed=None, spk_id=None, pitch=None, infer=False, tgt_mels=None, + forward_post_glow=True, two_stage=True, global_step=None): + is_training = self.training + train_fvae = not (forward_post_glow and two_stage) + if not train_fvae: + self.eval() + with torch.set_grad_enabled(mode=train_fvae): + ret = super(PortaSpeechFlow, self).forward( + txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, + spk_embed, spk_id, pitch, infer, tgt_mels, global_step) + if (forward_post_glow or not two_stage) and self.hparams['use_post_flow']: + self.run_post_glow(tgt_mels, infer, is_training, ret) + return ret + + def run_post_glow(self, tgt_mels, infer, is_training, ret): + x_recon = ret['mel_out'].transpose(1, 2) + g = x_recon + B, _, T = g.shape + if self.hparams.get('use_txt_cond', True): + g = torch.cat([g, ret['decoder_inp'].transpose(1, 2)], 1) + if self.hparams.get('use_latent_cond', False): + g_z = ret['z_p'][:, :, :, None].repeat(1, 1, 1, 4).reshape(B, -1, T) + g = torch.cat([g, g_z], 1) + if self.hparams['use_cond_proj']: + g = self.g_proj(g) + prior_dist = self.prior_dist + if not infer: + if is_training: + self.post_flow.train() + nonpadding = ret['nonpadding'].transpose(1, 2) + y_lengths = nonpadding.sum(-1) + if self.hparams['detach_postflow_input']: + g = g.detach() + tgt_mels = tgt_mels.transpose(1, 2) + z_postflow, ldj = self.post_flow(tgt_mels, nonpadding, g=g) + ldj = ldj / y_lengths / 80 + ret['z_pf'], ret['ldj_pf'] = z_postflow, ldj + ret['postflow'] = -prior_dist.log_prob(z_postflow).mean() - ldj.mean() + if torch.isnan(ret['postflow']): + ret['postflow'] = None + else: + nonpadding = torch.ones_like(x_recon[:, :1, :]) + z_post = torch.randn(x_recon.shape).to(g.device) * self.hparams['noise_scale'] + x_recon, _ = self.post_flow(z_post, nonpadding, g, reverse=True) + ret['mel_out'] = x_recon.transpose(1, 2) diff --git a/modules/tts/syntaspeech/__pycache__/multi_window_disc.cpython-36.pyc b/modules/tts/syntaspeech/__pycache__/multi_window_disc.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b90f1718adb6d3a1f70de4caabf69137d4c6865f Binary files /dev/null and b/modules/tts/syntaspeech/__pycache__/multi_window_disc.cpython-36.pyc differ diff --git a/modules/tts/syntaspeech/__pycache__/multi_window_disc.cpython-37.pyc b/modules/tts/syntaspeech/__pycache__/multi_window_disc.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fac53bf338b819d52c7e874e3ed29ea78231d23b Binary files /dev/null and b/modules/tts/syntaspeech/__pycache__/multi_window_disc.cpython-37.pyc differ diff --git a/modules/tts/syntaspeech/__pycache__/syntactic_graph_buider.cpython-36.pyc b/modules/tts/syntaspeech/__pycache__/syntactic_graph_buider.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c65d51fd3b5f42b0bf991fff23aa0b64873665b0 Binary files /dev/null and b/modules/tts/syntaspeech/__pycache__/syntactic_graph_buider.cpython-36.pyc differ diff --git a/modules/tts/syntaspeech/__pycache__/syntactic_graph_buider.cpython-37.pyc b/modules/tts/syntaspeech/__pycache__/syntactic_graph_buider.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8dcf68a4e255e274fbe3fc8d8d3a2eb91567f75b Binary files /dev/null and b/modules/tts/syntaspeech/__pycache__/syntactic_graph_buider.cpython-37.pyc differ diff --git a/modules/tts/syntaspeech/__pycache__/syntactic_graph_encoder.cpython-36.pyc b/modules/tts/syntaspeech/__pycache__/syntactic_graph_encoder.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b24c52014b4a2076c23c5a46e7ca191938851a3b Binary files /dev/null and b/modules/tts/syntaspeech/__pycache__/syntactic_graph_encoder.cpython-36.pyc differ diff --git a/modules/tts/syntaspeech/__pycache__/syntactic_graph_encoder.cpython-37.pyc b/modules/tts/syntaspeech/__pycache__/syntactic_graph_encoder.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fac7f009c7b4567908a80111ed2a8b0e9160bf0c Binary files /dev/null and b/modules/tts/syntaspeech/__pycache__/syntactic_graph_encoder.cpython-37.pyc differ diff --git a/modules/tts/syntaspeech/__pycache__/syntaspeech.cpython-36.pyc b/modules/tts/syntaspeech/__pycache__/syntaspeech.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee722d1e719f3a13f47d70ab34710aef1bdc1c47 Binary files /dev/null and b/modules/tts/syntaspeech/__pycache__/syntaspeech.cpython-36.pyc differ diff --git a/modules/tts/syntaspeech/__pycache__/syntaspeech.cpython-37.pyc b/modules/tts/syntaspeech/__pycache__/syntaspeech.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ac4d613be5d21aa9fa762eed9171a94f2e41cda Binary files /dev/null and b/modules/tts/syntaspeech/__pycache__/syntaspeech.cpython-37.pyc differ diff --git a/modules/tts/syntaspeech/multi_window_disc.py b/modules/tts/syntaspeech/multi_window_disc.py new file mode 100644 index 0000000000000000000000000000000000000000..a8166ac5b514e501043b9fed13aab01421a6c10e --- /dev/null +++ b/modules/tts/syntaspeech/multi_window_disc.py @@ -0,0 +1,136 @@ +import numpy as np +import torch +import torch.nn as nn + + +class SingleWindowDisc(nn.Module): + def __init__(self, time_length, freq_length=80, kernel=(3, 3), c_in=1, hidden_size=128): + super().__init__() + padding = (kernel[0] // 2, kernel[1] // 2) + self.model = nn.ModuleList([ + nn.Sequential(*[ + nn.Conv2d(c_in, hidden_size, kernel, (2, 2), padding), + nn.LeakyReLU(0.2, inplace=True), + nn.Dropout2d(0.25), + nn.BatchNorm2d(hidden_size, 0.8) + ]), + nn.Sequential(*[ + nn.Conv2d(hidden_size, hidden_size, kernel, (2, 2), padding), + nn.LeakyReLU(0.2, inplace=True), + nn.Dropout2d(0.25), + nn.BatchNorm2d(hidden_size, 0.8) + ]), + nn.Sequential(*[ + nn.Conv2d(hidden_size, hidden_size, kernel, (2, 2), padding), + nn.LeakyReLU(0.2, inplace=True), + nn.Dropout2d(0.25), + ]), + ]) + ds_size = (time_length // 2 ** 3, (freq_length + 7) // 2 ** 3) + self.adv_layer = nn.Linear(hidden_size * ds_size[0] * ds_size[1], 1) + + def forward(self, x): + """ + :param x: [B, C, T, n_bins] + :return: validity: [B, 1], h: List of hiddens + """ + h = [] + for l in self.model: + x = l(x) + h.append(x) + x = x.view(x.shape[0], -1) + validity = self.adv_layer(x) # [B, 1] + return validity, h + + +class MultiWindowDiscriminator(nn.Module): + def __init__(self, time_lengths, freq_length=80, kernel=(3, 3), c_in=1, hidden_size=128): + super(MultiWindowDiscriminator, self).__init__() + self.win_lengths = time_lengths + self.discriminators = nn.ModuleList() + + for time_length in time_lengths: + self.discriminators += [SingleWindowDisc(time_length, freq_length, kernel, c_in=c_in, hidden_size=hidden_size)] + + def forward(self, x, x_len, start_frames_wins=None): + ''' + Args: + x (tensor): input mel, (B, c_in, T, n_bins). + x_length (tensor): len of per mel. (B,). + + Returns: + tensor : (B). + ''' + validity = [] + if start_frames_wins is None: + start_frames_wins = [None] * len(self.discriminators) + h = [] + for i, start_frames in zip(range(len(self.discriminators)), start_frames_wins): + x_clip, start_frames = self.clip(x, x_len, self.win_lengths[i], start_frames) # (B, win_length, C) + start_frames_wins[i] = start_frames + if x_clip is None: + continue + x_clip, h_ = self.discriminators[i](x_clip) + h += h_ + validity.append(x_clip) + if len(validity) != len(self.discriminators): + return None, start_frames_wins, h + validity = sum(validity) # [B] + return validity, start_frames_wins, h + + def clip(self, x, x_len, win_length, start_frames=None): + '''Ramdom clip x to win_length. + Args: + x (tensor) : (B, c_in, T, n_bins). + cond (tensor) : (B, T, H). + x_len (tensor) : (B,). + win_length (int): target clip length + + Returns: + (tensor) : (B, c_in, win_length, n_bins). + + ''' + T_start = 0 + T_end = x_len.max() - win_length + if T_end < 0: + return None, None, start_frames + T_end = T_end.item() + if start_frames is None: + start_frame = np.random.randint(low=T_start, high=T_end + 1) + start_frames = [start_frame] * x.size(0) + else: + start_frame = start_frames[0] + x_batch = x[:, :, start_frame: start_frame + win_length] + return x_batch, start_frames + + +class Discriminator(nn.Module): + def __init__(self, time_lengths=[32, 64, 128], freq_length=80, kernel=(3, 3), c_in=1, + hidden_size=128): + super(Discriminator, self).__init__() + self.time_lengths = time_lengths + self.discriminator = MultiWindowDiscriminator( + freq_length=freq_length, + time_lengths=time_lengths, + kernel=kernel, + c_in=c_in, hidden_size=hidden_size + ) + + + def forward(self, x, start_frames_wins=None): + """ + + :param x: [B, T, 80] + :param return_y_only: + :return: + """ + if len(x.shape) == 3: + x = x[:, None, :, :] # [B,1,T,80] + x_len = x.sum([1, -1]).ne(0).int().sum([-1]) + ret = {'y_c': None, 'y': None} + ret['y'], start_frames_wins, ret['h'] = self.discriminator( + x, x_len, start_frames_wins=start_frames_wins) + + ret['start_frames_wins'] = start_frames_wins + return ret + diff --git a/modules/tts/syntaspeech/syntactic_graph_buider.py b/modules/tts/syntaspeech/syntactic_graph_buider.py new file mode 100644 index 0000000000000000000000000000000000000000..4bafdd0bc2c8dbe33e0b5e4dfce939b9eb7df8ee --- /dev/null +++ b/modules/tts/syntaspeech/syntactic_graph_buider.py @@ -0,0 +1,291 @@ +from copy import deepcopy +import torch +import dgl +import stanza +import networkx as nx + +class Sentence2GraphParser: + def __init__(self, language='zh', use_gpu=False): + self.language = language + self.stanza_parser = stanza.Pipeline(lang=language, use_gpu=use_gpu) + + def parse(self, clean_sentence=None, words=None, ph_words=None): + if self.language == 'zh': + assert words is not None and ph_words is not None + ret = self._parse_zh(words, ph_words) + elif self.language == 'en': + assert clean_sentence is not None + ret = self._parse_en(clean_sentence) + else: + raise NotImplementedError + return ret + + def _parse_zh(self, words, ph_words, enable_backward_edge=True, enable_recur_edge=True, + enable_inter_sentence_edge=True, sequential_edge=False): + """ + words: , each character in chinese is one item + ph_words: , each character in chinese is one item, represented by the phoneme + Example: + text1 = '宝马配挂跛骡鞍,貂蝉怨枕董翁榻.' + words = ['', '宝', '马', '配', '挂', '跛', '骡', '鞍', ',' + , '貂', '蝉', '怨', '枕', '董', '翁', '榻', ''] + ph_words = ['', 'b_ao3_|', 'm_a3_#', 'p_ei4_|', 'g_ua4_#', + 'b_o3_#', 'l_uo2_|', 'an1', ',', 'd_iao1_|', + 'ch_an2_#', 'van4_#', 'zh_en3_#', 'd_ong3_|', 'ueng1_#', 't_a4', ''] + """ + words, ph_words = words[1:-1], ph_words[1:-1] # delete and + for i, p_w in enumerate(ph_words): + if p_w == ',': + # change english ',' into chinese + # we found it necessary in stanza's dependency parsing + words[i], ph_words[i] = ',', ',' + tmp_words = deepcopy(words) + num_added_space = 0 + for i, p_w in enumerate(ph_words): + if p_w.endswith("#"): + # add a blank after the p_w with '#', to separate words + tmp_words.insert(num_added_space + i + 1, " ") + num_added_space += 1 + if p_w in [',', ',']: + # add one blank before and after ', ', respectively + tmp_words.insert(num_added_space + i + 1, " ") # insert behind ',' first + tmp_words.insert(num_added_space + i, " ") # insert before + num_added_space += 2 + clean_text = ''.join(tmp_words).strip() + parser_out = self.stanza_parser(clean_text) + + idx_to_word = {i + 1: w for i, w in enumerate(words)} + + vocab_nodes = {} + vocab_idx_offset = 0 + for sentence in parser_out.sentences: + num_nodes_in_current_sentence = 0 + for vocab_node in sentence.words: + num_nodes_in_current_sentence += 1 + vocab_idx = vocab_node.id + vocab_idx_offset + vocab_text = vocab_node.text.replace(" ", "") # delete blank in vocab + vocab_nodes[vocab_idx] = vocab_text + vocab_idx_offset += num_nodes_in_current_sentence + + # start vocab-to-word alignment + vocab_to_word = {} + current_word_idx = 1 + for vocab_i in vocab_nodes.keys(): + vocab_to_word[vocab_i] = [] + for w_in_vocab_i in vocab_nodes[vocab_i]: + if w_in_vocab_i != idx_to_word[current_word_idx]: + raise ValueError("Word Mismatch!") + vocab_to_word[vocab_i].append(current_word_idx) # add a path (vocab_node_idx, word_global_idx) + current_word_idx += 1 + + # then we compute the vocab-level edges + if len(parser_out.sentences) > 5: + print("Detect more than 5 input sentence! pls check whether the sentence is too long!") + vocab_level_source_id, vocab_level_dest_id = [], [] + vocab_level_edge_types = [] + sentences_heads = [] + vocab_id_offset = 0 + # get forward edges + for s in parser_out.sentences: + for w in s.words: + w_idx = w.id + vocab_id_offset # it starts from 1, just same as binarizer + w_dest_idx = w.head + vocab_id_offset + if w.head == 0: + sentences_heads.append(w_idx) + continue + vocab_level_source_id.append(w_idx) + vocab_level_dest_id.append(w_dest_idx) + vocab_id_offset += len(s.words) + vocab_level_edge_types += [0] * len(vocab_level_source_id) + num_vocab = vocab_id_offset + + # optional: get backward edges + if enable_backward_edge: + back_source, back_dest = deepcopy(vocab_level_dest_id), deepcopy(vocab_level_source_id) + vocab_level_source_id += back_source + vocab_level_dest_id += back_dest + vocab_level_edge_types += [1] * len(back_source) + + # optional: get inter-sentence edges if num_sentences > 1 + inter_sentence_source, inter_sentence_dest = [], [] + if enable_inter_sentence_edge and len(sentences_heads) > 1: + def get_full_graph_edges(nodes): + tmp_edges = [] + for i, node_i in enumerate(nodes): + for j, node_j in enumerate(nodes): + if i == j: + continue + tmp_edges.append((node_i, node_j)) + return tmp_edges + + tmp_edges = get_full_graph_edges(sentences_heads) + for (source, dest) in tmp_edges: + inter_sentence_source.append(source) + inter_sentence_dest.append(dest) + vocab_level_source_id += inter_sentence_source + vocab_level_dest_id += inter_sentence_dest + vocab_level_edge_types += [3] * len(inter_sentence_source) + + if sequential_edge: + seq_source, seq_dest = list(range(1, num_vocab)) + list(range(num_vocab, 0, -1)), \ + list(range(2, num_vocab + 1)) + list(range(num_vocab - 1, -1, -1)) + vocab_level_source_id += seq_source + vocab_level_dest_id += seq_dest + vocab_level_edge_types += [4] * (num_vocab - 1) + [5] * (num_vocab - 1) + + # Then, we use the vocab-level edges and the vocab-to-word path, to construct the word-level graph + num_word = len(words) + source_id, dest_id, edge_types = [], [], [] + for (vocab_start, vocab_end, vocab_edge_type) in zip(vocab_level_source_id, vocab_level_dest_id, + vocab_level_edge_types): + # connect the first word in the vocab + word_start = min(vocab_to_word[vocab_start]) + word_end = min(vocab_to_word[vocab_end]) + source_id.append(word_start) + dest_id.append(word_end) + edge_types.append(vocab_edge_type) + + # sequential connection in words + for word_indices_in_v in vocab_to_word.values(): + for i, word_idx in enumerate(word_indices_in_v): + if i + 1 < len(word_indices_in_v): + source_id.append(word_idx) + dest_id.append(word_idx + 1) + edge_types.append(4) + if i - 1 >= 0: + source_id.append(word_idx) + dest_id.append(word_idx - 1) + edge_types.append(5) + + # optional: get recurrent edges + if enable_recur_edge: + recur_source, recur_dest = list(range(1, num_word + 1)), list(range(1, num_word + 1)) + source_id += recur_source + dest_id += recur_dest + edge_types += [2] * len(recur_source) + + # add and + source_id += [0, num_word + 1, 1, num_word] + dest_id += [1, num_word, 0, num_word + 1] + edge_types += [4, 4, 5, 5] # 4 represents sequentially forward, 5 is sequential backward + + edges = (torch.LongTensor(source_id), torch.LongTensor(dest_id)) + dgl_graph = dgl.graph(edges) + assert dgl_graph.num_edges() == len(edge_types) + return dgl_graph, torch.LongTensor(edge_types) + + def _parse_en(self, clean_sentence, enable_backward_edge=True, enable_recur_edge=True, + enable_inter_sentence_edge=True, sequential_edge=False, consider_bos_for_index=True): + """ + clean_sentence: , each word or punctuation should be separated by one blank. + """ + edge_types = [] # required for gated graph neural network + clean_sentence = clean_sentence.strip() + if clean_sentence.endswith((" .", " ,", " ;", " :", " ?", " !")): + clean_sentence = clean_sentence[:-2] + if clean_sentence.startswith(". "): + clean_sentence = clean_sentence[2:] + parser_out = self.stanza_parser(clean_sentence) + if len(parser_out.sentences) > 5: + print("Detect more than 5 input sentence! pls check whether the sentence is too long!") + print(clean_sentence) + source_id, dest_id = [], [] + sentences_heads = [] + word_id_offset = 0 + # get forward edges + for s in parser_out.sentences: + for w in s.words: + w_idx = w.id + word_id_offset # it starts from 1, just same as binarizer + w_dest_idx = w.head + word_id_offset + if w.head == 0: + sentences_heads.append(w_idx) + continue + source_id.append(w_idx) + dest_id.append(w_dest_idx) + word_id_offset += len(s.words) + num_word = word_id_offset + edge_types += [0] * len(source_id) + + # optional: get backward edges + if enable_backward_edge: + back_source, back_dest = deepcopy(dest_id), deepcopy(source_id) + source_id += back_source + dest_id += back_dest + edge_types += [1] * len(back_source) + + # optional: get recurrent edges + if enable_recur_edge: + recur_source, recur_dest = list(range(1, num_word + 1)), list(range(1, num_word + 1)) + source_id += recur_source + dest_id += recur_dest + edge_types += [2] * len(recur_source) + + # optional: get inter-sentence edges if num_sentences > 1 + inter_sentence_source, inter_sentence_dest = [], [] + if enable_inter_sentence_edge and len(sentences_heads) > 1: + def get_full_graph_edges(nodes): + tmp_edges = [] + for i, node_i in enumerate(nodes): + for j, node_j in enumerate(nodes): + if i == j: + continue + tmp_edges.append((node_i, node_j)) + return tmp_edges + + tmp_edges = get_full_graph_edges(sentences_heads) + for (source, dest) in tmp_edges: + inter_sentence_source.append(source) + inter_sentence_dest.append(dest) + source_id += inter_sentence_source + dest_id += inter_sentence_dest + edge_types += [3] * len(inter_sentence_source) + + # add and + source_id += [0, num_word + 1, 1, num_word] + dest_id += [1, num_word, 0, num_word + 1] + edge_types += [4, 4, 5, 5] # 4 represents sequentially forward, 5 is sequential backward + + # optional: sequential edge + if sequential_edge: + seq_source, seq_dest = list(range(1, num_word)) + list(range(num_word, 0, -1)), \ + list(range(2, num_word + 1)) + list(range(num_word - 1, -1, -1)) + source_id += seq_source + dest_id += seq_dest + edge_types += [4] * (num_word - 1) + [5] * (num_word - 1) + if consider_bos_for_index: + edges = (torch.LongTensor(source_id), torch.LongTensor(dest_id)) + else: + edges = (torch.LongTensor(source_id) - 1, torch.LongTensor(dest_id) - 1) + dgl_graph = dgl.graph(edges) + assert dgl_graph.num_edges() == len(edge_types) + return dgl_graph, torch.LongTensor(edge_types) + + +def plot_dgl_sentence_graph(dgl_graph, labels): + """ + labels = {idx: word for idx,word in enumerate(sentence.split(" ")) } + """ + import matplotlib.pyplot as plt + nx_graph = dgl_graph.to_networkx() + pos = nx.random_layout(nx_graph) + nx.draw(nx_graph, pos, with_labels=False) + nx.draw_networkx_labels(nx_graph, pos, labels) + plt.show() + +if __name__ == '__main__': + + # Unit Test for Chinese Graph Builder + parser = Sentence2GraphParser("zh") + text1 = '宝马配挂跛骡鞍,貂蝉怨枕董翁榻.' + words = ['', '宝', '马', '配', '挂', '跛', '骡', '鞍', ',', '貂', '蝉', '怨', '枕', '董', '翁', '榻', ''] + ph_words = ['', 'b_ao3_|', 'm_a3_#', 'p_ei4_|', 'g_ua4_#', 'b_o3_#', 'l_uo2_|', 'an1', ',', 'd_iao1_|', + 'ch_an2_#', 'van4_#', 'zh_en3_#', 'd_ong3_|', 'ueng1_#', 't_a4', ''] + graph1, etypes1 = parser.parse(text1, words, ph_words) + plot_dgl_sentence_graph(graph1, {i: w for i, w in enumerate(ph_words)}) + + # Unit Test for English Graph Builder + parser = Sentence2GraphParser("en") + text2 = "I love you . You love me . Mixue ice-scream and tea ." + graph2, etypes2 = parser.parse(text2) + plot_dgl_sentence_graph(graph2, {i: w for i, w in enumerate((" " + text2 + " ").split(" "))}) + diff --git a/modules/tts/syntaspeech/syntactic_graph_encoder.py b/modules/tts/syntaspeech/syntactic_graph_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..0260b3100e6636f9684fc8ddff1775cafd33eba4 --- /dev/null +++ b/modules/tts/syntaspeech/syntactic_graph_encoder.py @@ -0,0 +1,193 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import dgl +from dgl.nn.pytorch import GatedGraphConv + +def sequence_mask(lengths, maxlen, dtype=torch.bool): + if maxlen is None: + maxlen = lengths.max() + mask = ~(torch.ones((len(lengths), maxlen)).to(lengths.device).cumsum(dim=1).t() > lengths).t() + mask.type(dtype) + return mask + + +def group_hidden_by_segs(h, seg_ids, max_len): + """ + :param h: [B, T, H] + :param seg_ids: [B, T] + :return: h_ph: [B, T_ph, H] + """ + B, T, H = h.shape + h_gby_segs = h.new_zeros([B, max_len + 1, H]).scatter_add_(1, seg_ids[:, :, None].repeat([1, 1, H]), h) + all_ones = h.new_ones(h.shape[:2]) + cnt_gby_segs = h.new_zeros([B, max_len + 1]).scatter_add_(1, seg_ids, all_ones).contiguous() + h_gby_segs = h_gby_segs[:, 1:] + cnt_gby_segs = cnt_gby_segs[:, 1:] + h_gby_segs = h_gby_segs / torch.clamp(cnt_gby_segs[:, :, None], min=1) + # assert h_gby_segs.shape[-1] == 192 + return h_gby_segs + +class GraphAuxEnc(nn.Module): + def __init__(self, in_dim, hid_dim, out_dim, n_iterations=5, n_edge_types=6): + super(GraphAuxEnc, self).__init__() + self.in_dim = in_dim + self.hid_dim = hid_dim + self.out_dim = out_dim + self.skip_connect = True + self.dropout_after_gae = False + + self.ggc_1 = GatedGraphConv(in_feats=in_dim, out_feats=hid_dim + , n_steps=n_iterations, n_etypes=n_edge_types) + self.ggc_2 = GatedGraphConv(in_feats=hid_dim, out_feats=out_dim + , n_steps=n_iterations, n_etypes=n_edge_types) + self.dropout = nn.Dropout(p=0.5) + + @staticmethod + def ph_encoding_to_word_encoding(ph_encoding, ph2word, word_len): + """ + ph_encoding: [batch, t_p, hid] + ph2word: tensor [batch, t_w] + word_len: tensor [batch] + """ + word_encoding_for_graph, batch_word_encoding, has_word_row_idx = GraphAuxEnc._process_ph_to_word_encoding( + ph_encoding, + ph2word, + word_len) + # [batch, t_w, hid] + return batch_word_encoding, word_encoding_for_graph + + def pad_word_encoding_to_phoneme(self, word_encoding, ph2word, t_p): + return self._postprocess_word2ph(word_encoding, ph2word, t_p) + + @staticmethod + def _process_ph_to_word_encoding(ph_encoding, ph2word, word_len=None): + """ + ph_encoding: [batch, t_p, hid] + ph2word: tensor [batch, t_w] + word_len: tensor [batch] + """ + word_len = word_len.reshape([-1,]) + max_len = max(word_len) + num_nodes = sum(word_len) + + batch_word_encoding = group_hidden_by_segs(ph_encoding, ph2word, max_len) + bs, t_p, hid = batch_word_encoding.shape + has_word_mask = sequence_mask(word_len, max_len) # [batch, t_p, 1] + word_encoding = batch_word_encoding.reshape([bs * t_p, hid]) + has_word_row_idx = has_word_mask.reshape([-1]) + word_encoding = word_encoding[has_word_row_idx] + assert word_encoding.shape[0] == num_nodes + return word_encoding, batch_word_encoding, has_word_row_idx + + @staticmethod + def _postprocess_word2ph(word_encoding, ph2word, t_p): + word_encoding = F.pad(word_encoding,[0,0,1,0]) + ph2word_ = ph2word[:, :, None].repeat([1, 1, word_encoding.shape[-1]]) + out = torch.gather(word_encoding, 1, ph2word_) # [B, T, H] + return out + + @staticmethod + def _repeat_one_sequence(x, d, T): + """Repeat each frame according to duration.""" + if d.sum() == 0: + d = d.fill_(1) + hid = x.shape[-1] + expanded_lst = [x_.repeat(int(d_), 1) for x_, d_ in zip(x, d) if d_ != 0] + expanded = torch.cat(expanded_lst, dim=0) + if T > expanded.shape[0]: + expanded = torch.cat([expanded, torch.zeros([T - expanded.shape[0], hid]).to(expanded.device)], dim=0) + return expanded + + def word_forward(self, graph_lst, word_encoding, etypes_lst): + """ + word encoding in, word encoding out. + """ + batched_graph = dgl.batch(graph_lst) + inp = word_encoding + batched_etypes = torch.cat(etypes_lst) # [num_edges_in_batch, 1] + assert batched_graph.num_nodes() == inp.shape[0] + + gcc1_out = self.ggc_1(batched_graph, inp, batched_etypes) + if self.dropout_after_gae: + gcc1_out = self.dropout(gcc1_out) + gcc2_out = self.ggc_2(batched_graph, gcc1_out, batched_etypes) # [num_nodes_in_batch, hin] + if self.dropout_after_gae: + gcc2_out = self.ggc_2(batched_graph, gcc2_out, batched_etypes) + if self.skip_connect: + assert self.in_dim == self.hid_dim and self.hid_dim == self.out_dim + gcc2_out = inp + gcc1_out + gcc1_out + + word_len = torch.tensor([g.num_nodes() for g in graph_lst]).reshape([-1]) + max_len = max(word_len) + has_word_mask = sequence_mask(word_len, max_len) # [batch, t_p, 1] + has_word_row_idx = has_word_mask.reshape([-1]) + bs = len(graph_lst) + t_w = max([g.num_nodes() for g in graph_lst]) + hid = word_encoding.shape[-1] + output = torch.zeros([bs * t_w, hid]).to(gcc2_out.device) + output[has_word_row_idx] = gcc2_out + output = output.reshape([bs, t_w, hid]) + word_level_output = output + return torch.transpose(word_level_output, 1, 2) + + def forward(self, graph_lst, ph_encoding, ph2word, etypes_lst, return_word_encoding=False): + """ + graph_lst: [list of dgl_graph] + ph_encoding: [batch, hid, t_p] + ph2word: [list of list[1,2,2,2,3,3,3]] + etypes_lst: [list of etypes]; etypes: torch.LongTensor + """ + t_p = ph_encoding.shape[-1] + ph_encoding = ph_encoding.transpose(1,2) # [batch, t_p, hid] + word_len = torch.tensor([g.num_nodes() for g in graph_lst]).reshape([-1]) + batched_graph = dgl.batch(graph_lst) + inp, batched_word_encoding, has_word_row_idx = self._process_ph_to_word_encoding(ph_encoding, ph2word, + word_len=word_len) # [num_nodes_in_batch, in_dim] + bs, t_w, hid = batched_word_encoding.shape + batched_etypes = torch.cat(etypes_lst) # [num_edges_in_batch, 1] + gcc1_out = self.ggc_1(batched_graph, inp, batched_etypes) + gcc2_out = self.ggc_2(batched_graph, gcc1_out, batched_etypes) # [num_nodes_in_batch, hin] + # skip connection + gcc2_out = inp + gcc1_out + gcc1_out # [n_nodes, hid] + + output = torch.zeros([bs * t_w, hid]).to(gcc2_out.device) + output[has_word_row_idx] = gcc2_out + output = output.reshape([bs, t_w, hid]) + word_level_output = output + output = self._postprocess_word2ph(word_level_output, ph2word, t_p) # [batch, t_p, hid] + output = torch.transpose(output, 1, 2) + + if return_word_encoding: + return output, torch.transpose(word_level_output, 1, 2) + else: + return output + +if __name__ == '__main__': + # Unit Test for batching graphs + from modules.tts.syntaspeech.syntactic_graph_buider import Sentence2GraphParser, plot_dgl_sentence_graph + parser = Sentence2GraphParser("en") + + # Unit Test for English Graph Builder + text1 = "To be or not to be , that 's a question ." + text2 = "I love you . You love me . Mixue ice-scream and tea ." + graph1, etypes1 = parser.parse(text1) + graph2, etypes2 = parser.parse(text2) + batched_text = " " + text1 + " " + " " + " " + text2 + " " + batched_nodes = [graph1.num_nodes(), graph2.num_nodes()] + plot_dgl_sentence_graph(dgl.batch([graph1, graph2]), {i: w for i, w in enumerate(batched_text.split(" "))}) + etypes_lst = [etypes1, etypes2] + + # Unit Test for Graph Encoder forward + in_feats = 4 + out_feats = 4 + enc = GraphAuxEnc(in_dim=in_feats, hid_dim=in_feats, out_dim=out_feats) + ph2word = torch.tensor([ + [1, 2, 3, 3, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0], + [1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] + ]) + inp = torch.randn([2, in_feats, 17]) # [N_sentence, feat, ph_length] + graph_lst = [graph1, graph2] + out = enc(graph_lst, inp, ph2word, etypes_lst) + print(out.shape) # [N_sentence, feat, ph_length] diff --git a/modules/tts/syntaspeech/syntaspeech.py b/modules/tts/syntaspeech/syntaspeech.py new file mode 100644 index 0000000000000000000000000000000000000000..d6652a419babd7e064b5b387ed85fa08b6eec07b --- /dev/null +++ b/modules/tts/syntaspeech/syntaspeech.py @@ -0,0 +1,251 @@ +import math +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn import Linear + +from modules.commons.conv import ConvBlocks, ConditionalConvBlocks +from modules.commons.layers import Embedding +from modules.commons.rel_transformer import RelTransformerEncoder +from modules.commons.transformer import MultiheadAttention, FFTBlocks +from modules.tts.commons.align_ops import clip_mel2token_to_multiple, build_word_mask, expand_states, mel2ph_to_mel2word +from modules.tts.fs import FS_DECODERS, FastSpeech +from modules.tts.portaspeech.fvae import SyntaFVAE +from utils.commons.meters import Timer +from utils.nn.seq_utils import group_hidden_by_segs +from modules.commons.nar_tts_modules import SyntaDurationPredictor + + +class SinusoidalPosEmb(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + """ + + :param x: [B, T] + :return: [B, T, H] + """ + device = x.device + half_dim = self.dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, device=device) * -emb) + emb = x[:, :, None] * emb[None, :] + emb = torch.cat((emb.sin(), emb.cos()), dim=-1) + return emb + + +class SyntaSpeech(FastSpeech): + def __init__(self, ph_dict_size, word_dict_size, hparams, out_dims=None): + super().__init__(ph_dict_size, hparams, out_dims) + # build linguistic encoder + if hparams['num_spk'] > 1: + self.spk_embed_proj = Embedding(hparams['num_spk'], self.hidden_size) + if hparams['use_word_encoder']: + self.word_encoder = RelTransformerEncoder( + word_dict_size, self.hidden_size, self.hidden_size, self.hidden_size, 2, + hparams['word_enc_layers'], hparams['enc_ffn_kernel_size']) + if hparams['dur_level'] == 'word': + if hparams['word_encoder_type'] == 'rel_fft': + self.ph2word_encoder = RelTransformerEncoder( + 0, self.hidden_size, self.hidden_size, self.hidden_size, 2, + hparams['word_enc_layers'], hparams['enc_ffn_kernel_size']) + if hparams['word_encoder_type'] == 'fft': + self.ph2word_encoder = FFTBlocks( + self.hidden_size, hparams['word_enc_layers'], 1, num_heads=hparams['num_heads']) + self.sin_pos = SinusoidalPosEmb(self.hidden_size) + self.enc_pos_proj = nn.Linear(2 * self.hidden_size, self.hidden_size) + self.dec_query_proj = nn.Linear(2 * self.hidden_size, self.hidden_size) + self.dec_res_proj = nn.Linear(2 * self.hidden_size, self.hidden_size) + self.attn = MultiheadAttention(self.hidden_size, 1, encoder_decoder_attention=True, bias=False) + self.attn.enable_torch_version = False + if hparams['text_encoder_postnet']: + self.text_encoder_postnet = ConvBlocks( + self.hidden_size, self.hidden_size, [1] * 3, 5, layers_in_block=2) + else: + self.sin_pos = SinusoidalPosEmb(self.hidden_size) + + predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size + self.dur_predictor = SyntaDurationPredictor( + self.hidden_size, + n_chans=predictor_hidden, + n_layers=hparams['dur_predictor_layers'], + dropout_rate=hparams['predictor_dropout'], + kernel_size=hparams['dur_predictor_kernel']) + # build VAE decoder + if hparams['use_fvae']: + del self.decoder + del self.mel_out + self.fvae = SyntaFVAE( + c_in_out=self.out_dims, + hidden_size=hparams['fvae_enc_dec_hidden'], c_latent=hparams['latent_size'], + kernel_size=hparams['fvae_kernel_size'], + enc_n_layers=hparams['fvae_enc_n_layers'], + dec_n_layers=hparams['fvae_dec_n_layers'], + c_cond=self.hidden_size, + use_prior_flow=hparams['use_prior_flow'], + flow_hidden=hparams['prior_flow_hidden'], + flow_kernel_size=hparams['prior_flow_kernel_size'], + flow_n_steps=hparams['prior_flow_n_blocks'], + strides=[hparams['fvae_strides']], + encoder_type=hparams['fvae_encoder_type'], + decoder_type=hparams['fvae_decoder_type'], + ) + else: + self.decoder = FS_DECODERS[hparams['decoder_type']](hparams) + self.mel_out = Linear(self.hidden_size, self.out_dims, bias=True) + if hparams['use_pitch_embed']: + self.pitch_embed = Embedding(300, self.hidden_size, 0) + if self.hparams['add_word_pos']: + self.word_pos_proj = Linear(self.hidden_size, self.hidden_size) + + def build_embedding(self, dictionary, embed_dim): + num_embeddings = len(dictionary) + emb = Embedding(num_embeddings, embed_dim, self.padding_idx) + return emb + + def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word=None, mel2ph=None, + spk_embed=None, spk_id=None, pitch=None, infer=False, tgt_mels=None, + global_step=None, graph_lst=None, etypes_lst=None, *args, **kwargs): + + if self.hparams['use_spk_embed'] or self.hparams['use_spk_id']: + spk_embed = self.spk_embed_proj(spk_embed)[:, None, :] + else: + spk_embed = 0 + + ret = {} + style_embed = self.forward_style_embed(spk_embed, spk_id) # speaker embedding, [B, 1, C] + x, tgt_nonpadding = self.run_text_encoder( + txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret, graph_lst=graph_lst, etypes_lst=etypes_lst) + x = x * tgt_nonpadding + ret['nonpadding'] = tgt_nonpadding + if self.hparams['use_pitch_embed']: + x = x + self.pitch_embed(pitch) + ret['decoder_inp'] = x + if infer and (mel2ph is None or mel2word is None): + mel2word = ret['mel2word'] + ret['mel_out_fvae'] = ret['mel_out'] = self.run_decoder(x, tgt_nonpadding, ret, infer, tgt_mels, global_step, + mel2word=mel2word, ph2word=ph2word, graph_lst=graph_lst, etypes_lst=etypes_lst) + return ret + + def run_text_encoder(self, txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret, graph_lst, etypes_lst): + word2word = torch.arange(word_len)[None, :].to(ph2word.device) + 1 # [B, T_mel, T_word] + src_nonpadding = (txt_tokens > 0).float()[:, :, None] + ph_encoder_out = self.encoder(txt_tokens) * src_nonpadding + style_embed + if self.hparams['use_word_encoder']: + word_encoder_out = self.word_encoder(word_tokens) + style_embed + ph_encoder_out = ph_encoder_out + expand_states(word_encoder_out, ph2word) + + dur_input = ph_encoder_out * src_nonpadding + if self.hparams['dur_level'] == 'word': + word_encoder_out = 0 + h_ph_gb_word = group_hidden_by_segs(ph_encoder_out, ph2word, word_len)[0] + word_encoder_out = word_encoder_out + self.ph2word_encoder(h_ph_gb_word) + if self.hparams['use_word_encoder']: + word_encoder_out = word_encoder_out + self.word_encoder(word_tokens) + mel2word = self.forward_dur(dur_input, mel2word, ret, ph2word=ph2word, word_len=word_len, graph_lst=graph_lst, etypes_lst=etypes_lst) + mel2word = clip_mel2token_to_multiple(mel2word, self.hparams['frames_multiple']) + ret['mel2word'] = mel2word + tgt_nonpadding = (mel2word > 0).float()[:, :, None] + enc_pos = self.get_pos_embed(word2word, ph2word) # [B, T_ph, H] + dec_pos = self.get_pos_embed(word2word, mel2word) # [B, T_mel, H] + dec_word_mask = build_word_mask(mel2word, ph2word) # [B, T_mel, T_ph] + x, weight = self.attention(ph_encoder_out, enc_pos, word_encoder_out, dec_pos, mel2word, dec_word_mask) + if self.hparams['add_word_pos']: + x = x + self.word_pos_proj(dec_pos) + ret['attn'] = weight + else: + mel2ph = self.forward_dur(dur_input, mel2ph, ret) + mel2ph = clip_mel2token_to_multiple(mel2ph, self.hparams['frames_multiple']) + mel2word = mel2ph_to_mel2word(mel2ph, ph2word) + x = expand_states(ph_encoder_out, mel2ph) + if self.hparams['add_word_pos']: + dec_pos = self.get_pos_embed(word2word, mel2word) # [B, T_mel, H] + x = x + self.word_pos_proj(dec_pos) + tgt_nonpadding = (mel2ph > 0).float()[:, :, None] + if self.hparams['use_word_encoder']: + x = x + expand_states(word_encoder_out, mel2word) + return x, tgt_nonpadding + + def attention(self, ph_encoder_out, enc_pos, word_encoder_out, dec_pos, mel2word, dec_word_mask): + ph_kv = self.enc_pos_proj(torch.cat([ph_encoder_out, enc_pos], -1)) + word_enc_out_expend = expand_states(word_encoder_out, mel2word) + word_enc_out_expend = torch.cat([word_enc_out_expend, dec_pos], -1) + if self.hparams['text_encoder_postnet']: + word_enc_out_expend = self.dec_res_proj(word_enc_out_expend) + word_enc_out_expend = self.text_encoder_postnet(word_enc_out_expend) + dec_q = x_res = word_enc_out_expend + else: + dec_q = self.dec_query_proj(word_enc_out_expend) + x_res = self.dec_res_proj(word_enc_out_expend) + ph_kv, dec_q = ph_kv.transpose(0, 1), dec_q.transpose(0, 1) + x, (weight, _) = self.attn(dec_q, ph_kv, ph_kv, attn_mask=(1 - dec_word_mask) * -1e9) + x = x.transpose(0, 1) + x = x + x_res + return x, weight + + def run_decoder(self, x, tgt_nonpadding, ret, infer, tgt_mels=None, global_step=0, + mel2word=None, ph2word=None, graph_lst=None, etypes_lst=None): + if not self.hparams['use_fvae']: + x = self.decoder(x) + x = self.mel_out(x) + ret['kl'] = 0 + return x * tgt_nonpadding + else: + # x is the phoneme encoding + x = x.transpose(1, 2) # [B, H, T] + tgt_nonpadding_BHT = tgt_nonpadding.transpose(1, 2) # [B, H, T] + if infer: + z = self.fvae(cond=x, infer=True, mel2word=mel2word, ph2word=ph2word, graph_lst=graph_lst, etypes_lst=etypes_lst) + else: + tgt_mels = tgt_mels.transpose(1, 2) # [B, 80, T] + z, ret['kl'], ret['z_p'], ret['m_q'], ret['logs_q'] = self.fvae( + tgt_mels, tgt_nonpadding_BHT, cond=x, mel2word=mel2word, ph2word=ph2word, graph_lst=graph_lst, etypes_lst=etypes_lst) + if global_step < self.hparams['posterior_start_steps']: + z = torch.randn_like(z) + x_recon = self.fvae.decoder(z, nonpadding=tgt_nonpadding_BHT, cond=x).transpose(1, 2) + ret['pre_mel_out'] = x_recon + return x_recon + + def forward_dur(self, dur_input, mel2word, ret, **kwargs): + """ + + :param dur_input: [B, T_txt, H] + :param mel2ph: [B, T_mel] + :param txt_tokens: [B, T_txt] + :param ret: + :return: + """ + word_len = kwargs['word_len'] + ph2word = kwargs['ph2word'] + graph_lst = kwargs['graph_lst'] + etypes_lst = kwargs['etypes_lst'] + src_padding = dur_input.data.abs().sum(-1) == 0 + dur_input = dur_input.detach() + self.hparams['predictor_grad'] * (dur_input - dur_input.detach()) + dur = self.dur_predictor(dur_input, src_padding, ph2word, graph_lst, etypes_lst) + + B, T_ph = ph2word.shape + dur = torch.zeros([B, word_len.max() + 1]).to(ph2word.device).scatter_add(1, ph2word, dur) + dur = dur[:, 1:] + ret['dur'] = dur + if mel2word is None: + mel2word = self.length_regulator(dur).detach() + return mel2word + + def get_pos_embed(self, word2word, x2word): + x_pos = build_word_mask(word2word, x2word).float() # [B, T_word, T_ph] + x_pos = (x_pos.cumsum(-1) / x_pos.sum(-1).clamp(min=1)[..., None] * x_pos).sum(1) + x_pos = self.sin_pos(x_pos.float()) # [B, T_ph, H] + return x_pos + + def store_inverse_all(self): + def remove_weight_norm(m): + try: + if hasattr(m, 'store_inverse'): + m.store_inverse() + nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(remove_weight_norm) diff --git a/modules/vocoder/hifigan/__pycache__/hifigan.cpython-36.pyc b/modules/vocoder/hifigan/__pycache__/hifigan.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15330a931e74d26dd71ebfb155b962708db6a858 Binary files /dev/null and b/modules/vocoder/hifigan/__pycache__/hifigan.cpython-36.pyc differ diff --git a/modules/vocoder/hifigan/__pycache__/hifigan.cpython-37.pyc b/modules/vocoder/hifigan/__pycache__/hifigan.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..645b3458d41cb9fcaad55d4a31d26db27283d659 Binary files /dev/null and b/modules/vocoder/hifigan/__pycache__/hifigan.cpython-37.pyc differ diff --git a/modules/vocoder/hifigan/hifigan.py b/modules/vocoder/hifigan/hifigan.py new file mode 100644 index 0000000000000000000000000000000000000000..fddd5278760427d5d93b9b38240319ba5bdb0bdf --- /dev/null +++ b/modules/vocoder/hifigan/hifigan.py @@ -0,0 +1,338 @@ +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +import numpy as np + +LRELU_SLOPE = 0.1 + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +class ResBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.h = h + self.convs = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Conv1d1x1(Conv1d): + """1x1 Conv1d with customized initialization.""" + + def __init__(self, in_channels, out_channels, bias): + """Initialize 1x1 Conv1d module.""" + super(Conv1d1x1, self).__init__(in_channels, out_channels, + kernel_size=1, padding=0, + dilation=1, bias=bias) + + +class HifiGanGenerator(torch.nn.Module): + def __init__(self, h, c_out=1): + super(HifiGanGenerator, self).__init__() + self.h = h + self.num_kernels = len(h['resblock_kernel_sizes']) + self.num_upsamples = len(h['upsample_rates']) + + self.conv_pre = weight_norm(Conv1d(80, h['upsample_initial_channel'], 7, 1, padding=3)) + resblock = ResBlock1 if h['resblock'] == '1' else ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h['upsample_rates'], h['upsample_kernel_sizes'])): + c_cur = h['upsample_initial_channel'] // (2 ** (i + 1)) + self.ups.append(weight_norm( + ConvTranspose1d(c_cur * 2, c_cur, k, u, padding=(k - u) // 2))) + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h['upsample_initial_channel'] // (2 ** (i + 1)) + for j, (k, d) in enumerate(zip(h['resblock_kernel_sizes'], h['resblock_dilation_sizes'])): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = weight_norm(Conv1d(ch, c_out, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x, f0=None): + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False, use_cond=False, c_in=1): + super(DiscriminatorP, self).__init__() + self.use_cond = use_cond + if use_cond: + from utils.commons.hparams import hparams + t = hparams['hop_size'] + self.cond_net = torch.nn.ConvTranspose1d(80, 1, t * 2, stride=t, padding=t // 2) + c_in = 2 + + self.period = period + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv2d(c_in, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x, mel): + fmap = [] + if self.use_cond: + x_mel = self.cond_net(mel) + x = torch.cat([x_mel, x], 1) + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_cond=False, c_in=1): + super(MultiPeriodDiscriminator, self).__init__() + self.discriminators = nn.ModuleList([ + DiscriminatorP(2, use_cond=use_cond, c_in=c_in), + DiscriminatorP(3, use_cond=use_cond, c_in=c_in), + DiscriminatorP(5, use_cond=use_cond, c_in=c_in), + DiscriminatorP(7, use_cond=use_cond, c_in=c_in), + DiscriminatorP(11, use_cond=use_cond, c_in=c_in), + ]) + + def forward(self, y, y_hat, mel=None): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y, mel) + y_d_g, fmap_g = d(y_hat, mel) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False, use_cond=False, upsample_rates=None, c_in=1): + super(DiscriminatorS, self).__init__() + self.use_cond = use_cond + if use_cond: + t = np.prod(upsample_rates) + self.cond_net = torch.nn.ConvTranspose1d(80, 1, t * 2, stride=t, padding=t // 2) + c_in = 2 + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv1d(c_in, 128, 15, 1, padding=7)), + norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), + norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), + norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x, mel): + if self.use_cond: + x_mel = self.cond_net(mel) + x = torch.cat([x_mel, x], 1) + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiScaleDiscriminator(torch.nn.Module): + def __init__(self, use_cond=False, c_in=1): + super(MultiScaleDiscriminator, self).__init__() + from utils.commons.hparams import hparams + self.discriminators = nn.ModuleList([ + DiscriminatorS(use_spectral_norm=True, use_cond=use_cond, + upsample_rates=[4, 4, hparams['hop_size'] // 16], + c_in=c_in), + DiscriminatorS(use_cond=use_cond, + upsample_rates=[4, 4, hparams['hop_size'] // 32], + c_in=c_in), + DiscriminatorS(use_cond=use_cond, + upsample_rates=[4, 4, hparams['hop_size'] // 64], + c_in=c_in), + ]) + self.meanpools = nn.ModuleList([ + AvgPool1d(4, 2, padding=1), + AvgPool1d(4, 2, padding=1) + ]) + + def forward(self, y, y_hat, mel=None): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + y = self.meanpools[i - 1](y) + y_hat = self.meanpools[i - 1](y_hat) + y_d_r, fmap_r = d(y, mel) + y_d_g, fmap_g = d(y_hat, mel) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + r_losses = 0 + g_losses = 0 + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg ** 2) + r_losses += r_loss + g_losses += g_loss + r_losses = r_losses / len(disc_real_outputs) + g_losses = g_losses / len(disc_real_outputs) + return r_losses, g_losses + + +def cond_discriminator_loss(outputs): + loss = 0 + for dg in outputs: + g_loss = torch.mean(dg ** 2) + loss += g_loss + loss = loss / len(outputs) + return loss + + +def generator_loss(disc_outputs): + loss = 0 + for dg in disc_outputs: + l = torch.mean((1 - dg) ** 2) + loss += l + loss = loss / len(disc_outputs) + return loss diff --git a/modules/vocoder/hifigan/mel_utils.py b/modules/vocoder/hifigan/mel_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a75fce72db54812320bc60aedfdd378ccecb3374 --- /dev/null +++ b/modules/vocoder/hifigan/mel_utils.py @@ -0,0 +1,80 @@ +import numpy as np +import torch +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn +from scipy.io.wavfile import read + +MAX_WAV_VALUE = 32768.0 + + +def load_wav(full_path): + sampling_rate, data = read(full_path) + return data, sampling_rate + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def mel_spectrogram(y, hparams, center=False, complex=False): + # hop_size: 512 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate) + # win_size: 2048 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate) + # fmin: 55 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) + # fmax: 10000 # To be increased/reduced depending on data. + # fft_size: 2048 # Extra window size is filled with 0 paddings to match this parameter + # n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, + n_fft = hparams['fft_size'] + num_mels = hparams['audio_num_mel_bins'] + sampling_rate = hparams['audio_sample_rate'] + hop_size = hparams['hop_size'] + win_size = hparams['win_size'] + fmin = hparams['fmin'] + fmax = hparams['fmax'] + y = y.clamp(min=-1., max=1.) + global mel_basis, hann_window + if fmax not in mel_basis: + mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) + mel_basis[str(fmax) + '_' + str(y.device)] = torch.from_numpy(mel).float().to(y.device) + hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) + + y = torch.nn.functional.pad(y.unsqueeze(1), [int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)], + mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], + center=center, pad_mode='reflect', normalized=False, onesided=True) + + if not complex: + spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) + spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec) + spec = spectral_normalize_torch(spec) + else: + B, C, T, _ = spec.shape + spec = spec.transpose(1, 2) # [B, T, n_fft, 2] + return spec diff --git a/modules/vocoder/hifigan/stft_loss.py b/modules/vocoder/hifigan/stft_loss.py new file mode 100755 index 0000000000000000000000000000000000000000..e47447455341e5725d6f82ded66dc08b5d2b1cc5 --- /dev/null +++ b/modules/vocoder/hifigan/stft_loss.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) + +"""STFT-based Loss modules.""" + +import torch +import torch.nn.functional as F + + +def stft(x, fft_size, hop_size, win_length, window): + """Perform STFT and convert to magnitude spectrogram. + Args: + x (Tensor): Input signal tensor (B, T). + fft_size (int): FFT size. + hop_size (int): Hop size. + win_length (int): Window length. + window (str): Window function type. + Returns: + Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). + """ + x_stft = torch.stft(x, fft_size, hop_size, win_length, window) + real = x_stft[..., 0] + imag = x_stft[..., 1] + + # NOTE(kan-bayashi): clamp is needed to avoid nan or inf + return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1) + + +class SpectralConvergengeLoss(torch.nn.Module): + """Spectral convergence loss module.""" + + def __init__(self): + """Initilize spectral convergence loss module.""" + super(SpectralConvergengeLoss, self).__init__() + + def forward(self, x_mag, y_mag): + """Calculate forward propagation. + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns: + Tensor: Spectral convergence loss value. + """ + return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro") + + +class LogSTFTMagnitudeLoss(torch.nn.Module): + """Log STFT magnitude loss module.""" + + def __init__(self): + """Initilize los STFT magnitude loss module.""" + super(LogSTFTMagnitudeLoss, self).__init__() + + def forward(self, x_mag, y_mag): + """Calculate forward propagation. + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns: + Tensor: Log STFT magnitude loss value. + """ + return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) + + +class STFTLoss(torch.nn.Module): + """STFT loss module.""" + + def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"): + """Initialize STFT loss module.""" + super(STFTLoss, self).__init__() + self.fft_size = fft_size + self.shift_size = shift_size + self.win_length = win_length + self.window = getattr(torch, window)(win_length) + self.spectral_convergenge_loss = SpectralConvergengeLoss() + self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() + + def forward(self, x, y): + """Calculate forward propagation. + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + Returns: + Tensor: Spectral convergence loss value. + Tensor: Log STFT magnitude loss value. + """ + x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window.to(x.get_device())) + y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window.to(x.get_device())) + sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) + mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) + + return sc_loss, mag_loss + + +class MultiResolutionSTFTLoss(torch.nn.Module): + """Multi resolution STFT loss module.""" + + def __init__(self, + fft_sizes=[1024, 2048, 512], + hop_sizes=[120, 240, 50], + win_lengths=[600, 1200, 240], + window="hann_window"): + """Initialize Multi resolution STFT loss module. + Args: + fft_sizes (list): List of FFT sizes. + hop_sizes (list): List of hop sizes. + win_lengths (list): List of window lengths. + window (str): Window function type. + """ + super(MultiResolutionSTFTLoss, self).__init__() + assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) + self.stft_losses = torch.nn.ModuleList() + for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): + self.stft_losses += [STFTLoss(fs, ss, wl, window)] + + def forward(self, x, y): + """Calculate forward propagation. + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + Returns: + Tensor: Multi resolution spectral convergence loss value. + Tensor: Multi resolution log STFT magnitude loss value. + """ + sc_loss = 0.0 + mag_loss = 0.0 + for f in self.stft_losses: + sc_l, mag_l = f(x, y) + sc_loss += sc_l + mag_loss += mag_l + sc_loss /= len(self.stft_losses) + mag_loss /= len(self.stft_losses) + + return sc_loss, mag_loss \ No newline at end of file diff --git a/modules/vocoder/parallel_wavegan/__init__.py b/modules/vocoder/parallel_wavegan/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/vocoder/parallel_wavegan/__pycache__/__init__.cpython-36.pyc b/modules/vocoder/parallel_wavegan/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f88053316f3a5d045086037a364d2b680ec8e65 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/__pycache__/__init__.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/__pycache__/__init__.cpython-37.pyc b/modules/vocoder/parallel_wavegan/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0677eb7b6e5f38de64de8af30b9268afda9f40c4 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/__pycache__/__init__.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/__init__.py b/modules/vocoder/parallel_wavegan/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..02e0ac58838f16c95f047d26c34a7ef86d473f07 --- /dev/null +++ b/modules/vocoder/parallel_wavegan/layers/__init__.py @@ -0,0 +1,5 @@ +from .causal_conv import * # NOQA +from .pqmf import * # NOQA +from .residual_block import * # NOQA +from modules.vocoder.parallel_wavegan.layers.residual_stack import * # NOQA +from .upsample import * # NOQA diff --git a/modules/vocoder/parallel_wavegan/layers/__pycache__/__init__.cpython-36.pyc b/modules/vocoder/parallel_wavegan/layers/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e0691a710563a2b93bb752c8f1c6ffd0e2fc7b2 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/layers/__pycache__/__init__.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/__pycache__/__init__.cpython-37.pyc b/modules/vocoder/parallel_wavegan/layers/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3bfe6a673eb3dccfba3c344c6170dbcd1a67463 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/layers/__pycache__/__init__.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/__pycache__/causal_conv.cpython-36.pyc b/modules/vocoder/parallel_wavegan/layers/__pycache__/causal_conv.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15333660484c1ebd676732bb98a0f93f891061b5 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/layers/__pycache__/causal_conv.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/__pycache__/causal_conv.cpython-37.pyc b/modules/vocoder/parallel_wavegan/layers/__pycache__/causal_conv.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a79d0d35a561410f88df071c19ca8d7df492085 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/layers/__pycache__/causal_conv.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/__pycache__/pqmf.cpython-36.pyc b/modules/vocoder/parallel_wavegan/layers/__pycache__/pqmf.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b575055480eca37c384e1a328e0b0e091000f08d Binary files /dev/null and b/modules/vocoder/parallel_wavegan/layers/__pycache__/pqmf.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/__pycache__/pqmf.cpython-37.pyc b/modules/vocoder/parallel_wavegan/layers/__pycache__/pqmf.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cdcf8d9a1e4cbfcbcd507a146879841011c0223a Binary files /dev/null and b/modules/vocoder/parallel_wavegan/layers/__pycache__/pqmf.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/__pycache__/residual_block.cpython-36.pyc b/modules/vocoder/parallel_wavegan/layers/__pycache__/residual_block.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3a0ef740d9e761f69dd958bd05a74736cec484a Binary files /dev/null and b/modules/vocoder/parallel_wavegan/layers/__pycache__/residual_block.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/__pycache__/residual_block.cpython-37.pyc b/modules/vocoder/parallel_wavegan/layers/__pycache__/residual_block.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6e9d59592c00e096666ca84895f5d32882b5a8d Binary files /dev/null and b/modules/vocoder/parallel_wavegan/layers/__pycache__/residual_block.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/__pycache__/residual_stack.cpython-36.pyc b/modules/vocoder/parallel_wavegan/layers/__pycache__/residual_stack.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4773f474ced4b2456b553d1a1c3352a66c6a7058 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/layers/__pycache__/residual_stack.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/__pycache__/residual_stack.cpython-37.pyc b/modules/vocoder/parallel_wavegan/layers/__pycache__/residual_stack.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..045eaf8ab7bf2d6ba796ae022cf9e1547988f079 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/layers/__pycache__/residual_stack.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/__pycache__/upsample.cpython-36.pyc b/modules/vocoder/parallel_wavegan/layers/__pycache__/upsample.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebe915f15531bb9c491aa2b973a2c2c42a92ded1 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/layers/__pycache__/upsample.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/__pycache__/upsample.cpython-37.pyc b/modules/vocoder/parallel_wavegan/layers/__pycache__/upsample.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b10fafc8203a26ccdf1368c5a5fb08265c403d7c Binary files /dev/null and b/modules/vocoder/parallel_wavegan/layers/__pycache__/upsample.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/layers/causal_conv.py b/modules/vocoder/parallel_wavegan/layers/causal_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..fca77daf65f234e6fbe355ed148fc8f0ee85038a --- /dev/null +++ b/modules/vocoder/parallel_wavegan/layers/causal_conv.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) + +"""Causal convolusion layer modules.""" + + +import torch + + +class CausalConv1d(torch.nn.Module): + """CausalConv1d module with customized initialization.""" + + def __init__(self, in_channels, out_channels, kernel_size, + dilation=1, bias=True, pad="ConstantPad1d", pad_params={"value": 0.0}): + """Initialize CausalConv1d module.""" + super(CausalConv1d, self).__init__() + self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params) + self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size, + dilation=dilation, bias=bias) + + def forward(self, x): + """Calculate forward propagation. + + Args: + x (Tensor): Input tensor (B, in_channels, T). + + Returns: + Tensor: Output tensor (B, out_channels, T). + + """ + return self.conv(self.pad(x))[:, :, :x.size(2)] + + +class CausalConvTranspose1d(torch.nn.Module): + """CausalConvTranspose1d module with customized initialization.""" + + def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True): + """Initialize CausalConvTranspose1d module.""" + super(CausalConvTranspose1d, self).__init__() + self.deconv = torch.nn.ConvTranspose1d( + in_channels, out_channels, kernel_size, stride, bias=bias) + self.stride = stride + + def forward(self, x): + """Calculate forward propagation. + + Args: + x (Tensor): Input tensor (B, in_channels, T_in). + + Returns: + Tensor: Output tensor (B, out_channels, T_out). + + """ + return self.deconv(x)[:, :, :-self.stride] diff --git a/modules/vocoder/parallel_wavegan/layers/pqmf.py b/modules/vocoder/parallel_wavegan/layers/pqmf.py new file mode 100644 index 0000000000000000000000000000000000000000..bb31c430d2abe0219f58f153f69d836383e095ef --- /dev/null +++ b/modules/vocoder/parallel_wavegan/layers/pqmf.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) + +"""Pseudo QMF modules.""" + +import numpy as np +import torch +import torch.nn.functional as F + +from scipy.signal import kaiser + + +def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0): + """Design prototype filter for PQMF. + + This method is based on `A Kaiser window approach for the design of prototype + filters of cosine modulated filterbanks`_. + + Args: + taps (int): The number of filter taps. + cutoff_ratio (float): Cut-off frequency ratio. + beta (float): Beta coefficient for kaiser window. + + Returns: + ndarray: Impluse response of prototype filter (taps + 1,). + + .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: + https://ieeexplore.ieee.org/abstract/document/681427 + + """ + # check the arguments are valid + assert taps % 2 == 0, "The number of taps mush be even number." + assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0." + + # make initial filter + omega_c = np.pi * cutoff_ratio + with np.errstate(invalid='ignore'): + h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \ + / (np.pi * (np.arange(taps + 1) - 0.5 * taps)) + h_i[taps // 2] = np.cos(0) * cutoff_ratio # fix nan due to indeterminate form + + # apply kaiser window + w = kaiser(taps + 1, beta) + h = h_i * w + + return h + + +class PQMF(torch.nn.Module): + """PQMF module. + + This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_. + + .. _`Near-perfect-reconstruction pseudo-QMF banks`: + https://ieeexplore.ieee.org/document/258122 + + """ + + def __init__(self, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0): + """Initilize PQMF module. + + Args: + subbands (int): The number of subbands. + taps (int): The number of filter taps. + cutoff_ratio (float): Cut-off frequency ratio. + beta (float): Beta coefficient for kaiser window. + + """ + super(PQMF, self).__init__() + + # define filter coefficient + h_proto = design_prototype_filter(taps, cutoff_ratio, beta) + h_analysis = np.zeros((subbands, len(h_proto))) + h_synthesis = np.zeros((subbands, len(h_proto))) + for k in range(subbands): + h_analysis[k] = 2 * h_proto * np.cos( + (2 * k + 1) * (np.pi / (2 * subbands)) * + (np.arange(taps + 1) - ((taps - 1) / 2)) + + (-1) ** k * np.pi / 4) + h_synthesis[k] = 2 * h_proto * np.cos( + (2 * k + 1) * (np.pi / (2 * subbands)) * + (np.arange(taps + 1) - ((taps - 1) / 2)) - + (-1) ** k * np.pi / 4) + + # convert to tensor + analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1) + synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0) + + # register coefficients as beffer + self.register_buffer("analysis_filter", analysis_filter) + self.register_buffer("synthesis_filter", synthesis_filter) + + # filter for downsampling & upsampling + updown_filter = torch.zeros((subbands, subbands, subbands)).float() + for k in range(subbands): + updown_filter[k, k, 0] = 1.0 + self.register_buffer("updown_filter", updown_filter) + self.subbands = subbands + + # keep padding info + self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0) + + def analysis(self, x): + """Analysis with PQMF. + + Args: + x (Tensor): Input tensor (B, 1, T). + + Returns: + Tensor: Output tensor (B, subbands, T // subbands). + + """ + x = F.conv1d(self.pad_fn(x), self.analysis_filter) + return F.conv1d(x, self.updown_filter, stride=self.subbands) + + def synthesis(self, x): + """Synthesis with PQMF. + + Args: + x (Tensor): Input tensor (B, subbands, T // subbands). + + Returns: + Tensor: Output tensor (B, 1, T). + + """ + # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands. + # Not sure this is the correct way, it is better to check again. + # TODO(kan-bayashi): Understand the reconstruction procedure + x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands) + return F.conv1d(self.pad_fn(x), self.synthesis_filter) diff --git a/modules/vocoder/parallel_wavegan/layers/residual_block.py b/modules/vocoder/parallel_wavegan/layers/residual_block.py new file mode 100644 index 0000000000000000000000000000000000000000..7a267a86c1fa521c2824addf9dda304c43f1ff1f --- /dev/null +++ b/modules/vocoder/parallel_wavegan/layers/residual_block.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- + +"""Residual block module in WaveNet. + +This code is modified from https://github.com/r9y9/wavenet_vocoder. + +""" + +import math + +import torch +import torch.nn.functional as F + + +class Conv1d(torch.nn.Conv1d): + """Conv1d module with customized initialization.""" + + def __init__(self, *args, **kwargs): + """Initialize Conv1d module.""" + super(Conv1d, self).__init__(*args, **kwargs) + + def reset_parameters(self): + """Reset parameters.""" + torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu") + if self.bias is not None: + torch.nn.init.constant_(self.bias, 0.0) + + +class Conv1d1x1(Conv1d): + """1x1 Conv1d with customized initialization.""" + + def __init__(self, in_channels, out_channels, bias): + """Initialize 1x1 Conv1d module.""" + super(Conv1d1x1, self).__init__(in_channels, out_channels, + kernel_size=1, padding=0, + dilation=1, bias=bias) + + +class ResidualBlock(torch.nn.Module): + """Residual block module in WaveNet.""" + + def __init__(self, + kernel_size=3, + residual_channels=64, + gate_channels=128, + skip_channels=64, + aux_channels=80, + dropout=0.0, + dilation=1, + bias=True, + use_causal_conv=False + ): + """Initialize ResidualBlock module. + + Args: + kernel_size (int): Kernel size of dilation convolution layer. + residual_channels (int): Number of channels for residual connection. + skip_channels (int): Number of channels for skip connection. + aux_channels (int): Local conditioning channels i.e. auxiliary input dimension. + dropout (float): Dropout probability. + dilation (int): Dilation factor. + bias (bool): Whether to add bias parameter in convolution layers. + use_causal_conv (bool): Whether to use use_causal_conv or non-use_causal_conv convolution. + + """ + super(ResidualBlock, self).__init__() + self.dropout = dropout + # no future time stamps available + if use_causal_conv: + padding = (kernel_size - 1) * dilation + else: + assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." + padding = (kernel_size - 1) // 2 * dilation + self.use_causal_conv = use_causal_conv + + # dilation conv + self.conv = Conv1d(residual_channels, gate_channels, kernel_size, + padding=padding, dilation=dilation, bias=bias) + + # local conditioning + if aux_channels > 0: + self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False) + else: + self.conv1x1_aux = None + + # conv output is split into two groups + gate_out_channels = gate_channels // 2 + self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias) + self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias) + + def forward(self, x, c): + """Calculate forward propagation. + + Args: + x (Tensor): Input tensor (B, residual_channels, T). + c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T). + + Returns: + Tensor: Output tensor for residual connection (B, residual_channels, T). + Tensor: Output tensor for skip connection (B, skip_channels, T). + + """ + residual = x + x = F.dropout(x, p=self.dropout, training=self.training) + x = self.conv(x) + + # remove future time steps if use_causal_conv conv + x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x + + # split into two part for gated activation + splitdim = 1 + xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim) + + # local conditioning + if c is not None: + assert self.conv1x1_aux is not None + c = self.conv1x1_aux(c) + ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim) + xa, xb = xa + ca, xb + cb + + x = torch.tanh(xa) * torch.sigmoid(xb) + + # for skip connection + s = self.conv1x1_skip(x) + + # for residual connection + x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5) + + return x, s diff --git a/modules/vocoder/parallel_wavegan/layers/residual_stack.py b/modules/vocoder/parallel_wavegan/layers/residual_stack.py new file mode 100644 index 0000000000000000000000000000000000000000..6e07c8803ad348dd923f6b7c0f7aff14aab9cf78 --- /dev/null +++ b/modules/vocoder/parallel_wavegan/layers/residual_stack.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) + +"""Residual stack module in MelGAN.""" + +import torch + +from . import CausalConv1d + + +class ResidualStack(torch.nn.Module): + """Residual stack module introduced in MelGAN.""" + + def __init__(self, + kernel_size=3, + channels=32, + dilation=1, + bias=True, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + pad="ReflectionPad1d", + pad_params={}, + use_causal_conv=False, + ): + """Initialize ResidualStack module. + + Args: + kernel_size (int): Kernel size of dilation convolution layer. + channels (int): Number of channels of convolution layers. + dilation (int): Dilation factor. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. + use_causal_conv (bool): Whether to use causal convolution. + + """ + super(ResidualStack, self).__init__() + + # defile residual stack part + if not use_causal_conv: + assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." + self.stack = torch.nn.Sequential( + getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), + getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params), + torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias), + getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), + torch.nn.Conv1d(channels, channels, 1, bias=bias), + ) + else: + self.stack = torch.nn.Sequential( + getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), + CausalConv1d(channels, channels, kernel_size, dilation=dilation, + bias=bias, pad=pad, pad_params=pad_params), + getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), + torch.nn.Conv1d(channels, channels, 1, bias=bias), + ) + + # defile extra layer for skip connection + self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias) + + def forward(self, c): + """Calculate forward propagation. + + Args: + c (Tensor): Input tensor (B, channels, T). + + Returns: + Tensor: Output tensor (B, chennels, T). + + """ + return self.stack(c) + self.skip_layer(c) diff --git a/modules/vocoder/parallel_wavegan/layers/tf_layers.py b/modules/vocoder/parallel_wavegan/layers/tf_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..c0f46bd755c161cda2ac904fe37f3f3c6357a88d --- /dev/null +++ b/modules/vocoder/parallel_wavegan/layers/tf_layers.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 MINH ANH (@dathudeptrai) +# MIT License (https://opensource.org/licenses/MIT) + +"""Tensorflow Layer modules complatible with pytorch.""" + +import tensorflow as tf + + +class TFReflectionPad1d(tf.keras.layers.Layer): + """Tensorflow ReflectionPad1d module.""" + + def __init__(self, padding_size): + """Initialize TFReflectionPad1d module. + + Args: + padding_size (int): Padding size. + + """ + super(TFReflectionPad1d, self).__init__() + self.padding_size = padding_size + + @tf.function + def call(self, x): + """Calculate forward propagation. + + Args: + x (Tensor): Input tensor (B, T, 1, C). + + Returns: + Tensor: Padded tensor (B, T + 2 * padding_size, 1, C). + + """ + return tf.pad(x, [[0, 0], [self.padding_size, self.padding_size], [0, 0], [0, 0]], "REFLECT") + + +class TFConvTranspose1d(tf.keras.layers.Layer): + """Tensorflow ConvTranspose1d module.""" + + def __init__(self, channels, kernel_size, stride, padding): + """Initialize TFConvTranspose1d( module. + + Args: + channels (int): Number of channels. + kernel_size (int): kernel size. + strides (int): Stride width. + padding (str): Padding type ("same" or "valid"). + + """ + super(TFConvTranspose1d, self).__init__() + self.conv1d_transpose = tf.keras.layers.Conv2DTranspose( + filters=channels, + kernel_size=(kernel_size, 1), + strides=(stride, 1), + padding=padding, + ) + + @tf.function + def call(self, x): + """Calculate forward propagation. + + Args: + x (Tensor): Input tensor (B, T, 1, C). + + Returns: + Tensors: Output tensor (B, T', 1, C'). + + """ + x = self.conv1d_transpose(x) + return x + + +class TFResidualStack(tf.keras.layers.Layer): + """Tensorflow ResidualStack module.""" + + def __init__(self, + kernel_size, + channels, + dilation, + bias, + nonlinear_activation, + nonlinear_activation_params, + padding, + ): + """Initialize TFResidualStack module. + + Args: + kernel_size (int): Kernel size. + channles (int): Number of channels. + dilation (int): Dilation ine. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + padding (str): Padding type ("same" or "valid"). + + """ + super(TFResidualStack, self).__init__() + self.block = [ + getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params), + TFReflectionPad1d(dilation), + tf.keras.layers.Conv2D( + filters=channels, + kernel_size=(kernel_size, 1), + dilation_rate=(dilation, 1), + use_bias=bias, + padding="valid", + ), + getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params), + tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias) + ] + self.shortcut = tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias) + + @tf.function + def call(self, x): + """Calculate forward propagation. + + Args: + x (Tensor): Input tensor (B, T, 1, C). + + Returns: + Tensor: Output tensor (B, T, 1, C). + + """ + _x = tf.identity(x) + for i, layer in enumerate(self.block): + _x = layer(_x) + shortcut = self.shortcut(x) + return shortcut + _x diff --git a/modules/vocoder/parallel_wavegan/layers/upsample.py b/modules/vocoder/parallel_wavegan/layers/upsample.py new file mode 100644 index 0000000000000000000000000000000000000000..18c6397c420a81fadc5320e3a48f3249534decd8 --- /dev/null +++ b/modules/vocoder/parallel_wavegan/layers/upsample.py @@ -0,0 +1,183 @@ +# -*- coding: utf-8 -*- + +"""Upsampling module. + +This code is modified from https://github.com/r9y9/wavenet_vocoder. + +""" + +import numpy as np +import torch +import torch.nn.functional as F + +from . import Conv1d + + +class Stretch2d(torch.nn.Module): + """Stretch2d module.""" + + def __init__(self, x_scale, y_scale, mode="nearest"): + """Initialize Stretch2d module. + + Args: + x_scale (int): X scaling factor (Time axis in spectrogram). + y_scale (int): Y scaling factor (Frequency axis in spectrogram). + mode (str): Interpolation mode. + + """ + super(Stretch2d, self).__init__() + self.x_scale = x_scale + self.y_scale = y_scale + self.mode = mode + + def forward(self, x): + """Calculate forward propagation. + + Args: + x (Tensor): Input tensor (B, C, F, T). + + Returns: + Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale), + + """ + return F.interpolate( + x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode) + + +class Conv2d(torch.nn.Conv2d): + """Conv2d module with customized initialization.""" + + def __init__(self, *args, **kwargs): + """Initialize Conv2d module.""" + super(Conv2d, self).__init__(*args, **kwargs) + + def reset_parameters(self): + """Reset parameters.""" + self.weight.data.fill_(1. / np.prod(self.kernel_size)) + if self.bias is not None: + torch.nn.init.constant_(self.bias, 0.0) + + +class UpsampleNetwork(torch.nn.Module): + """Upsampling network module.""" + + def __init__(self, + upsample_scales, + nonlinear_activation=None, + nonlinear_activation_params={}, + interpolate_mode="nearest", + freq_axis_kernel_size=1, + use_causal_conv=False, + ): + """Initialize upsampling network module. + + Args: + upsample_scales (list): List of upsampling scales. + nonlinear_activation (str): Activation function name. + nonlinear_activation_params (dict): Arguments for specified activation function. + interpolate_mode (str): Interpolation mode. + freq_axis_kernel_size (int): Kernel size in the direction of frequency axis. + + """ + super(UpsampleNetwork, self).__init__() + self.use_causal_conv = use_causal_conv + self.up_layers = torch.nn.ModuleList() + for scale in upsample_scales: + # interpolation layer + stretch = Stretch2d(scale, 1, interpolate_mode) + self.up_layers += [stretch] + + # conv layer + assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size." + freq_axis_padding = (freq_axis_kernel_size - 1) // 2 + kernel_size = (freq_axis_kernel_size, scale * 2 + 1) + if use_causal_conv: + padding = (freq_axis_padding, scale * 2) + else: + padding = (freq_axis_padding, scale) + conv = Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False) + self.up_layers += [conv] + + # nonlinear + if nonlinear_activation is not None: + nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params) + self.up_layers += [nonlinear] + + def forward(self, c): + """Calculate forward propagation. + + Args: + c : Input tensor (B, C, T). + + Returns: + Tensor: Upsampled tensor (B, C, T'), where T' = T * prod(upsample_scales). + + """ + c = c.unsqueeze(1) # (B, 1, C, T) + for f in self.up_layers: + if self.use_causal_conv and isinstance(f, Conv2d): + c = f(c)[..., :c.size(-1)] + else: + c = f(c) + return c.squeeze(1) # (B, C, T') + + +class ConvInUpsampleNetwork(torch.nn.Module): + """Convolution + upsampling network module.""" + + def __init__(self, + upsample_scales, + nonlinear_activation=None, + nonlinear_activation_params={}, + interpolate_mode="nearest", + freq_axis_kernel_size=1, + aux_channels=80, + aux_context_window=0, + use_causal_conv=False + ): + """Initialize convolution + upsampling network module. + + Args: + upsample_scales (list): List of upsampling scales. + nonlinear_activation (str): Activation function name. + nonlinear_activation_params (dict): Arguments for specified activation function. + mode (str): Interpolation mode. + freq_axis_kernel_size (int): Kernel size in the direction of frequency axis. + aux_channels (int): Number of channels of pre-convolutional layer. + aux_context_window (int): Context window size of the pre-convolutional layer. + use_causal_conv (bool): Whether to use causal structure. + + """ + super(ConvInUpsampleNetwork, self).__init__() + self.aux_context_window = aux_context_window + self.use_causal_conv = use_causal_conv and aux_context_window > 0 + # To capture wide-context information in conditional features + kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1 + # NOTE(kan-bayashi): Here do not use padding because the input is already padded + self.conv_in = Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False) + self.upsample = UpsampleNetwork( + upsample_scales=upsample_scales, + nonlinear_activation=nonlinear_activation, + nonlinear_activation_params=nonlinear_activation_params, + interpolate_mode=interpolate_mode, + freq_axis_kernel_size=freq_axis_kernel_size, + use_causal_conv=use_causal_conv, + ) + + def forward(self, c): + """Calculate forward propagation. + + Args: + c : Input tensor (B, C, T'). + + Returns: + Tensor: Upsampled tensor (B, C, T), + where T = (T' - aux_context_window * 2) * prod(upsample_scales). + + Note: + The length of inputs considers the context window size. + + """ + c_ = self.conv_in(c) + c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_ + return self.upsample(c) diff --git a/modules/vocoder/parallel_wavegan/losses/__init__.py b/modules/vocoder/parallel_wavegan/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b03080a907cb5cb4b316ceb74866ddbc406b33bf --- /dev/null +++ b/modules/vocoder/parallel_wavegan/losses/__init__.py @@ -0,0 +1 @@ +from .stft_loss import * # NOQA diff --git a/modules/vocoder/parallel_wavegan/losses/stft_loss.py b/modules/vocoder/parallel_wavegan/losses/stft_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..adb5767eb6e48b79c9811139091522cf635b5697 --- /dev/null +++ b/modules/vocoder/parallel_wavegan/losses/stft_loss.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) + +"""STFT-based Loss modules.""" + +import torch +import torch.nn.functional as F + + +def stft(x, fft_size, hop_size, win_length, window): + """Perform STFT and convert to magnitude spectrogram. + + Args: + x (Tensor): Input signal tensor (B, T). + fft_size (int): FFT size. + hop_size (int): Hop size. + win_length (int): Window length. + window (str): Window function type. + + Returns: + Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). + + """ + x_stft = torch.stft(x, fft_size, hop_size, win_length, window) + real = x_stft[..., 0] + imag = x_stft[..., 1] + + # NOTE(kan-bayashi): clamp is needed to avoid nan or inf + return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1) + + +class SpectralConvergengeLoss(torch.nn.Module): + """Spectral convergence loss module.""" + + def __init__(self): + """Initilize spectral convergence loss module.""" + super(SpectralConvergengeLoss, self).__init__() + + def forward(self, x_mag, y_mag): + """Calculate forward propagation. + + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + + Returns: + Tensor: Spectral convergence loss value. + + """ + return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro") + + +class LogSTFTMagnitudeLoss(torch.nn.Module): + """Log STFT magnitude loss module.""" + + def __init__(self): + """Initilize los STFT magnitude loss module.""" + super(LogSTFTMagnitudeLoss, self).__init__() + + def forward(self, x_mag, y_mag): + """Calculate forward propagation. + + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + + Returns: + Tensor: Log STFT magnitude loss value. + + """ + return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) + + +class STFTLoss(torch.nn.Module): + """STFT loss module.""" + + def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"): + """Initialize STFT loss module.""" + super(STFTLoss, self).__init__() + self.fft_size = fft_size + self.shift_size = shift_size + self.win_length = win_length + self.window = getattr(torch, window)(win_length) + self.spectral_convergenge_loss = SpectralConvergengeLoss() + self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() + + def forward(self, x, y): + """Calculate forward propagation. + + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + + Returns: + Tensor: Spectral convergence loss value. + Tensor: Log STFT magnitude loss value. + + """ + self.window = self.window.to(x.device) + x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) + y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window) + sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) + mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) + + return sc_loss, mag_loss + + +class MultiResolutionSTFTLoss(torch.nn.Module): + """Multi resolution STFT loss module.""" + + def __init__(self, + fft_sizes=[1024, 2048, 512], + hop_sizes=[120, 240, 50], + win_lengths=[600, 1200, 240], + window="hann_window"): + """Initialize Multi resolution STFT loss module. + + Args: + fft_sizes (list): List of FFT sizes. + hop_sizes (list): List of hop sizes. + win_lengths (list): List of window lengths. + window (str): Window function type. + + """ + super(MultiResolutionSTFTLoss, self).__init__() + assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) + self.stft_losses = torch.nn.ModuleList() + for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): + self.stft_losses += [STFTLoss(fs, ss, wl, window)] + + def forward(self, x, y): + """Calculate forward propagation. + + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + + Returns: + Tensor: Multi resolution spectral convergence loss value. + Tensor: Multi resolution log STFT magnitude loss value. + + """ + sc_loss = 0.0 + mag_loss = 0.0 + for f in self.stft_losses: + sc_l, mag_l = f(x, y) + sc_loss += sc_l + mag_loss += mag_l + sc_loss /= len(self.stft_losses) + mag_loss /= len(self.stft_losses) + + return sc_loss, mag_loss diff --git a/modules/vocoder/parallel_wavegan/models/__init__.py b/modules/vocoder/parallel_wavegan/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4803ba6b2a0afc8022e756ae5b3f4c7403c3c1bd --- /dev/null +++ b/modules/vocoder/parallel_wavegan/models/__init__.py @@ -0,0 +1,2 @@ +from .melgan import * # NOQA +from .parallel_wavegan import * # NOQA diff --git a/modules/vocoder/parallel_wavegan/models/__pycache__/__init__.cpython-36.pyc b/modules/vocoder/parallel_wavegan/models/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12281b434897095ae2329fe464cc85ba3fc38030 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/models/__pycache__/__init__.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/models/__pycache__/__init__.cpython-37.pyc b/modules/vocoder/parallel_wavegan/models/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..86b66799f526566234c77f9bbf4196842eaa54dd Binary files /dev/null and b/modules/vocoder/parallel_wavegan/models/__pycache__/__init__.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/models/__pycache__/melgan.cpython-36.pyc b/modules/vocoder/parallel_wavegan/models/__pycache__/melgan.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95c72dc3a9e4afd00a59316a4f756268d138f0d3 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/models/__pycache__/melgan.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/models/__pycache__/melgan.cpython-37.pyc b/modules/vocoder/parallel_wavegan/models/__pycache__/melgan.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf9e4f69390cfd46ba2e6cf2502c1dc86b07ae12 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/models/__pycache__/melgan.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/models/__pycache__/parallel_wavegan.cpython-36.pyc b/modules/vocoder/parallel_wavegan/models/__pycache__/parallel_wavegan.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f89b4bc7fcfa4f02ae0990e9f2f243b73318ea29 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/models/__pycache__/parallel_wavegan.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/models/__pycache__/parallel_wavegan.cpython-37.pyc b/modules/vocoder/parallel_wavegan/models/__pycache__/parallel_wavegan.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cbed9b118e1a8f36254234a7467fc1a23a4cffa9 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/models/__pycache__/parallel_wavegan.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/models/__pycache__/source.cpython-36.pyc b/modules/vocoder/parallel_wavegan/models/__pycache__/source.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf1d21b6d53a93e921ce778b6fc7912dc91fe10c Binary files /dev/null and b/modules/vocoder/parallel_wavegan/models/__pycache__/source.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/models/__pycache__/source.cpython-37.pyc b/modules/vocoder/parallel_wavegan/models/__pycache__/source.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d40a53e0b8884e8eb400445b2955eecf8863c6d4 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/models/__pycache__/source.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/models/freq_discriminator.py b/modules/vocoder/parallel_wavegan/models/freq_discriminator.py new file mode 100644 index 0000000000000000000000000000000000000000..876b66ff931335ae16a5c36f95beb33e789a3f7d --- /dev/null +++ b/modules/vocoder/parallel_wavegan/models/freq_discriminator.py @@ -0,0 +1,149 @@ +import torch +import torch.nn as nn + + +class BasicDiscriminatorBlock(nn.Module): + def __init__(self, in_channel, out_channel): + super(BasicDiscriminatorBlock, self).__init__() + self.block = nn.Sequential( + nn.utils.weight_norm(nn.Conv1d( + in_channel, + out_channel, + kernel_size=3, + stride=2, + padding=1, + )), + nn.LeakyReLU(0.2, True), + + nn.utils.weight_norm(nn.Conv1d( + out_channel, + out_channel, + kernel_size=3, + stride=1, + padding=1, + )), + nn.LeakyReLU(0.2, True), + + nn.utils.weight_norm(nn.Conv1d( + out_channel, + out_channel, + kernel_size=3, + stride=1, + padding=1, + )), + nn.LeakyReLU(0.2, True), + + nn.utils.weight_norm(nn.Conv1d( + out_channel, + out_channel, + kernel_size=3, + stride=1, + padding=1, + )), + + ) + + def forward(self, x): + return self.block(x) + + +class ResDiscriminatorBlock(nn.Module): + def __init__(self, in_channel, out_channel): + super(ResDiscriminatorBlock, self).__init__() + self.block1 = nn.Sequential( + nn.utils.weight_norm(nn.Conv1d( + in_channel, + out_channel, + kernel_size=3, + stride=2, + padding=1, + )), + nn.LeakyReLU(0.2, True), + + nn.utils.weight_norm(nn.Conv1d( + out_channel, + out_channel, + kernel_size=3, + stride=1, + padding=1, + )), + ) + + self.shortcut1 = nn.utils.weight_norm(nn.Conv1d( + in_channel, + out_channel, + kernel_size=1, + stride=2, + )) + + self.block2 = nn.Sequential( + nn.utils.weight_norm(nn.Conv1d( + out_channel, + out_channel, + kernel_size=3, + stride=1, + padding=1, + )), + nn.LeakyReLU(0.2, True), + + nn.utils.weight_norm(nn.Conv1d( + out_channel, + out_channel, + kernel_size=3, + stride=1, + padding=1, + )), + ) + + self.shortcut2 = nn.utils.weight_norm(nn.Conv1d( + out_channel, + out_channel, + kernel_size=1, + stride=1, + )) + + def forward(self, x): + x1 = self.block1(x) + x1 = x1 + self.shortcut1(x) + return self.block2(x1) + self.shortcut2(x1) + + +class ResNet18Discriminator(nn.Module): + def __init__(self, stft_channel, in_channel=64): + super(ResNet18Discriminator, self).__init__() + self.input = nn.Sequential( + nn.utils.weight_norm(nn.Conv1d(stft_channel, in_channel, kernel_size=7, stride=2, padding=1, )), + nn.LeakyReLU(0.2, True), + ) + self.df1 = BasicDiscriminatorBlock(in_channel, in_channel) + self.df2 = ResDiscriminatorBlock(in_channel, in_channel * 2) + self.df3 = ResDiscriminatorBlock(in_channel * 2, in_channel * 4) + self.df4 = ResDiscriminatorBlock(in_channel * 4, in_channel * 8) + + def forward(self, x): + x = self.input(x) + x = self.df1(x) + x = self.df2(x) + x = self.df3(x) + return self.df4(x) + + +class FrequencyDiscriminator(nn.Module): + def __init__(self, in_channel=64, fft_size=1024, hop_length=256, win_length=1024, window="hann_window"): + super(FrequencyDiscriminator, self).__init__() + self.fft_size = fft_size + self.hop_length = hop_length + self.win_length = win_length + self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) + self.stft_channel = fft_size // 2 + 1 + self.resnet_disc = ResNet18Discriminator(self.stft_channel, in_channel) + + def forward(self, x): + x_stft = torch.stft(x, self.fft_size, self.hop_length, self.win_length, self.window) + real = x_stft[..., 0] + imag = x_stft[..., 1] + + x_real = self.resnet_disc(real) + x_imag = self.resnet_disc(imag) + + return x_real, x_imag diff --git a/modules/vocoder/parallel_wavegan/models/melgan.py b/modules/vocoder/parallel_wavegan/models/melgan.py new file mode 100644 index 0000000000000000000000000000000000000000..b593bfbf4bb2eeb07bc7a34a82a5d3c7ee379f73 --- /dev/null +++ b/modules/vocoder/parallel_wavegan/models/melgan.py @@ -0,0 +1,458 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) + +"""MelGAN Modules.""" + +import logging + +import numpy as np +import torch +from torch import nn + +from modules.vocoder.parallel_wavegan.layers import CausalConv1d +from modules.vocoder.parallel_wavegan.layers import CausalConvTranspose1d +from modules.vocoder.parallel_wavegan.layers import ResidualStack +from modules.vocoder.parallel_wavegan.models.source import SourceModuleCycNoise_v1 + + +class MelGANGenerator(torch.nn.Module): + """MelGAN generator module.""" + + def __init__(self, + in_channels=80, + out_channels=1, + kernel_size=7, + channels=512, + bias=True, + upsample_scales=[8, 8, 2, 2], + stack_kernel_size=3, + stacks=3, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + pad="ReflectionPad1d", + pad_params={}, + use_final_nonlinear_activation=True, + use_weight_norm=True, + use_causal_conv=False, + use_pitch_embed=False, + use_nsf=False, + sample_rate=22050, + **kwargs + ): + """Initialize MelGANGenerator module. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (int): Kernel size of initial and final conv layer. + channels (int): Initial number of channels for conv layer. + bias (bool): Whether to add bias parameter in convolution layers. + upsample_scales (list): List of upsampling scales. + stack_kernel_size (int): Kernel size of dilated conv layers in residual stack. + stacks (int): Number of stacks in a single residual stack. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. + use_final_nonlinear_activation (torch.nn.Module): Activation function for the final layer. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + use_causal_conv (bool): Whether to use causal convolution. + + """ + super(MelGANGenerator, self).__init__() + + # check hyper parameters is valid + assert channels >= np.prod(upsample_scales) + assert channels % (2 ** len(upsample_scales)) == 0 + if not use_causal_conv: + assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." + + # add initial layer + layers = [] + if not use_causal_conv: + layers += [ + getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params), + torch.nn.Conv1d(in_channels, channels, kernel_size, bias=bias), + ] + else: + layers += [ + CausalConv1d(in_channels, channels, kernel_size, + bias=bias, pad=pad, pad_params=pad_params), + ] + + self.use_pitch_embed = use_pitch_embed + if use_pitch_embed: + self.pitch_embed = nn.Embedding(300, in_channels, 0) + self.c_proj = nn.Conv1d(2 * in_channels, in_channels, 1) + + for i, upsample_scale in enumerate(upsample_scales): + # add upsampling layer + layers += [getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)] + if not use_causal_conv: + layers += [ + torch.nn.ConvTranspose1d( + channels // (2 ** i), + channels // (2 ** (i + 1)), + upsample_scale * 2, + stride=upsample_scale, + padding=upsample_scale // 2 + upsample_scale % 2, + output_padding=upsample_scale % 2, + bias=bias, + ) + ] + else: + layers += [ + CausalConvTranspose1d( + channels // (2 ** i), + channels // (2 ** (i + 1)), + upsample_scale * 2, + stride=upsample_scale, + bias=bias, + ) + ] + + # add residual stack + for j in range(stacks): + layers += [ + ResidualStack( + kernel_size=stack_kernel_size, + channels=channels // (2 ** (i + 1)), + dilation=stack_kernel_size ** j, + bias=bias, + nonlinear_activation=nonlinear_activation, + nonlinear_activation_params=nonlinear_activation_params, + pad=pad, + pad_params=pad_params, + use_causal_conv=use_causal_conv, + ) + ] + self.use_nsf = use_nsf + if use_nsf: + self.harmonic_num = 8 + hop_size = np.prod(upsample_scales) + self.f0_upsamp = torch.nn.Upsample(scale_factor=hop_size) + # self.m_source = SourceModuleHnNSF(sampling_rate=sample_rate, harmonic_num=self.harmonic_num) + self.m_source = SourceModuleCycNoise_v1(sample_rate, 0.003) + self.nsf_conv = nn.Sequential(nn.Conv1d(1, channels // (2 ** (i + 1)), 1), torch.nn.Tanh()) + + # define the model as a single function + self.melgan_body = torch.nn.Sequential(*layers) + layers = [] + # add final layer + layers += [getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)] + if not use_causal_conv: + layers += [ + getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params), + torch.nn.Conv1d(channels // (2 ** (i + 1)), out_channels, kernel_size, bias=bias), + ] + else: + layers += [ + CausalConv1d(channels // (2 ** (i + 1)), out_channels, kernel_size, + bias=bias, pad=pad, pad_params=pad_params), + ] + if use_final_nonlinear_activation: + layers += [torch.nn.Tanh()] + + # define the model as a single function + self.melgan_final = torch.nn.Sequential(*layers) + + # apply weight norm + if use_weight_norm: + self.apply_weight_norm() + + # reset parameters + self.reset_parameters() + + def forward(self, c, f0=None, pitch=None): + """Calculate forward propagation. + + Args: + c (Tensor): Input tensor (B, channels, T). + + Returns: + Tensor: Output tensor (B, 1, T ** prod(upsample_scales)). + + """ + if self.use_pitch_embed: + c = self.c_proj(torch.cat([c, self.pitch_embed(pitch).transpose(1, 2)], 1)) + x = self.melgan_body(c) + if self.use_nsf: + f0_upsample = self.f0_upsamp(f0[:, None, :]) + f0_upsample = self.nsf_conv(f0_upsample) + x = x + f0_upsample + x = self.melgan_final(x) + return x + + def remove_weight_norm(self): + """Remove weight normalization module from all of the layers.""" + def _remove_weight_norm(m): + try: + logging.debug(f"Weight norm is removed from {m}.") + torch.nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(_remove_weight_norm) + + def apply_weight_norm(self): + """Apply weight normalization module from all of the layers.""" + def _apply_weight_norm(m): + if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d): + torch.nn.utils.weight_norm(m) + logging.debug(f"Weight norm is applied to {m}.") + + self.apply(_apply_weight_norm) + + def reset_parameters(self): + """Reset parameters. + + This initialization follows official implementation manner. + https://github.com/descriptinc/melgan-neurips/blob/master/spec2wav/modules.py + + """ + def _reset_parameters(m): + if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d): + m.weight.data.normal_(0.0, 0.02) + logging.debug(f"Reset parameters in {m}.") + + self.apply(_reset_parameters) + + +class MelGANDiscriminator(torch.nn.Module): + """MelGAN discriminator module.""" + + def __init__(self, + in_channels=1, + out_channels=1, + kernel_sizes=[5, 3], + channels=16, + max_downsample_channels=1024, + bias=True, + downsample_scales=[4, 4, 4, 4], + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + pad="ReflectionPad1d", + pad_params={}, + ): + """Initilize MelGAN discriminator module. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_sizes (list): List of two kernel sizes. The prod will be used for the first conv layer, + and the first and the second kernel sizes will be used for the last two layers. + For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15, + the last two layers' kernel size will be 5 and 3, respectively. + channels (int): Initial number of channels for conv layer. + max_downsample_channels (int): Maximum number of channels for downsampling layers. + bias (bool): Whether to add bias parameter in convolution layers. + downsample_scales (list): List of downsampling scales. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. + + """ + super(MelGANDiscriminator, self).__init__() + self.layers = torch.nn.ModuleList() + + # check kernel size is valid + assert len(kernel_sizes) == 2 + assert kernel_sizes[0] % 2 == 1 + assert kernel_sizes[1] % 2 == 1 + + # add first layer + self.layers += [ + torch.nn.Sequential( + getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params), + torch.nn.Conv1d(in_channels, channels, np.prod(kernel_sizes), bias=bias), + getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), + ) + ] + + # add downsample layers + in_chs = channels + for downsample_scale in downsample_scales: + out_chs = min(in_chs * downsample_scale, max_downsample_channels) + self.layers += [ + torch.nn.Sequential( + torch.nn.Conv1d( + in_chs, out_chs, + kernel_size=downsample_scale * 10 + 1, + stride=downsample_scale, + padding=downsample_scale * 5, + groups=in_chs // 4, + bias=bias, + ), + getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), + ) + ] + in_chs = out_chs + + # add final layers + out_chs = min(in_chs * 2, max_downsample_channels) + self.layers += [ + torch.nn.Sequential( + torch.nn.Conv1d( + in_chs, out_chs, kernel_sizes[0], + padding=(kernel_sizes[0] - 1) // 2, + bias=bias, + ), + getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), + ) + ] + self.layers += [ + torch.nn.Conv1d( + out_chs, out_channels, kernel_sizes[1], + padding=(kernel_sizes[1] - 1) // 2, + bias=bias, + ), + ] + + def forward(self, x): + """Calculate forward propagation. + + Args: + x (Tensor): Input noise signal (B, 1, T). + + Returns: + List: List of output tensors of each layer. + + """ + outs = [] + for f in self.layers: + x = f(x) + outs += [x] + + return outs + + +class MelGANMultiScaleDiscriminator(torch.nn.Module): + """MelGAN multi-scale discriminator module.""" + + def __init__(self, + in_channels=1, + out_channels=1, + scales=3, + downsample_pooling="AvgPool1d", + # follow the official implementation setting + downsample_pooling_params={ + "kernel_size": 4, + "stride": 2, + "padding": 1, + "count_include_pad": False, + }, + kernel_sizes=[5, 3], + channels=16, + max_downsample_channels=1024, + bias=True, + downsample_scales=[4, 4, 4, 4], + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + pad="ReflectionPad1d", + pad_params={}, + use_weight_norm=True, + **kwargs + ): + """Initilize MelGAN multi-scale discriminator module. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + downsample_pooling (str): Pooling module name for downsampling of the inputs. + downsample_pooling_params (dict): Parameters for the above pooling module. + kernel_sizes (list): List of two kernel sizes. The sum will be used for the first conv layer, + and the first and the second kernel sizes will be used for the last two layers. + channels (int): Initial number of channels for conv layer. + max_downsample_channels (int): Maximum number of channels for downsampling layers. + bias (bool): Whether to add bias parameter in convolution layers. + downsample_scales (list): List of downsampling scales. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. + use_causal_conv (bool): Whether to use causal convolution. + + """ + super(MelGANMultiScaleDiscriminator, self).__init__() + self.discriminators = torch.nn.ModuleList() + + # add discriminators + for _ in range(scales): + self.discriminators += [ + MelGANDiscriminator( + in_channels=in_channels, + out_channels=out_channels, + kernel_sizes=kernel_sizes, + channels=channels, + max_downsample_channels=max_downsample_channels, + bias=bias, + downsample_scales=downsample_scales, + nonlinear_activation=nonlinear_activation, + nonlinear_activation_params=nonlinear_activation_params, + pad=pad, + pad_params=pad_params, + ) + ] + self.pooling = getattr(torch.nn, downsample_pooling)(**downsample_pooling_params) + + # apply weight norm + if use_weight_norm: + self.apply_weight_norm() + + # reset parameters + self.reset_parameters() + + def forward(self, x): + """Calculate forward propagation. + + Args: + x (Tensor): Input noise signal (B, 1, T). + + Returns: + List: List of list of each discriminator outputs, which consists of each layer output tensors. + + """ + outs = [] + for f in self.discriminators: + outs += [f(x)] + x = self.pooling(x) + + return outs + + def remove_weight_norm(self): + """Remove weight normalization module from all of the layers.""" + def _remove_weight_norm(m): + try: + logging.debug(f"Weight norm is removed from {m}.") + torch.nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(_remove_weight_norm) + + def apply_weight_norm(self): + """Apply weight normalization module from all of the layers.""" + def _apply_weight_norm(m): + if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d): + torch.nn.utils.weight_norm(m) + logging.debug(f"Weight norm is applied to {m}.") + + self.apply(_apply_weight_norm) + + def reset_parameters(self): + """Reset parameters. + + This initialization follows official implementation manner. + https://github.com/descriptinc/melgan-neurips/blob/master/spec2wav/modules.py + + """ + def _reset_parameters(m): + if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d): + m.weight.data.normal_(0.0, 0.02) + logging.debug(f"Reset parameters in {m}.") + + self.apply(_reset_parameters) diff --git a/modules/vocoder/parallel_wavegan/models/parallel_wavegan.py b/modules/vocoder/parallel_wavegan/models/parallel_wavegan.py new file mode 100644 index 0000000000000000000000000000000000000000..dc828fb2728416288987a6678bd963af80597cdf --- /dev/null +++ b/modules/vocoder/parallel_wavegan/models/parallel_wavegan.py @@ -0,0 +1,461 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) + +"""Parallel WaveGAN Modules.""" + +import logging +import math + +import torch +from torch import nn + +from modules.vocoder.parallel_wavegan.layers import Conv1d +from modules.vocoder.parallel_wavegan.layers import Conv1d1x1 +from modules.vocoder.parallel_wavegan.layers import ResidualBlock +from modules.vocoder.parallel_wavegan.layers import upsample +from modules.vocoder.parallel_wavegan import models +from modules.vocoder.parallel_wavegan.models import SourceModuleCycNoise_v1 +from utils.commons.hparams import hparams +import numpy as np + +class ParallelWaveGANGenerator(torch.nn.Module): + """Parallel WaveGAN Generator module.""" + + def __init__(self, + in_channels=1, + out_channels=1, + kernel_size=3, + layers=30, + stacks=3, + residual_channels=64, + gate_channels=128, + skip_channels=64, + aux_channels=80, + aux_context_window=2, + dropout=0.0, + bias=True, + use_weight_norm=True, + use_causal_conv=False, + upsample_conditional_features=True, + upsample_net="ConvInUpsampleNetwork", + upsample_params={"upsample_scales": [4, 4, 4, 4]}, + use_pitch_embed=False, + use_nsf=False, + sample_rate=22050, + ): + """Initialize Parallel WaveGAN Generator module. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (int): Kernel size of dilated convolution. + layers (int): Number of residual block layers. + stacks (int): Number of stacks i.e., dilation cycles. + residual_channels (int): Number of channels in residual conv. + gate_channels (int): Number of channels in gated conv. + skip_channels (int): Number of channels in skip conv. + aux_channels (int): Number of channels for auxiliary feature conv. + aux_context_window (int): Context window size for auxiliary feature. + dropout (float): Dropout rate. 0.0 means no dropout applied. + bias (bool): Whether to use bias parameter in conv layer. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + use_causal_conv (bool): Whether to use causal structure. + upsample_conditional_features (bool): Whether to use upsampling network. + upsample_net (str): Upsampling network architecture. + upsample_params (dict): Upsampling network parameters. + + """ + super(ParallelWaveGANGenerator, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.aux_channels = aux_channels + self.layers = layers + self.stacks = stacks + self.kernel_size = kernel_size + + # check the number of layers and stacks + assert layers % stacks == 0 + layers_per_stack = layers // stacks + + # define first convolution + self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True) + + # define conv + upsampling network + self.aux_context_window = aux_context_window + if upsample_conditional_features: + upsample_params.update({ + "use_causal_conv": use_causal_conv, + }) + if upsample_net == "MelGANGenerator": + assert aux_context_window == 0 + upsample_params.update({ + "use_weight_norm": False, # not to apply twice + "use_final_nonlinear_activation": False, + }) + self.upsample_net = getattr(models, upsample_net)(**upsample_params) + else: + if upsample_net == "ConvInUpsampleNetwork": + upsample_params.update({ + "aux_channels": aux_channels, + "aux_context_window": aux_context_window, + }) + self.upsample_net = getattr(upsample, upsample_net)(**upsample_params) + else: + self.upsample_net = None + + # define residual blocks + self.conv_layers = torch.nn.ModuleList() + for layer in range(layers): + dilation = 2 ** (layer % layers_per_stack) + conv = ResidualBlock( + kernel_size=kernel_size, + residual_channels=residual_channels, + gate_channels=gate_channels, + skip_channels=skip_channels, + aux_channels=aux_channels, + dilation=dilation, + dropout=dropout, + bias=bias, + use_causal_conv=use_causal_conv, + ) + self.conv_layers += [conv] + + # define output layers + self.last_conv_layers = torch.nn.ModuleList([ + torch.nn.ReLU(inplace=True), + Conv1d1x1(skip_channels, skip_channels, bias=True), + torch.nn.ReLU(inplace=True), + Conv1d1x1(skip_channels, out_channels, bias=True), + ]) + + self.use_pitch_embed = use_pitch_embed + if use_pitch_embed: + self.pitch_embed = nn.Embedding(300, aux_channels, 0) + self.c_proj = nn.Linear(2 * aux_channels, aux_channels) + self.use_nsf = use_nsf + if use_nsf: + self.harmonic_num = 8 + hop_size = np.prod(upsample_params['upsample_scales']) + self.f0_upsamp = torch.nn.Upsample(scale_factor=hop_size) + self.m_source = SourceModuleCycNoise_v1(sample_rate, 0.003) + self.nsf_conv = nn.Sequential(nn.Conv1d(1, aux_channels, 1), torch.nn.Tanh()) + + # apply weight norm + if use_weight_norm: + self.apply_weight_norm() + + def forward(self, x, c=None, pitch=None, f0=None, **kwargs): + """Calculate forward propagation. + + Args: + x (Tensor): Input noise signal (B, C_in, T). + c (Tensor): Local conditioning auxiliary features (B, C ,T'). + pitch (Tensor): Local conditioning pitch (B, T'). + + Returns: + Tensor: Output tensor (B, C_out, T) + + """ + # perform upsampling + if c is not None and self.upsample_net is not None: + if self.use_pitch_embed: + p = self.pitch_embed(pitch) + c = self.c_proj(torch.cat([c.transpose(1, 2), p], -1)).transpose(1, 2) + c = self.upsample_net(c) + if self.use_nsf: + f0_upsample = self.f0_upsamp( + f0[:, None, :][:, :, self.aux_context_window:-self.aux_context_window]) + f0_upsample = self.nsf_conv(f0_upsample) + c = c + f0_upsample + if x is None: + x = torch.randn([c.size(0), 1, c.size(-1)]).to(c.device) + assert c.size(-1) == x.size(-1), (c.size(-1), x.size(-1)) + + # encode to hidden representation + x = self.first_conv(x) + skips = 0 + for f in self.conv_layers: + x, h = f(x, c) + skips += h + skips *= math.sqrt(1.0 / len(self.conv_layers)) + + # apply final layers + x = skips + for f in self.last_conv_layers: + x = f(x) + + return x + + def remove_weight_norm(self): + """Remove weight normalization module from all of the layers.""" + def _remove_weight_norm(m): + try: + logging.debug(f"Weight norm is removed from {m}.") + torch.nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(_remove_weight_norm) + + def apply_weight_norm(self): + """Apply weight normalization module from all of the layers.""" + def _apply_weight_norm(m): + if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): + torch.nn.utils.weight_norm(m) + logging.debug(f"Weight norm is applied to {m}.") + + self.apply(_apply_weight_norm) + + @staticmethod + def _get_receptive_field_size(layers, stacks, kernel_size, + dilation=lambda x: 2 ** x): + assert layers % stacks == 0 + layers_per_cycle = layers // stacks + dilations = [dilation(i % layers_per_cycle) for i in range(layers)] + return (kernel_size - 1) * sum(dilations) + 1 + + @property + def receptive_field_size(self): + """Return receptive field size.""" + return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) + + +class ParallelWaveGANDiscriminator(torch.nn.Module): + """Parallel WaveGAN Discriminator module.""" + + def __init__(self, + in_channels=1, + out_channels=1, + kernel_size=3, + layers=10, + conv_channels=64, + dilation_factor=1, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + bias=True, + use_weight_norm=True, + ): + """Initialize Parallel WaveGAN Discriminator module. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (int): Number of output channels. + layers (int): Number of conv layers. + conv_channels (int): Number of chnn layers. + dilation_factor (int): Dilation factor. For example, if dilation_factor = 2, + the dilation will be 2, 4, 8, ..., and so on. + nonlinear_activation (str): Nonlinear function after each conv. + nonlinear_activation_params (dict): Nonlinear function parameters + bias (bool): Whether to use bias parameter in conv. + use_weight_norm (bool) Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + + """ + super(ParallelWaveGANDiscriminator, self).__init__() + assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." + assert dilation_factor > 0, "Dilation factor must be > 0." + self.conv_layers = torch.nn.ModuleList() + conv_in_channels = in_channels + for i in range(layers - 1): + if i == 0: + dilation = 1 + else: + dilation = i if dilation_factor == 1 else dilation_factor ** i + conv_in_channels = conv_channels + padding = (kernel_size - 1) // 2 * dilation + conv_layer = [ + Conv1d(conv_in_channels, conv_channels, + kernel_size=kernel_size, padding=padding, + dilation=dilation, bias=bias), + getattr(torch.nn, nonlinear_activation)(inplace=True, **nonlinear_activation_params) + ] + self.conv_layers += conv_layer + padding = (kernel_size - 1) // 2 + last_conv_layer = Conv1d( + conv_in_channels, out_channels, + kernel_size=kernel_size, padding=padding, bias=bias) + self.conv_layers += [last_conv_layer] + + # apply weight norm + if use_weight_norm: + self.apply_weight_norm() + + def forward(self, x, cond=None): + """Calculate forward propagation. + + Args: + x (Tensor): Input noise signal (B, 1, T). + cond (Tensor): Input noise signal (B, H, T_frame). + + Returns: + Tensor: Output tensor (B, 1, T) + + """ + cond_layer_i = len(self.conv_layers) // 2 + for i, f in enumerate(self.conv_layers): + if i == cond_layer_i and cond is not None: + aux_context_window = hparams['aux_context_window'] + cond = cond[:, :, aux_context_window:-aux_context_window] + cond = cond[:, :, :, None].repeat([1, 1, 1, hparams['hop_size']]).reshape( + cond.shape[0], cond.shape[1], -1) + x = x + cond + x = f(x) + return x + + def apply_weight_norm(self): + """Apply weight normalization module from all of the layers.""" + def _apply_weight_norm(m): + if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): + torch.nn.utils.weight_norm(m) + logging.debug(f"Weight norm is applied to {m}.") + + self.apply(_apply_weight_norm) + + def remove_weight_norm(self): + """Remove weight normalization module from all of the layers.""" + def _remove_weight_norm(m): + try: + logging.debug(f"Weight norm is removed from {m}.") + torch.nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(_remove_weight_norm) + + +class ResidualParallelWaveGANDiscriminator(torch.nn.Module): + """Parallel WaveGAN Discriminator module.""" + + def __init__(self, + in_channels=1, + out_channels=1, + kernel_size=3, + layers=30, + stacks=3, + residual_channels=64, + gate_channels=128, + skip_channels=64, + dropout=0.0, + bias=True, + use_weight_norm=True, + use_causal_conv=False, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + ): + """Initialize Parallel WaveGAN Discriminator module. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (int): Kernel size of dilated convolution. + layers (int): Number of residual block layers. + stacks (int): Number of stacks i.e., dilation cycles. + residual_channels (int): Number of channels in residual conv. + gate_channels (int): Number of channels in gated conv. + skip_channels (int): Number of channels in skip conv. + dropout (float): Dropout rate. 0.0 means no dropout applied. + bias (bool): Whether to use bias parameter in conv. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + use_causal_conv (bool): Whether to use causal structure. + nonlinear_activation_params (dict): Nonlinear function parameters + + """ + super(ResidualParallelWaveGANDiscriminator, self).__init__() + assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." + + self.in_channels = in_channels + self.out_channels = out_channels + self.layers = layers + self.stacks = stacks + self.kernel_size = kernel_size + + # check the number of layers and stacks + assert layers % stacks == 0 + layers_per_stack = layers // stacks + + # define first convolution + self.first_conv = torch.nn.Sequential( + Conv1d1x1(in_channels, residual_channels, bias=True), + getattr(torch.nn, nonlinear_activation)( + inplace=True, **nonlinear_activation_params), + ) + + # define residual blocks + self.conv_layers = torch.nn.ModuleList() + for layer in range(layers): + dilation = 2 ** (layer % layers_per_stack) + conv = ResidualBlock( + kernel_size=kernel_size, + residual_channels=residual_channels, + gate_channels=gate_channels, + skip_channels=skip_channels, + aux_channels=-1, + dilation=dilation, + dropout=dropout, + bias=bias, + use_causal_conv=use_causal_conv, + ) + self.conv_layers += [conv] + + # define output layers + self.last_conv_layers = torch.nn.ModuleList([ + getattr(torch.nn, nonlinear_activation)( + inplace=True, **nonlinear_activation_params), + Conv1d1x1(skip_channels, skip_channels, bias=True), + getattr(torch.nn, nonlinear_activation)( + inplace=True, **nonlinear_activation_params), + Conv1d1x1(skip_channels, out_channels, bias=True), + ]) + + # apply weight norm + if use_weight_norm: + self.apply_weight_norm() + + def forward(self, x): + """Calculate forward propagation. + + Args: + x (Tensor): Input noise signal (B, 1, T). + + Returns: + Tensor: Output tensor (B, 1, T) + + """ + x = self.first_conv(x) + + skips = 0 + for f in self.conv_layers: + x, h = f(x, None) + skips += h + skips *= math.sqrt(1.0 / len(self.conv_layers)) + + # apply final layers + x = skips + for f in self.last_conv_layers: + x = f(x) + return x + + def apply_weight_norm(self): + """Apply weight normalization module from all of the layers.""" + def _apply_weight_norm(m): + if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): + torch.nn.utils.weight_norm(m) + logging.debug(f"Weight norm is applied to {m}.") + + self.apply(_apply_weight_norm) + + def remove_weight_norm(self): + """Remove weight normalization module from all of the layers.""" + def _remove_weight_norm(m): + try: + logging.debug(f"Weight norm is removed from {m}.") + torch.nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(_remove_weight_norm) diff --git a/modules/vocoder/parallel_wavegan/models/source.py b/modules/vocoder/parallel_wavegan/models/source.py new file mode 100644 index 0000000000000000000000000000000000000000..f2a006e53c0e2194036fd08ea9d6ed4d9a10d6cf --- /dev/null +++ b/modules/vocoder/parallel_wavegan/models/source.py @@ -0,0 +1,538 @@ +import torch +import numpy as np +import sys +import torch.nn.functional as torch_nn_func + + +class SineGen(torch.nn.Module): + """ Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__(self, samp_rate, harmonic_num=0, + sine_amp=0.1, noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + self.flag_for_pulse = flag_for_pulse + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def _f02sine(self, f0_values): + """ f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The interger part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \ + device=f0_values.device) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + if not self.flag_for_pulse: + # for normal case + + # To prevent torch.cumsum numerical overflow, + # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. + # Buffer tmp_over_one_idx indicates the time step to add -1. + # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi + tmp_over_one = torch.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - + tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) + * 2 * np.pi) + else: + # If necessary, make sure that the first time step of every + # voiced segments is sin(pi) or cos(0) + # This is used for pulse-train generation + + # identify the last time step in unvoiced segments + uv = self._f02uv(f0_values) + uv_1 = torch.roll(uv, shifts=-1, dims=1) + uv_1[:, -1, :] = 1 + u_loc = (uv < 1) * (uv_1 > 0) + + # get the instantanouse phase + tmp_cumsum = torch.cumsum(rad_values, dim=1) + # different batch needs to be processed differently + for idx in range(f0_values.shape[0]): + temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] + temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] + # stores the accumulation of i.phase within + # each voiced segments + tmp_cumsum[idx, :, :] = 0 + tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum + + # rad_values - tmp_cumsum: remove the accumulation of i.phase + # within the previous voiced segment. + i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1) + + # get the sines + sines = torch.cos(i_phase * 2 * np.pi) + return sines + + def forward(self, f0): + """ sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, + device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) + + # generate sine waveforms + sine_waves = self._f02sine(f0_buf) * self.sine_amp + + # generate uv signal + # uv = torch.ones(f0.shape) + # uv = uv * (f0 > self.voiced_threshold) + uv = self._f02uv(f0) + + # noise: for unvoiced should be similar to sine_amp + # std = self.sine_amp/3 -> max value ~ self.sine_amp + # . for voiced regions is self.noise_std + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + # first: set the unvoiced part to 0 by uv + # then: additive noise + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class PulseGen(torch.nn.Module): + """ Definition of Pulse train generator + + There are many ways to implement pulse generator. + Here, PulseGen is based on SinGen. For a perfect + """ + def __init__(self, samp_rate, pulse_amp = 0.1, + noise_std = 0.003, voiced_threshold = 0): + super(PulseGen, self).__init__() + self.pulse_amp = pulse_amp + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + self.noise_std = noise_std + self.l_sinegen = SineGen(self.sampling_rate, harmonic_num=0, \ + sine_amp=self.pulse_amp, noise_std=0, \ + voiced_threshold=self.voiced_threshold, \ + flag_for_pulse=True) + + def forward(self, f0): + """ Pulse train generator + pulse_train, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output pulse_train: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + + Note: self.l_sine doesn't make sure that the initial phase of + a voiced segment is np.pi, the first pulse in a voiced segment + may not be at the first time step within a voiced segment + """ + with torch.no_grad(): + sine_wav, uv, noise = self.l_sinegen(f0) + + # sine without additive noise + pure_sine = sine_wav - noise + + # step t corresponds to a pulse if + # sine[t] > sine[t+1] & sine[t] > sine[t-1] + # & sine[t-1], sine[t+1], and sine[t] are voiced + # or + # sine[t] is voiced, sine[t-1] is unvoiced + # we use torch.roll to simulate sine[t+1] and sine[t-1] + sine_1 = torch.roll(pure_sine, shifts=1, dims=1) + uv_1 = torch.roll(uv, shifts=1, dims=1) + uv_1[:, 0, :] = 0 + sine_2 = torch.roll(pure_sine, shifts=-1, dims=1) + uv_2 = torch.roll(uv, shifts=-1, dims=1) + uv_2[:, -1, :] = 0 + + loc = (pure_sine > sine_1) * (pure_sine > sine_2) \ + * (uv_1 > 0) * (uv_2 > 0) * (uv > 0) \ + + (uv_1 < 1) * (uv > 0) + + # pulse train without noise + pulse_train = pure_sine * loc + + # additive noise to pulse train + # note that noise from sinegen is zero in voiced regions + pulse_noise = torch.randn_like(pure_sine) * self.noise_std + + # with additive noise on pulse, and unvoiced regions + pulse_train += pulse_noise * loc + pulse_noise * (1 - uv) + return pulse_train, sine_wav, uv, pulse_noise + + +class SignalsConv1d(torch.nn.Module): + """ Filtering input signal with time invariant filter + Note: FIRFilter conducted filtering given fixed FIR weight + SignalsConv1d convolves two signals + Note: this is based on torch.nn.functional.conv1d + + """ + + def __init__(self): + super(SignalsConv1d, self).__init__() + + def forward(self, signal, system_ir): + """ output = forward(signal, system_ir) + + signal: (batchsize, length1, dim) + system_ir: (length2, dim) + + output: (batchsize, length1, dim) + """ + if signal.shape[-1] != system_ir.shape[-1]: + print("Error: SignalsConv1d expects shape:") + print("signal (batchsize, length1, dim)") + print("system_id (batchsize, length2, dim)") + print("But received signal: {:s}".format(str(signal.shape))) + print(" system_ir: {:s}".format(str(system_ir.shape))) + sys.exit(1) + padding_length = system_ir.shape[0] - 1 + groups = signal.shape[-1] + + # pad signal on the left + signal_pad = torch_nn_func.pad(signal.permute(0, 2, 1), \ + (padding_length, 0)) + # prepare system impulse response as (dim, 1, length2) + # also flip the impulse response + ir = torch.flip(system_ir.unsqueeze(1).permute(2, 1, 0), \ + dims=[2]) + # convolute + output = torch_nn_func.conv1d(signal_pad, ir, groups=groups) + return output.permute(0, 2, 1) + + +class CyclicNoiseGen_v1(torch.nn.Module): + """ CyclicnoiseGen_v1 + Cyclic noise with a single parameter of beta. + Pytorch v1 implementation assumes f_t is also fixed + """ + + def __init__(self, samp_rate, + noise_std=0.003, voiced_threshold=0): + super(CyclicNoiseGen_v1, self).__init__() + self.samp_rate = samp_rate + self.noise_std = noise_std + self.voiced_threshold = voiced_threshold + + self.l_pulse = PulseGen(samp_rate, pulse_amp=1.0, + noise_std=noise_std, + voiced_threshold=voiced_threshold) + self.l_conv = SignalsConv1d() + + def noise_decay(self, beta, f0mean): + """ decayed_noise = noise_decay(beta, f0mean) + decayed_noise = n[t]exp(-t * f_mean / beta / samp_rate) + + beta: (dim=1) or (batchsize=1, 1, dim=1) + f0mean (batchsize=1, 1, dim=1) + + decayed_noise (batchsize=1, length, dim=1) + """ + with torch.no_grad(): + # exp(-1.0 n / T) < 0.01 => n > -log(0.01)*T = 4.60*T + # truncate the noise when decayed by -40 dB + length = 4.6 * self.samp_rate / f0mean + length = length.int() + time_idx = torch.arange(0, length, device=beta.device) + time_idx = time_idx.unsqueeze(0).unsqueeze(2) + time_idx = time_idx.repeat(beta.shape[0], 1, beta.shape[2]) + + noise = torch.randn(time_idx.shape, device=beta.device) + + # due to Pytorch implementation, use f0_mean as the f0 factor + decay = torch.exp(-time_idx * f0mean / beta / self.samp_rate) + return noise * self.noise_std * decay + + def forward(self, f0s, beta): + """ Producde cyclic-noise + """ + # pulse train + pulse_train, sine_wav, uv, noise = self.l_pulse(f0s) + pure_pulse = pulse_train - noise + + # decayed_noise (length, dim=1) + if (uv < 1).all(): + # all unvoiced + cyc_noise = torch.zeros_like(sine_wav) + else: + f0mean = f0s[uv > 0].mean() + + decayed_noise = self.noise_decay(beta, f0mean)[0, :, :] + # convolute + cyc_noise = self.l_conv(pure_pulse, decayed_noise) + + # add noise in invoiced segments + cyc_noise = cyc_noise + noise * (1.0 - uv) + return cyc_noise, pulse_train, sine_wav, uv, noise + + +class SineGen(torch.nn.Module): + """ Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__(self, samp_rate, harmonic_num=0, + sine_amp=0.1, noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + self.flag_for_pulse = flag_for_pulse + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def _f02sine(self, f0_values): + """ f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The interger part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \ + device=f0_values.device) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + if not self.flag_for_pulse: + # for normal case + + # To prevent torch.cumsum numerical overflow, + # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. + # Buffer tmp_over_one_idx indicates the time step to add -1. + # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi + tmp_over_one = torch.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - + tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) + * 2 * np.pi) + else: + # If necessary, make sure that the first time step of every + # voiced segments is sin(pi) or cos(0) + # This is used for pulse-train generation + + # identify the last time step in unvoiced segments + uv = self._f02uv(f0_values) + uv_1 = torch.roll(uv, shifts=-1, dims=1) + uv_1[:, -1, :] = 1 + u_loc = (uv < 1) * (uv_1 > 0) + + # get the instantanouse phase + tmp_cumsum = torch.cumsum(rad_values, dim=1) + # different batch needs to be processed differently + for idx in range(f0_values.shape[0]): + temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] + temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] + # stores the accumulation of i.phase within + # each voiced segments + tmp_cumsum[idx, :, :] = 0 + tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum + + # rad_values - tmp_cumsum: remove the accumulation of i.phase + # within the previous voiced segment. + i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1) + + # get the sines + sines = torch.cos(i_phase * 2 * np.pi) + return sines + + def forward(self, f0): + """ sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, \ + device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) + + # generate sine waveforms + sine_waves = self._f02sine(f0_buf) * self.sine_amp + + # generate uv signal + # uv = torch.ones(f0.shape) + # uv = uv * (f0 > self.voiced_threshold) + uv = self._f02uv(f0) + + # noise: for unvoiced should be similar to sine_amp + # std = self.sine_amp/3 -> max value ~ self.sine_amp + # . for voiced regions is self.noise_std + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + # first: set the unvoiced part to 0 by uv + # then: additive noise + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleCycNoise_v1(torch.nn.Module): + """ SourceModuleCycNoise_v1 + SourceModule(sampling_rate, noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + + noise_std: std of Gaussian noise (default: 0.003) + voiced_threshold: threshold to set U/V given F0 (default: 0) + + cyc, noise, uv = SourceModuleCycNoise_v1(F0_upsampled, beta) + F0_upsampled (batchsize, length, 1) + beta (1) + cyc (batchsize, length, 1) + noise (batchsize, length, 1) + uv (batchsize, length, 1) + """ + + def __init__(self, sampling_rate, noise_std=0.003, voiced_threshod=0): + super(SourceModuleCycNoise_v1, self).__init__() + self.sampling_rate = sampling_rate + self.noise_std = noise_std + self.l_cyc_gen = CyclicNoiseGen_v1(sampling_rate, noise_std, + voiced_threshod) + + def forward(self, f0_upsamped, beta): + """ + cyc, noise, uv = SourceModuleCycNoise_v1(F0, beta) + F0_upsampled (batchsize, length, 1) + beta (1) + cyc (batchsize, length, 1) + noise (batchsize, length, 1) + uv (batchsize, length, 1) + """ + # source for harmonic branch + cyc, pulse, sine, uv, add_noi = self.l_cyc_gen(f0_upsamped, beta) + + # source for noise branch, in the same shape as uv + noise = torch.randn_like(uv) * self.noise_std / 3 + return cyc, noise, uv + + +class SourceModuleHnNSF(torch.nn.Module): + """ SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGen(sampling_rate, harmonic_num, + sine_amp, add_noise_std, voiced_threshod) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x): + """ + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + """ + # source for harmonic branch + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + # source for noise branch, in the same shape as uv + noise = torch.randn_like(uv) * self.sine_amp / 3 + return sine_merge, noise, uv + + +if __name__ == '__main__': + source = SourceModuleCycNoise_v1(24000) + x = torch.randn(16, 25600, 1) + + diff --git a/modules/vocoder/parallel_wavegan/optimizers/__init__.py b/modules/vocoder/parallel_wavegan/optimizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0e0c5932838281e912079e5784d84d43444a61a --- /dev/null +++ b/modules/vocoder/parallel_wavegan/optimizers/__init__.py @@ -0,0 +1,2 @@ +from torch.optim import * # NOQA +from .radam import * # NOQA diff --git a/modules/vocoder/parallel_wavegan/optimizers/radam.py b/modules/vocoder/parallel_wavegan/optimizers/radam.py new file mode 100644 index 0000000000000000000000000000000000000000..e805d7e34921bee436e1e7fd9e1f753c7609186b --- /dev/null +++ b/modules/vocoder/parallel_wavegan/optimizers/radam.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +"""RAdam optimizer. + +This code is drived from https://github.com/LiyuanLucasLiu/RAdam. +""" + +import math +import torch + +from torch.optim.optimizer import Optimizer + + +class RAdam(Optimizer): + """Rectified Adam optimizer.""" + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): + """Initilize RAdam optimizer.""" + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + self.buffer = [[None, None, None] for ind in range(10)] + super(RAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + """Set state.""" + super(RAdam, self).__setstate__(state) + + def step(self, closure=None): + """Run one step.""" + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('RAdam does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state['step'] += 1 + buffered = self.buffer[int(state['step'] % 10)] + if state['step'] == buffered[0]: + N_sma, step_size = buffered[1], buffered[2] + else: + buffered[0] = state['step'] + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + buffered[1] = N_sma + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = math.sqrt( + (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) # NOQA + else: + step_size = 1.0 / (1 - beta1 ** state['step']) + buffered[2] = step_size + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + + # more conservative since it's an approximated value + if N_sma >= 5: + denom = exp_avg_sq.sqrt().add_(group['eps']) + p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) + else: + p_data_fp32.add_(-step_size * group['lr'], exp_avg) + + p.data.copy_(p_data_fp32) + + return loss diff --git a/modules/vocoder/parallel_wavegan/stft_loss.py b/modules/vocoder/parallel_wavegan/stft_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..91b484a8e68e935e2738427746907d83d2c0baa5 --- /dev/null +++ b/modules/vocoder/parallel_wavegan/stft_loss.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) + +"""STFT-based Loss modules.""" +import librosa +import torch + +from modules.vocoder.parallel_wavegan.losses import LogSTFTMagnitudeLoss, SpectralConvergengeLoss, stft + + +class STFTLoss(torch.nn.Module): + """STFT loss module.""" + + def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window", + use_mel_loss=False): + """Initialize STFT loss module.""" + super(STFTLoss, self).__init__() + self.fft_size = fft_size + self.shift_size = shift_size + self.win_length = win_length + self.window = getattr(torch, window)(win_length) + self.spectral_convergenge_loss = SpectralConvergengeLoss() + self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() + self.use_mel_loss = use_mel_loss + self.mel_basis = None + + def forward(self, x, y): + """Calculate forward propagation. + + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + + Returns: + Tensor: Spectral convergence loss value. + Tensor: Log STFT magnitude loss value. + + """ + if self.window.device != x.device: + self.window = self.window.to(x.device) + x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) + y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window) + if self.use_mel_loss: + if self.mel_basis is None: + self.mel_basis = torch.from_numpy(librosa.filters.mel(22050, self.fft_size, 80)).cuda().T + x_mag = x_mag @ self.mel_basis + y_mag = y_mag @ self.mel_basis + + sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) + mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) + + return sc_loss, mag_loss + + +class MultiResolutionSTFTLoss(torch.nn.Module): + """Multi resolution STFT loss module.""" + + def __init__(self, + fft_sizes=[1024, 2048, 512], + hop_sizes=[120, 240, 50], + win_lengths=[600, 1200, 240], + window="hann_window", + use_mel_loss=False): + """Initialize Multi resolution STFT loss module. + + Args: + fft_sizes (list): List of FFT sizes. + hop_sizes (list): List of hop sizes. + win_lengths (list): List of window lengths. + window (str): Window function type. + + """ + super(MultiResolutionSTFTLoss, self).__init__() + assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) + self.stft_losses = torch.nn.ModuleList() + for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): + self.stft_losses += [STFTLoss(fs, ss, wl, window, use_mel_loss)] + + def forward(self, x, y): + """Calculate forward propagation. + + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + + Returns: + Tensor: Multi resolution spectral convergence loss value. + Tensor: Multi resolution log STFT magnitude loss value. + + """ + sc_loss = 0.0 + mag_loss = 0.0 + for f in self.stft_losses: + sc_l, mag_l = f(x, y) + sc_loss += sc_l + mag_loss += mag_l + sc_loss /= len(self.stft_losses) + mag_loss /= len(self.stft_losses) + + return sc_loss, mag_loss diff --git a/modules/vocoder/parallel_wavegan/utils/__init__.py b/modules/vocoder/parallel_wavegan/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e8fa95a020706b5412c3959fbf6e5980019c0d5f --- /dev/null +++ b/modules/vocoder/parallel_wavegan/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * # NOQA diff --git a/modules/vocoder/parallel_wavegan/utils/__pycache__/__init__.cpython-36.pyc b/modules/vocoder/parallel_wavegan/utils/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e9f83c755607b63ec9bf5b57b865877cfcecb26 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/utils/__pycache__/__init__.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/utils/__pycache__/__init__.cpython-37.pyc b/modules/vocoder/parallel_wavegan/utils/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9966dc335900fb13df28fa72df453b7f45ffdc6 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/utils/__pycache__/__init__.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/utils/__pycache__/utils.cpython-36.pyc b/modules/vocoder/parallel_wavegan/utils/__pycache__/utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1774a45c7ccc60d51f9193b6aca5543c8868c867 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/utils/__pycache__/utils.cpython-36.pyc differ diff --git a/modules/vocoder/parallel_wavegan/utils/__pycache__/utils.cpython-37.pyc b/modules/vocoder/parallel_wavegan/utils/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c412a3de64ded2231bca83c08369d818ce320c9 Binary files /dev/null and b/modules/vocoder/parallel_wavegan/utils/__pycache__/utils.cpython-37.pyc differ diff --git a/modules/vocoder/parallel_wavegan/utils/utils.py b/modules/vocoder/parallel_wavegan/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..69fdc4cdb5d75b907c8b9372a9f2448a9a166730 --- /dev/null +++ b/modules/vocoder/parallel_wavegan/utils/utils.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) + +"""Utility functions.""" + +import fnmatch +import logging +import os +import sys +try: + import h5py +except: + pass +import numpy as np + + +def find_files(root_dir, query="*.wav", include_root_dir=True): + """Find files recursively. + + Args: + root_dir (str): Root root_dir to find. + query (str): Query to find. + include_root_dir (bool): If False, root_dir name is not included. + + Returns: + list: List of found filenames. + + """ + files = [] + for root, dirnames, filenames in os.walk(root_dir, followlinks=True): + for filename in fnmatch.filter(filenames, query): + files.append(os.path.join(root, filename)) + if not include_root_dir: + files = [file_.replace(root_dir + "/", "") for file_ in files] + + return files + + +def read_hdf5(hdf5_name, hdf5_path): + """Read hdf5 dataset. + + Args: + hdf5_name (str): Filename of hdf5 file. + hdf5_path (str): Dataset name in hdf5 file. + + Return: + any: Dataset values. + + """ + if not os.path.exists(hdf5_name): + logging.error(f"There is no such a hdf5 file ({hdf5_name}).") + sys.exit(1) + + hdf5_file = h5py.File(hdf5_name, "r") + + if hdf5_path not in hdf5_file: + logging.error(f"There is no such a data in hdf5 file. ({hdf5_path})") + sys.exit(1) + + hdf5_data = hdf5_file[hdf5_path][()] + hdf5_file.close() + + return hdf5_data + + +def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True): + """Write dataset to hdf5. + + Args: + hdf5_name (str): Hdf5 dataset filename. + hdf5_path (str): Dataset path in hdf5. + write_data (ndarray): Data to write. + is_overwrite (bool): Whether to overwrite dataset. + + """ + # convert to numpy array + write_data = np.array(write_data) + + # check folder existence + folder_name, _ = os.path.split(hdf5_name) + if not os.path.exists(folder_name) and len(folder_name) != 0: + os.makedirs(folder_name) + + # check hdf5 existence + if os.path.exists(hdf5_name): + # if already exists, open with r+ mode + hdf5_file = h5py.File(hdf5_name, "r+") + # check dataset existence + if hdf5_path in hdf5_file: + if is_overwrite: + logging.warning("Dataset in hdf5 file already exists. " + "recreate dataset in hdf5.") + hdf5_file.__delitem__(hdf5_path) + else: + logging.error("Dataset in hdf5 file already exists. " + "if you want to overwrite, please set is_overwrite = True.") + hdf5_file.close() + sys.exit(1) + else: + # if not exists, open with w mode + hdf5_file = h5py.File(hdf5_name, "w") + + # write data to hdf5 + hdf5_file.create_dataset(hdf5_path, data=write_data) + hdf5_file.flush() + hdf5_file.close() + + +class HDF5ScpLoader(object): + """Loader class for a fests.scp file of hdf5 file. + + Examples: + key1 /some/path/a.h5:feats + key2 /some/path/b.h5:feats + key3 /some/path/c.h5:feats + key4 /some/path/d.h5:feats + ... + >>> loader = HDF5ScpLoader("hdf5.scp") + >>> array = loader["key1"] + + key1 /some/path/a.h5 + key2 /some/path/b.h5 + key3 /some/path/c.h5 + key4 /some/path/d.h5 + ... + >>> loader = HDF5ScpLoader("hdf5.scp", "feats") + >>> array = loader["key1"] + + """ + + def __init__(self, feats_scp, default_hdf5_path="feats"): + """Initialize HDF5 scp loader. + + Args: + feats_scp (str): Kaldi-style feats.scp file with hdf5 format. + default_hdf5_path (str): Path in hdf5 file. If the scp contain the info, not used. + + """ + self.default_hdf5_path = default_hdf5_path + with open(feats_scp) as f: + lines = [line.replace("\n", "") for line in f.readlines()] + self.data = {} + for line in lines: + key, value = line.split() + self.data[key] = value + + def get_path(self, key): + """Get hdf5 file path for a given key.""" + return self.data[key] + + def __getitem__(self, key): + """Get ndarray for a given key.""" + p = self.data[key] + if ":" in p: + return read_hdf5(*p.split(":")) + else: + return read_hdf5(p, self.default_hdf5_path) + + def __len__(self): + """Return the length of the scp file.""" + return len(self.data) + + def __iter__(self): + """Return the iterator of the scp file.""" + return iter(self.data) + + def keys(self): + """Return the keys of the scp file.""" + return self.data.keys() diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 0000000000000000000000000000000000000000..93e7f233d63332fe6b7455344d8651e25ee5d81f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,27 @@ +matplotlib +librosa==0.8.0 +tqdm +pandas +numba==0.53.1 +numpy==1.19.2 +scipy==1.3 +PyYAML==5.3.1 +tensorboardX +pyloudnorm +setuptools>=41.0.0 +g2p_en +resemblyzer +webrtcvad +tensorboard==2.6.0 +scikit-learn==0.24.1 +scikit-image==0.16.2 +textgrid +jiwer +pycwt +PyWavelets +praat-parselmouth==0.3.3 +jieba +pypinyin # for chinese graphme-to-phoneme +einops +chardet +stanza # for dependency parsing to build the syntactic graph \ No newline at end of file diff --git a/tasks/run.py b/tasks/run.py new file mode 100644 index 0000000000000000000000000000000000000000..ef2b0a319cb5cd7baf87e5224ab545412715fb69 --- /dev/null +++ b/tasks/run.py @@ -0,0 +1,19 @@ +import os + +os.environ["OMP_NUM_THREADS"] = "1" + +from utils.commons.hparams import hparams, set_hparams +import importlib + + +def run_task(): + assert hparams['task_cls'] != '' + pkg = ".".join(hparams["task_cls"].split(".")[:-1]) + cls_name = hparams["task_cls"].split(".")[-1] + task_cls = getattr(importlib.import_module(pkg), cls_name) + task_cls.start() + + +if __name__ == '__main__': + set_hparams() + run_task() diff --git a/tasks/tts/__pycache__/dataset_utils.cpython-36.pyc b/tasks/tts/__pycache__/dataset_utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ae0e37a73f81b506b440d3a5074de539201b5cd Binary files /dev/null and b/tasks/tts/__pycache__/dataset_utils.cpython-36.pyc differ diff --git a/tasks/tts/__pycache__/dataset_utils.cpython-37.pyc b/tasks/tts/__pycache__/dataset_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e80bf5d8ba4a8db77d957d38ebfaa3bf3a853e79 Binary files /dev/null and b/tasks/tts/__pycache__/dataset_utils.cpython-37.pyc differ diff --git a/tasks/tts/__pycache__/fs.cpython-36.pyc b/tasks/tts/__pycache__/fs.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f2aa3abf3e09c7dd45128771b1a51fe266da2a4 Binary files /dev/null and b/tasks/tts/__pycache__/fs.cpython-36.pyc differ diff --git a/tasks/tts/__pycache__/fs.cpython-37.pyc b/tasks/tts/__pycache__/fs.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb5931becb85fa819a76016ad06fde4adf4a278b Binary files /dev/null and b/tasks/tts/__pycache__/fs.cpython-37.pyc differ diff --git a/tasks/tts/__pycache__/ps.cpython-36.pyc b/tasks/tts/__pycache__/ps.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3db4e11e086542adae1cfa671687d44d5b4a7988 Binary files /dev/null and b/tasks/tts/__pycache__/ps.cpython-36.pyc differ diff --git a/tasks/tts/__pycache__/ps_flow.cpython-36.pyc b/tasks/tts/__pycache__/ps_flow.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..690f8340e9f67c643ffcee1d603c83c4fe6360d3 Binary files /dev/null and b/tasks/tts/__pycache__/ps_flow.cpython-36.pyc differ diff --git a/tasks/tts/__pycache__/speech_base.cpython-36.pyc b/tasks/tts/__pycache__/speech_base.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e777a11c04f90bde7ef59e5bd78fdb1daa74cac Binary files /dev/null and b/tasks/tts/__pycache__/speech_base.cpython-36.pyc differ diff --git a/tasks/tts/__pycache__/speech_base.cpython-37.pyc b/tasks/tts/__pycache__/speech_base.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bee6be979a9762cf726b56e4bdac7241065fe0b6 Binary files /dev/null and b/tasks/tts/__pycache__/speech_base.cpython-37.pyc differ diff --git a/tasks/tts/__pycache__/synta.cpython-36.pyc b/tasks/tts/__pycache__/synta.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80b20c10705f3e84635244f1165db1f6a8a9a9db Binary files /dev/null and b/tasks/tts/__pycache__/synta.cpython-36.pyc differ diff --git a/tasks/tts/__pycache__/synta.cpython-37.pyc b/tasks/tts/__pycache__/synta.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b159d6dbed952793de1bd1293d1b34f64bf53d3d Binary files /dev/null and b/tasks/tts/__pycache__/synta.cpython-37.pyc differ diff --git a/tasks/tts/__pycache__/synta.cpython-39.pyc b/tasks/tts/__pycache__/synta.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ff6381d1aa1446626d8aa88e8a2ad615b5fdb1f Binary files /dev/null and b/tasks/tts/__pycache__/synta.cpython-39.pyc differ diff --git a/tasks/tts/__pycache__/tts_utils.cpython-36.pyc b/tasks/tts/__pycache__/tts_utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00ae5447399712ff987087fed54f7e26406b6f93 Binary files /dev/null and b/tasks/tts/__pycache__/tts_utils.cpython-36.pyc differ diff --git a/tasks/tts/__pycache__/tts_utils.cpython-37.pyc b/tasks/tts/__pycache__/tts_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1aa694fb488cbe9ff71510a59de9cfc534d62e5c Binary files /dev/null and b/tasks/tts/__pycache__/tts_utils.cpython-37.pyc differ diff --git a/tasks/tts/dataset_utils.py b/tasks/tts/dataset_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..08772c5a47378fa7101db71206c99fa641cdd644 --- /dev/null +++ b/tasks/tts/dataset_utils.py @@ -0,0 +1,195 @@ +import torch.optim +import torch.utils.data +import numpy as np +import torch +import torch.optim +import torch.utils.data +import torch.distributions +from utils.audio.pitch.utils import norm_interp_f0, denorm_f0 +from utils.commons.dataset_utils import BaseDataset, collate_1d_or_2d +from utils.commons.indexed_datasets import IndexedDataset + + +class BaseSpeechDataset(BaseDataset): + def __init__(self, prefix, shuffle=False, items=None, data_dir=None): + super().__init__(shuffle) + from utils.commons.hparams import hparams + self.data_dir = hparams['binary_data_dir'] if data_dir is None else data_dir + self.prefix = prefix + self.hparams = hparams + self.indexed_ds = None + if items is not None: + self.indexed_ds = items + self.sizes = [1] * len(items) + self.avail_idxs = list(range(len(self.sizes))) + else: + self.sizes = np.load(f'{self.data_dir}/{self.prefix}_lengths.npy') + if prefix == 'test' and len(hparams['test_ids']) > 0: + self.avail_idxs = hparams['test_ids'] + else: + self.avail_idxs = list(range(len(self.sizes))) + if prefix == 'train' and hparams['min_frames'] > 0: + self.avail_idxs = [x for x in self.avail_idxs if self.sizes[x] >= hparams['min_frames']] + self.sizes = [self.sizes[i] for i in self.avail_idxs] + + def _get_item(self, index): + if hasattr(self, 'avail_idxs') and self.avail_idxs is not None: + index = self.avail_idxs[index] + if self.indexed_ds is None: + self.indexed_ds = IndexedDataset(f'{self.data_dir}/{self.prefix}') + return self.indexed_ds[index] + + def __getitem__(self, index): + hparams = self.hparams + item = self._get_item(index) + assert len(item['mel']) == self.sizes[index], (len(item['mel']), self.sizes[index]) + max_frames = hparams['max_frames'] + spec = torch.Tensor(item['mel'])[:max_frames] + max_frames = spec.shape[0] // hparams['frames_multiple'] * hparams['frames_multiple'] + spec = spec[:max_frames] + ph_token = torch.LongTensor(item['ph_token'][:hparams['max_input_tokens']]) + sample = { + "id": index, + "item_name": item['item_name'], + "text": item['txt'], + "txt_token": ph_token, + "mel": spec, + "mel_nonpadding": spec.abs().sum(-1) > 0, + } + if hparams['use_spk_embed']: + sample["spk_embed"] = torch.Tensor(item['spk_embed']) + if hparams['use_spk_id']: + sample["spk_id"] = int(item['spk_id']) + return sample + + def collater(self, samples): + if len(samples) == 0: + return {} + hparams = self.hparams + id = torch.LongTensor([s['id'] for s in samples]) + item_names = [s['item_name'] for s in samples] + text = [s['text'] for s in samples] + txt_tokens = collate_1d_or_2d([s['txt_token'] for s in samples], 0) + mels = collate_1d_or_2d([s['mel'] for s in samples], 0.0) + txt_lengths = torch.LongTensor([s['txt_token'].numel() for s in samples]) + mel_lengths = torch.LongTensor([s['mel'].shape[0] for s in samples]) + + batch = { + 'id': id, + 'item_name': item_names, + 'nsamples': len(samples), + 'text': text, + 'txt_tokens': txt_tokens, + 'txt_lengths': txt_lengths, + 'mels': mels, + 'mel_lengths': mel_lengths, + } + + if hparams['use_spk_embed']: + spk_embed = torch.stack([s['spk_embed'] for s in samples]) + batch['spk_embed'] = spk_embed + if hparams['use_spk_id']: + spk_ids = torch.LongTensor([s['spk_id'] for s in samples]) + batch['spk_ids'] = spk_ids + return batch + + +class FastSpeechDataset(BaseSpeechDataset): + def __getitem__(self, index): + sample = super(FastSpeechDataset, self).__getitem__(index) + item = self._get_item(index) + hparams = self.hparams + mel = sample['mel'] + T = mel.shape[0] + ph_token = sample['txt_token'] + sample['mel2ph'] = mel2ph = torch.LongTensor(item['mel2ph'])[:T] + if hparams['use_pitch_embed']: + assert 'f0' in item + pitch = torch.LongTensor(item.get(hparams.get('pitch_key', 'pitch')))[:T] + f0, uv = norm_interp_f0(item["f0"][:T]) + uv = torch.FloatTensor(uv) + f0 = torch.FloatTensor(f0) + if hparams['pitch_type'] == 'ph': + if "f0_ph" in item: + f0 = torch.FloatTensor(item['f0_ph']) + else: + f0 = denorm_f0(f0, None) + f0_phlevel_sum = torch.zeros_like(ph_token).float().scatter_add(0, mel2ph - 1, f0) + f0_phlevel_num = torch.zeros_like(ph_token).float().scatter_add( + 0, mel2ph - 1, torch.ones_like(f0)).clamp_min(1) + f0_ph = f0_phlevel_sum / f0_phlevel_num + f0, uv = norm_interp_f0(f0_ph) + else: + f0, uv, pitch = None, None, None + sample["f0"], sample["uv"], sample["pitch"] = f0, uv, pitch + return sample + + def collater(self, samples): + if len(samples) == 0: + return {} + batch = super(FastSpeechDataset, self).collater(samples) + hparams = self.hparams + if hparams['use_pitch_embed']: + f0 = collate_1d_or_2d([s['f0'] for s in samples], 0.0) + pitch = collate_1d_or_2d([s['pitch'] for s in samples]) + uv = collate_1d_or_2d([s['uv'] for s in samples]) + else: + f0, uv, pitch = None, None, None + mel2ph = collate_1d_or_2d([s['mel2ph'] for s in samples], 0.0) + batch.update({ + 'mel2ph': mel2ph, + 'pitch': pitch, + 'f0': f0, + 'uv': uv, + }) + return batch + +class FastSpeechWordDataset(FastSpeechDataset): + def __getitem__(self, index): + sample = super().__getitem__(index) + item = self._get_item(index) + max_frames = sample['mel'].shape[0] + if 'word' in item: + sample['words'] = item['word'] + sample["ph_words"] = item["ph_gb_word"] + sample["word_tokens"] = torch.LongTensor(item["word_token"]) + else: + sample['words'] = item['words'] + sample["ph_words"] = " ".join(item["ph_words"]) + sample["word_tokens"] = torch.LongTensor(item["word_tokens"]) + sample["mel2word"] = torch.LongTensor(item.get("mel2word"))[:max_frames] + sample["ph2word"] = torch.LongTensor(item['ph2word'][:self.hparams['max_input_tokens']]) + + sample['dgl_graph'] = item['dgl_graph'] + sample['edge_types'] = item['edge_types'] + + return sample + + def collater(self, samples): + samples = [s for s in samples if s is not None] + batch = super().collater(samples) + ph_words = [s['ph_words'] for s in samples] + batch['ph_words'] = ph_words + word_tokens = collate_1d_or_2d([s['word_tokens'] for s in samples], 0) + batch['word_tokens'] = word_tokens + mel2word = collate_1d_or_2d([s['mel2word'] for s in samples], 0) + batch['mel2word'] = mel2word + ph2word = collate_1d_or_2d([s['ph2word'] for s in samples], 0) + batch['ph2word'] = ph2word + batch['words'] = [s['words'] for s in samples] + batch['word_lengths'] = torch.LongTensor([len(s['word_tokens']) for s in samples]) + if self.hparams['use_word_input']: + batch['txt_tokens'] = batch['word_tokens'] + batch['txt_lengths'] = torch.LongTensor([s['word_tokens'].numel() for s in samples]) + batch['mel2ph'] = batch['mel2word'] + + graph_lst, etypes_lst = [], [] # new features for Graph-based SDP + for s in samples: + graph_lst.append(s['dgl_graph']) + etypes_lst.append(s['edge_types']) + batch.update({ + 'graph_lst': graph_lst, + 'etypes_lst': etypes_lst, + }) + + return batch diff --git a/tasks/tts/diffspeech.py b/tasks/tts/diffspeech.py new file mode 100644 index 0000000000000000000000000000000000000000..283bf9b62fed0c5f68a9f82887543b9413dd8955 --- /dev/null +++ b/tasks/tts/diffspeech.py @@ -0,0 +1,111 @@ +import torch + +from modules.tts.diffspeech.shallow_diffusion_tts import GaussianDiffusion +from tasks.tts.fs2_orig import FastSpeech2OrigTask + +import utils +from utils.commons.hparams import hparams +from utils.commons.ckpt_utils import load_ckpt +from utils.audio.pitch.utils import denorm_f0 + + +class DiffSpeechTask(FastSpeech2OrigTask): + def build_tts_model(self): + # get min and max + # import torch + # from tqdm import tqdm + # v_min = torch.ones([80]) * 100 + # v_max = torch.ones([80]) * -100 + # for i, ds in enumerate(tqdm(self.dataset_cls('train'))): + # v_max = torch.max(torch.max(ds['mel'].reshape(-1, 80), 0)[0], v_max) + # v_min = torch.min(torch.min(ds['mel'].reshape(-1, 80), 0)[0], v_min) + # if i % 100 == 0: + # print(i, v_min, v_max) + # print('final', v_min, v_max) + dict_size = len(self.token_encoder) + self.model = GaussianDiffusion(dict_size, hparams) + if hparams['fs2_ckpt'] != '': + load_ckpt(self.model.fs2, hparams['fs2_ckpt'], 'model', strict=True) + # for k, v in self.model.fs2.named_parameters(): + # if 'predictor' not in k: + # v.requires_grad = False + # or + for k, v in self.model.fs2.named_parameters(): + v.requires_grad = False + + def build_optimizer(self, model): + self.optimizer = optimizer = torch.optim.AdamW( + filter(lambda p: p.requires_grad, model.parameters()), + lr=hparams['lr'], + betas=(hparams['optimizer_adam_beta1'], hparams['optimizer_adam_beta2']), + weight_decay=hparams['weight_decay']) + return optimizer + + def build_scheduler(self, optimizer): + return torch.optim.lr_scheduler.StepLR(optimizer, hparams['decay_steps'], gamma=0.5) + + def run_model(self, sample, infer=False, *args, **kwargs): + txt_tokens = sample['txt_tokens'] # [B, T_t] + spk_embed = sample.get('spk_embed') + spk_id = sample.get('spk_ids') + if not infer: + target = sample['mels'] # [B, T_s, 80] + mel2ph = sample['mel2ph'] # [B, T_s] + f0 = sample.get('f0') + uv = sample.get('uv') + output = self.model(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed, spk_id=spk_id, + ref_mels=target, f0=f0, uv=uv, infer=False) + losses = {} + if 'diff_loss' in output: + losses['mel'] = output['diff_loss'] + self.add_dur_loss(output['dur'], mel2ph, txt_tokens, losses=losses) + if hparams['use_pitch_embed']: + self.add_pitch_loss(output, sample, losses) + return losses, output + else: + use_gt_dur = kwargs.get('infer_use_gt_dur', hparams['use_gt_dur']) + use_gt_f0 = kwargs.get('infer_use_gt_f0', hparams['use_gt_f0']) + mel2ph, uv, f0 = None, None, None + if use_gt_dur: + mel2ph = sample['mel2ph'] + if use_gt_f0: + f0 = sample['f0'] + uv = sample['uv'] + output = self.model(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed, spk_id=spk_id, + ref_mels=None, f0=f0, uv=uv, infer=True) + return output + + def save_valid_result(self, sample, batch_idx, model_out): + sr = hparams['audio_sample_rate'] + f0_gt = None + # mel_out = model_out['mel_out'] + if sample.get('f0') is not None: + f0_gt = denorm_f0(sample['f0'][0].cpu(), sample['uv'][0].cpu()) + # self.plot_mel(batch_idx, sample['mels'], mel_out, f0s=f0_gt) + if self.global_step > 0: + # wav_pred = self.vocoder.spec2wav(mel_out[0].cpu(), f0=f0_gt) + # self.logger.add_audio(f'wav_val_{batch_idx}', wav_pred, self.global_step, sr) + # with gt duration + model_out = self.run_model(sample, infer=True, infer_use_gt_dur=True) + dur_info = self.get_plot_dur_info(sample, model_out) + del dur_info['dur_pred'] + wav_pred = self.vocoder.spec2wav(model_out['mel_out'][0].cpu(), f0=f0_gt) + self.logger.add_audio(f'wav_gdur_{batch_idx}', wav_pred, self.global_step, sr) + self.plot_mel(batch_idx, sample['mels'], model_out['mel_out'][0], f'diffmel_gdur_{batch_idx}', + dur_info=dur_info, f0s=f0_gt) + self.plot_mel(batch_idx, sample['mels'], model_out['fs2_mel'][0], f'fs2mel_gdur_{batch_idx}', + dur_info=dur_info, f0s=f0_gt) # gt mel vs. fs2 mel + + # with pred duration + if not hparams['use_gt_dur']: + model_out = self.run_model(sample, infer=True, infer_use_gt_dur=False) + dur_info = self.get_plot_dur_info(sample, model_out) + self.plot_mel(batch_idx, sample['mels'], model_out['mel_out'][0], f'mel_pdur_{batch_idx}', + dur_info=dur_info, f0s=f0_gt) + wav_pred = self.vocoder.spec2wav(model_out['mel_out'][0].cpu(), f0=f0_gt) + self.logger.add_audio(f'wav_pdur_{batch_idx}', wav_pred, self.global_step, sr) + # gt wav + if self.global_step <= hparams['valid_infer_interval']: + mel_gt = sample['mels'][0].cpu() + wav_gt = self.vocoder.spec2wav(mel_gt, f0=f0_gt) + self.logger.add_audio(f'wav_gt_{batch_idx}', wav_gt, self.global_step, sr) diff --git a/tasks/tts/fs.py b/tasks/tts/fs.py new file mode 100755 index 0000000000000000000000000000000000000000..f280bb922da6e8e714680700ba8d23114d4951f2 --- /dev/null +++ b/tasks/tts/fs.py @@ -0,0 +1,184 @@ +import torch +import torch.distributions +import torch.nn.functional as F +import torch.optim +import torch.utils.data + +from modules.tts.fs import FastSpeech +from tasks.tts.dataset_utils import FastSpeechWordDataset +from tasks.tts.speech_base import SpeechBaseTask +from utils.audio.align import mel2token_to_dur +from utils.audio.pitch.utils import denorm_f0 +from utils.commons.hparams import hparams + + +class FastSpeechTask(SpeechBaseTask): + def __init__(self): + super().__init__() + self.dataset_cls = FastSpeechWordDataset + self.sil_ph = self.token_encoder.sil_phonemes() + + def build_tts_model(self): + dict_size = len(self.token_encoder) + self.model = FastSpeech(dict_size, hparams) + + def run_model(self, sample, infer=False, *args, **kwargs): + txt_tokens = sample['txt_tokens'] # [B, T_t] + spk_embed = sample.get('spk_embed') + spk_id = sample.get('spk_ids') + if not infer: + target = sample['mels'] # [B, T_s, 80] + mel2ph = sample['mel2ph'] # [B, T_s] + f0 = sample.get('f0') + uv = sample.get('uv') + output = self.model(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed, spk_id=spk_id, + f0=f0, uv=uv, infer=False) + losses = {} + self.add_mel_loss(output['mel_out'], target, losses) + self.add_dur_loss(output['dur'], mel2ph, txt_tokens, losses=losses) + if hparams['use_pitch_embed']: + self.add_pitch_loss(output, sample, losses) + return losses, output + else: + use_gt_dur = kwargs.get('infer_use_gt_dur', hparams['use_gt_dur']) + use_gt_f0 = kwargs.get('infer_use_gt_f0', hparams['use_gt_f0']) + mel2ph, uv, f0 = None, None, None + if use_gt_dur: + mel2ph = sample['mel2ph'] + if use_gt_f0: + f0 = sample['f0'] + uv = sample['uv'] + output = self.model(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed, spk_id=spk_id, + f0=f0, uv=uv, infer=True) + return output + + def add_dur_loss(self, dur_pred, mel2ph, txt_tokens, losses=None): + """ + + :param dur_pred: [B, T], float, log scale + :param mel2ph: [B, T] + :param txt_tokens: [B, T] + :param losses: + :return: + """ + B, T = txt_tokens.shape + nonpadding = (txt_tokens != 0).float() + dur_gt = mel2token_to_dur(mel2ph, T).float() * nonpadding + is_sil = torch.zeros_like(txt_tokens).bool() + for p in self.sil_ph: + is_sil = is_sil | (txt_tokens == self.token_encoder.encode(p)[0]) + is_sil = is_sil.float() # [B, T_txt] + losses['pdur'] = F.mse_loss((dur_pred + 1).log(), (dur_gt + 1).log(), reduction='none') + losses['pdur'] = (losses['pdur'] * nonpadding).sum() / nonpadding.sum() + losses['pdur'] = losses['pdur'] * hparams['lambda_ph_dur'] + # use linear scale for sentence and word duration + if hparams['lambda_word_dur'] > 0: + word_id = (is_sil.cumsum(-1) * (1 - is_sil)).long() + word_dur_p = dur_pred.new_zeros([B, word_id.max() + 1]).scatter_add(1, word_id, dur_pred)[:, 1:] + word_dur_g = dur_gt.new_zeros([B, word_id.max() + 1]).scatter_add(1, word_id, dur_gt)[:, 1:] + wdur_loss = F.mse_loss((word_dur_p + 1).log(), (word_dur_g + 1).log(), reduction='none') + word_nonpadding = (word_dur_g > 0).float() + wdur_loss = (wdur_loss * word_nonpadding).sum() / word_nonpadding.sum() + losses['wdur'] = wdur_loss * hparams['lambda_word_dur'] + if hparams['lambda_sent_dur'] > 0: + sent_dur_p = dur_pred.sum(-1) + sent_dur_g = dur_gt.sum(-1) + sdur_loss = F.mse_loss((sent_dur_p + 1).log(), (sent_dur_g + 1).log(), reduction='mean') + losses['sdur'] = sdur_loss.mean() * hparams['lambda_sent_dur'] + + def add_pitch_loss(self, output, sample, losses): + mel2ph = sample['mel2ph'] # [B, T_s] + f0 = sample['f0'] + uv = sample['uv'] + nonpadding = (mel2ph != 0).float() if hparams['pitch_type'] == 'frame' \ + else (sample['txt_tokens'] != 0).float() + p_pred = output['pitch_pred'] + assert p_pred[..., 0].shape == f0.shape + if hparams['use_uv'] and hparams['pitch_type'] == 'frame': + assert p_pred[..., 1].shape == uv.shape, (p_pred.shape, uv.shape) + losses['uv'] = (F.binary_cross_entropy_with_logits( + p_pred[:, :, 1], uv, reduction='none') * nonpadding).sum() \ + / nonpadding.sum() * hparams['lambda_uv'] + nonpadding = nonpadding * (uv == 0).float() + f0_pred = p_pred[:, :, 0] + losses['f0'] = (F.l1_loss(f0_pred, f0, reduction='none') * nonpadding).sum() \ + / nonpadding.sum() * hparams['lambda_f0'] + + def save_valid_result(self, sample, batch_idx, model_out): + sr = hparams['audio_sample_rate'] + f0_gt = None + mel_out = model_out['mel_out'] + if sample.get('f0') is not None: + f0_gt = denorm_f0(sample['f0'][0].cpu(), sample['uv'][0].cpu()) + self.plot_mel(batch_idx, sample['mels'], mel_out, f0s=f0_gt) + if self.global_step > 0: + wav_pred = self.vocoder.spec2wav(mel_out[0].cpu(), f0=f0_gt) + self.logger.add_audio(f'wav_val_{batch_idx}', wav_pred, self.global_step, sr) + # with gt duration + model_out = self.run_model(sample, infer=True, infer_use_gt_dur=True) + dur_info = self.get_plot_dur_info(sample, model_out) + del dur_info['dur_pred'] + wav_pred = self.vocoder.spec2wav(model_out['mel_out'][0].cpu(), f0=f0_gt) + self.logger.add_audio(f'wav_gdur_{batch_idx}', wav_pred, self.global_step, sr) + self.plot_mel(batch_idx, sample['mels'], model_out['mel_out'][0], f'mel_gdur_{batch_idx}', + dur_info=dur_info, f0s=f0_gt) + + # with pred duration + if not hparams['use_gt_dur']: + model_out = self.run_model(sample, infer=True, infer_use_gt_dur=False) + dur_info = self.get_plot_dur_info(sample, model_out) + self.plot_mel(batch_idx, sample['mels'], model_out['mel_out'][0], f'mel_pdur_{batch_idx}', + dur_info=dur_info, f0s=f0_gt) + wav_pred = self.vocoder.spec2wav(model_out['mel_out'][0].cpu(), f0=f0_gt) + self.logger.add_audio(f'wav_pdur_{batch_idx}', wav_pred, self.global_step, sr) + # gt wav + if self.global_step <= hparams['valid_infer_interval']: + mel_gt = sample['mels'][0].cpu() + wav_gt = self.vocoder.spec2wav(mel_gt, f0=f0_gt) + self.logger.add_audio(f'wav_gt_{batch_idx}', wav_gt, self.global_step, sr) + + def get_plot_dur_info(self, sample, model_out): + T_txt = sample['txt_tokens'].shape[1] + dur_gt = mel2token_to_dur(sample['mel2ph'], T_txt)[0] + dur_pred = model_out['dur'] if 'dur' in model_out else dur_gt + txt = self.token_encoder.decode(sample['txt_tokens'][0].cpu().numpy()) + txt = txt.split(" ") + return {'dur_gt': dur_gt, 'dur_pred': dur_pred, 'txt': txt} + + def test_step(self, sample, batch_idx): + """ + + :param sample: + :param batch_idx: + :return: + """ + assert sample['txt_tokens'].shape[0] == 1, 'only support batch_size=1 in inference' + outputs = self.run_model(sample, infer=True) + text = sample['text'][0] + item_name = sample['item_name'][0] + tokens = sample['txt_tokens'][0].cpu().numpy() + mel_gt = sample['mels'][0].cpu().numpy() + mel_pred = outputs['mel_out'][0].cpu().numpy() + mel2ph = sample['mel2ph'][0].cpu().numpy() + mel2ph_pred = outputs['mel2ph'][0].cpu().numpy() + str_phs = self.token_encoder.decode(tokens, strip_padding=True) + base_fn = f'[{batch_idx:06d}][{item_name.replace("%", "_")}][%s]' + if text is not None: + base_fn += text.replace(":", "$3A")[:80] + base_fn = base_fn.replace(' ', '_') + gen_dir = self.gen_dir + wav_pred = self.vocoder.spec2wav(mel_pred) + self.saving_result_pool.add_job(self.save_result, args=[ + wav_pred, mel_pred, base_fn % 'P', gen_dir, str_phs, mel2ph_pred]) + if hparams['save_gt']: + wav_gt = self.vocoder.spec2wav(mel_gt) + self.saving_result_pool.add_job(self.save_result, args=[ + wav_gt, mel_gt, base_fn % 'G', gen_dir, str_phs, mel2ph]) + print(f"Pred_shape: {mel_pred.shape}, gt_shape: {mel_gt.shape}") + return { + 'item_name': item_name, + 'text': text, + 'ph_tokens': self.token_encoder.decode(tokens.tolist()), + 'wav_fn_pred': base_fn % 'P', + 'wav_fn_gt': base_fn % 'G', + } diff --git a/tasks/tts/fs2_orig.py b/tasks/tts/fs2_orig.py new file mode 100755 index 0000000000000000000000000000000000000000..a234df565d3a1679bf8bc5f3c7821256152ed456 --- /dev/null +++ b/tasks/tts/fs2_orig.py @@ -0,0 +1,138 @@ +import torch +import torch.nn.functional as F +from modules.tts.fs2_orig import FastSpeech2Orig +from tasks.tts.dataset_utils import FastSpeechDataset +from tasks.tts.fs import FastSpeechTask +from utils.commons.dataset_utils import collate_1d, collate_2d +from utils.commons.hparams import hparams +from utils.plot.plot import spec_to_figure +import numpy as np + + +class FastSpeech2OrigDataset(FastSpeechDataset): + def __init__(self, prefix, shuffle=False, items=None, data_dir=None): + super().__init__(prefix, shuffle, items, data_dir) + self.pitch_type = hparams.get('pitch_type') + + def __getitem__(self, index): + sample = super().__getitem__(index) + item = self._get_item(index) + hparams = self.hparams + mel = sample['mel'] + T = mel.shape[0] + sample['energy'] = (mel.exp() ** 2).sum(-1).sqrt() + if hparams['use_pitch_embed'] and self.pitch_type == 'cwt': + cwt_spec = torch.Tensor(item['cwt_spec'])[:T] + f0_mean = item.get('f0_mean', item.get('cwt_mean')) + f0_std = item.get('f0_std', item.get('cwt_std')) + sample.update({"cwt_spec": cwt_spec, "f0_mean": f0_mean, "f0_std": f0_std}) + return sample + + def collater(self, samples): + if len(samples) == 0: + return {} + batch = super().collater(samples) + if hparams['use_pitch_embed']: + energy = collate_1d([s['energy'] for s in samples], 0.0) + else: + energy = None + batch.update({'energy': energy}) + if self.pitch_type == 'cwt': + cwt_spec = collate_2d([s['cwt_spec'] for s in samples]) + f0_mean = torch.Tensor([s['f0_mean'] for s in samples]) + f0_std = torch.Tensor([s['f0_std'] for s in samples]) + batch.update({'cwt_spec': cwt_spec, 'f0_mean': f0_mean, 'f0_std': f0_std}) + return batch + + +class FastSpeech2OrigTask(FastSpeechTask): + def __init__(self): + super(FastSpeech2OrigTask, self).__init__() + self.dataset_cls = FastSpeech2OrigDataset + + def build_tts_model(self): + dict_size = len(self.token_encoder) + self.model = FastSpeech2Orig(dict_size, hparams) + + def run_model(self, sample, infer=False, *args, **kwargs): + txt_tokens = sample['txt_tokens'] # [B, T_t] + spk_embed = sample.get('spk_embed') + spk_id = sample.get('spk_ids') + if not infer: + target = sample['mels'] # [B, T_s, 80] + mel2ph = sample['mel2ph'] # [B, T_s] + f0 = sample.get('f0') + uv = sample.get('uv') + energy = sample.get('energy') + output = self.model(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed, spk_id=spk_id, + f0=f0, uv=uv, energy=energy, infer=False) + losses = {} + self.add_mel_loss(output['mel_out'], target, losses) + self.add_dur_loss(output['dur'], mel2ph, txt_tokens, losses=losses) + if hparams['use_pitch_embed']: + self.add_pitch_loss(output, sample, losses) + if hparams['use_energy_embed']: + self.add_energy_loss(output, sample, losses) + return losses, output + else: + mel2ph, uv, f0, energy = None, None, None, None + use_gt_dur = kwargs.get('infer_use_gt_dur', hparams['use_gt_dur']) + use_gt_f0 = kwargs.get('infer_use_gt_f0', hparams['use_gt_f0']) + use_gt_energy = kwargs.get('infer_use_gt_energy', hparams['use_gt_energy']) + if use_gt_dur: + mel2ph = sample['mel2ph'] + if use_gt_f0: + f0 = sample['f0'] + uv = sample['uv'] + if use_gt_energy: + energy = sample['energy'] + output = self.model(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed, spk_id=spk_id, + f0=f0, uv=uv, energy=energy, infer=True) + return output + + def save_valid_result(self, sample, batch_idx, model_out): + super(FastSpeech2OrigTask, self).save_valid_result(sample, batch_idx, model_out) + self.plot_cwt(batch_idx, model_out['cwt'], sample['cwt_spec']) + + def plot_cwt(self, batch_idx, cwt_out, cwt_gt=None): + if len(cwt_out.shape) == 3: + cwt_out = cwt_out[0] + if isinstance(cwt_out, torch.Tensor): + cwt_out = cwt_out.cpu().numpy() + if cwt_gt is not None: + if len(cwt_gt.shape) == 3: + cwt_gt = cwt_gt[0] + if isinstance(cwt_gt, torch.Tensor): + cwt_gt = cwt_gt.cpu().numpy() + cwt_out = np.concatenate([cwt_out, cwt_gt], -1) + name = f'cwt_val_{batch_idx}' + self.logger.add_figure(name, spec_to_figure(cwt_out), self.global_step) + + def add_pitch_loss(self, output, sample, losses): + if hparams['pitch_type'] == 'cwt': + cwt_spec = sample[f'cwt_spec'] + f0_mean = sample['f0_mean'] + uv = sample['uv'] + mel2ph = sample['mel2ph'] + f0_std = sample['f0_std'] + cwt_pred = output['cwt'][:, :, :10] + f0_mean_pred = output['f0_mean'] + f0_std_pred = output['f0_std'] + nonpadding = (mel2ph != 0).float() + losses['C'] = F.l1_loss(cwt_pred, cwt_spec) * hparams['lambda_f0'] + if hparams['use_uv']: + assert output['cwt'].shape[-1] == 11 + uv_pred = output['cwt'][:, :, -1] + losses['uv'] = (F.binary_cross_entropy_with_logits(uv_pred, uv, reduction='none') + * nonpadding).sum() / nonpadding.sum() * hparams['lambda_uv'] + losses['f0_mean'] = F.l1_loss(f0_mean_pred, f0_mean) * hparams['lambda_f0'] + losses['f0_std'] = F.l1_loss(f0_std_pred, f0_std) * hparams['lambda_f0'] + else: + super(FastSpeech2OrigTask, self).add_pitch_loss(output, sample, losses) + + def add_energy_loss(self, output, sample, losses): + energy_pred, energy = output['energy_pred'], sample['energy'] + nonpadding = (energy != 0).float() + loss = (F.mse_loss(energy_pred, energy, reduction='none') * nonpadding).sum() / nonpadding.sum() + loss = loss * hparams['lambda_energy'] + losses['e'] = loss diff --git a/tasks/tts/ps.py b/tasks/tts/ps.py new file mode 100644 index 0000000000000000000000000000000000000000..995dec8c7f40c27310a6231b08330e807d02c405 --- /dev/null +++ b/tasks/tts/ps.py @@ -0,0 +1,194 @@ +import os +import torch +import torch.nn.functional as F +from torch import nn + +from modules.tts.portaspeech.portaspeech import PortaSpeech +from tasks.tts.fs import FastSpeechTask +from utils.audio.align import mel2token_to_dur +from utils.commons.hparams import hparams +from utils.metrics.diagonal_metrics import get_focus_rate, get_phone_coverage_rate, get_diagonal_focus_rate +from utils.nn.model_utils import num_params +import numpy as np + +from utils.plot.plot import spec_to_figure +from utils.text.text_encoder import build_token_encoder + + +class PortaSpeechTask(FastSpeechTask): + def __init__(self): + super().__init__() + data_dir = hparams['binary_data_dir'] + self.word_encoder = build_token_encoder(f'{data_dir}/word_set.json') + + def build_tts_model(self): + ph_dict_size = len(self.token_encoder) + word_dict_size = len(self.word_encoder) + self.model = PortaSpeech(ph_dict_size, word_dict_size, hparams) + + def on_train_start(self): + super().on_train_start() + for n, m in self.model.named_children(): + num_params(m, model_name=n) + if hasattr(self.model, 'fvae'): + for n, m in self.model.fvae.named_children(): + num_params(m, model_name=f'fvae.{n}') + + def run_model(self, sample, infer=False, *args, **kwargs): + txt_tokens = sample['txt_tokens'] + word_tokens = sample['word_tokens'] + spk_embed = sample.get('spk_embed') + spk_id = sample.get('spk_ids') + if not infer: + output = self.model(txt_tokens, word_tokens, + ph2word=sample['ph2word'], + mel2word=sample['mel2word'], + mel2ph=sample['mel2ph'], + word_len=sample['word_lengths'].max(), + tgt_mels=sample['mels'], + pitch=sample.get('pitch'), + spk_embed=spk_embed, + spk_id=spk_id, + infer=False, + global_step=self.global_step) + losses = {} + losses['kl_v'] = output['kl'].detach() + losses_kl = output['kl'] + losses_kl = torch.clamp(losses_kl, min=hparams['kl_min']) + losses_kl = min(self.global_step / hparams['kl_start_steps'], 1) * losses_kl + losses_kl = losses_kl * hparams['lambda_kl'] + losses['kl'] = losses_kl + self.add_mel_loss(output['mel_out'], sample['mels'], losses) + if hparams['dur_level'] == 'word': + self.add_dur_loss( + output['dur'], sample['mel2word'], sample['word_lengths'], sample['txt_tokens'], losses) + self.get_attn_stats(output['attn'], sample, losses) + else: + super(PortaSpeechTask, self).add_dur_loss(output['dur'], sample['mel2ph'], sample['txt_tokens'], losses) + return losses, output + else: + use_gt_dur = kwargs.get('infer_use_gt_dur', hparams['use_gt_dur']) + output = self.model( + txt_tokens, word_tokens, + ph2word=sample['ph2word'], + word_len=sample['word_lengths'].max(), + pitch=sample.get('pitch'), + mel2ph=sample['mel2ph'] if use_gt_dur else None, + mel2word=sample['mel2word'] if use_gt_dur else None, + tgt_mels=sample['mels'], + infer=True, + spk_embed=spk_embed, + spk_id=spk_id, + ) + return output + + def add_dur_loss(self, dur_pred, mel2token, word_len, txt_tokens, losses=None): + T = word_len.max() + dur_gt = mel2token_to_dur(mel2token, T).float() + nonpadding = (torch.arange(T).to(dur_pred.device)[None, :] < word_len[:, None]).float() + dur_pred = dur_pred * nonpadding + dur_gt = dur_gt * nonpadding + wdur = F.l1_loss((dur_pred + 1).log(), (dur_gt + 1).log(), reduction='none') + wdur = (wdur * nonpadding).sum() / nonpadding.sum() + if hparams['lambda_word_dur'] > 0: + losses['wdur'] = wdur * hparams['lambda_word_dur'] + if hparams['lambda_sent_dur'] > 0: + sent_dur_p = dur_pred.sum(-1) + sent_dur_g = dur_gt.sum(-1) + sdur_loss = F.l1_loss(sent_dur_p, sent_dur_g, reduction='mean') + losses['sdur'] = sdur_loss.mean() * hparams['lambda_sent_dur'] + + def validation_step(self, sample, batch_idx): + return super().validation_step(sample, batch_idx) + + def save_valid_result(self, sample, batch_idx, model_out): + super(PortaSpeechTask, self).save_valid_result(sample, batch_idx, model_out) + if self.global_step > 0 and hparams['dur_level'] == 'word': + self.logger.add_figure(f'attn_{batch_idx}', spec_to_figure(model_out['attn'][0]), self.global_step) + + def get_attn_stats(self, attn, sample, logging_outputs, prefix=''): + # diagonal_focus_rate + txt_lengths = sample['txt_lengths'].float() + mel_lengths = sample['mel_lengths'].float() + src_padding_mask = sample['txt_tokens'].eq(0) + target_padding_mask = sample['mels'].abs().sum(-1).eq(0) + src_seg_mask = sample['txt_tokens'].eq(self.seg_idx) + attn_ks = txt_lengths.float() / mel_lengths.float() + + focus_rate = get_focus_rate(attn, src_padding_mask, target_padding_mask).mean().data + phone_coverage_rate = get_phone_coverage_rate( + attn, src_padding_mask, src_seg_mask, target_padding_mask).mean() + diagonal_focus_rate, diag_mask = get_diagonal_focus_rate( + attn, attn_ks, mel_lengths, src_padding_mask, target_padding_mask) + logging_outputs[f'{prefix}fr'] = focus_rate.mean().data + logging_outputs[f'{prefix}pcr'] = phone_coverage_rate.mean().data + logging_outputs[f'{prefix}dfr'] = diagonal_focus_rate.mean().data + + def get_plot_dur_info(self, sample, model_out): + if hparams['dur_level'] == 'word': + T_txt = sample['word_lengths'].max() + dur_gt = mel2token_to_dur(sample['mel2word'], T_txt)[0] + dur_pred = model_out['dur'] if 'dur' in model_out else dur_gt + txt = sample['ph_words'][0].split(" ") + else: + T_txt = sample['txt_tokens'].shape[1] + dur_gt = mel2token_to_dur(sample['mel2ph'], T_txt)[0] + dur_pred = model_out['dur'] if 'dur' in model_out else dur_gt + txt = self.token_encoder.decode(sample['txt_tokens'][0].cpu().numpy()) + txt = txt.split(" ") + return {'dur_gt': dur_gt, 'dur_pred': dur_pred, 'txt': txt} + + def build_optimizer(self, model): + self.optimizer = torch.optim.AdamW( + self.model.parameters(), + lr=hparams['lr'], + betas=(hparams['optimizer_adam_beta1'], hparams['optimizer_adam_beta2']), + weight_decay=hparams['weight_decay']) + return self.optimizer + + def build_scheduler(self, optimizer): + return FastSpeechTask.build_scheduler(self, optimizer) + + ############ + # infer + ############ + def test_start(self): + super().test_start() + if hparams.get('save_attn', False): + os.makedirs(f'{self.gen_dir}/attn', exist_ok=True) + self.model.store_inverse_all() + + def test_step(self, sample, batch_idx): + assert sample['txt_tokens'].shape[0] == 1, 'only support batch_size=1 in inference' + outputs = self.run_model(sample, infer=True) + text = sample['text'][0] + item_name = sample['item_name'][0] + tokens = sample['txt_tokens'][0].cpu().numpy() + mel_gt = sample['mels'][0].cpu().numpy() + mel_pred = outputs['mel_out'][0].cpu().numpy() + mel2ph = sample['mel2ph'][0].cpu().numpy() + mel2ph_pred = None + str_phs = self.token_encoder.decode(tokens, strip_padding=True) + base_fn = f'[{batch_idx:06d}][{item_name.replace("%", "_")}][%s]' + if text is not None: + base_fn += text.replace(":", "$3A")[:80] + base_fn = base_fn.replace(' ', '_') + gen_dir = self.gen_dir + wav_pred = self.vocoder.spec2wav(mel_pred) + self.saving_result_pool.add_job(self.save_result, args=[ + wav_pred, mel_pred, base_fn % 'P', gen_dir, str_phs, mel2ph_pred]) + if hparams['save_gt']: + wav_gt = self.vocoder.spec2wav(mel_gt) + self.saving_result_pool.add_job(self.save_result, args=[ + wav_gt, mel_gt, base_fn % 'G', gen_dir, str_phs, mel2ph]) + if hparams.get('save_attn', False): + attn = outputs['attn'][0].cpu().numpy() + np.save(f'{gen_dir}/attn/{item_name}.npy', attn) + print(f"Pred_shape: {mel_pred.shape}, gt_shape: {mel_gt.shape}") + return { + 'item_name': item_name, + 'text': text, + 'ph_tokens': self.token_encoder.decode(tokens.tolist()), + 'wav_fn_pred': base_fn % 'P', + 'wav_fn_gt': base_fn % 'G', + } diff --git a/tasks/tts/ps_flow.py b/tasks/tts/ps_flow.py new file mode 100644 index 0000000000000000000000000000000000000000..37a2469ed08d382b58bcb6b8b1750986bb3dd345 --- /dev/null +++ b/tasks/tts/ps_flow.py @@ -0,0 +1,134 @@ +import torch +from modules.tts.portaspeech.portaspeech_flow import PortaSpeechFlow +from tasks.tts.fs import FastSpeechTask +from tasks.tts.ps import PortaSpeechTask +from utils.audio.pitch.utils import denorm_f0 +from utils.commons.hparams import hparams + + +class PortaSpeechFlowTask(PortaSpeechTask): + def __init__(self): + super().__init__() + self.training_post_glow = False + + def build_tts_model(self): + ph_dict_size = len(self.token_encoder) + word_dict_size = len(self.word_encoder) + self.model = PortaSpeechFlow(ph_dict_size, word_dict_size, hparams) + + def _training_step(self, sample, batch_idx, opt_idx): + self.training_post_glow = self.global_step >= hparams['post_glow_training_start'] \ + and hparams['use_post_flow'] + if hparams['two_stage'] and \ + ((opt_idx == 0 and self.training_post_glow) or (opt_idx == 1 and not self.training_post_glow)): + return None + loss_output, _ = self.run_model(sample) + total_loss = sum([v for v in loss_output.values() if isinstance(v, torch.Tensor) and v.requires_grad]) + loss_output['batch_size'] = sample['txt_tokens'].size()[0] + if 'postflow' in loss_output and loss_output['postflow'] is None: + return None + return total_loss, loss_output + + def run_model(self, sample, infer=False, *args, **kwargs): + if not infer: + training_post_glow = self.training_post_glow + spk_embed = sample.get('spk_embed') + spk_id = sample.get('spk_ids') + output = self.model(sample['txt_tokens'], + sample['word_tokens'], + ph2word=sample['ph2word'], + mel2word=sample['mel2word'], + mel2ph=sample['mel2ph'], + word_len=sample['word_lengths'].max(), + tgt_mels=sample['mels'], + pitch=sample.get('pitch'), + spk_embed=spk_embed, + spk_id=spk_id, + infer=False, + forward_post_glow=training_post_glow, + two_stage=hparams['two_stage'], + global_step=self.global_step) + losses = {} + self.add_mel_loss(output['mel_out'], sample['mels'], losses) + if (training_post_glow or not hparams['two_stage']) and hparams['use_post_flow']: + losses['postflow'] = output['postflow'] + losses['l1'] = losses['l1'].detach() + losses['ssim'] = losses['ssim'].detach() + if not training_post_glow or not hparams['two_stage'] or not self.training: + losses['kl'] = output['kl'] + if self.global_step < hparams['kl_start_steps']: + losses['kl'] = losses['kl'].detach() + else: + losses['kl'] = torch.clamp(losses['kl'], min=hparams['kl_min']) + losses['kl'] = losses['kl'] * hparams['lambda_kl'] + if hparams['dur_level'] == 'word': + self.add_dur_loss( + output['dur'], sample['mel2word'], sample['word_lengths'], sample['txt_tokens'], losses) + self.get_attn_stats(output['attn'], sample, losses) + else: + super().add_dur_loss(output['dur'], sample['mel2ph'], sample['txt_tokens'], losses) + return losses, output + else: + use_gt_dur = kwargs.get('infer_use_gt_dur', hparams['use_gt_dur']) + forward_post_glow = self.global_step >= hparams['post_glow_training_start'] + 1000 \ + and hparams['use_post_flow'] + spk_embed = sample.get('spk_embed') + spk_id = sample.get('spk_ids') + output = self.model( + sample['txt_tokens'], + sample['word_tokens'], + ph2word=sample['ph2word'], + word_len=sample['word_lengths'].max(), + pitch=sample.get('pitch'), + mel2ph=sample['mel2ph'] if use_gt_dur else None, + mel2word=sample['mel2word'] if hparams['profile_infer'] or hparams['use_gt_dur'] else None, + infer=True, + forward_post_glow=forward_post_glow, + spk_embed=spk_embed, + spk_id=spk_id, + two_stage=hparams['two_stage'] + ) + return output + + def validation_step(self, sample, batch_idx): + self.training_post_glow = self.global_step >= hparams['post_glow_training_start'] \ + and hparams['use_post_flow'] + return super().validation_step(sample, batch_idx) + + def save_valid_result(self, sample, batch_idx, model_out): + super(PortaSpeechFlowTask, self).save_valid_result(sample, batch_idx, model_out) + sr = hparams['audio_sample_rate'] + f0_gt = None + if sample.get('f0') is not None: + f0_gt = denorm_f0(sample['f0'][0].cpu(), sample['uv'][0].cpu()) + if self.global_step > 0: + # save FVAE result + if hparams['use_post_flow']: + wav_pred = self.vocoder.spec2wav(model_out['mel_out_fvae'][0].cpu(), f0=f0_gt) + self.logger.add_audio(f'wav_fvae_{batch_idx}', wav_pred, self.global_step, sr) + self.plot_mel(batch_idx, sample['mels'], model_out['mel_out_fvae'][0], + f'mel_fvae_{batch_idx}', f0s=f0_gt) + + def build_optimizer(self, model): + if hparams['two_stage'] and hparams['use_post_flow']: + self.optimizer = torch.optim.AdamW( + [p for name, p in self.model.named_parameters() if 'post_flow' not in name], + lr=hparams['lr'], + betas=(hparams['optimizer_adam_beta1'], hparams['optimizer_adam_beta2']), + weight_decay=hparams['weight_decay']) + self.post_flow_optimizer = torch.optim.AdamW( + self.model.post_flow.parameters(), + lr=hparams['post_flow_lr'], + betas=(hparams['optimizer_adam_beta1'], hparams['optimizer_adam_beta2']), + weight_decay=hparams['weight_decay']) + return [self.optimizer, self.post_flow_optimizer] + else: + self.optimizer = torch.optim.AdamW( + self.model.parameters(), + lr=hparams['lr'], + betas=(hparams['optimizer_adam_beta1'], hparams['optimizer_adam_beta2']), + weight_decay=hparams['weight_decay']) + return [self.optimizer] + + def build_scheduler(self, optimizer): + return FastSpeechTask.build_scheduler(self, optimizer[0]) \ No newline at end of file diff --git a/tasks/tts/speech_base.py b/tasks/tts/speech_base.py new file mode 100644 index 0000000000000000000000000000000000000000..a438c9a432fe850370ee2a10c2aa7d6c0e1fb793 --- /dev/null +++ b/tasks/tts/speech_base.py @@ -0,0 +1,373 @@ +import filecmp +import os +import traceback +import numpy as np +import pandas as pd +import torch +import torch.distributed as dist +import torch.nn.functional as F +import torch.optim +import torch.utils.data +import yaml +from tqdm import tqdm +import utils +from tasks.tts.dataset_utils import BaseSpeechDataset +from tasks.tts.tts_utils import parse_mel_losses, parse_dataset_configs, load_data_preprocessor, load_data_binarizer +from tasks.tts.vocoder_infer.base_vocoder import BaseVocoder, get_vocoder_cls +from utils.audio.align import mel2token_to_dur +from utils.audio.io import save_wav +from utils.audio.pitch_extractors import extract_pitch_simple +from utils.commons.base_task import BaseTask +from utils.commons.ckpt_utils import load_ckpt +from utils.commons.dataset_utils import data_loader, BaseConcatDataset +from utils.commons.hparams import hparams +from utils.commons.multiprocess_utils import MultiprocessManager +from utils.commons.tensor_utils import tensors_to_scalars +from utils.metrics.ssim import ssim +from utils.nn.model_utils import print_arch +from utils.nn.schedulers import RSQRTSchedule, NoneSchedule, WarmupSchedule +from utils.nn.seq_utils import weights_nonzero_speech +from utils.plot.plot import spec_to_figure +from utils.text.text_encoder import build_token_encoder +import matplotlib.pyplot as plt + + +class SpeechBaseTask(BaseTask): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.dataset_cls = BaseSpeechDataset + self.vocoder = None + data_dir = hparams['binary_data_dir'] + if not hparams['use_word_input']: + self.token_encoder = build_token_encoder(f'{data_dir}/phone_set.json') + else: + self.token_encoder = build_token_encoder(f'{data_dir}/word_set.json') + self.padding_idx = self.token_encoder.pad() + self.eos_idx = self.token_encoder.eos() + self.seg_idx = self.token_encoder.seg() + self.saving_result_pool = None + self.saving_results_futures = None + self.mel_losses = parse_mel_losses() + self.max_tokens, self.max_sentences, \ + self.max_valid_tokens, self.max_valid_sentences = parse_dataset_configs() + + ########################## + # datasets + ########################## + @data_loader + def train_dataloader(self): + if hparams['train_sets'] != '': + train_sets = hparams['train_sets'].split("|") + # check if all train_sets have the same spk map and dictionary + binary_data_dir = hparams['binary_data_dir'] + file_to_cmp = ['phone_set.json'] + if os.path.exists(f'{binary_data_dir}/word_set.json'): + file_to_cmp.append('word_set.json') + if hparams['use_spk_id']: + file_to_cmp.append('spk_map.json') + for f in file_to_cmp: + for ds_name in train_sets: + base_file = os.path.join(binary_data_dir, f) + ds_file = os.path.join(ds_name, f) + assert filecmp.cmp(base_file, ds_file), \ + f'{f} in {ds_name} is not same with that in {binary_data_dir}.' + train_dataset = BaseConcatDataset([ + self.dataset_cls(prefix='train', shuffle=True, data_dir=ds_name) for ds_name in train_sets]) + else: + train_dataset = self.dataset_cls(prefix=hparams['train_set_name'], shuffle=True) + return self.build_dataloader(train_dataset, True, self.max_tokens, self.max_sentences, + endless=hparams['endless_ds']) + + @data_loader + def val_dataloader(self): + valid_dataset = self.dataset_cls(prefix=hparams['valid_set_name'], shuffle=False) + return self.build_dataloader(valid_dataset, False, self.max_valid_tokens, self.max_valid_sentences, + batch_by_size=False) + + @data_loader + def test_dataloader(self): + test_dataset = self.dataset_cls(prefix=hparams['test_set_name'], shuffle=False) + self.test_dl = self.build_dataloader( + test_dataset, False, self.max_valid_tokens, self.max_valid_sentences, batch_by_size=False) + return self.test_dl + + def build_dataloader(self, dataset, shuffle, max_tokens=None, max_sentences=None, + required_batch_size_multiple=-1, endless=False, batch_by_size=True): + devices_cnt = torch.cuda.device_count() + if devices_cnt == 0: + devices_cnt = 1 + if required_batch_size_multiple == -1: + required_batch_size_multiple = devices_cnt + + def shuffle_batches(batches): + np.random.shuffle(batches) + return batches + + if max_tokens is not None: + max_tokens *= devices_cnt + if max_sentences is not None: + max_sentences *= devices_cnt + indices = dataset.ordered_indices() + if batch_by_size: + batch_sampler = utils.commons.dataset_utils.batch_by_size( + indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, + required_batch_size_multiple=required_batch_size_multiple, + ) + else: + batch_sampler = [] + for i in range(0, len(indices), max_sentences): + batch_sampler.append(indices[i:i + max_sentences]) + + if shuffle: + batches = shuffle_batches(list(batch_sampler)) + if endless: + batches = [b for _ in range(1000) for b in shuffle_batches(list(batch_sampler))] + else: + batches = batch_sampler + if endless: + batches = [b for _ in range(1000) for b in batches] + num_workers = dataset.num_workers + if self.trainer.use_ddp: + num_replicas = dist.get_world_size() + rank = dist.get_rank() + batches = [x[rank::num_replicas] for x in batches if len(x) % num_replicas == 0] + return torch.utils.data.DataLoader(dataset, + collate_fn=dataset.collater, + batch_sampler=batches, + num_workers=num_workers, + pin_memory=False) + + ########################## + # scheduler and optimizer + ########################## + def build_model(self): + self.build_tts_model() + if hparams['load_ckpt'] != '': + load_ckpt(self.model, hparams['load_ckpt']) + print_arch(self.model) + return self.model + + def build_tts_model(self): + raise NotImplementedError + + def build_scheduler(self, optimizer): + if hparams['scheduler'] == 'rsqrt': + return RSQRTSchedule(optimizer, hparams['lr'], hparams['warmup_updates'], hparams['hidden_size']) + elif hparams['scheduler'] == 'warmup': + return WarmupSchedule(optimizer, hparams['lr'], hparams['warmup_updates']) + elif hparams['scheduler'] == 'step_lr': + return torch.optim.lr_scheduler.StepLR( + optimizer=optimizer, step_size=500, gamma=0.998) + else: + return NoneSchedule(optimizer, hparams['lr']) + + def build_optimizer(self, model): + self.optimizer = optimizer = torch.optim.AdamW( + model.parameters(), + lr=hparams['lr'], + betas=(hparams['optimizer_adam_beta1'], hparams['optimizer_adam_beta2']), + weight_decay=hparams['weight_decay']) + + return optimizer + + ########################## + # training and validation + ########################## + def _training_step(self, sample, batch_idx, _): + loss_output, _ = self.run_model(sample) + total_loss = sum([v for v in loss_output.values() if isinstance(v, torch.Tensor) and v.requires_grad]) + loss_output['batch_size'] = sample['txt_tokens'].size()[0] + return total_loss, loss_output + + def run_model(self, sample, infer=False): + """ + + :param sample: a batch of data + :param infer: bool, run in infer mode + :return: + if not infer: + return losses, model_out + if infer: + return model_out + """ + raise NotImplementedError + + def validation_start(self): + self.vocoder = get_vocoder_cls(hparams['vocoder'])() + + def validation_step(self, sample, batch_idx): + outputs = {} + outputs['losses'] = {} + outputs['losses'], model_out = self.run_model(sample) + outputs['total_loss'] = sum(outputs['losses'].values()) + outputs['nsamples'] = sample['nsamples'] + outputs = tensors_to_scalars(outputs) + if self.global_step % hparams['valid_infer_interval'] == 0 \ + and batch_idx < hparams['num_valid_plots']: + self.save_valid_result(sample, batch_idx, model_out) + return outputs + + def validation_end(self, outputs): + self.vocoder = None + return super(SpeechBaseTask, self).validation_end(outputs) + + def save_valid_result(self, sample, batch_idx, model_out): + raise NotImplementedError + + ########################## + # losses + ########################## + def add_mel_loss(self, mel_out, target, losses, postfix=''): + for loss_name, lambd in self.mel_losses.items(): + losses[f'{loss_name}{postfix}'] = getattr(self, f'{loss_name}_loss')(mel_out, target) * lambd + + def l1_loss(self, decoder_output, target): + # decoder_output : B x T x n_mel + # target : B x T x n_mel + l1_loss = F.l1_loss(decoder_output, target, reduction='none') + weights = weights_nonzero_speech(target) + l1_loss = (l1_loss * weights).sum() / weights.sum() + return l1_loss + + def mse_loss(self, decoder_output, target): + # decoder_output : B x T x n_mel + # target : B x T x n_mel + assert decoder_output.shape == target.shape + mse_loss = F.mse_loss(decoder_output, target, reduction='none') + weights = weights_nonzero_speech(target) + mse_loss = (mse_loss * weights).sum() / weights.sum() + return mse_loss + + def ssim_loss(self, decoder_output, target, bias=6.0): + # decoder_output : B x T x n_mel + # target : B x T x n_mel + assert decoder_output.shape == target.shape + weights = weights_nonzero_speech(target) + decoder_output = decoder_output[:, None] + bias + target = target[:, None] + bias + ssim_loss = 1 - ssim(decoder_output, target, size_average=False) + ssim_loss = (ssim_loss * weights).sum() / weights.sum() + return ssim_loss + + def plot_mel(self, batch_idx, spec_out, spec_gt=None, name=None, title='', f0s=None, dur_info=None): + vmin = hparams['mel_vmin'] + vmax = hparams['mel_vmax'] + if len(spec_out.shape) == 3: + spec_out = spec_out[0] + if isinstance(spec_out, torch.Tensor): + spec_out = spec_out.cpu().numpy() + if spec_gt is not None: + if len(spec_gt.shape) == 3: + spec_gt = spec_gt[0] + if isinstance(spec_gt, torch.Tensor): + spec_gt = spec_gt.cpu().numpy() + max_len = max(len(spec_gt), len(spec_out)) + if max_len - len(spec_gt) > 0: + spec_gt = np.pad(spec_gt, [[0, max_len - len(spec_gt)], [0, 0]], mode='constant', + constant_values=vmin) + if max_len - len(spec_out) > 0: + spec_out = np.pad(spec_out, [[0, max_len - len(spec_out)], [0, 0]], mode='constant', + constant_values=vmin) + spec_out = np.concatenate([spec_out, spec_gt], -1) + name = f'mel_val_{batch_idx}' if name is None else name + self.logger.add_figure(name, spec_to_figure( + spec_out, vmin, vmax, title=title, f0s=f0s, dur_info=dur_info), self.global_step) + + ########################## + # testing + ########################## + def test_start(self): + self.saving_result_pool = MultiprocessManager(int(os.getenv('N_PROC', os.cpu_count()))) + self.saving_results_futures = [] + self.gen_dir = os.path.join( + hparams['work_dir'], f'generated_{self.trainer.global_step}_{hparams["gen_dir_name"]}') + self.vocoder: BaseVocoder = get_vocoder_cls(hparams['vocoder'])() + os.makedirs(self.gen_dir, exist_ok=True) + os.makedirs(f'{self.gen_dir}/wavs', exist_ok=True) + os.makedirs(f'{self.gen_dir}/plot', exist_ok=True) + if hparams.get('save_mel_npy', False): + os.makedirs(f'{self.gen_dir}/mel_npy', exist_ok=True) + + def test_step(self, sample, batch_idx): + """ + + :param sample: + :param batch_idx: + :return: + """ + assert sample['txt_tokens'].shape[0] == 1, 'only support batch_size=1 in inference' + outputs = self.run_model(sample, infer=True) + text = sample['text'][0] + item_name = sample['item_name'][0] + tokens = sample['txt_tokens'][0].cpu().numpy() + mel_gt = sample['mels'][0].cpu().numpy() + mel_pred = outputs['mel_out'][0].cpu().numpy() + str_phs = self.token_encoder.decode(tokens, strip_padding=True) + base_fn = f'[{self.results_id:06d}][{item_name.replace("%", "_")}][%s]' + if text is not None: + base_fn += text.replace(":", "$3A")[:80] + base_fn = base_fn.replace(' ', '_') + gen_dir = self.gen_dir + wav_pred = self.vocoder.spec2wav(mel_pred) + self.saving_result_pool.add_job(self.save_result, args=[ + wav_pred, mel_pred, base_fn % 'P', gen_dir, str_phs]) + if hparams['save_gt']: + wav_gt = self.vocoder.spec2wav(mel_gt) + self.saving_result_pool.add_job(self.save_result, args=[ + wav_gt, mel_gt, base_fn % 'G', gen_dir, str_phs]) + print(f"Pred_shape: {mel_pred.shape}, gt_shape: {mel_gt.shape}") + return { + 'item_name': item_name, + 'text': text, + 'ph_tokens': self.token_encoder.decode(tokens.tolist()), + 'wav_fn_pred': base_fn % 'P', + 'wav_fn_gt': base_fn % 'G', + } + + @staticmethod + def save_result(wav_out, mel, base_fn, gen_dir, str_phs=None, mel2ph=None, alignment=None): + save_wav(wav_out, f'{gen_dir}/wavs/{base_fn}.wav', hparams['audio_sample_rate'], + norm=hparams['out_wav_norm']) + fig = plt.figure(figsize=(14, 10)) + spec_vmin = hparams['mel_vmin'] + spec_vmax = hparams['mel_vmax'] + heatmap = plt.pcolor(mel.T, vmin=spec_vmin, vmax=spec_vmax) + fig.colorbar(heatmap) + try: + f0 = extract_pitch_simple(wav_out) + f0 = f0 / 10 * (f0 > 0) + plt.plot(f0, c='white', linewidth=1, alpha=0.6) + if mel2ph is not None and str_phs is not None: + decoded_txt = str_phs.split(" ") + dur = mel2token_to_dur(torch.LongTensor(mel2ph)[None, :], len(decoded_txt))[0].numpy() + dur = [0] + list(np.cumsum(dur)) + for i in range(len(dur) - 1): + shift = (i % 20) + 1 + plt.text(dur[i], shift, decoded_txt[i]) + plt.hlines(shift, dur[i], dur[i + 1], colors='b' if decoded_txt[i] != '|' else 'black') + plt.vlines(dur[i], 0, 5, colors='b' if decoded_txt[i] != '|' else 'black', + alpha=1, linewidth=1) + plt.tight_layout() + plt.savefig(f'{gen_dir}/plot/{base_fn}.png', format='png') + plt.close(fig) + if hparams.get('save_mel_npy', False): + np.save(f'{gen_dir}/mel_npy/{base_fn}', mel) + if alignment is not None: + fig, ax = plt.subplots(figsize=(12, 16)) + im = ax.imshow(alignment, aspect='auto', origin='lower', + interpolation='none') + decoded_txt = str_phs.split(" ") + ax.set_yticks(np.arange(len(decoded_txt))) + ax.set_yticklabels(list(decoded_txt), fontsize=6) + fig.colorbar(im, ax=ax) + fig.savefig(f'{gen_dir}/attn_plot/{base_fn}_attn.png', format='png') + plt.close(fig) + except Exception: + traceback.print_exc() + return None + + def test_end(self, outputs): + pd.DataFrame(outputs).to_csv(f'{self.gen_dir}/meta.csv') + for _1, _2 in tqdm(self.saving_result_pool.get_results(), total=len(self.saving_result_pool)): + pass + return {} diff --git a/tasks/tts/synta.py b/tasks/tts/synta.py new file mode 100644 index 0000000000000000000000000000000000000000..d01fd6ca34c526759e8918a5b97e2c1e19a003ca --- /dev/null +++ b/tasks/tts/synta.py @@ -0,0 +1,272 @@ +import os +import torch +import torch.nn.functional as F +from torch import nn + +from modules.tts.syntaspeech.syntaspeech import SyntaSpeech +from modules.tts.syntaspeech.multi_window_disc import Discriminator +from tasks.tts.fs import FastSpeechTask +from utils.audio.align import mel2token_to_dur +from utils.commons.hparams import hparams +from utils.metrics.diagonal_metrics import get_focus_rate, get_phone_coverage_rate, get_diagonal_focus_rate +from utils.nn.model_utils import num_params +import numpy as np + +from utils.plot.plot import spec_to_figure +from utils.text.text_encoder import build_token_encoder + + +class SyntaSpeechTask(FastSpeechTask): + def __init__(self): + super().__init__() + data_dir = hparams['binary_data_dir'] + self.word_encoder = build_token_encoder(f'{data_dir}/word_set.json') + self.build_disc_model() + self.mse_loss_fn = torch.nn.MSELoss() + + def build_tts_model(self): + ph_dict_size = len(self.token_encoder) + word_dict_size = len(self.word_encoder) + self.model = SyntaSpeech(ph_dict_size, word_dict_size, hparams) + + def build_disc_model(self): + disc_win_num = hparams['disc_win_num'] + h = hparams['mel_disc_hidden_size'] + self.mel_disc = Discriminator( + time_lengths=[32, 64, 128][:disc_win_num], + freq_length=80, hidden_size=h, kernel=(3, 3) + ) + self.disc_params = list(self.mel_disc.parameters()) + + def on_train_start(self): + super().on_train_start() + for n, m in self.model.named_children(): + num_params(m, model_name=n) + if hasattr(self.model, 'fvae'): + for n, m in self.model.fvae.named_children(): + num_params(m, model_name=f'fvae.{n}') + + def _training_step(self, sample, batch_idx, optimizer_idx): + loss_output = {} + loss_weights = {} + disc_start = self.global_step >= hparams["disc_start_steps"] and hparams['lambda_mel_adv'] > 0 + if optimizer_idx == 0: + ####################### + # Generator # + ####################### + loss_output, model_out = self.run_model(sample, infer=False) + self.model_out_gt = self.model_out = \ + {k: v.detach() for k, v in model_out.items() if isinstance(v, torch.Tensor)} + if disc_start: + mel_p = model_out['mel_out'] + if hasattr(self.model, 'out2mel'): + mel_p = self.model.out2mel(mel_p) + o_ = self.mel_disc(mel_p) + p_, pc_ = o_['y'], o_['y_c'] + if p_ is not None: + loss_output['a'] = self.mse_loss_fn(p_, p_.new_ones(p_.size())) + loss_weights['a'] = hparams['lambda_mel_adv'] + if pc_ is not None: + loss_output['ac'] = self.mse_loss_fn(pc_, pc_.new_ones(pc_.size())) + loss_weights['ac'] = hparams['lambda_mel_adv'] + else: + ####################### + # Discriminator # + ####################### + if disc_start and self.global_step % hparams['disc_interval'] == 0: + model_out = self.model_out_gt + mel_g = sample['mels'] + mel_p = model_out['mel_out'] + o = self.mel_disc(mel_g) + p, pc = o['y'], o['y_c'] + o_ = self.mel_disc(mel_p) + p_, pc_ = o_['y'], o_['y_c'] + if p_ is not None: + loss_output["r"] = self.mse_loss_fn(p, p.new_ones(p.size())) + loss_output["f"] = self.mse_loss_fn(p_, p_.new_zeros(p_.size())) + if pc_ is not None: + loss_output["rc"] = self.mse_loss_fn(pc, pc.new_ones(pc.size())) + loss_output["fc"] = self.mse_loss_fn(pc_, pc_.new_zeros(pc_.size())) + total_loss = sum([loss_weights.get(k, 1) * v for k, v in loss_output.items() if isinstance(v, torch.Tensor) and v.requires_grad]) + loss_output['batch_size'] = sample['txt_tokens'].size()[0] + return total_loss, loss_output + + def run_model(self, sample, infer=False, *args, **kwargs): + txt_tokens = sample['txt_tokens'] + word_tokens = sample['word_tokens'] + spk_embed = sample.get('spk_embed') + spk_id = sample.get('spk_ids') + if not infer: + output = self.model(txt_tokens, word_tokens, + ph2word=sample['ph2word'], + mel2word=sample['mel2word'], + mel2ph=sample['mel2ph'], + word_len=sample['word_lengths'].max(), + tgt_mels=sample['mels'], + pitch=sample.get('pitch'), + spk_embed=spk_embed, + spk_id=spk_id, + infer=False, + global_step=self.global_step, + graph_lst=sample['graph_lst'], + etypes_lst=sample['etypes_lst'] + ) + losses = {} + losses['kl_v'] = output['kl'].detach() + losses_kl = output['kl'] + losses_kl = torch.clamp(losses_kl, min=hparams['kl_min']) + losses_kl = min(self.global_step / hparams['kl_start_steps'], 1) * losses_kl + losses_kl = losses_kl * hparams['lambda_kl'] + losses['kl'] = losses_kl + self.add_mel_loss(output['mel_out'], sample['mels'], losses) + if hparams['dur_level'] == 'word': + self.add_dur_loss( + output['dur'], sample['mel2word'], sample['word_lengths'], sample['txt_tokens'], losses) + self.get_attn_stats(output['attn'], sample, losses) + else: + super(SyntaSpeechTask, self).add_dur_loss(output['dur'], sample['mel2ph'], sample['txt_tokens'], losses) + return losses, output + else: + use_gt_dur = kwargs.get('infer_use_gt_dur', hparams['use_gt_dur']) + output = self.model( + txt_tokens, word_tokens, + ph2word=sample['ph2word'], + word_len=sample['word_lengths'].max(), + pitch=sample.get('pitch'), + mel2ph=sample['mel2ph'] if use_gt_dur else None, + mel2word=sample['mel2word'] if use_gt_dur else None, + tgt_mels=sample['mels'], + infer=True, + spk_embed=spk_embed, + spk_id=spk_id, + graph_lst=sample['graph_lst'], + etypes_lst=sample['etypes_lst'] + ) + return output + + def add_dur_loss(self, dur_pred, mel2token, word_len, txt_tokens, losses=None): + T = word_len.max() + dur_gt = mel2token_to_dur(mel2token, T).float() + nonpadding = (torch.arange(T).to(dur_pred.device)[None, :] < word_len[:, None]).float() + dur_pred = dur_pred * nonpadding + dur_gt = dur_gt * nonpadding + wdur = F.l1_loss((dur_pred + 1).log(), (dur_gt + 1).log(), reduction='none') + wdur = (wdur * nonpadding).sum() / nonpadding.sum() + if hparams['lambda_word_dur'] > 0: + losses['wdur'] = wdur * hparams['lambda_word_dur'] + if hparams['lambda_sent_dur'] > 0: + sent_dur_p = dur_pred.sum(-1) + sent_dur_g = dur_gt.sum(-1) + sdur_loss = F.l1_loss(sent_dur_p, sent_dur_g, reduction='mean') + losses['sdur'] = sdur_loss.mean() * hparams['lambda_sent_dur'] + + def validation_step(self, sample, batch_idx): + return super().validation_step(sample, batch_idx) + + def save_valid_result(self, sample, batch_idx, model_out): + super(SyntaSpeechTask, self).save_valid_result(sample, batch_idx, model_out) + if self.global_step > 0 and hparams['dur_level'] == 'word': + self.logger.add_figure(f'attn_{batch_idx}', spec_to_figure(model_out['attn'][0]), self.global_step) + + def get_attn_stats(self, attn, sample, logging_outputs, prefix=''): + # diagonal_focus_rate + txt_lengths = sample['txt_lengths'].float() + mel_lengths = sample['mel_lengths'].float() + src_padding_mask = sample['txt_tokens'].eq(0) + target_padding_mask = sample['mels'].abs().sum(-1).eq(0) + src_seg_mask = sample['txt_tokens'].eq(self.seg_idx) + attn_ks = txt_lengths.float() / mel_lengths.float() + + focus_rate = get_focus_rate(attn, src_padding_mask, target_padding_mask).mean().data + phone_coverage_rate = get_phone_coverage_rate( + attn, src_padding_mask, src_seg_mask, target_padding_mask).mean() + diagonal_focus_rate, diag_mask = get_diagonal_focus_rate( + attn, attn_ks, mel_lengths, src_padding_mask, target_padding_mask) + logging_outputs[f'{prefix}fr'] = focus_rate.mean().data + logging_outputs[f'{prefix}pcr'] = phone_coverage_rate.mean().data + logging_outputs[f'{prefix}dfr'] = diagonal_focus_rate.mean().data + + def get_plot_dur_info(self, sample, model_out): + if hparams['dur_level'] == 'word': + T_txt = sample['word_lengths'].max() + dur_gt = mel2token_to_dur(sample['mel2word'], T_txt)[0] + dur_pred = model_out['dur'] if 'dur' in model_out else dur_gt + txt = sample['ph_words'][0].split(" ") + else: + T_txt = sample['txt_tokens'].shape[1] + dur_gt = mel2token_to_dur(sample['mel2ph'], T_txt)[0] + dur_pred = model_out['dur'] if 'dur' in model_out else dur_gt + txt = self.token_encoder.decode(sample['txt_tokens'][0].cpu().numpy()) + txt = txt.split(" ") + return {'dur_gt': dur_gt, 'dur_pred': dur_pred, 'txt': txt} + + def build_optimizer(self, model): + optimizer_gen = torch.optim.AdamW( + self.model.parameters(), + lr=hparams['lr'], + betas=(hparams['optimizer_adam_beta1'], hparams['optimizer_adam_beta2']), + weight_decay=hparams['weight_decay']) + + optimizer_disc = torch.optim.AdamW( + self.disc_params, + lr=hparams['disc_lr'], + betas=(hparams['optimizer_adam_beta1'], hparams['optimizer_adam_beta2']), + **hparams["discriminator_optimizer_params"]) if len(self.disc_params) > 0 else None + + return [optimizer_gen, optimizer_disc] + + def build_scheduler(self, optimizer): + return [ + FastSpeechTask.build_scheduler(self, optimizer[0]), # Generator Scheduler + torch.optim.lr_scheduler.StepLR(optimizer=optimizer[1], # Discriminator Scheduler + **hparams["discriminator_scheduler_params"]), + ] + + def on_after_optimization(self, epoch, batch_idx, optimizer, optimizer_idx): + if self.scheduler is not None: + self.scheduler[0].step(self.global_step // hparams['accumulate_grad_batches']) + self.scheduler[1].step(self.global_step // hparams['accumulate_grad_batches']) + + ############ + # infer + ############ + def test_start(self): + super().test_start() + if hparams.get('save_attn', False): + os.makedirs(f'{self.gen_dir}/attn', exist_ok=True) + self.model.store_inverse_all() + + def test_step(self, sample, batch_idx): + assert sample['txt_tokens'].shape[0] == 1, 'only support batch_size=1 in inference' + outputs = self.run_model(sample, infer=True) + text = sample['text'][0] + item_name = sample['item_name'][0] + tokens = sample['txt_tokens'][0].cpu().numpy() + mel_gt = sample['mels'][0].cpu().numpy() + mel_pred = outputs['mel_out'][0].cpu().numpy() + mel2ph = sample['mel2ph'][0].cpu().numpy() + mel2ph_pred = None + str_phs = self.token_encoder.decode(tokens, strip_padding=True) + base_fn = f'[{batch_idx:06d}][{item_name.replace("%", "_")}][%s]' + if text is not None: + base_fn += text.replace(":", "$3A")[:80] + base_fn = base_fn.replace(' ', '_') + gen_dir = self.gen_dir + wav_pred = self.vocoder.spec2wav(mel_pred) + self.saving_result_pool.add_job(self.save_result, args=[ + wav_pred, mel_pred, base_fn % 'P', gen_dir, str_phs, mel2ph_pred]) + if hparams['save_gt']: + wav_gt = self.vocoder.spec2wav(mel_gt) + self.saving_result_pool.add_job(self.save_result, args=[ + wav_gt, mel_gt, base_fn % 'G', gen_dir, str_phs, mel2ph]) + if hparams.get('save_attn', False): + attn = outputs['attn'][0].cpu().numpy() + np.save(f'{gen_dir}/attn/{item_name}.npy', attn) + print(f"Pred_shape: {mel_pred.shape}, gt_shape: {mel_gt.shape}") + return { + 'item_name': item_name, + 'text': text, + 'ph_tokens': self.token_encoder.decode(tokens.tolist()), + 'wav_fn_pred': base_fn % 'P', + 'wav_fn_gt': base_fn % 'G', + } diff --git a/tasks/tts/tts_utils.py b/tasks/tts/tts_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c4b82df98677e7ba132f77b4f147a0b9aa03c1f1 --- /dev/null +++ b/tasks/tts/tts_utils.py @@ -0,0 +1,54 @@ +import importlib + +from data_gen.tts.base_binarizer import BaseBinarizer +from data_gen.tts.base_preprocess import BasePreprocessor +from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls +from utils.commons.hparams import hparams + + +def parse_dataset_configs(): + max_tokens = hparams['max_tokens'] + max_sentences = hparams['max_sentences'] + max_valid_tokens = hparams['max_valid_tokens'] + if max_valid_tokens == -1: + hparams['max_valid_tokens'] = max_valid_tokens = max_tokens + max_valid_sentences = hparams['max_valid_sentences'] + if max_valid_sentences == -1: + hparams['max_valid_sentences'] = max_valid_sentences = max_sentences + return max_tokens, max_sentences, max_valid_tokens, max_valid_sentences + + +def parse_mel_losses(): + mel_losses = hparams['mel_losses'].split("|") + loss_and_lambda = {} + for i, l in enumerate(mel_losses): + if l == '': + continue + if ':' in l: + l, lbd = l.split(":") + lbd = float(lbd) + else: + lbd = 1.0 + loss_and_lambda[l] = lbd + print("| Mel losses:", loss_and_lambda) + return loss_and_lambda + + +def load_data_preprocessor(): + preprocess_cls = hparams["preprocess_cls"] + pkg = ".".join(preprocess_cls.split(".")[:-1]) + cls_name = preprocess_cls.split(".")[-1] + preprocessor: BasePreprocessor = getattr(importlib.import_module(pkg), cls_name)() + preprocess_args = {} + preprocess_args.update(hparams['preprocess_args']) + return preprocessor, preprocess_args + + +def load_data_binarizer(): + binarizer_cls = hparams['binarizer_cls'] + pkg = ".".join(binarizer_cls.split(".")[:-1]) + cls_name = binarizer_cls.split(".")[-1] + binarizer: BaseBinarizer = getattr(importlib.import_module(pkg), cls_name)() + binarization_args = {} + binarization_args.update(hparams['binarization_args']) + return binarizer, binarization_args diff --git a/tasks/tts/vocoder_infer/__init__.py b/tasks/tts/vocoder_infer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1868dc2601363d6843f2b02a82e6fa2de375408b --- /dev/null +++ b/tasks/tts/vocoder_infer/__init__.py @@ -0,0 +1,2 @@ +from . import hifigan +from . import pwg \ No newline at end of file diff --git a/tasks/tts/vocoder_infer/__pycache__/__init__.cpython-36.pyc b/tasks/tts/vocoder_infer/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfd9f6ca070d45a7e78e701625b7b89cd383503a Binary files /dev/null and b/tasks/tts/vocoder_infer/__pycache__/__init__.cpython-36.pyc differ diff --git a/tasks/tts/vocoder_infer/__pycache__/__init__.cpython-37.pyc b/tasks/tts/vocoder_infer/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e3ef31e84030e2d5a5fbe8a39b65c69761b84cc0 Binary files /dev/null and b/tasks/tts/vocoder_infer/__pycache__/__init__.cpython-37.pyc differ diff --git a/tasks/tts/vocoder_infer/__pycache__/base_vocoder.cpython-36.pyc b/tasks/tts/vocoder_infer/__pycache__/base_vocoder.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dcc8b211c1bf76d406e0f3f85002e2859a145300 Binary files /dev/null and b/tasks/tts/vocoder_infer/__pycache__/base_vocoder.cpython-36.pyc differ diff --git a/tasks/tts/vocoder_infer/__pycache__/base_vocoder.cpython-37.pyc b/tasks/tts/vocoder_infer/__pycache__/base_vocoder.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8757e958410900a6ff10cbdfa5ade0208ecb2540 Binary files /dev/null and b/tasks/tts/vocoder_infer/__pycache__/base_vocoder.cpython-37.pyc differ diff --git a/tasks/tts/vocoder_infer/__pycache__/hifigan.cpython-36.pyc b/tasks/tts/vocoder_infer/__pycache__/hifigan.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c5d4366f520b98a07e808c956cdd35719117ec4 Binary files /dev/null and b/tasks/tts/vocoder_infer/__pycache__/hifigan.cpython-36.pyc differ diff --git a/tasks/tts/vocoder_infer/__pycache__/hifigan.cpython-37.pyc b/tasks/tts/vocoder_infer/__pycache__/hifigan.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa56665ebe13bef537d01d971e9bcac36272902e Binary files /dev/null and b/tasks/tts/vocoder_infer/__pycache__/hifigan.cpython-37.pyc differ diff --git a/tasks/tts/vocoder_infer/__pycache__/pwg.cpython-36.pyc b/tasks/tts/vocoder_infer/__pycache__/pwg.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95394ce8c6998972d72cf7a0f9b63234aa44d5b7 Binary files /dev/null and b/tasks/tts/vocoder_infer/__pycache__/pwg.cpython-36.pyc differ diff --git a/tasks/tts/vocoder_infer/__pycache__/pwg.cpython-37.pyc b/tasks/tts/vocoder_infer/__pycache__/pwg.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a70110c2076eab539adce203e5e48ed0d9f7c22c Binary files /dev/null and b/tasks/tts/vocoder_infer/__pycache__/pwg.cpython-37.pyc differ diff --git a/tasks/tts/vocoder_infer/base_vocoder.py b/tasks/tts/vocoder_infer/base_vocoder.py new file mode 100644 index 0000000000000000000000000000000000000000..0ab88f4e78be66ba1821e5a6720193b1d614f4f5 --- /dev/null +++ b/tasks/tts/vocoder_infer/base_vocoder.py @@ -0,0 +1,63 @@ +import librosa +from utils.audio import librosa_wav2spec +from utils.commons.hparams import hparams +import numpy as np + +REGISTERED_VOCODERS = {} + + +def register_vocoder(name): + def _f(cls): + REGISTERED_VOCODERS[name] = cls + return cls + + return _f + + +def get_vocoder_cls(vocoder_name): + return REGISTERED_VOCODERS.get(vocoder_name) + + +class BaseVocoder: + def spec2wav(self, mel): + """ + + :param mel: [T, 80] + :return: wav: [T'] + """ + + raise NotImplementedError + + @staticmethod + def wav2spec(wav_fn): + """ + + :param wav_fn: str + :return: wav, mel: [T, 80] + """ + wav_spec_dict = librosa_wav2spec(wav_fn, fft_size=hparams['fft_size'], + hop_size=hparams['hop_size'], + win_length=hparams['win_size'], + num_mels=hparams['audio_num_mel_bins'], + fmin=hparams['fmin'], + fmax=hparams['fmax'], + sample_rate=hparams['audio_sample_rate'], + loud_norm=hparams['loud_norm']) + wav = wav_spec_dict['wav'] + mel = wav_spec_dict['mel'] + return wav, mel + + @staticmethod + def wav2mfcc(wav_fn): + fft_size = hparams['fft_size'] + hop_size = hparams['hop_size'] + win_length = hparams['win_size'] + sample_rate = hparams['audio_sample_rate'] + wav, _ = librosa.core.load(wav_fn, sr=sample_rate) + mfcc = librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=13, + n_fft=fft_size, hop_length=hop_size, + win_length=win_length, pad_mode="constant", power=1.0) + mfcc_delta = librosa.feature.delta(mfcc, order=1) + mfcc_delta_delta = librosa.feature.delta(mfcc, order=2) + mfcc = np.concatenate([mfcc, mfcc_delta, mfcc_delta_delta]).T + return mfcc diff --git a/tasks/tts/vocoder_infer/hifigan.py b/tasks/tts/vocoder_infer/hifigan.py new file mode 100644 index 0000000000000000000000000000000000000000..fdde1058eeef1dc91710ed93dfaa63989c89ae3d --- /dev/null +++ b/tasks/tts/vocoder_infer/hifigan.py @@ -0,0 +1,31 @@ +import torch +from modules.vocoder.hifigan.hifigan import HifiGanGenerator +from tasks.tts.vocoder_infer.base_vocoder import register_vocoder, BaseVocoder +from utils.commons.ckpt_utils import load_ckpt +from utils.commons.hparams import set_hparams, hparams +from utils.commons.meters import Timer + +total_time = 0 + + +@register_vocoder('HifiGAN') +class HifiGAN(BaseVocoder): + def __init__(self): + base_dir = hparams['vocoder_ckpt'] + config_path = f'{base_dir}/config.yaml' + self.config = config = set_hparams(config_path, global_hparams=False) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = HifiGanGenerator(config) + load_ckpt(self.model, base_dir, 'model_gen') + self.model.to(self.device) + self.model.eval() + + def spec2wav(self, mel, **kwargs): + device = self.device + with torch.no_grad(): + c = torch.FloatTensor(mel).unsqueeze(0).to(device) + c = c.transpose(2, 1) + with Timer('hifigan', enable=hparams['profile_infer']): + y = self.model(c).view(-1) + wav_out = y.cpu().numpy() + return wav_out \ No newline at end of file diff --git a/tasks/tts/vocoder_infer/pwg.py b/tasks/tts/vocoder_infer/pwg.py new file mode 100644 index 0000000000000000000000000000000000000000..c9599eaf81d0b421a9223101eb8c07ccc47e0231 --- /dev/null +++ b/tasks/tts/vocoder_infer/pwg.py @@ -0,0 +1,32 @@ +import torch +from modules.vocoder.parallel_wavegan.models.parallel_wavegan import ParallelWaveGANGenerator +from tasks.tts.vocoder_infer.base_vocoder import register_vocoder, BaseVocoder +from utils.commons.ckpt_utils import load_ckpt +from utils.commons.hparams import set_hparams, hparams +from utils.commons.meters import Timer + +total_time = 0 + + +@register_vocoder('PWG') +class PWG(BaseVocoder): + def __init__(self): + base_dir = hparams['vocoder_ckpt'] + config_path = f'{base_dir}/config.yaml' + self.config = config = set_hparams(config_path, global_hparams=False) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = ParallelWaveGANGenerator(**config["generator_params"]) + load_ckpt(self.model, base_dir, 'model_gen') + self.model.to(self.device) + self.model.eval() + + def spec2wav(self, mel, **kwargs): + device = self.device + with torch.no_grad(): + c = torch.FloatTensor(mel).unsqueeze(0).to(device) + c = c.transpose(2, 1) # [B, C, T] + z = None + with Timer('pwg', enable=hparams['profile_infer']): + y = self.model(z, c).view(-1) + wav_out = y.cpu().numpy() + return wav_out \ No newline at end of file diff --git a/tasks/vocoder/dataset_utils.py b/tasks/vocoder/dataset_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e399b9e37fde0341c9a52d2cb241b96ce8c8b822 --- /dev/null +++ b/tasks/vocoder/dataset_utils.py @@ -0,0 +1,130 @@ +import numpy as np +import torch +import torch.distributed as dist +from torch.utils.data import DistributedSampler +from utils.commons.dataset_utils import BaseDataset, collate_1d, collate_2d +from utils.commons.hparams import hparams +from utils.commons.indexed_datasets import IndexedDataset + + +class EndlessDistributedSampler(DistributedSampler): + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.shuffle = shuffle + + g = torch.Generator() + g.manual_seed(self.epoch) + if self.shuffle: + indices = [i for _ in range(1000) for i in torch.randperm( + len(self.dataset), generator=g).tolist()] + else: + indices = [i for _ in range(1000) for i in list(range(len(self.dataset)))] + indices = indices[:len(indices) // self.num_replicas * self.num_replicas] + indices = indices[self.rank::self.num_replicas] + self.indices = indices + + def __iter__(self): + return iter(self.indices) + + def __len__(self): + return len(self.indices) + + +class VocoderDataset(BaseDataset): + def __init__(self, prefix, shuffle=False): + super().__init__(shuffle) + self.hparams = hparams + self.prefix = prefix + self.data_dir = hparams['binary_data_dir'] + self.is_infer = prefix == 'test' + self.batch_max_frames = 0 if self.is_infer else hparams['max_samples'] // hparams['hop_size'] + self.hop_size = hparams['hop_size'] + self.indexed_ds = None + self.sizes = np.load(f'{self.data_dir}/{self.prefix}_lengths.npy') + self.avail_idxs = [idx for idx, s in enumerate(self.sizes) if s > self.batch_max_frames] + print(f"| {len(self.sizes) - len(self.avail_idxs)} short items are skipped in {prefix} set.") + self.sizes = [s for idx, s in enumerate(self.sizes) if s > self.batch_max_frames] + + def _get_item(self, index): + if self.indexed_ds is None: + self.indexed_ds = IndexedDataset(f'{self.data_dir}/{self.prefix}') + item = self.indexed_ds[index] + return item + + def __getitem__(self, index): + index = self.avail_idxs[index] + item = self._get_item(index) + sample = { + "id": index, + "item_name": item['item_name'], + "mel": torch.FloatTensor(item['mel']), + "wav": torch.FloatTensor(item['wav'].astype(np.float32)), + "pitch": torch.LongTensor(item['pitch']), + "f0": torch.FloatTensor(item['f0']) + } + return sample + + def collater(self, batch): + if len(batch) == 0: + return {} + + y_batch, c_batch, p_batch, f0_batch = [], [], [], [] + item_name = [] + for idx in range(len(batch)): + item_name.append(batch[idx]['item_name']) + x, c = batch[idx]['wav'], batch[idx]['mel'] + p, f0 = batch[idx]['pitch'], batch[idx]['f0'] + self._assert_ready_for_upsampling(x, c, self.hop_size) + if len(c) > self.batch_max_frames: + # randomly pickup with the batch_max_steps length of the part + batch_max_frames = self.batch_max_frames if self.batch_max_frames != 0 else len(c) - 1 + batch_max_steps = batch_max_frames * self.hop_size + interval_start = 0 + interval_end = len(c) - batch_max_frames + start_frame = np.random.randint(interval_start, interval_end) + start_step = start_frame * self.hop_size + y = x[start_step: start_step + batch_max_steps] + c = c[start_frame: start_frame + batch_max_frames] + p = p[start_frame: start_frame + batch_max_frames] + f0 = f0[start_frame: start_frame + batch_max_frames] + self._assert_ready_for_upsampling(y, c, self.hop_size) + else: + print(f"Removed short sample from batch (length={len(x)}).") + continue + y_batch += [y.reshape(-1, 1)] # [(T, 1), (T, 1), ...] + c_batch += [c] # [(T' C), (T' C), ...] + p_batch += [p] # [(T' C), (T' C), ...] + f0_batch += [f0] # [(T' C), (T' C), ...] + + # convert each batch to tensor, asuume that each item in batch has the same length + y_batch = collate_2d(y_batch, 0).transpose(2, 1) # (B, 1, T) + c_batch = collate_2d(c_batch, 0).transpose(2, 1) # (B, C, T') + p_batch = collate_1d(p_batch, 0) # (B, T') + f0_batch = collate_1d(f0_batch, 0) # (B, T') + + # make input noise signal batch tensor + z_batch = torch.randn(y_batch.size()) # (B, 1, T) + return { + 'z': z_batch, + 'mels': c_batch, + 'wavs': y_batch, + 'pitches': p_batch, + 'f0': f0_batch, + 'item_name': item_name + } + + @staticmethod + def _assert_ready_for_upsampling(x, c, hop_size): + """Assert the audio and feature lengths are correctly adjusted for upsamping.""" + assert len(x) == (len(c)) * hop_size diff --git a/tasks/vocoder/hifigan.py b/tasks/vocoder/hifigan.py new file mode 100755 index 0000000000000000000000000000000000000000..a07370ab84f2d5ba6b20cc37db9773c1c2879b73 --- /dev/null +++ b/tasks/vocoder/hifigan.py @@ -0,0 +1,63 @@ +import torch.nn.functional as F +from torch import nn + +from modules.vocoder.hifigan.hifigan import HifiGanGenerator, MultiPeriodDiscriminator, MultiScaleDiscriminator, \ + generator_loss, feature_loss, discriminator_loss +from modules.vocoder.hifigan.mel_utils import mel_spectrogram +from modules.vocoder.hifigan.stft_loss import MultiResolutionSTFTLoss +from tasks.vocoder.vocoder_base import VocoderBaseTask +from utils.commons.hparams import hparams +from utils.nn.model_utils import print_arch + + +class HifiGanTask(VocoderBaseTask): + def build_model(self): + self.model_gen = HifiGanGenerator(hparams) + self.model_disc = nn.ModuleDict() + self.model_disc['mpd'] = MultiPeriodDiscriminator() + self.model_disc['msd'] = MultiScaleDiscriminator() + self.stft_loss = MultiResolutionSTFTLoss() + print_arch(self.model_gen) + if hparams['load_ckpt'] != '': + self.load_ckpt(hparams['load_ckpt'], 'model_gen', 'model_gen', force=True, strict=True) + self.load_ckpt(hparams['load_ckpt'], 'model_disc', 'model_disc', force=True, strict=True) + return self.model_gen + + def _training_step(self, sample, batch_idx, optimizer_idx): + mel = sample['mels'] + y = sample['wavs'] + f0 = sample['f0'] + loss_output = {} + if optimizer_idx == 0: + ####################### + # Generator # + ####################### + y_ = self.model_gen(mel, f0) + y_mel = mel_spectrogram(y.squeeze(1), hparams).transpose(1, 2) + y_hat_mel = mel_spectrogram(y_.squeeze(1), hparams).transpose(1, 2) + loss_output['mel'] = F.l1_loss(y_hat_mel, y_mel) * hparams['lambda_mel'] + _, y_p_hat_g, fmap_f_r, fmap_f_g = self.model_disc['mpd'](y, y_, mel) + _, y_s_hat_g, fmap_s_r, fmap_s_g = self.model_disc['msd'](y, y_, mel) + loss_output['a_p'] = generator_loss(y_p_hat_g) * hparams['lambda_adv'] + loss_output['a_s'] = generator_loss(y_s_hat_g) * hparams['lambda_adv'] + if hparams['use_fm_loss']: + loss_output['fm_f'] = feature_loss(fmap_f_r, fmap_f_g) + loss_output['fm_s'] = feature_loss(fmap_s_r, fmap_s_g) + if hparams['use_ms_stft']: + loss_output['sc'], loss_output['mag'] = self.stft_loss(y.squeeze(1), y_.squeeze(1)) + self.y_ = y_.detach() + self.y_mel = y_mel.detach() + self.y_hat_mel = y_hat_mel.detach() + else: + ####################### + # Discriminator # + ####################### + y_ = self.y_ + # MPD + y_p_hat_r, y_p_hat_g, _, _ = self.model_disc['mpd'](y, y_.detach(), mel) + loss_output['r_p'], loss_output['f_p'] = discriminator_loss(y_p_hat_r, y_p_hat_g) + # MSD + y_s_hat_r, y_s_hat_g, _, _ = self.model_disc['msd'](y, y_.detach(), mel) + loss_output['r_s'], loss_output['f_s'] = discriminator_loss(y_s_hat_r, y_s_hat_g) + total_loss = sum(loss_output.values()) + return total_loss, loss_output diff --git a/tasks/vocoder/vocoder_base.py b/tasks/vocoder/vocoder_base.py new file mode 100644 index 0000000000000000000000000000000000000000..9a1d006647f259ec39968ec9a9d2f36b166f5851 --- /dev/null +++ b/tasks/vocoder/vocoder_base.py @@ -0,0 +1,137 @@ +import os +import torch +import torch.distributed as dist +from torch import nn +from torch.utils.data import DistributedSampler +from tasks.vocoder.dataset_utils import VocoderDataset, EndlessDistributedSampler +from utils.audio.io import save_wav +from utils.commons.base_task import BaseTask +from utils.commons.dataset_utils import data_loader +from utils.commons.hparams import hparams +from utils.commons.tensor_utils import tensors_to_scalars + + +class VocoderBaseTask(BaseTask): + def __init__(self): + super(VocoderBaseTask, self).__init__() + self.max_sentences = hparams['max_sentences'] + self.max_valid_sentences = hparams['max_valid_sentences'] + if self.max_valid_sentences == -1: + hparams['max_valid_sentences'] = self.max_valid_sentences = self.max_sentences + self.dataset_cls = VocoderDataset + + @data_loader + def train_dataloader(self): + train_dataset = self.dataset_cls('train', shuffle=True) + return self.build_dataloader(train_dataset, True, self.max_sentences, hparams['endless_ds']) + + @data_loader + def val_dataloader(self): + valid_dataset = self.dataset_cls('test', shuffle=False) + return self.build_dataloader(valid_dataset, False, self.max_valid_sentences) + + @data_loader + def test_dataloader(self): + test_dataset = self.dataset_cls('test', shuffle=False) + return self.build_dataloader(test_dataset, False, self.max_valid_sentences) + + def build_dataloader(self, dataset, shuffle, max_sentences, endless=False): + world_size = 1 + rank = 0 + if dist.is_initialized(): + world_size = dist.get_world_size() + rank = dist.get_rank() + sampler_cls = DistributedSampler if not endless else EndlessDistributedSampler + train_sampler = sampler_cls( + dataset=dataset, + num_replicas=world_size, + rank=rank, + shuffle=shuffle, + ) + return torch.utils.data.DataLoader( + dataset=dataset, + shuffle=False, + collate_fn=dataset.collater, + batch_size=max_sentences, + num_workers=dataset.num_workers, + sampler=train_sampler, + pin_memory=True, + ) + + def build_optimizer(self, model): + optimizer_gen = torch.optim.AdamW(self.model_gen.parameters(), lr=hparams['lr'], + betas=[hparams['adam_b1'], hparams['adam_b2']]) + optimizer_disc = torch.optim.AdamW(self.model_disc.parameters(), lr=hparams['lr'], + betas=[hparams['adam_b1'], hparams['adam_b2']]) + return [optimizer_gen, optimizer_disc] + + def build_scheduler(self, optimizer): + return { + "gen": torch.optim.lr_scheduler.StepLR( + optimizer=optimizer[0], + **hparams["generator_scheduler_params"]), + "disc": torch.optim.lr_scheduler.StepLR( + optimizer=optimizer[1], + **hparams["discriminator_scheduler_params"]), + } + + def validation_step(self, sample, batch_idx): + outputs = {} + total_loss, loss_output = self._training_step(sample, batch_idx, 0) + outputs['losses'] = tensors_to_scalars(loss_output) + outputs['total_loss'] = tensors_to_scalars(total_loss) + + if self.global_step % hparams['valid_infer_interval'] == 0 and \ + batch_idx < 10: + mels = sample['mels'] + y = sample['wavs'] + f0 = sample['f0'] + y_ = self.model_gen(mels, f0) + for idx, (wav_pred, wav_gt, item_name) in enumerate(zip(y_, y, sample["item_name"])): + wav_pred = wav_pred / wav_pred.abs().max() + if self.global_step == 0: + wav_gt = wav_gt / wav_gt.abs().max() + self.logger.add_audio(f'wav_{batch_idx}_{idx}_gt', wav_gt, self.global_step, + hparams['audio_sample_rate']) + self.logger.add_audio(f'wav_{batch_idx}_{idx}_pred', wav_pred, self.global_step, + hparams['audio_sample_rate']) + return outputs + + def test_start(self): + self.gen_dir = os.path.join(hparams['work_dir'], + f'generated_{self.trainer.global_step}_{hparams["gen_dir_name"]}') + os.makedirs(self.gen_dir, exist_ok=True) + + def test_step(self, sample, batch_idx): + mels = sample['mels'] + y = sample['wavs'] + f0 = sample['f0'] + loss_output = {} + y_ = self.model_gen(mels, f0) + gen_dir = os.path.join(hparams['work_dir'], f'generated_{self.trainer.global_step}_{hparams["gen_dir_name"]}') + os.makedirs(gen_dir, exist_ok=True) + for idx, (wav_pred, wav_gt, item_name) in enumerate(zip(y_, y, sample["item_name"])): + wav_gt = wav_gt.clamp(-1, 1) + wav_pred = wav_pred.clamp(-1, 1) + save_wav( + wav_gt.view(-1).cpu().float().numpy(), f'{gen_dir}/{item_name}_gt.wav', + hparams['audio_sample_rate']) + save_wav( + wav_pred.view(-1).cpu().float().numpy(), f'{gen_dir}/{item_name}_pred.wav', + hparams['audio_sample_rate']) + return loss_output + + def test_end(self, outputs): + return {} + + def on_before_optimization(self, opt_idx): + if opt_idx == 0: + nn.utils.clip_grad_norm_(self.model_gen.parameters(), hparams['generator_grad_norm']) + else: + nn.utils.clip_grad_norm_(self.model_disc.parameters(), hparams["discriminator_grad_norm"]) + + def on_after_optimization(self, epoch, batch_idx, optimizer, optimizer_idx): + if optimizer_idx == 0: + self.scheduler['gen'].step(self.global_step // hparams['accumulate_grad_batches']) + else: + self.scheduler['disc'].step(self.global_step // hparams['accumulate_grad_batches']) diff --git a/utils/__pycache__/os_utils.cpython-36.pyc b/utils/__pycache__/os_utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e41787936b08f9c5b4be83bb7c5a8dd1d89178bb Binary files /dev/null and b/utils/__pycache__/os_utils.cpython-36.pyc differ diff --git a/utils/__pycache__/os_utils.cpython-37.pyc b/utils/__pycache__/os_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..016c36cbf40813c3a741a5d9c9972deafc15b2c0 Binary files /dev/null and b/utils/__pycache__/os_utils.cpython-37.pyc differ diff --git a/utils/__pycache__/os_utils.cpython-39.pyc b/utils/__pycache__/os_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb11c7ca907d4fa2793a7e772d0c614d592ca307 Binary files /dev/null and b/utils/__pycache__/os_utils.cpython-39.pyc differ diff --git a/utils/audio/__init__.py b/utils/audio/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..e8cc4466b27eeda4026e945a5388dca04817e8a1 --- /dev/null +++ b/utils/audio/__init__.py @@ -0,0 +1,82 @@ +import librosa +import numpy as np +import pyloudnorm as pyln + +from utils.audio.vad import trim_long_silences + + +def librosa_pad_lr(x, fsize, fshift, pad_sides=1): + '''compute right padding (final frame) or both sides padding (first and final frames) + ''' + assert pad_sides in (1, 2) + # return int(fsize // 2) + pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0] + if pad_sides == 1: + return 0, pad + else: + return pad // 2, pad // 2 + pad % 2 + + +def amp_to_db(x): + return 20 * np.log10(np.maximum(1e-5, x)) + + +def db_to_amp(x): + return 10.0 ** (x * 0.05) + + +def normalize(S, min_level_db): + return (S - min_level_db) / -min_level_db + + +def denormalize(D, min_level_db): + return (D * -min_level_db) + min_level_db + + +def librosa_wav2spec(wav_path, + fft_size=1024, + hop_size=256, + win_length=1024, + window="hann", + num_mels=80, + fmin=80, + fmax=-1, + eps=1e-6, + sample_rate=22050, + loud_norm=False, + trim_long_sil=False): + if isinstance(wav_path, str): + if trim_long_sil: + wav, _, _ = trim_long_silences(wav_path, sample_rate) + else: + wav, _ = librosa.core.load(wav_path, sr=sample_rate) + else: + wav = wav_path + + if loud_norm: + meter = pyln.Meter(sample_rate) # create BS.1770 meter + loudness = meter.integrated_loudness(wav) + wav = pyln.normalize.loudness(wav, loudness, -22.0) + if np.abs(wav).max() > 1: + wav = wav / np.abs(wav).max() + + # get amplitude spectrogram + x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size, + win_length=win_length, window=window, pad_mode="constant") + linear_spc = np.abs(x_stft) # (n_bins, T) + + # get mel basis + fmin = 0 if fmin == -1 else fmin + fmax = sample_rate / 2 if fmax == -1 else fmax + mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax) + + # calculate mel spec + mel = mel_basis @ linear_spc + mel = np.log10(np.maximum(eps, mel)) # (n_mel_bins, T) + l_pad, r_pad = librosa_pad_lr(wav, fft_size, hop_size, 1) + wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0) + wav = wav[:mel.shape[1] * hop_size] + + # log linear spec + linear_spc = np.log10(np.maximum(eps, linear_spc)) + return {'wav': wav, 'mel': mel.T, 'linear': linear_spc.T, 'mel_basis': mel_basis} diff --git a/utils/audio/__pycache__/__init__.cpython-36.pyc b/utils/audio/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..675df7303d6b037724e8a71a5802e851eafe83cb Binary files /dev/null and b/utils/audio/__pycache__/__init__.cpython-36.pyc differ diff --git a/utils/audio/__pycache__/__init__.cpython-37.pyc b/utils/audio/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2f3510417186dbe16fe9aa40b18d4e035b2c062 Binary files /dev/null and b/utils/audio/__pycache__/__init__.cpython-37.pyc differ diff --git a/utils/audio/__pycache__/align.cpython-36.pyc b/utils/audio/__pycache__/align.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d904cc3fccf750eb02f336170268c4adf36617a Binary files /dev/null and b/utils/audio/__pycache__/align.cpython-36.pyc differ diff --git a/utils/audio/__pycache__/align.cpython-37.pyc b/utils/audio/__pycache__/align.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eef0d98909ad7ec45f70c11a91d9609dfc746d73 Binary files /dev/null and b/utils/audio/__pycache__/align.cpython-37.pyc differ diff --git a/utils/audio/__pycache__/cwt.cpython-36.pyc b/utils/audio/__pycache__/cwt.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0edd28df08c8a096832422487e4dd202378795ea Binary files /dev/null and b/utils/audio/__pycache__/cwt.cpython-36.pyc differ diff --git a/utils/audio/__pycache__/cwt.cpython-37.pyc b/utils/audio/__pycache__/cwt.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..081c43bd3a882d4a8d001bbb91099f1528d53176 Binary files /dev/null and b/utils/audio/__pycache__/cwt.cpython-37.pyc differ diff --git a/utils/audio/__pycache__/io.cpython-36.pyc b/utils/audio/__pycache__/io.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd2bf7b6739ad681fa71d24b806ab9ca23c81088 Binary files /dev/null and b/utils/audio/__pycache__/io.cpython-36.pyc differ diff --git a/utils/audio/__pycache__/io.cpython-37.pyc b/utils/audio/__pycache__/io.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a1f80a67a85c7e484440f5dba55f4353f689bfd Binary files /dev/null and b/utils/audio/__pycache__/io.cpython-37.pyc differ diff --git a/utils/audio/__pycache__/pitch_extractors.cpython-36.pyc b/utils/audio/__pycache__/pitch_extractors.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2ab050ccaa44fb60d8068f52f1c97e79829dfe8 Binary files /dev/null and b/utils/audio/__pycache__/pitch_extractors.cpython-36.pyc differ diff --git a/utils/audio/__pycache__/pitch_extractors.cpython-37.pyc b/utils/audio/__pycache__/pitch_extractors.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8ebd46ab0025d782b0431109a922a5074a71f33 Binary files /dev/null and b/utils/audio/__pycache__/pitch_extractors.cpython-37.pyc differ diff --git a/utils/audio/__pycache__/rnnoise.cpython-36.pyc b/utils/audio/__pycache__/rnnoise.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c476768f816fd20b756fda50fdaa15ea0caf2498 Binary files /dev/null and b/utils/audio/__pycache__/rnnoise.cpython-36.pyc differ diff --git a/utils/audio/__pycache__/rnnoise.cpython-37.pyc b/utils/audio/__pycache__/rnnoise.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..626b98654cd6a6a25d24dad1999c621ee47edc2c Binary files /dev/null and b/utils/audio/__pycache__/rnnoise.cpython-37.pyc differ diff --git a/utils/audio/__pycache__/vad.cpython-36.pyc b/utils/audio/__pycache__/vad.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2b0aed911b60ebb1ad1ec22dcd75fed04f1ef28 Binary files /dev/null and b/utils/audio/__pycache__/vad.cpython-36.pyc differ diff --git a/utils/audio/__pycache__/vad.cpython-37.pyc b/utils/audio/__pycache__/vad.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ccc019e6b6c27619ff41e44db439459c89532e4 Binary files /dev/null and b/utils/audio/__pycache__/vad.cpython-37.pyc differ diff --git a/utils/audio/align.py b/utils/audio/align.py new file mode 100644 index 0000000000000000000000000000000000000000..096e593f01b51ee2d9f565666cfa8c40a90d76f1 --- /dev/null +++ b/utils/audio/align.py @@ -0,0 +1,90 @@ +import re + +import torch +import numpy as np +from textgrid import TextGrid + +from utils.text.text_encoder import is_sil_phoneme + + +def get_mel2ph(tg_fn, ph, mel, hop_size, audio_sample_rate, min_sil_duration=0): + ph_list = ph.split(" ") + itvs = TextGrid.fromFile(tg_fn)[1] + itvs_ = [] + for i in range(len(itvs)): + if itvs[i].maxTime - itvs[i].minTime < min_sil_duration and i > 0 and is_sil_phoneme(itvs[i].mark): + itvs_[-1].maxTime = itvs[i].maxTime + else: + itvs_.append(itvs[i]) + itvs.intervals = itvs_ + itv_marks = [itv.mark for itv in itvs] + tg_len = len([x for x in itvs if not is_sil_phoneme(x.mark)]) + ph_len = len([x for x in ph_list if not is_sil_phoneme(x)]) + assert tg_len == ph_len, (tg_len, ph_len, itv_marks, ph_list, tg_fn) + mel2ph = np.zeros([mel.shape[0]], int) + i_itv = 0 + i_ph = 0 + while i_itv < len(itvs): + itv = itvs[i_itv] + ph = ph_list[i_ph] + itv_ph = itv.mark + start_frame = int(itv.minTime * audio_sample_rate / hop_size + 0.5) + end_frame = int(itv.maxTime * audio_sample_rate / hop_size + 0.5) + if is_sil_phoneme(itv_ph) and not is_sil_phoneme(ph): + mel2ph[start_frame:end_frame] = i_ph + i_itv += 1 + elif not is_sil_phoneme(itv_ph) and is_sil_phoneme(ph): + i_ph += 1 + else: + if not ((is_sil_phoneme(itv_ph) and is_sil_phoneme(ph)) \ + or re.sub(r'\d+', '', itv_ph.lower()) == re.sub(r'\d+', '', ph.lower())): + print(f"| WARN: {tg_fn} phs are not same: ", itv_ph, ph, itv_marks, ph_list) + mel2ph[start_frame:end_frame] = i_ph + 1 + i_ph += 1 + i_itv += 1 + mel2ph[-1] = mel2ph[-2] + assert not np.any(mel2ph == 0) + T_t = len(ph_list) + dur = mel2token_to_dur(mel2ph, T_t) + return mel2ph.tolist(), dur.tolist() + + +def split_audio_by_mel2ph(audio, mel2ph, hop_size, audio_num_mel_bins): + if isinstance(audio, torch.Tensor): + audio = audio.numpy() + if isinstance(mel2ph, torch.Tensor): + mel2ph = mel2ph.numpy() + assert len(audio.shape) == 1, len(mel2ph.shape) == 1 + split_locs = [] + for i in range(1, len(mel2ph)): + if mel2ph[i] != mel2ph[i - 1]: + split_loc = i * hop_size + split_locs.append(split_loc) + + new_audio = [] + for i in range(len(split_locs) - 1): + new_audio.append(audio[split_locs[i]:split_locs[i + 1]]) + new_audio.append(np.zeros([0.5 * audio_num_mel_bins])) + return np.concatenate(new_audio) + + +def mel2token_to_dur(mel2token, T_txt=None, max_dur=None): + is_torch = isinstance(mel2token, torch.Tensor) + has_batch_dim = True + if not is_torch: + mel2token = torch.LongTensor(mel2token) + if T_txt is None: + T_txt = mel2token.max() + if len(mel2token.shape) == 1: + mel2token = mel2token[None, ...] + has_batch_dim = False + B, _ = mel2token.shape + dur = mel2token.new_zeros(B, T_txt + 1).scatter_add(1, mel2token, torch.ones_like(mel2token)) + dur = dur[:, 1:] + if max_dur is not None: + dur = dur.clamp(max=max_dur) + if not is_torch: + dur = dur.numpy() + if not has_batch_dim: + dur = dur[0] + return dur diff --git a/utils/audio/cwt.py b/utils/audio/cwt.py new file mode 100644 index 0000000000000000000000000000000000000000..9d42ffc7b40d200dcfa66a0823163f98173b50e6 --- /dev/null +++ b/utils/audio/cwt.py @@ -0,0 +1,143 @@ +import numpy as np +from pycwt import wavelet +from scipy.interpolate import interp1d + +dt = 0.005 +dj = 1 + + +def convert_continuos_f0(f0): + '''CONVERT F0 TO CONTINUOUS F0 + Args: + f0 (ndarray): original f0 sequence with the shape (T) + Return: + (ndarray): continuous f0 with the shape (T) + ''' + # get uv information as binary + f0 = np.copy(f0) + uv = (f0 == 0).astype(float) + + # get start and end of f0 + if (f0 == 0).all(): + print("| all of the f0 values are 0.") + return uv, f0 + start_f0 = f0[f0 != 0][0] + end_f0 = f0[f0 != 0][-1] + + # padding start and end of f0 sequence + start_idx = np.where(f0 == start_f0)[0][0] + end_idx = np.where(f0 == end_f0)[0][-1] + f0[:start_idx] = start_f0 + f0[end_idx:] = end_f0 + + # get non-zero frame index + nz_frames = np.where(f0 != 0)[0] + + # perform linear interpolation + f = interp1d(nz_frames, f0[nz_frames]) + cont_f0 = f(np.arange(0, f0.shape[0])) + + return uv, cont_f0 + + +def get_cont_lf0(f0, frame_period=5.0): + uv, cont_f0_lpf = convert_continuos_f0(f0) + # cont_f0_lpf = low_pass_filter(cont_f0_lpf, int(1.0 / (frame_period * 0.001)), cutoff=20) + cont_lf0_lpf = np.log(cont_f0_lpf) + return uv, cont_lf0_lpf + + +def get_lf0_cwt(lf0): + ''' + input: + signal of shape (N) + output: + Wavelet_lf0 of shape(10, N), scales of shape(10) + ''' + mother = wavelet.MexicanHat() + s0 = dt * 2 + J = 9 + + Wavelet_lf0, scales, _, _, _, _ = wavelet.cwt(np.squeeze(lf0), dt, dj, s0, J, mother) + # Wavelet.shape => (J + 1, len(lf0)) + Wavelet_lf0 = np.real(Wavelet_lf0).T + return Wavelet_lf0, scales + + +def norm_scale(Wavelet_lf0): + mean = Wavelet_lf0.mean(0)[None, :] + std = Wavelet_lf0.std(0)[None, :] + Wavelet_lf0_norm = (Wavelet_lf0 - mean) / std + return Wavelet_lf0_norm, mean, std + + +def normalize_cwt_lf0(f0, mean, std): + uv, cont_lf0_lpf = get_cont_lf0(f0) + cont_lf0_norm = (cont_lf0_lpf - mean) / std + Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_norm) + Wavelet_lf0_norm, _, _ = norm_scale(Wavelet_lf0) + + return Wavelet_lf0_norm + + +def get_lf0_cwt_norm(f0s, mean, std): + uvs = list() + cont_lf0_lpfs = list() + cont_lf0_lpf_norms = list() + Wavelet_lf0s = list() + Wavelet_lf0s_norm = list() + scaless = list() + + means = list() + stds = list() + for f0 in f0s: + uv, cont_lf0_lpf = get_cont_lf0(f0) + cont_lf0_lpf_norm = (cont_lf0_lpf - mean) / std + + Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm) # [560,10] + Wavelet_lf0_norm, mean_scale, std_scale = norm_scale(Wavelet_lf0) # [560,10],[1,10],[1,10] + + Wavelet_lf0s_norm.append(Wavelet_lf0_norm) + uvs.append(uv) + cont_lf0_lpfs.append(cont_lf0_lpf) + cont_lf0_lpf_norms.append(cont_lf0_lpf_norm) + Wavelet_lf0s.append(Wavelet_lf0) + scaless.append(scales) + means.append(mean_scale) + stds.append(std_scale) + + return Wavelet_lf0s_norm, scaless, means, stds + + +def inverse_cwt_torch(Wavelet_lf0, scales): + import torch + b = ((torch.arange(0, len(scales)).float().to(Wavelet_lf0.device)[None, None, :] + 1 + 2.5) ** (-2.5)) + lf0_rec = Wavelet_lf0 * b + lf0_rec_sum = lf0_rec.sum(-1) + lf0_rec_sum = (lf0_rec_sum - lf0_rec_sum.mean(-1, keepdim=True)) / lf0_rec_sum.std(-1, keepdim=True) + return lf0_rec_sum + + +def inverse_cwt(Wavelet_lf0, scales): + # mother = wavelet.MexicanHat() + # lf0_rec_sum = wavelet.icwt(Wavelet_lf0[0].T, scales, dt, dj, mother) + b = ((np.arange(0, len(scales))[None, None, :] + 1 + 2.5) ** (-2.5)) + lf0_rec = Wavelet_lf0 * b + lf0_rec_sum = lf0_rec.sum(-1) + # lf0_rec_sum = lf0_rec_sum[None, ...] + lf0_rec_sum = (lf0_rec_sum - lf0_rec_sum.mean(-1, keepdims=True)) / lf0_rec_sum.std(-1, keepdims=True) + return lf0_rec_sum + + +def cwt2f0(cwt_spec, mean, std, cwt_scales): + assert len(mean.shape) == 1 and len(std.shape) == 1 and len(cwt_spec.shape) == 3 + import torch + if isinstance(cwt_spec, torch.Tensor): + f0 = inverse_cwt_torch(cwt_spec, cwt_scales) + f0 = f0 * std[:, None] + mean[:, None] + f0 = f0.exp() # [B, T] + else: + f0 = inverse_cwt(cwt_spec, cwt_scales) + f0 = f0 * std[:, None] + mean[:, None] + f0 = np.exp(f0) # [B, T] + return f0 diff --git a/utils/audio/griffin_lim.py b/utils/audio/griffin_lim.py new file mode 100644 index 0000000000000000000000000000000000000000..960132b6a1b8befaf5d0ca968f9908405323d89f --- /dev/null +++ b/utils/audio/griffin_lim.py @@ -0,0 +1,85 @@ +import librosa +import numpy as np +import torch +import torch.nn.functional as F + + +def _stft(y, hop_size, win_size, fft_size): + return librosa.stft(y=y, n_fft=fft_size, hop_length=hop_size, win_length=win_size, pad_mode='constant') + + +def _istft(y, hop_size, win_size): + return librosa.istft(y, hop_length=hop_size, win_length=win_size) + + +def griffin_lim(S, hop_size, win_size, fft_size, angles=None, n_iters=30): + angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) if angles is None else angles + S_complex = np.abs(S).astype(np.complex) + y = _istft(S_complex * angles, hop_size, win_size) + for i in range(n_iters): + angles = np.exp(1j * np.angle(_stft(y, hop_size, win_size, fft_size))) + y = _istft(S_complex * angles, hop_size, win_size) + return y + + +def istft(amp, ang, hop_size, win_size, fft_size, pad=False, window=None): + spec = amp * torch.exp(1j * ang) + spec_r = spec.real + spec_i = spec.imag + spec = torch.stack([spec_r, spec_i], -1) + if window is None: + window = torch.hann_window(win_size).to(amp.device) + if pad: + spec = F.pad(spec, [0, 0, 0, 1], mode='reflect') + wav = torch.istft(spec, fft_size, hop_size, win_size) + return wav + + +def griffin_lim_torch(S, hop_size, win_size, fft_size, angles=None, n_iters=30): + """ + + Examples: + >>> x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size, win_length=win_length, pad_mode="constant") + >>> x_stft = x_stft[None, ...] + >>> amp = np.abs(x_stft) + >>> angle_init = np.exp(2j * np.pi * np.random.rand(*x_stft.shape)) + >>> amp = torch.FloatTensor(amp) + >>> wav = griffin_lim_torch(amp, angle_init, hparams) + + :param amp: [B, n_fft, T] + :param ang: [B, n_fft, T] + :return: [B, T_wav] + """ + angles = torch.exp(2j * np.pi * torch.rand(*S.shape)) if angles is None else angles + window = torch.hann_window(win_size).to(S.device) + y = istft(S, angles, hop_size, win_size, fft_size, window=window) + for i in range(n_iters): + x_stft = torch.stft(y, fft_size, hop_size, win_size, window) + x_stft = x_stft[..., 0] + 1j * x_stft[..., 1] + angles = torch.angle(x_stft) + y = istft(S, angles, hop_size, win_size, fft_size, window=window) + return y + + +# Conversions +_mel_basis = None +_inv_mel_basis = None + + +def _build_mel_basis(audio_sample_rate, fft_size, audio_num_mel_bins, fmin, fmax): + assert fmax <= audio_sample_rate // 2 + return librosa.filters.mel(audio_sample_rate, fft_size, n_mels=audio_num_mel_bins, fmin=fmin, fmax=fmax) + + +def _linear_to_mel(spectogram, audio_sample_rate, fft_size, audio_num_mel_bins, fmin, fmax): + global _mel_basis + if _mel_basis is None: + _mel_basis = _build_mel_basis(audio_sample_rate, fft_size, audio_num_mel_bins, fmin, fmax) + return np.dot(_mel_basis, spectogram) + + +def _mel_to_linear(mel_spectrogram, audio_sample_rate, fft_size, audio_num_mel_bins, fmin, fmax): + global _inv_mel_basis + if _inv_mel_basis is None: + _inv_mel_basis = np.linalg.pinv(_build_mel_basis(audio_sample_rate, fft_size, audio_num_mel_bins, fmin, fmax)) + return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) diff --git a/utils/audio/io.py b/utils/audio/io.py new file mode 100644 index 0000000000000000000000000000000000000000..34d5d20ae13e9aa481b1bc85117ad6539af8a624 --- /dev/null +++ b/utils/audio/io.py @@ -0,0 +1,22 @@ +import subprocess + +import numpy as np +from scipy.io import wavfile + + +def save_wav(wav, path, sr, norm=False): + if norm: + wav = wav / np.abs(wav).max() + wav = wav * 32767 + wavfile.write(path[:-4] + '.wav', sr, wav.astype(np.int16)) + if path[-4:] == '.mp3': + to_mp3(path[:-4]) + + +def to_mp3(out_path): + if out_path[-4:] == '.wav': + out_path = out_path[:-4] + subprocess.check_call( + f'ffmpeg -threads 1 -loglevel error -i "{out_path}.wav" -vn -b:a 192k -y -hide_banner -async 1 "{out_path}.mp3"', + shell=True, stdin=subprocess.PIPE) + subprocess.check_call(f'rm -f "{out_path}.wav"', shell=True) diff --git a/utils/audio/pitch/__pycache__/utils.cpython-36.pyc b/utils/audio/pitch/__pycache__/utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96606f14a1cc2a2014dd0d13d40d4283a5029a69 Binary files /dev/null and b/utils/audio/pitch/__pycache__/utils.cpython-36.pyc differ diff --git a/utils/audio/pitch/__pycache__/utils.cpython-37.pyc b/utils/audio/pitch/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bcaababceea8373de4de714d6e27febcd26d121a Binary files /dev/null and b/utils/audio/pitch/__pycache__/utils.cpython-37.pyc differ diff --git a/utils/audio/pitch/utils.py b/utils/audio/pitch/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..238b8022185753a7d4d9d674d189a99050c29b6f --- /dev/null +++ b/utils/audio/pitch/utils.py @@ -0,0 +1,82 @@ +import numpy as np +import torch + + +def to_lf0(f0): + f0[f0 < 1.0e-5] = 1.0e-6 + lf0 = f0.log() if isinstance(f0, torch.Tensor) else np.log(f0) + lf0[f0 < 1.0e-5] = - 1.0E+10 + return lf0 + + +def to_f0(lf0): + f0 = np.where(lf0 <= 0, 0.0, np.exp(lf0)) + return f0.flatten() + + +def f0_to_coarse(f0, f0_bin=256, f0_max=900.0, f0_min=50.0): + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + is_torch = isinstance(f0, torch.Tensor) + f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 + + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 + f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min(), f0.min(), f0.max()) + return f0_coarse + + +def coarse_to_f0(f0_coarse, f0_bin=256, f0_max=900.0, f0_min=50.0): + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + uv = f0_coarse == 1 + f0 = f0_mel_min + (f0_coarse - 1) * (f0_mel_max - f0_mel_min) / (f0_bin - 2) + f0 = ((f0 / 1127).exp() - 1) * 700 + f0[uv] = 0 + return f0 + + +def norm_f0(f0, uv, pitch_norm='log', f0_mean=400, f0_std=100): + is_torch = isinstance(f0, torch.Tensor) + if pitch_norm == 'standard': + f0 = (f0 - f0_mean) / f0_std + if pitch_norm == 'log': + f0 = torch.log2(f0 + 1e-8) if is_torch else np.log2(f0 + 1e-8) + if uv is not None: + f0[uv > 0] = 0 + return f0 + + +def norm_interp_f0(f0, pitch_norm='log', f0_mean=None, f0_std=None): + is_torch = isinstance(f0, torch.Tensor) + if is_torch: + device = f0.device + f0 = f0.data.cpu().numpy() + uv = f0 == 0 + f0 = norm_f0(f0, uv, pitch_norm, f0_mean, f0_std) + if sum(uv) == len(f0): + f0[uv] = 0 + elif sum(uv) > 0: + f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv]) + if is_torch: + uv = torch.FloatTensor(uv) + f0 = torch.FloatTensor(f0) + f0 = f0.to(device) + uv = uv.to(device) + return f0, uv + + +def denorm_f0(f0, uv, pitch_norm='log', f0_mean=400, f0_std=100, pitch_padding=None, min=50, max=900): + is_torch = isinstance(f0, torch.Tensor) + if pitch_norm == 'standard': + f0 = f0 * f0_std + f0_mean + if pitch_norm == 'log': + f0 = 2 ** f0 + f0 = f0.clamp(min=min, max=max) if is_torch else np.clip(f0, a_min=min, a_max=max) + if uv is not None: + f0[uv > 0] = 0 + if pitch_padding is not None: + f0[pitch_padding] = 0 + return f0 diff --git a/utils/audio/pitch_extractors.py b/utils/audio/pitch_extractors.py new file mode 100644 index 0000000000000000000000000000000000000000..eb19c50d55d198157b2e6adedd8a343d9c363395 --- /dev/null +++ b/utils/audio/pitch_extractors.py @@ -0,0 +1,40 @@ +import numpy as np + +PITCH_EXTRACTOR = {} + + +def register_pitch_extractor(name): + def register_pitch_extractor_(cls): + PITCH_EXTRACTOR[name] = cls + return cls + + return register_pitch_extractor_ + + +def get_pitch_extractor(name): + return PITCH_EXTRACTOR[name] + + +def extract_pitch_simple(wav): + from utils.commons.hparams import hparams + return extract_pitch(hparams['pitch_extractor'], wav, + hparams['hop_size'], hparams['audio_sample_rate'], + f0_min=hparams['f0_min'], f0_max=hparams['f0_max']) + + +def extract_pitch(extractor_name, wav_data, hop_size, audio_sample_rate, f0_min=75, f0_max=800, **kwargs): + return get_pitch_extractor(extractor_name)(wav_data, hop_size, audio_sample_rate, f0_min, f0_max, **kwargs) + + +@register_pitch_extractor('parselmouth') +def parselmouth_pitch(wav_data, hop_size, audio_sample_rate, f0_min, f0_max, + voicing_threshold=0.6, *args, **kwargs): + import parselmouth + time_step = hop_size / audio_sample_rate * 1000 + n_mel_frames = int(len(wav_data) // hop_size) + f0_pm = parselmouth.Sound(wav_data, audio_sample_rate).to_pitch_ac( + time_step=time_step / 1000, voicing_threshold=voicing_threshold, + pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] + pad_size = (n_mel_frames - len(f0_pm) + 1) // 2 + f0 = np.pad(f0_pm, [[pad_size, n_mel_frames - len(f0_pm) - pad_size]], mode='constant') + return f0 diff --git a/utils/audio/rnnoise.py b/utils/audio/rnnoise.py new file mode 100644 index 0000000000000000000000000000000000000000..47f4eb6471918ca8144f217580a71d1720cd8c36 --- /dev/null +++ b/utils/audio/rnnoise.py @@ -0,0 +1,48 @@ +# rnnoise.py, requirements: ffmpeg, sox, rnnoise, python +import os +import subprocess + +INSTALL_STR = """ +RNNoise library not found. Please install RNNoise (https://github.com/xiph/rnnoise) to $REPO/rnnoise: +sudo apt-get install -y autoconf automake libtool ffmpeg sox +git clone https://github.com/xiph/rnnoise.git +rm -rf rnnoise/.git +cd rnnoise +./autogen.sh && ./configure && make +cd .. +""" + + +def rnnoise(filename, out_fn=None, verbose=False, out_sample_rate=22050): + assert os.path.exists('./rnnoise/examples/rnnoise_demo'), INSTALL_STR + if out_fn is None: + out_fn = f"{filename[:-4]}.denoised.wav" + out_48k_fn = f"{out_fn}.48000.wav" + tmp0_fn = f"{out_fn}.0.wav" + tmp1_fn = f"{out_fn}.1.wav" + tmp2_fn = f"{out_fn}.2.raw" + tmp3_fn = f"{out_fn}.3.raw" + if verbose: + print("Pre-processing audio...") # wav to pcm raw + subprocess.check_call( + f'sox "{filename}" -G -r48000 "{tmp0_fn}"', shell=True, stdin=subprocess.PIPE) # convert to raw + subprocess.check_call( + f'sox -v 0.95 "{tmp0_fn}" "{tmp1_fn}"', shell=True, stdin=subprocess.PIPE) # convert to raw + subprocess.check_call( + f'ffmpeg -y -i "{tmp1_fn}" -loglevel quiet -f s16le -ac 1 -ar 48000 "{tmp2_fn}"', + shell=True, stdin=subprocess.PIPE) # convert to raw + if verbose: + print("Applying rnnoise algorithm to audio...") # rnnoise + subprocess.check_call( + f'./rnnoise/examples/rnnoise_demo "{tmp2_fn}" "{tmp3_fn}"', shell=True) + + if verbose: + print("Post-processing audio...") # pcm raw to wav + if filename == out_fn: + subprocess.check_call(f'rm -f "{out_fn}"', shell=True) + subprocess.check_call( + f'sox -t raw -r 48000 -b 16 -e signed-integer -c 1 "{tmp3_fn}" "{out_48k_fn}"', shell=True) + subprocess.check_call(f'sox "{out_48k_fn}" -G -r{out_sample_rate} "{out_fn}"', shell=True) + subprocess.check_call(f'rm -f "{tmp0_fn}" "{tmp1_fn}" "{tmp2_fn}" "{tmp3_fn}" "{out_48k_fn}"', shell=True) + if verbose: + print("Audio-filtering completed!") diff --git a/utils/audio/vad.py b/utils/audio/vad.py new file mode 100644 index 0000000000000000000000000000000000000000..cbe9c7a6417f234ae46e1754d6736b26e22b2427 --- /dev/null +++ b/utils/audio/vad.py @@ -0,0 +1,78 @@ +from skimage.transform import resize +import struct +import webrtcvad +from scipy.ndimage.morphology import binary_dilation +import librosa +import numpy as np +import pyloudnorm as pyln +import warnings + +warnings.filterwarnings("ignore", message="Possible clipped samples in output") + +int16_max = (2 ** 15) - 1 + + +def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12): + """ + Ensures that segments without voice in the waveform remain no longer than a + threshold determined by the VAD parameters in params.py. + :param wav: the raw waveform as a numpy array of floats + :param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have. + :return: the same waveform with silences trimmed away (length <= original wav length) + """ + + ## Voice Activation Detection + # Window size of the VAD. Must be either 10, 20 or 30 milliseconds. + # This sets the granularity of the VAD. Should not need to be changed. + sampling_rate = 16000 + wav_raw, sr = librosa.core.load(path, sr=sr) + + if norm: + meter = pyln.Meter(sr) # create BS.1770 meter + loudness = meter.integrated_loudness(wav_raw) + wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0) + if np.abs(wav_raw).max() > 1.0: + wav_raw = wav_raw / np.abs(wav_raw).max() + + wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best') + + vad_window_length = 30 # In milliseconds + # Number of frames to average together when performing the moving average smoothing. + # The larger this value, the larger the VAD variations must be to not get smoothed out. + vad_moving_average_width = 8 + + # Compute the voice detection window size + samples_per_window = (vad_window_length * sampling_rate) // 1000 + + # Trim the end of the audio to have a multiple of the window size + wav = wav[:len(wav) - (len(wav) % samples_per_window)] + + # Convert the float waveform to 16-bit mono PCM + pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) + + # Perform voice activation detection + voice_flags = [] + vad = webrtcvad.Vad(mode=3) + for window_start in range(0, len(wav), samples_per_window): + window_end = window_start + samples_per_window + voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], + sample_rate=sampling_rate)) + voice_flags = np.array(voice_flags) + + # Smooth the voice detection with a moving average + def moving_average(array, width): + array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) + ret = np.cumsum(array_padded, dtype=float) + ret[width:] = ret[width:] - ret[:-width] + return ret[width - 1:] / width + + audio_mask = moving_average(voice_flags, vad_moving_average_width) + audio_mask = np.round(audio_mask).astype(np.bool) + + # Dilate the voiced regions + audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) + audio_mask = np.repeat(audio_mask, samples_per_window) + audio_mask = resize(audio_mask, (len(wav_raw),)) > 0 + if return_raw_wav: + return wav_raw, audio_mask, sr + return wav_raw[audio_mask], audio_mask, sr diff --git a/utils/commons/__pycache__/base_task.cpython-36.pyc b/utils/commons/__pycache__/base_task.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a999e66138ff69a21ca6bd5964f44fe5339f9cb Binary files /dev/null and b/utils/commons/__pycache__/base_task.cpython-36.pyc differ diff --git a/utils/commons/__pycache__/base_task.cpython-37.pyc b/utils/commons/__pycache__/base_task.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9420b397ddbeac36ba5c8e7973339e5b53978a8b Binary files /dev/null and b/utils/commons/__pycache__/base_task.cpython-37.pyc differ diff --git a/utils/commons/__pycache__/ckpt_utils.cpython-36.pyc b/utils/commons/__pycache__/ckpt_utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b379ec228ed3e33a82fcf9ffa3c5ec734ed90de1 Binary files /dev/null and b/utils/commons/__pycache__/ckpt_utils.cpython-36.pyc differ diff --git a/utils/commons/__pycache__/ckpt_utils.cpython-37.pyc b/utils/commons/__pycache__/ckpt_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..356399275eafcababc80541a04416c8643fe390d Binary files /dev/null and b/utils/commons/__pycache__/ckpt_utils.cpython-37.pyc differ diff --git a/utils/commons/__pycache__/dataset_utils.cpython-36.pyc b/utils/commons/__pycache__/dataset_utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77a12cccb4fb26e7508e5745a6936867d81423b0 Binary files /dev/null and b/utils/commons/__pycache__/dataset_utils.cpython-36.pyc differ diff --git a/utils/commons/__pycache__/dataset_utils.cpython-37.pyc b/utils/commons/__pycache__/dataset_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d26e3f8e113f303a57821c62b77d000d6b8221d8 Binary files /dev/null and b/utils/commons/__pycache__/dataset_utils.cpython-37.pyc differ diff --git a/utils/commons/__pycache__/ddp_utils.cpython-36.pyc b/utils/commons/__pycache__/ddp_utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a23403daf73714c4998a2ce2de2497d51401b18 Binary files /dev/null and b/utils/commons/__pycache__/ddp_utils.cpython-36.pyc differ diff --git a/utils/commons/__pycache__/ddp_utils.cpython-37.pyc b/utils/commons/__pycache__/ddp_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..986f189d57a1f3bfbaa176a7436d91527e4551bf Binary files /dev/null and b/utils/commons/__pycache__/ddp_utils.cpython-37.pyc differ diff --git a/utils/commons/__pycache__/hparams.cpython-36.pyc b/utils/commons/__pycache__/hparams.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9aad7ab2abcb53f79d4d2242790d2b8941513ce6 Binary files /dev/null and b/utils/commons/__pycache__/hparams.cpython-36.pyc differ diff --git a/utils/commons/__pycache__/hparams.cpython-37.pyc b/utils/commons/__pycache__/hparams.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..070e462d4288628a2dd7b6ac570a4aa515e2de61 Binary files /dev/null and b/utils/commons/__pycache__/hparams.cpython-37.pyc differ diff --git a/utils/commons/__pycache__/hparams.cpython-39.pyc b/utils/commons/__pycache__/hparams.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09a76235dbf5c729fe9d95a2753d7fa08e0b7ec4 Binary files /dev/null and b/utils/commons/__pycache__/hparams.cpython-39.pyc differ diff --git a/utils/commons/__pycache__/indexed_datasets.cpython-36.pyc b/utils/commons/__pycache__/indexed_datasets.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06591943fa16839a8c8345eeca5574dde7f51c0c Binary files /dev/null and b/utils/commons/__pycache__/indexed_datasets.cpython-36.pyc differ diff --git a/utils/commons/__pycache__/indexed_datasets.cpython-37.pyc b/utils/commons/__pycache__/indexed_datasets.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38acce9fe2f68a1c0b0e47ef81b5415a2eda337f Binary files /dev/null and b/utils/commons/__pycache__/indexed_datasets.cpython-37.pyc differ diff --git a/utils/commons/__pycache__/meters.cpython-36.pyc b/utils/commons/__pycache__/meters.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50be1b69cc0b379fc8e7c654e0f8ff077311040c Binary files /dev/null and b/utils/commons/__pycache__/meters.cpython-36.pyc differ diff --git a/utils/commons/__pycache__/meters.cpython-37.pyc b/utils/commons/__pycache__/meters.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37f9d5021a05820f0a8d791e6f2cafb2ccb10271 Binary files /dev/null and b/utils/commons/__pycache__/meters.cpython-37.pyc differ diff --git a/utils/commons/__pycache__/multiprocess_utils.cpython-36.pyc b/utils/commons/__pycache__/multiprocess_utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08ea0b88aba16592775df5a9283f3a0675e1256a Binary files /dev/null and b/utils/commons/__pycache__/multiprocess_utils.cpython-36.pyc differ diff --git a/utils/commons/__pycache__/multiprocess_utils.cpython-37.pyc b/utils/commons/__pycache__/multiprocess_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2decb33e5b5b8dfbdc7e2b4d418d2d5e690af26 Binary files /dev/null and b/utils/commons/__pycache__/multiprocess_utils.cpython-37.pyc differ diff --git a/utils/commons/__pycache__/single_thread_env.cpython-36.pyc b/utils/commons/__pycache__/single_thread_env.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40244077266c59176dff38c8afd855489229117c Binary files /dev/null and b/utils/commons/__pycache__/single_thread_env.cpython-36.pyc differ diff --git a/utils/commons/__pycache__/single_thread_env.cpython-37.pyc b/utils/commons/__pycache__/single_thread_env.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f51bd1a577da02a3f4904c814ffc55f11a5fc9c2 Binary files /dev/null and b/utils/commons/__pycache__/single_thread_env.cpython-37.pyc differ diff --git a/utils/commons/__pycache__/single_thread_env.cpython-39.pyc b/utils/commons/__pycache__/single_thread_env.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d87de5b641fd74ebbfd3d05bd433cdae00771117 Binary files /dev/null and b/utils/commons/__pycache__/single_thread_env.cpython-39.pyc differ diff --git a/utils/commons/__pycache__/tensor_utils.cpython-36.pyc b/utils/commons/__pycache__/tensor_utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2718362915196b38bdfb4b19c0a1bda90b6d7941 Binary files /dev/null and b/utils/commons/__pycache__/tensor_utils.cpython-36.pyc differ diff --git a/utils/commons/__pycache__/tensor_utils.cpython-37.pyc b/utils/commons/__pycache__/tensor_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a7b3b37a475ad20f59a5a6f3dbc5052114a833b Binary files /dev/null and b/utils/commons/__pycache__/tensor_utils.cpython-37.pyc differ diff --git a/utils/commons/__pycache__/trainer.cpython-36.pyc b/utils/commons/__pycache__/trainer.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04ff75ff2e5b3991f4a45e6eb4b0e187f38d9b3a Binary files /dev/null and b/utils/commons/__pycache__/trainer.cpython-36.pyc differ diff --git a/utils/commons/__pycache__/trainer.cpython-37.pyc b/utils/commons/__pycache__/trainer.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2588ae523e45915c9217be6ae12653fd9956282f Binary files /dev/null and b/utils/commons/__pycache__/trainer.cpython-37.pyc differ diff --git a/utils/commons/base_task.py b/utils/commons/base_task.py new file mode 100755 index 0000000000000000000000000000000000000000..d90bc062f10f2db0a2ff1bd28a68bb7e8e35f924 --- /dev/null +++ b/utils/commons/base_task.py @@ -0,0 +1,232 @@ +import logging +import os +import random +import subprocess +import sys +from datetime import datetime +import numpy as np +import torch.utils.data +from torch import nn +from torch.utils.tensorboard import SummaryWriter +from utils.commons.dataset_utils import data_loader +from utils.commons.hparams import hparams +from utils.commons.meters import AvgrageMeter +from utils.commons.tensor_utils import tensors_to_scalars +from utils.commons.trainer import Trainer + +torch.multiprocessing.set_sharing_strategy(os.getenv('TORCH_SHARE_STRATEGY', 'file_system')) + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') + + +class BaseTask(nn.Module): + def __init__(self, *args, **kwargs): + super(BaseTask, self).__init__() + self.current_epoch = 0 + self.global_step = 0 + self.trainer = None + self.use_ddp = False + self.gradient_clip_norm = hparams['clip_grad_norm'] + self.gradient_clip_val = hparams.get('clip_grad_value', 0) + self.model = None + self.training_losses_meter = None + self.logger: SummaryWriter = None + + ###################### + # build model, dataloaders, optimizer, scheduler and tensorboard + ###################### + def build_model(self): + raise NotImplementedError + + @data_loader + def train_dataloader(self): + raise NotImplementedError + + @data_loader + def test_dataloader(self): + raise NotImplementedError + + @data_loader + def val_dataloader(self): + raise NotImplementedError + + def build_scheduler(self, optimizer): + return None + + def build_optimizer(self, model): + raise NotImplementedError + + def configure_optimizers(self): + optm = self.build_optimizer(self.model) + self.scheduler = self.build_scheduler(optm) + if isinstance(optm, (list, tuple)): + return optm + return [optm] + + def build_tensorboard(self, save_dir, name, **kwargs): + log_dir = os.path.join(save_dir, name) + os.makedirs(log_dir, exist_ok=True) + self.logger = SummaryWriter(log_dir=log_dir, **kwargs) + + ###################### + # training + ###################### + def on_train_start(self): + pass + + def on_train_end(self): + pass + + def on_epoch_start(self): + self.training_losses_meter = {'total_loss': AvgrageMeter()} + + def on_epoch_end(self): + loss_outputs = {k: round(v.avg, 4) for k, v in self.training_losses_meter.items()} + print(f"Epoch {self.current_epoch} ended. Steps: {self.global_step}. {loss_outputs}") + + def _training_step(self, sample, batch_idx, optimizer_idx): + """ + + :param sample: + :param batch_idx: + :return: total loss: torch.Tensor, loss_log: dict + """ + raise NotImplementedError + + def training_step(self, sample, batch_idx, optimizer_idx=-1): + """ + + :param sample: + :param batch_idx: + :param optimizer_idx: + :return: {'loss': torch.Tensor, 'progress_bar': dict, 'tb_log': dict} + """ + loss_ret = self._training_step(sample, batch_idx, optimizer_idx) + if loss_ret is None: + return {'loss': None} + total_loss, log_outputs = loss_ret + log_outputs = tensors_to_scalars(log_outputs) + for k, v in log_outputs.items(): + if k not in self.training_losses_meter: + self.training_losses_meter[k] = AvgrageMeter() + if not np.isnan(v): + self.training_losses_meter[k].update(v) + self.training_losses_meter['total_loss'].update(total_loss.item()) + + if optimizer_idx >= 0: + log_outputs[f'lr_{optimizer_idx}'] = self.trainer.optimizers[optimizer_idx].param_groups[0]['lr'] + + progress_bar_log = log_outputs + tb_log = {f'tr/{k}': v for k, v in log_outputs.items()} + return { + 'loss': total_loss, + 'progress_bar': progress_bar_log, + 'tb_log': tb_log + } + + def on_before_optimization(self, opt_idx): + if self.gradient_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(self.parameters(), self.gradient_clip_norm) + if self.gradient_clip_val > 0: + torch.nn.utils.clip_grad_value_(self.parameters(), self.gradient_clip_val) + + def on_after_optimization(self, epoch, batch_idx, optimizer, optimizer_idx): + if self.scheduler is not None: + self.scheduler.step(self.global_step // hparams['accumulate_grad_batches']) + + ###################### + # validation + ###################### + def validation_start(self): + pass + + def validation_step(self, sample, batch_idx): + """ + + :param sample: + :param batch_idx: + :return: output: {"losses": {...}, "total_loss": float, ...} or (total loss: torch.Tensor, loss_log: dict) + """ + raise NotImplementedError + + def validation_end(self, outputs): + """ + + :param outputs: + :return: loss_output: dict + """ + all_losses_meter = {'total_loss': AvgrageMeter()} + for output in outputs: + if len(output) == 0 or output is None: + continue + if isinstance(output, dict): + assert 'losses' in output, 'Key "losses" should exist in validation output.' + n = output.pop('nsamples', 1) + losses = tensors_to_scalars(output['losses']) + total_loss = output.get('total_loss', sum(losses.values())) + else: + assert len(output) == 2, 'Validation output should only consist of two elements: (total_loss, losses)' + n = 1 + total_loss, losses = output + losses = tensors_to_scalars(losses) + if isinstance(total_loss, torch.Tensor): + total_loss = total_loss.item() + for k, v in losses.items(): + if k not in all_losses_meter: + all_losses_meter[k] = AvgrageMeter() + all_losses_meter[k].update(v, n) + all_losses_meter['total_loss'].update(total_loss, n) + loss_output = {k: round(v.avg, 4) for k, v in all_losses_meter.items()} + print(f"| Validation results@{self.global_step}: {loss_output}") + return { + 'tb_log': {f'val/{k}': v for k, v in loss_output.items()}, + 'val_loss': loss_output['total_loss'] + } + + ###################### + # testing + ###################### + def test_start(self): + pass + + def test_step(self, sample, batch_idx): + return self.validation_step(sample, batch_idx) + + def test_end(self, outputs): + return self.validation_end(outputs) + + ###################### + # start training/testing + ###################### + @classmethod + def start(cls): + os.environ['MASTER_PORT'] = str(random.randint(15000, 30000)) + random.seed(hparams['seed']) + np.random.seed(hparams['seed']) + work_dir = hparams['work_dir'] + trainer = Trainer( + work_dir=work_dir, + val_check_interval=hparams['val_check_interval'], + tb_log_interval=hparams['tb_log_interval'], + max_updates=hparams['max_updates'], + num_sanity_val_steps=hparams['num_sanity_val_steps'] if not hparams['validate'] else 10000, + accumulate_grad_batches=hparams['accumulate_grad_batches'], + print_nan_grads=hparams['print_nan_grads'], + resume_from_checkpoint=hparams.get('resume_from_checkpoint', 0), + amp=hparams['amp'], + monitor_key=hparams['valid_monitor_key'], + monitor_mode=hparams['valid_monitor_mode'], + num_ckpt_keep=hparams['num_ckpt_keep'], + save_best=hparams['save_best'], + seed=hparams['seed'], + debug=hparams['debug'] + ) + if not hparams['infer']: # train + trainer.fit(cls) + else: + trainer.test(cls) + + def on_keyboard_interrupt(self): + pass diff --git a/utils/commons/ckpt_utils.py b/utils/commons/ckpt_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9c1006d5852c6cf57063ce64e773d3c40ae9500d --- /dev/null +++ b/utils/commons/ckpt_utils.py @@ -0,0 +1,66 @@ +import glob +import os +import re +import torch + + +def get_last_checkpoint(work_dir, steps=None): + checkpoint = None + last_ckpt_path = None + ckpt_paths = get_all_ckpts(work_dir, steps) + if len(ckpt_paths) > 0: + last_ckpt_path = ckpt_paths[0] + checkpoint = torch.load(last_ckpt_path, map_location='cpu') + return checkpoint, last_ckpt_path + + +def get_all_ckpts(work_dir, steps=None): + if steps is None: + ckpt_path_pattern = f'{work_dir}/model_ckpt_steps_*.ckpt' + else: + ckpt_path_pattern = f'{work_dir}/model_ckpt_steps_{steps}.ckpt' + return sorted(glob.glob(ckpt_path_pattern), + key=lambda x: -int(re.findall('.*steps\_(\d+)\.ckpt', x)[0])) + + +def load_ckpt(cur_model, ckpt_base_dir, model_name='model', force=True, strict=True): + if os.path.isfile(ckpt_base_dir): + base_dir = os.path.dirname(ckpt_base_dir) + ckpt_path = ckpt_base_dir + checkpoint = torch.load(ckpt_base_dir, map_location='cpu') + else: + base_dir = ckpt_base_dir + checkpoint, ckpt_path = get_last_checkpoint(ckpt_base_dir) + if checkpoint is not None: + state_dict = checkpoint["state_dict"] + if len([k for k in state_dict.keys() if '.' in k]) > 0: + state_dict = {k[len(model_name) + 1:]: v for k, v in state_dict.items() + if k.startswith(f'{model_name}.')} + else: + if '.' not in model_name: + state_dict = state_dict[model_name] + else: + base_model_name = model_name.split('.')[0] + rest_model_name = model_name[len(base_model_name) + 1:] + state_dict = { + k[len(rest_model_name) + 1:]: v for k, v in state_dict[base_model_name].items() + if k.startswith(f'{rest_model_name}.')} + if not strict: + cur_model_state_dict = cur_model.state_dict() + unmatched_keys = [] + for key, param in state_dict.items(): + if key in cur_model_state_dict: + new_param = cur_model_state_dict[key] + if new_param.shape != param.shape: + unmatched_keys.append(key) + print("| Unmatched keys: ", key, new_param.shape, param.shape) + for key in unmatched_keys: + del state_dict[key] + cur_model.load_state_dict(state_dict, strict=strict) + print(f"| load '{model_name}' from '{ckpt_path}'.") + else: + e_msg = f"| ckpt not found in {base_dir}." + if force: + assert False, e_msg + else: + print(e_msg) diff --git a/utils/commons/dataset_utils.py b/utils/commons/dataset_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..44c2ca0ce3226fa21bf9d7c7fa889b23ef9b0fa9 --- /dev/null +++ b/utils/commons/dataset_utils.py @@ -0,0 +1,247 @@ +import os +import sys +import traceback +import types +from functools import wraps +from itertools import chain +import numpy as np +import torch.utils.data +from torch.utils.data import ConcatDataset +from utils.commons.hparams import hparams + + +def collate_1d_or_2d(values, pad_idx=0, left_pad=False, shift_right=False, max_len=None, shift_id=1): + if len(values[0].shape) == 1: + return collate_1d(values, pad_idx, left_pad, shift_right, max_len, shift_id) + else: + return collate_2d(values, pad_idx, left_pad, shift_right, max_len) + + +def collate_1d(values, pad_idx=0, left_pad=False, shift_right=False, max_len=None, shift_id=1): + """Convert a list of 1d tensors into a padded 2d tensor.""" + size = max(v.size(0) for v in values) if max_len is None else max_len + res = values[0].new(len(values), size).fill_(pad_idx) + + def copy_tensor(src, dst): + assert dst.numel() == src.numel() + if shift_right: + dst[1:] = src[:-1] + dst[0] = shift_id + else: + dst.copy_(src) + + for i, v in enumerate(values): + copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)]) + return res + + +def collate_2d(values, pad_idx=0, left_pad=False, shift_right=False, max_len=None): + """Convert a list of 2d tensors into a padded 3d tensor.""" + size = max(v.size(0) for v in values) if max_len is None else max_len + res = values[0].new(len(values), size, values[0].shape[1]).fill_(pad_idx) + + def copy_tensor(src, dst): + assert dst.numel() == src.numel() + if shift_right: + dst[1:] = src[:-1] + else: + dst.copy_(src) + + for i, v in enumerate(values): + copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)]) + return res + + +def _is_batch_full(batch, num_tokens, max_tokens, max_sentences): + if len(batch) == 0: + return 0 + if len(batch) == max_sentences: + return 1 + if num_tokens > max_tokens: + return 1 + return 0 + + +def batch_by_size( + indices, num_tokens_fn, max_tokens=None, max_sentences=None, + required_batch_size_multiple=1, distributed=False +): + """ + Yield mini-batches of indices bucketed by size. Batches may contain + sequences of different lengths. + + Args: + indices (List[int]): ordered list of dataset indices + num_tokens_fn (callable): function that returns the number of tokens at + a given index + max_tokens (int, optional): max number of tokens in each batch + (default: None). + max_sentences (int, optional): max number of sentences in each + batch (default: None). + required_batch_size_multiple (int, optional): require batch size to + be a multiple of N (default: 1). + """ + max_tokens = max_tokens if max_tokens is not None else sys.maxsize + max_sentences = max_sentences if max_sentences is not None else sys.maxsize + bsz_mult = required_batch_size_multiple + + if isinstance(indices, types.GeneratorType): + indices = np.fromiter(indices, dtype=np.int64, count=-1) + + sample_len = 0 + sample_lens = [] + batch = [] + batches = [] + for i in range(len(indices)): + idx = indices[i] + num_tokens = num_tokens_fn(idx) + sample_lens.append(num_tokens) + sample_len = max(sample_len, num_tokens) + + assert sample_len <= max_tokens, ( + "sentence at index {} of size {} exceeds max_tokens " + "limit of {}!".format(idx, sample_len, max_tokens) + ) + num_tokens = (len(batch) + 1) * sample_len + + if _is_batch_full(batch, num_tokens, max_tokens, max_sentences): + mod_len = max( + bsz_mult * (len(batch) // bsz_mult), + len(batch) % bsz_mult, + ) + batches.append(batch[:mod_len]) + batch = batch[mod_len:] + sample_lens = sample_lens[mod_len:] + sample_len = max(sample_lens) if len(sample_lens) > 0 else 0 + batch.append(idx) + if len(batch) > 0: + batches.append(batch) + return batches + + +def unpack_dict_to_list(samples): + samples_ = [] + bsz = samples.get('outputs').size(0) + for i in range(bsz): + res = {} + for k, v in samples.items(): + try: + res[k] = v[i] + except: + pass + samples_.append(res) + return samples_ + + +def remove_padding(x, padding_idx=0): + if x is None: + return None + assert len(x.shape) in [1, 2] + if len(x.shape) == 2: # [T, H] + return x[np.abs(x).sum(-1) != padding_idx] + elif len(x.shape) == 1: # [T] + return x[x != padding_idx] + + +def data_loader(fn): + """ + Decorator to make any fx with this use the lazy property + :param fn: + :return: + """ + + wraps(fn) + attr_name = '_lazy_' + fn.__name__ + + def _get_data_loader(self): + try: + value = getattr(self, attr_name) + except AttributeError: + try: + value = fn(self) # Lazy evaluation, done only once. + except AttributeError as e: + # Guard against AttributeError suppression. (Issue #142) + traceback.print_exc() + error = f'{fn.__name__}: An AttributeError was encountered: ' + str(e) + raise RuntimeError(error) from e + setattr(self, attr_name, value) # Memoize evaluation. + return value + + return _get_data_loader + + +class BaseDataset(torch.utils.data.Dataset): + def __init__(self, shuffle): + super().__init__() + self.hparams = hparams + self.shuffle = shuffle + self.sort_by_len = hparams['sort_by_len'] + self.sizes = None + + @property + def _sizes(self): + return self.sizes + + def __getitem__(self, index): + raise NotImplementedError + + def collater(self, samples): + raise NotImplementedError + + def __len__(self): + return len(self._sizes) + + def num_tokens(self, index): + return self.size(index) + + def size(self, index): + """Return an example's size as a float or tuple. This value is used when + filtering a dataset with ``--max-positions``.""" + return min(self._sizes[index], hparams['max_frames']) + + def ordered_indices(self): + """Return an ordered list of indices. Batches will be constructed based + on this order.""" + if self.shuffle: + indices = np.random.permutation(len(self)) + if self.sort_by_len: + indices = indices[np.argsort(np.array(self._sizes)[indices], kind='mergesort')] + else: + indices = np.arange(len(self)) + return indices + + @property + def num_workers(self): + return int(os.getenv('NUM_WORKERS', hparams['ds_workers'])) + + +class BaseConcatDataset(ConcatDataset): + def collater(self, samples): + return self.datasets[0].collater(samples) + + @property + def _sizes(self): + if not hasattr(self, 'sizes'): + self.sizes = list(chain.from_iterable([d._sizes for d in self.datasets])) + return self.sizes + + def size(self, index): + return min(self._sizes[index], hparams['max_frames']) + + def num_tokens(self, index): + return self.size(index) + + def ordered_indices(self): + """Return an ordered list of indices. Batches will be constructed based + on this order.""" + if self.datasets[0].shuffle: + indices = np.random.permutation(len(self)) + if self.datasets[0].sort_by_len: + indices = indices[np.argsort(np.array(self._sizes)[indices], kind='mergesort')] + else: + indices = np.arange(len(self)) + return indices + + @property + def num_workers(self): + return self.datasets[0].num_workers diff --git a/utils/commons/ddp_utils.py b/utils/commons/ddp_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4b529198c13a1ffc622baea6e5178407b24aee8f --- /dev/null +++ b/utils/commons/ddp_utils.py @@ -0,0 +1,137 @@ +from torch.nn.parallel import DistributedDataParallel +from torch.nn.parallel.distributed import _find_tensors +import torch.optim +import torch.utils.data +import torch +from packaging import version + +class DDP(DistributedDataParallel): + """ + Override the forward call in lightning so it goes to training and validation step respectively + """ + + def forward(self, *inputs, **kwargs): # pragma: no cover + if version.parse(torch.__version__[:6]) < version.parse("1.11"): + self._sync_params() + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + assert len(self.device_ids) == 1 + if self.module.training: + output = self.module.training_step(*inputs[0], **kwargs[0]) + elif self.module.testing: + output = self.module.test_step(*inputs[0], **kwargs[0]) + else: + output = self.module.validation_step(*inputs[0], **kwargs[0]) + if torch.is_grad_enabled(): + # We'll return the output object verbatim since it is a freeform + # object. We need to find any tensors in this object, though, + # because we need to figure out which parameters were used during + # this forward pass, to ensure we short circuit reduction for any + # unused parameters. Only if `find_unused_parameters` is set. + if self.find_unused_parameters: + self.reducer.prepare_for_backward(list(_find_tensors(output))) + else: + self.reducer.prepare_for_backward([]) + else: + from torch.nn.parallel.distributed import \ + logging, Join, _DDPSink, _tree_flatten_with_rref, _tree_unflatten_with_rref + with torch.autograd.profiler.record_function("DistributedDataParallel.forward"): + if torch.is_grad_enabled() and self.require_backward_grad_sync: + self.logger.set_runtime_stats_and_log() + self.num_iterations += 1 + self.reducer.prepare_for_forward() + + # Notify the join context that this process has not joined, if + # needed + work = Join.notify_join_context(self) + if work: + self.reducer._set_forward_pass_work_handle( + work, self._divide_by_initial_world_size + ) + + # Calling _rebuild_buckets before forward compuation, + # It may allocate new buckets before deallocating old buckets + # inside _rebuild_buckets. To save peak memory usage, + # call _rebuild_buckets before the peak memory usage increases + # during forward computation. + # This should be called only once during whole training period. + if torch.is_grad_enabled() and self.reducer._rebuild_buckets(): + logging.info("Reducer buckets have been rebuilt in this iteration.") + self._has_rebuilt_buckets = True + + # sync params according to location (before/after forward) user + # specified as part of hook, if hook was specified. + buffer_hook_registered = hasattr(self, 'buffer_hook') + if self._check_sync_bufs_pre_fwd(): + self._sync_buffers() + + if self._join_config.enable: + # Notify joined ranks whether they should sync in backwards pass or not. + self._check_global_requires_backward_grad_sync(is_joined_rank=False) + + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + if self.module.training: + output = self.module.training_step(*inputs[0], **kwargs[0]) + elif self.module.testing: + output = self.module.test_step(*inputs[0], **kwargs[0]) + else: + output = self.module.validation_step(*inputs[0], **kwargs[0]) + + # sync params according to location (before/after forward) user + # specified as part of hook, if hook was specified. + if self._check_sync_bufs_post_fwd(): + self._sync_buffers() + + if torch.is_grad_enabled() and self.require_backward_grad_sync: + self.require_forward_param_sync = True + # We'll return the output object verbatim since it is a freeform + # object. We need to find any tensors in this object, though, + # because we need to figure out which parameters were used during + # this forward pass, to ensure we short circuit reduction for any + # unused parameters. Only if `find_unused_parameters` is set. + if self.find_unused_parameters and not self.static_graph: + # Do not need to populate this for static graph. + self.reducer.prepare_for_backward(list(_find_tensors(output))) + else: + self.reducer.prepare_for_backward([]) + else: + self.require_forward_param_sync = False + + # TODO: DDPSink is currently enabled for unused parameter detection and + # static graph training for first iteration. + if (self.find_unused_parameters and not self.static_graph) or ( + self.static_graph and self.num_iterations == 1 + ): + state_dict = { + 'static_graph': self.static_graph, + 'num_iterations': self.num_iterations, + } + + output_tensor_list, treespec, output_is_rref = _tree_flatten_with_rref( + output + ) + output_placeholders = [None for _ in range(len(output_tensor_list))] + # Do not touch tensors that have no grad_fn, which can cause issues + # such as https://github.com/pytorch/pytorch/issues/60733 + for i, output in enumerate(output_tensor_list): + if torch.is_tensor(output) and output.grad_fn is None: + output_placeholders[i] = output + + # When find_unused_parameters=True, makes tensors which require grad + # run through the DDPSink backward pass. When not all outputs are + # used in loss, this makes those corresponding tensors receive + # undefined gradient which the reducer then handles to ensure + # param.grad field is not touched and we don't error out. + passthrough_tensor_list = _DDPSink.apply( + self.reducer, + state_dict, + *output_tensor_list, + ) + for i in range(len(output_placeholders)): + if output_placeholders[i] is None: + output_placeholders[i] = passthrough_tensor_list[i] + + # Reconstruct output data structure. + output = _tree_unflatten_with_rref( + output_placeholders, treespec, output_is_rref + ) + return output diff --git a/utils/commons/hparams.py b/utils/commons/hparams.py new file mode 100644 index 0000000000000000000000000000000000000000..356fe306b0be82040ae1e938d3fca0e2567ae7c2 --- /dev/null +++ b/utils/commons/hparams.py @@ -0,0 +1,131 @@ +import argparse +import os +import yaml + +from utils.os_utils import remove_file + +global_print_hparams = True +hparams = {} + + +class Args: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + self.__setattr__(k, v) + + +def override_config(old_config: dict, new_config: dict): + for k, v in new_config.items(): + if isinstance(v, dict) and k in old_config: + override_config(old_config[k], new_config[k]) + else: + old_config[k] = v + + +def set_hparams(config='', exp_name='', hparams_str='', print_hparams=True, global_hparams=True): + if config == '' and exp_name == '': + parser = argparse.ArgumentParser(description='') + parser.add_argument('--config', type=str, default='', + help='location of the data corpus') + parser.add_argument('--exp_name', type=str, default='', help='exp_name') + parser.add_argument('-hp', '--hparams', type=str, default='', + help='location of the data corpus') + parser.add_argument('--infer', action='store_true', help='infer') + parser.add_argument('--validate', action='store_true', help='validate') + parser.add_argument('--reset', action='store_true', help='reset hparams') + parser.add_argument('--remove', action='store_true', help='remove old ckpt') + parser.add_argument('--debug', action='store_true', help='debug') + args, unknown = parser.parse_known_args() + print("| Unknow hparams: ", unknown) + else: + args = Args(config=config, exp_name=exp_name, hparams=hparams_str, + infer=False, validate=False, reset=False, debug=False, remove=False) + global hparams + assert args.config != '' or args.exp_name != '' + if args.config != '': + assert os.path.exists(args.config) + + config_chains = [] + loaded_config = set() + + def load_config(config_fn): + # deep first inheritance and avoid the second visit of one node + if not os.path.exists(config_fn): + return {} + with open(config_fn) as f: + hparams_ = yaml.safe_load(f) + loaded_config.add(config_fn) + if 'base_config' in hparams_: + ret_hparams = {} + if not isinstance(hparams_['base_config'], list): + hparams_['base_config'] = [hparams_['base_config']] + for c in hparams_['base_config']: + if c.startswith('.'): + c = f'{os.path.dirname(config_fn)}/{c}' + c = os.path.normpath(c) + if c not in loaded_config: + override_config(ret_hparams, load_config(c)) + override_config(ret_hparams, hparams_) + else: + ret_hparams = hparams_ + config_chains.append(config_fn) + return ret_hparams + + saved_hparams = {} + args_work_dir = '' + if args.exp_name != '': + args_work_dir = f'checkpoints/{args.exp_name}' + ckpt_config_path = f'{args_work_dir}/config.yaml' + if os.path.exists(ckpt_config_path): + with open(ckpt_config_path) as f: + saved_hparams_ = yaml.safe_load(f) + if saved_hparams_ is not None: + saved_hparams.update(saved_hparams_) + hparams_ = {} + if args.config != '': + hparams_.update(load_config(args.config)) + if not args.reset: + hparams_.update(saved_hparams) + hparams_['work_dir'] = args_work_dir + + # Support config overriding in command line. Support list type config overriding. + # Examples: --hparams="a=1,b.c=2,d=[1 1 1]" + if args.hparams != "": + for new_hparam in args.hparams.split(","): + k, v = new_hparam.split("=") + v = v.strip("\'\" ") + config_node = hparams_ + for k_ in k.split(".")[:-1]: + config_node = config_node[k_] + k = k.split(".")[-1] + if v in ['True', 'False'] or type(config_node[k]) in [bool, list, dict]: + if type(config_node[k]) == list: + v = v.replace(" ", ",") + config_node[k] = eval(v) + else: + config_node[k] = type(config_node[k])(v) + if args_work_dir != '' and args.remove: + answer = input("REMOVE old checkpoint? Y/N [Default: N]: ") + if answer.lower() == "y": + remove_file(args_work_dir) + if args_work_dir != '' and (not os.path.exists(ckpt_config_path) or args.reset) and not args.infer: + os.makedirs(hparams_['work_dir'], exist_ok=True) + with open(ckpt_config_path, 'w') as f: + yaml.safe_dump(hparams_, f) + + hparams_['infer'] = args.infer + hparams_['debug'] = args.debug + hparams_['validate'] = args.validate + hparams_['exp_name'] = args.exp_name + global global_print_hparams + if global_hparams: + hparams.clear() + hparams.update(hparams_) + if print_hparams and global_print_hparams and global_hparams: + print('| Hparams chains: ', config_chains) + print('| Hparams: ') + for i, (k, v) in enumerate(sorted(hparams_.items())): + print(f"\033[;33;m{k}\033[0m: {v}, ", end="\n" if i % 5 == 4 else "") + print("") + global_print_hparams = False + return hparams_ diff --git a/utils/commons/indexed_datasets.py b/utils/commons/indexed_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..e15632be30d6296a3c9aa80a1f351058003698b3 --- /dev/null +++ b/utils/commons/indexed_datasets.py @@ -0,0 +1,71 @@ +import pickle +from copy import deepcopy + +import numpy as np + + +class IndexedDataset: + def __init__(self, path, num_cache=1): + super().__init__() + self.path = path + self.data_file = None + self.data_offsets = np.load(f"{path}.idx", allow_pickle=True).item()['offsets'] + self.data_file = open(f"{path}.data", 'rb', buffering=-1) + self.cache = [] + self.num_cache = num_cache + + def check_index(self, i): + if i < 0 or i >= len(self.data_offsets) - 1: + raise IndexError('index out of range') + + def __del__(self): + if self.data_file: + self.data_file.close() + + def __getitem__(self, i): + self.check_index(i) + if self.num_cache > 0: + for c in self.cache: + if c[0] == i: + return c[1] + self.data_file.seek(self.data_offsets[i]) + b = self.data_file.read(self.data_offsets[i + 1] - self.data_offsets[i]) + item = pickle.loads(b) + if self.num_cache > 0: + self.cache = [(i, deepcopy(item))] + self.cache[:-1] + return item + + def __len__(self): + return len(self.data_offsets) - 1 + +class IndexedDatasetBuilder: + def __init__(self, path): + self.path = path + self.out_file = open(f"{path}.data", 'wb') + self.byte_offsets = [0] + + def add_item(self, item): + s = pickle.dumps(item) + bytes = self.out_file.write(s) + self.byte_offsets.append(self.byte_offsets[-1] + bytes) + + def finalize(self): + self.out_file.close() + np.save(open(f"{self.path}.idx", 'wb'), {'offsets': self.byte_offsets}) + + +if __name__ == "__main__": + import random + from tqdm import tqdm + ds_path = '/tmp/indexed_ds_example' + size = 100 + items = [{"a": np.random.normal(size=[10000, 10]), + "b": np.random.normal(size=[10000, 10])} for i in range(size)] + builder = IndexedDatasetBuilder(ds_path) + for i in tqdm(range(size)): + builder.add_item(items[i]) + builder.finalize() + ds = IndexedDataset(ds_path) + for i in tqdm(range(10000)): + idx = random.randint(0, size - 1) + assert (ds[idx]['a'] == items[idx]['a']).all() diff --git a/utils/commons/meters.py b/utils/commons/meters.py new file mode 100644 index 0000000000000000000000000000000000000000..e38790e9f292ec843a820dad73c9795eb2ab8daa --- /dev/null +++ b/utils/commons/meters.py @@ -0,0 +1,42 @@ +import time +import torch + + +class AvgrageMeter(object): + + def __init__(self): + self.reset() + + def reset(self): + self.avg = 0 + self.sum = 0 + self.cnt = 0 + + def update(self, val, n=1): + self.sum += val * n + self.cnt += n + self.avg = self.sum / self.cnt + + +class Timer: + timer_map = {} + + def __init__(self, name, enable=False): + if name not in Timer.timer_map: + Timer.timer_map[name] = 0 + self.name = name + self.enable = enable + + def __enter__(self): + if self.enable: + if torch.cuda.is_available(): + torch.cuda.synchronize() + self.t = time.time() + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.enable: + if torch.cuda.is_available(): + torch.cuda.synchronize() + Timer.timer_map[self.name] += time.time() - self.t + if self.enable: + print(f'[Timer] {self.name}: {Timer.timer_map[self.name]}') diff --git a/utils/commons/multiprocess_utils.py b/utils/commons/multiprocess_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e2773543c702d2819559dfde4c5febab03899790 --- /dev/null +++ b/utils/commons/multiprocess_utils.py @@ -0,0 +1,130 @@ +import os +import traceback +from functools import partial +from tqdm import tqdm + + +def chunked_worker(worker_id, args_queue=None, results_queue=None, init_ctx_func=None): + ctx = init_ctx_func(worker_id) if init_ctx_func is not None else None + while True: + args = args_queue.get() + if args == '': + return + job_idx, map_func, arg = args + try: + map_func_ = partial(map_func, ctx=ctx) if ctx is not None else map_func + if isinstance(arg, dict): + res = map_func_(**arg) + elif isinstance(arg, (list, tuple)): + res = map_func_(*arg) + else: + res = map_func_(arg) + results_queue.put((job_idx, res)) + except: + traceback.print_exc() + results_queue.put((job_idx, None)) + + +class MultiprocessManager: + def __init__(self, num_workers=None, init_ctx_func=None, multithread=False, queue_max=-1): + if multithread: + from multiprocessing.dummy import Queue, Process + else: + from multiprocessing import Queue, Process + if num_workers is None: + num_workers = int(os.getenv('N_PROC', os.cpu_count())) + self.num_workers = num_workers + self.results_queue = Queue(maxsize=-1) + self.jobs_pending = [] + self.args_queue = Queue(maxsize=queue_max) + self.workers = [] + self.total_jobs = 0 + self.multithread = multithread + for i in range(num_workers): + if multithread: + p = Process(target=chunked_worker, + args=(i, self.args_queue, self.results_queue, init_ctx_func)) + else: + p = Process(target=chunked_worker, + args=(i, self.args_queue, self.results_queue, init_ctx_func), + daemon=True) + self.workers.append(p) + p.start() + + def add_job(self, func, args): + if not self.args_queue.full(): + self.args_queue.put((self.total_jobs, func, args)) + else: + self.jobs_pending.append((self.total_jobs, func, args)) + self.total_jobs += 1 + + def get_results(self): + self.n_finished = 0 + while self.n_finished < self.total_jobs: + while len(self.jobs_pending) > 0 and not self.args_queue.full(): + self.args_queue.put(self.jobs_pending[0]) + self.jobs_pending = self.jobs_pending[1:] + job_id, res = self.results_queue.get() + yield job_id, res + self.n_finished += 1 + for w in range(self.num_workers): + self.args_queue.put("") + for w in self.workers: + w.join() + + def close(self): + if not self.multithread: + for w in self.workers: + w.terminate() + + def __len__(self): + return self.total_jobs + + +def multiprocess_run_tqdm(map_func, args, num_workers=None, ordered=True, init_ctx_func=None, + multithread=False, queue_max=-1, desc=None): + for i, res in tqdm( + multiprocess_run(map_func, args, num_workers, ordered, init_ctx_func, multithread, + queue_max=queue_max), + total=len(args), desc=desc): + yield i, res + + +def multiprocess_run(map_func, args, num_workers=None, ordered=True, init_ctx_func=None, multithread=False, + queue_max=-1): + """ + Multiprocessing running chunked jobs. + + Examples: + >>> for res in tqdm(multiprocess_run(job_func, args): + >>> print(res) + + :param map_func: + :param args: + :param num_workers: + :param ordered: + :param init_ctx_func: + :param q_max_size: + :param multithread: + :return: + """ + if num_workers is None: + num_workers = int(os.getenv('N_PROC', os.cpu_count())) + # num_workers = 1 + manager = MultiprocessManager(num_workers, init_ctx_func, multithread, queue_max=queue_max) + for arg in args: + manager.add_job(map_func, arg) + if ordered: + n_jobs = len(args) + results = ['' for _ in range(n_jobs)] + i_now = 0 + for job_i, res in manager.get_results(): + results[job_i] = res + while i_now < n_jobs and (not isinstance(results[i_now], str) or results[i_now] != ''): + yield i_now, results[i_now] + results[i_now] = None + i_now += 1 + else: + for job_i, res in manager.get_results(): + yield job_i, res + manager.close() diff --git a/utils/commons/single_thread_env.py b/utils/commons/single_thread_env.py new file mode 100644 index 0000000000000000000000000000000000000000..849219afd2cddec2ec6d489f12f60a34994bfb80 --- /dev/null +++ b/utils/commons/single_thread_env.py @@ -0,0 +1,5 @@ +import os + +os.environ["OMP_NUM_THREADS"] = "1" +os.environ['TF_NUM_INTEROP_THREADS'] = '1' +os.environ['TF_NUM_INTRAOP_THREADS'] = '1' diff --git a/utils/commons/tensor_utils.py b/utils/commons/tensor_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..be4b69a4f135b95fcf18618668ed909314f24871 --- /dev/null +++ b/utils/commons/tensor_utils.py @@ -0,0 +1,92 @@ +import torch +import torch.distributed as dist + + +def reduce_tensors(metrics): + new_metrics = {} + for k, v in metrics.items(): + if isinstance(v, torch.Tensor): + dist.all_reduce(v) + v = v / dist.get_world_size() + if type(v) is dict: + v = reduce_tensors(v) + new_metrics[k] = v + return new_metrics + + +def tensors_to_scalars(tensors): + if isinstance(tensors, torch.Tensor): + tensors = tensors.item() + return tensors + elif isinstance(tensors, dict): + new_tensors = {} + for k, v in tensors.items(): + v = tensors_to_scalars(v) + new_tensors[k] = v + return new_tensors + elif isinstance(tensors, list): + return [tensors_to_scalars(v) for v in tensors] + else: + return tensors + + +def tensors_to_np(tensors): + if isinstance(tensors, dict): + new_np = {} + for k, v in tensors.items(): + if isinstance(v, torch.Tensor): + v = v.cpu().numpy() + if type(v) is dict: + v = tensors_to_np(v) + new_np[k] = v + elif isinstance(tensors, list): + new_np = [] + for v in tensors: + if isinstance(v, torch.Tensor): + v = v.cpu().numpy() + if type(v) is dict: + v = tensors_to_np(v) + new_np.append(v) + elif isinstance(tensors, torch.Tensor): + v = tensors + if isinstance(v, torch.Tensor): + v = v.cpu().numpy() + if type(v) is dict: + v = tensors_to_np(v) + new_np = v + else: + raise Exception(f'tensors_to_np does not support type {type(tensors)}.') + return new_np + + +def move_to_cpu(tensors): + ret = {} + for k, v in tensors.items(): + if isinstance(v, torch.Tensor): + v = v.cpu() + if type(v) is dict: + v = move_to_cpu(v) + ret[k] = v + return ret + + +def move_to_cuda(batch, gpu_id=0): + # base case: object can be directly moved using `cuda` or `to` + if callable(getattr(batch, 'cuda', None)): + return batch.cuda(gpu_id, non_blocking=True) + elif callable(getattr(batch, 'to', None)): + return batch.to(torch.device('cuda', gpu_id), non_blocking=True) + elif isinstance(batch, list): + for i, x in enumerate(batch): + batch[i] = move_to_cuda(x, gpu_id) + return batch + elif isinstance(batch, tuple): + batch = list(batch) + for i, x in enumerate(batch): + batch[i] = move_to_cuda(x, gpu_id) + return tuple(batch) + elif isinstance(batch, dict): + for k, v in batch.items(): + batch[k] = move_to_cuda(v, gpu_id) + return batch + return batch diff --git a/utils/commons/trainer.py b/utils/commons/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..2d7e0abc75a6195dade874135e61456f8ebe558f --- /dev/null +++ b/utils/commons/trainer.py @@ -0,0 +1,559 @@ +import random +import subprocess +import traceback +from datetime import datetime + +from torch.cuda.amp import GradScaler, autocast +import numpy as np +import torch.optim +import torch.utils.data +import copy +import logging +import os +import re +import sys +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +import tqdm + +from utils.commons.ckpt_utils import get_last_checkpoint, get_all_ckpts +from utils.commons.ddp_utils import DDP +from utils.commons.hparams import hparams +from utils.commons.tensor_utils import move_to_cuda +from utils.os_utils import remove_file + + +class Tee(object): + def __init__(self, name, mode): + self.file = open(name, mode) + self.stdout = sys.stdout + sys.stdout = self + + def __del__(self): + sys.stdout = self.stdout + self.file.close() + + def write(self, data): + self.file.write(data) + self.stdout.write(data) + + def flush(self): + self.file.flush() + + +class Trainer: + def __init__( + self, + work_dir, + default_save_path=None, + accumulate_grad_batches=1, + max_updates=160000, + print_nan_grads=False, + val_check_interval=2000, + num_sanity_val_steps=5, + amp=False, + # tb logger + log_save_interval=100, + tb_log_interval=10, + # checkpoint + monitor_key='val_loss', + monitor_mode='min', + num_ckpt_keep=5, + save_best=True, + resume_from_checkpoint=0, + seed=1234, + debug=False, + ): + os.makedirs(work_dir, exist_ok=True) + self.work_dir = work_dir + self.accumulate_grad_batches = accumulate_grad_batches + self.max_updates = max_updates + self.num_sanity_val_steps = num_sanity_val_steps + self.print_nan_grads = print_nan_grads + self.default_save_path = default_save_path + self.resume_from_checkpoint = resume_from_checkpoint if resume_from_checkpoint > 0 else None + self.seed = seed + self.debug = debug + # model and optm + self.task = None + self.optimizers = [] + + # trainer state + self.testing = False + self.global_step = 0 + self.current_epoch = 0 + self.total_batches = 0 + + # configure checkpoint + self.monitor_key = monitor_key + self.num_ckpt_keep = num_ckpt_keep + self.save_best = save_best + self.monitor_op = np.less if monitor_mode == 'min' else np.greater + self.best_val_results = np.Inf if monitor_mode == 'min' else -np.Inf + self.mode = 'min' + + # allow int, string and gpu list + self.all_gpu_ids = [ + int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",") if x != ''] + self.num_gpus = len(self.all_gpu_ids) + self.on_gpu = self.num_gpus > 0 + self.root_gpu = 0 + logging.info(f'GPU available: {torch.cuda.is_available()}, GPU used: {self.all_gpu_ids}') + self.use_ddp = self.num_gpus > 1 + self.proc_rank = 0 + # Tensorboard logging + self.log_save_interval = log_save_interval + self.val_check_interval = val_check_interval + self.tb_log_interval = tb_log_interval + self.amp = amp + self.amp_scalar = GradScaler() + + def test(self, task_cls): + self.testing = True + self.fit(task_cls) + + def fit(self, task_cls): + if len(self.all_gpu_ids) > 1: + mp.spawn(self.ddp_run, nprocs=self.num_gpus, args=(task_cls, copy.deepcopy(hparams))) + else: + self.task = task_cls() + self.task.trainer = self + self.run_single_process(self.task) + return 1 + + def ddp_run(self, gpu_idx, task_cls, hparams_): + hparams.update(hparams_) + self.proc_rank = gpu_idx + self.init_ddp_connection(self.proc_rank, self.num_gpus) + if dist.get_rank() != 0 and not self.debug: + sys.stdout = open(os.devnull, "w") + sys.stderr = open(os.devnull, "w") + task = task_cls() + task.trainer = self + torch.cuda.set_device(gpu_idx) + self.root_gpu = gpu_idx + self.task = task + self.run_single_process(task) + + def run_single_process(self, task): + """Sanity check a few things before starting actual training. + + :param task: + """ + # build model, optm and load checkpoint + if self.proc_rank == 0: + self.save_terminal_logs() + if not self.testing: + self.save_codes() + + model = task.build_model() + if model is not None: + task.model = model + checkpoint, _ = get_last_checkpoint(self.work_dir, self.resume_from_checkpoint) + if checkpoint is not None: + self.restore_weights(checkpoint) + elif self.on_gpu: + task.cuda(self.root_gpu) + if not self.testing: + self.optimizers = task.configure_optimizers() + self.fisrt_epoch = True + if checkpoint is not None: + self.restore_opt_state(checkpoint) + del checkpoint + # clear cache after restore + if self.on_gpu: + torch.cuda.empty_cache() + + if self.use_ddp: + self.task = self.configure_ddp(self.task) + dist.barrier() + + task_ref = self.get_task_ref() + task_ref.trainer = self + task_ref.testing = self.testing + # link up experiment object + if self.proc_rank == 0: + task_ref.build_tensorboard(save_dir=self.work_dir, name='tb_logs') + else: + os.makedirs('tmp', exist_ok=True) + task_ref.build_tensorboard(save_dir='tmp', name='tb_tmp') + self.logger = task_ref.logger + try: + if self.testing: + self.run_evaluation(test=True) + else: + self.train() + except KeyboardInterrupt as e: + traceback.print_exc() + task_ref.on_keyboard_interrupt() + + #################### + # valid and test + #################### + def run_evaluation(self, test=False): + eval_results = self.evaluate(self.task, test, tqdm_desc='Valid' if not test else 'test', + max_batches=hparams['eval_max_batches']) + if eval_results is not None and 'tb_log' in eval_results: + tb_log_output = eval_results['tb_log'] + self.log_metrics_to_tb(tb_log_output) + if self.proc_rank == 0 and not test: + self.save_checkpoint(epoch=self.current_epoch, logs=eval_results) + + def evaluate(self, task, test=False, tqdm_desc='Valid', max_batches=None): + if max_batches == -1: + max_batches = None + # enable eval mode + task.zero_grad() + task.eval() + torch.set_grad_enabled(False) + + task_ref = self.get_task_ref() + if test: + ret = task_ref.test_start() + if ret == 'EXIT': + return + else: + task_ref.validation_start() + outputs = [] + dataloader = task_ref.test_dataloader() if test else task_ref.val_dataloader() + pbar = tqdm.tqdm(dataloader, desc=tqdm_desc, total=max_batches, dynamic_ncols=True, unit='step', + disable=self.root_gpu > 0) + # give model a chance to do something with the outputs (and method defined) + for batch_idx, batch in enumerate(pbar): + if batch is None: # pragma: no cover + continue + # stop short when on fast_dev_run (sets max_batch=1) + if max_batches is not None and batch_idx >= max_batches: + break + + # make dataloader_idx arg in validation_step optional + if self.on_gpu: + batch = move_to_cuda(batch, self.root_gpu) + args = [batch, batch_idx] + if self.use_ddp: + output = task(*args) + else: + if test: + output = task_ref.test_step(*args) + else: + output = task_ref.validation_step(*args) + # track outputs for collation + outputs.append(output) + # give model a chance to do something with the outputs (and method defined) + if test: + eval_results = task_ref.test_end(outputs) + else: + eval_results = task_ref.validation_end(outputs) + # enable train mode again + task.train() + torch.set_grad_enabled(True) + return eval_results + + #################### + # train + #################### + def train(self): + task_ref = self.get_task_ref() + task_ref.on_train_start() + if self.num_sanity_val_steps > 0: + # run tiny validation (if validation defined) to make sure program won't crash during val + self.evaluate(self.task, False, 'Sanity Val', max_batches=self.num_sanity_val_steps) + # clear cache before training + if self.on_gpu: + torch.cuda.empty_cache() + dataloader = task_ref.train_dataloader() + epoch = self.current_epoch + # run all epochs + while True: + # set seed for distributed sampler (enables shuffling for each epoch) + if self.use_ddp and hasattr(dataloader.sampler, 'set_epoch'): + dataloader.sampler.set_epoch(epoch) + # update training progress in trainer and model + task_ref.current_epoch = epoch + self.current_epoch = epoch + # total batches includes multiple val checks + self.batch_loss_value = 0 # accumulated grads + # before epoch hook + task_ref.on_epoch_start() + + # run epoch + train_pbar = tqdm.tqdm(dataloader, initial=self.global_step, total=float('inf'), + dynamic_ncols=True, unit='step', disable=self.root_gpu > 0) + for batch_idx, batch in enumerate(train_pbar): + if self.global_step % self.val_check_interval == 0 and not self.fisrt_epoch: + self.run_evaluation() + pbar_metrics, tb_metrics = self.run_training_batch(batch_idx, batch) + train_pbar.set_postfix(**pbar_metrics) + self.fisrt_epoch = False + # when metrics should be logged + if (self.global_step + 1) % self.tb_log_interval == 0: + # logs user requested information to logger + self.log_metrics_to_tb(tb_metrics) + + self.global_step += 1 + task_ref.global_step = self.global_step + if self.global_step > self.max_updates: + print("| Training end..") + break + # epoch end hook + task_ref.on_epoch_end() + epoch += 1 + if self.global_step > self.max_updates: + break + task_ref.on_train_end() + + def run_training_batch(self, batch_idx, batch): + if batch is None: + return {} + all_progress_bar_metrics = [] + all_log_metrics = [] + task_ref = self.get_task_ref() + for opt_idx, optimizer in enumerate(self.optimizers): + if optimizer is None: + continue + # make sure only the gradients of the current optimizer's paramaters are calculated + # in the training step to prevent dangling gradients in multiple-optimizer setup. + if len(self.optimizers) > 1: + for param in task_ref.parameters(): + param.requires_grad = False + for group in optimizer.param_groups: + for param in group['params']: + param.requires_grad = True + + # forward pass + with autocast(enabled=self.amp): + if self.on_gpu: + batch = move_to_cuda(copy.copy(batch), self.root_gpu) + args = [batch, batch_idx, opt_idx] + if self.use_ddp: + output = self.task(*args) + else: + output = task_ref.training_step(*args) + loss = output['loss'] + if loss is None: + continue + progress_bar_metrics = output['progress_bar'] + log_metrics = output['tb_log'] + # accumulate loss + loss = loss / self.accumulate_grad_batches + + # backward pass + if loss.requires_grad: + if self.amp: + self.amp_scalar.scale(loss).backward() + else: + loss.backward() + + # track progress bar metrics + all_log_metrics.append(log_metrics) + all_progress_bar_metrics.append(progress_bar_metrics) + + if loss is None: + continue + + # nan grads + if self.print_nan_grads: + has_nan_grad = False + for name, param in task_ref.named_parameters(): + if (param.grad is not None) and torch.isnan(param.grad.float()).any(): + print("| NaN params: ", name, param, param.grad) + has_nan_grad = True + if has_nan_grad: + exit(0) + + # gradient update with accumulated gradients + if (self.global_step + 1) % self.accumulate_grad_batches == 0: + task_ref.on_before_optimization(opt_idx) + if self.amp: + self.amp_scalar.step(optimizer) + self.amp_scalar.update() + else: + optimizer.step() + optimizer.zero_grad() + task_ref.on_after_optimization(self.current_epoch, batch_idx, optimizer, opt_idx) + + # collapse all metrics into one dict + all_progress_bar_metrics = {k: v for d in all_progress_bar_metrics for k, v in d.items()} + all_log_metrics = {k: v for d in all_log_metrics for k, v in d.items()} + return all_progress_bar_metrics, all_log_metrics + + #################### + # load and save checkpoint + #################### + def restore_weights(self, checkpoint): + # load model state + task_ref = self.get_task_ref() + + for k, v in checkpoint['state_dict'].items(): + getattr(task_ref, k).load_state_dict(v) + + if self.on_gpu: + task_ref.cuda(self.root_gpu) + # load training state (affects trainer only) + self.best_val_results = checkpoint['checkpoint_callback_best'] + self.global_step = checkpoint['global_step'] + self.current_epoch = checkpoint['epoch'] + task_ref.global_step = self.global_step + + # wait for all models to restore weights + if self.use_ddp: + # wait for all processes to catch up + dist.barrier() + + def restore_opt_state(self, checkpoint): + if self.testing: + return + # restore the optimizers + optimizer_states = checkpoint['optimizer_states'] + for optimizer, opt_state in zip(self.optimizers, optimizer_states): + if optimizer is None: + return + try: + optimizer.load_state_dict(opt_state) + # move optimizer to GPU 1 weight at a time + if self.on_gpu: + for state in optimizer.state.values(): + for k, v in state.items(): + if isinstance(v, torch.Tensor): + state[k] = v.cuda(self.root_gpu) + except ValueError: + print("| WARMING: optimizer parameters not match !!!") + try: + if dist.is_initialized() and dist.get_rank() > 0: + return + except Exception as e: + print(e) + return + did_restore = True + return did_restore + + def save_checkpoint(self, epoch, logs=None): + monitor_op = np.less + ckpt_path = f'{self.work_dir}/model_ckpt_steps_{self.global_step}.ckpt' + logging.info(f'Epoch {epoch:05d}@{self.global_step}: saving model to {ckpt_path}') + self._atomic_save(ckpt_path) + for old_ckpt in get_all_ckpts(self.work_dir)[self.num_ckpt_keep:]: + remove_file(old_ckpt) + logging.info(f'Delete ckpt: {os.path.basename(old_ckpt)}') + current = None + if logs is not None and self.monitor_key in logs: + current = logs[self.monitor_key] + if current is not None and self.save_best: + if monitor_op(current, self.best_val_results): + best_filepath = f'{self.work_dir}/model_ckpt_best.pt' + self.best_val_results = current + logging.info( + f'Epoch {epoch:05d}@{self.global_step}: {self.monitor_key} reached {current:0.5f}. ' + f'Saving model to {best_filepath}') + self._atomic_save(best_filepath) + + def _atomic_save(self, filepath): + checkpoint = self.dump_checkpoint() + tmp_path = str(filepath) + ".part" + torch.save(checkpoint, tmp_path, _use_new_zipfile_serialization=False) + os.replace(tmp_path, filepath) + + def dump_checkpoint(self): + checkpoint = {'epoch': self.current_epoch, 'global_step': self.global_step, + 'checkpoint_callback_best': self.best_val_results} + # save optimizers + optimizer_states = [] + for i, optimizer in enumerate(self.optimizers): + if optimizer is not None: + optimizer_states.append(optimizer.state_dict()) + + checkpoint['optimizer_states'] = optimizer_states + task_ref = self.get_task_ref() + checkpoint['state_dict'] = { + k: v.state_dict() for k, v in task_ref.named_children() if len(list(v.parameters())) > 0} + return checkpoint + + #################### + # DDP + #################### + def configure_ddp(self, task): + task = DDP(task, device_ids=[self.root_gpu], find_unused_parameters=True) + random.seed(self.seed) + np.random.seed(self.seed) + return task + + def init_ddp_connection(self, proc_rank, world_size): + root_node = '127.0.0.1' + root_node = self.resolve_root_node_address(root_node) + os.environ['MASTER_ADDR'] = root_node + dist.init_process_group('nccl', rank=proc_rank, world_size=world_size) + + def resolve_root_node_address(self, root_node): + if '[' in root_node: + name = root_node.split('[')[0] + number = root_node.split(',')[0] + if '-' in number: + number = number.split('-')[0] + number = re.sub('[^0-9]', '', number) + root_node = name + number + return root_node + + #################### + # utils + #################### + def get_task_ref(self): + from utils.commons.base_task import BaseTask + task: BaseTask = self.task.module if isinstance(self.task, DDP) else self.task + return task + + def log_metrics_to_tb(self, metrics, step=None): + """Logs the metric dict passed in. + + :param metrics: + """ + # turn all tensors to scalars + scalar_metrics = self.metrics_to_scalars(metrics) + + step = step if step is not None else self.global_step + # log actual metrics + if self.proc_rank == 0: + self.log_metrics(self.logger, scalar_metrics, step=step) + + @staticmethod + def log_metrics(logger, metrics, step=None): + for k, v in metrics.items(): + if isinstance(v, torch.Tensor): + v = v.item() + logger.add_scalar(k, v, step) + + def metrics_to_scalars(self, metrics): + new_metrics = {} + for k, v in metrics.items(): + if isinstance(v, torch.Tensor): + v = v.item() + + if type(v) is dict: + v = self.metrics_to_scalars(v) + + new_metrics[k] = v + + return new_metrics + + def save_terminal_logs(self): + t = datetime.now().strftime('%Y%m%d%H%M%S') + os.makedirs(f'{self.work_dir}/terminal_logs', exist_ok=True) + Tee(f'{self.work_dir}/terminal_logs/log_{t}.txt', 'w') + + def save_codes(self): + if len(hparams['save_codes']) > 0: + t = datetime.now().strftime('%Y%m%d%H%M%S') + code_dir = f'{self.work_dir}/codes/{t}' + subprocess.check_call(f'mkdir -p "{code_dir}"', shell=True) + for c in hparams['save_codes']: + if os.path.exists(c): + subprocess.check_call( + f'rsync -aR ' + f'--include="*.py" ' + f'--include="*.yaml" ' + f'--exclude="__pycache__" ' + f'--include="*/" ' + f'--exclude="*" ' + f'"./{c}" "{code_dir}/"', + shell=True) + print(f"| Copied codes to {code_dir}.") diff --git a/utils/metrics/__pycache__/diagonal_metrics.cpython-36.pyc b/utils/metrics/__pycache__/diagonal_metrics.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83c4e9b8968df3b1a3112237ca3bd29eb574779a Binary files /dev/null and b/utils/metrics/__pycache__/diagonal_metrics.cpython-36.pyc differ diff --git a/utils/metrics/__pycache__/diagonal_metrics.cpython-37.pyc b/utils/metrics/__pycache__/diagonal_metrics.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7f5a381ac3adab5c59053ef058419ce003c03b7 Binary files /dev/null and b/utils/metrics/__pycache__/diagonal_metrics.cpython-37.pyc differ diff --git a/utils/metrics/__pycache__/ssim.cpython-36.pyc b/utils/metrics/__pycache__/ssim.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f354b07c41e2e2cc96302cdc93a80d72c3467b69 Binary files /dev/null and b/utils/metrics/__pycache__/ssim.cpython-36.pyc differ diff --git a/utils/metrics/__pycache__/ssim.cpython-37.pyc b/utils/metrics/__pycache__/ssim.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c53c00a2e3de33627bda41f998d7e7a3c181e938 Binary files /dev/null and b/utils/metrics/__pycache__/ssim.cpython-37.pyc differ diff --git a/utils/metrics/diagonal_metrics.py b/utils/metrics/diagonal_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..ba9807c1a594b38632c4731391e2d4fa3289037b --- /dev/null +++ b/utils/metrics/diagonal_metrics.py @@ -0,0 +1,74 @@ +import torch + + +def get_focus_rate(attn, src_padding_mask=None, tgt_padding_mask=None): + ''' + attn: bs x L_t x L_s + ''' + if src_padding_mask is not None: + attn = attn * (1 - src_padding_mask.float())[:, None, :] + + if tgt_padding_mask is not None: + attn = attn * (1 - tgt_padding_mask.float())[:, :, None] + + focus_rate = attn.max(-1).values.sum(-1) + focus_rate = focus_rate / attn.sum(-1).sum(-1) + return focus_rate + + +def get_phone_coverage_rate(attn, src_padding_mask=None, src_seg_mask=None, tgt_padding_mask=None): + ''' + attn: bs x L_t x L_s + ''' + src_mask = attn.new(attn.size(0), attn.size(-1)).bool().fill_(False) + if src_padding_mask is not None: + src_mask |= src_padding_mask + if src_seg_mask is not None: + src_mask |= src_seg_mask + + attn = attn * (1 - src_mask.float())[:, None, :] + if tgt_padding_mask is not None: + attn = attn * (1 - tgt_padding_mask.float())[:, :, None] + + phone_coverage_rate = attn.max(1).values.sum(-1) + # phone_coverage_rate = phone_coverage_rate / attn.sum(-1).sum(-1) + phone_coverage_rate = phone_coverage_rate / (1 - src_mask.float()).sum(-1) + return phone_coverage_rate + + +def get_diagonal_focus_rate(attn, attn_ks, target_len, src_padding_mask=None, tgt_padding_mask=None, + band_mask_factor=5, band_width=50): + ''' + attn: bx x L_t x L_s + attn_ks: shape: tensor with shape [batch_size], input_lens/output_lens + + diagonal: y=k*x (k=attn_ks, x:output, y:input) + 1 0 0 + 0 1 0 + 0 0 1 + y>=k*(x-width) and y<=k*(x+width):1 + else:0 + ''' + # width = min(target_len/band_mask_factor, 50) + width1 = target_len / band_mask_factor + width2 = target_len.new(target_len.size()).fill_(band_width) + width = torch.where(width1 < width2, width1, width2).float() + base = torch.ones(attn.size()).to(attn.device) + zero = torch.zeros(attn.size()).to(attn.device) + x = torch.arange(0, attn.size(1)).to(attn.device)[None, :, None].float() * base + y = torch.arange(0, attn.size(2)).to(attn.device)[None, None, :].float() * base + cond = (y - attn_ks[:, None, None] * x) + cond1 = cond + attn_ks[:, None, None] * width[:, None, None] + cond2 = cond - attn_ks[:, None, None] * width[:, None, None] + mask1 = torch.where(cond1 < 0, zero, base) + mask2 = torch.where(cond2 > 0, zero, base) + mask = mask1 * mask2 + + if src_padding_mask is not None: + attn = attn * (1 - src_padding_mask.float())[:, None, :] + if tgt_padding_mask is not None: + attn = attn * (1 - tgt_padding_mask.float())[:, :, None] + + diagonal_attn = attn * mask + diagonal_focus_rate = diagonal_attn.sum(-1).sum(-1) / attn.sum(-1).sum(-1) + return diagonal_focus_rate, mask diff --git a/utils/metrics/dtw.py b/utils/metrics/dtw.py new file mode 100644 index 0000000000000000000000000000000000000000..829e8e160355f8729b8e478bc4a24ca8597df58e --- /dev/null +++ b/utils/metrics/dtw.py @@ -0,0 +1,160 @@ +from numpy import array, zeros, full, argmin, inf, ndim +from scipy.spatial.distance import cdist +from math import isinf + + +def dtw(x, y, dist, warp=1, w=inf, s=1.0): + """ + Computes Dynamic Time Warping (DTW) of two sequences. + + :param array x: N1*M array + :param array y: N2*M array + :param func dist: distance used as cost measure + :param int warp: how many shifts are computed. + :param int w: window size limiting the maximal distance between indices of matched entries |i,j|. + :param float s: weight applied on off-diagonal moves of the path. As s gets larger, the warping path is increasingly biased towards the diagonal + Returns the minimum distance, the cost matrix, the accumulated cost matrix, and the wrap path. + """ + assert len(x) + assert len(y) + assert isinf(w) or (w >= abs(len(x) - len(y))) + assert s > 0 + r, c = len(x), len(y) + if not isinf(w): + D0 = full((r + 1, c + 1), inf) + for i in range(1, r + 1): + D0[i, max(1, i - w):min(c + 1, i + w + 1)] = 0 + D0[0, 0] = 0 + else: + D0 = zeros((r + 1, c + 1)) + D0[0, 1:] = inf + D0[1:, 0] = inf + D1 = D0[1:, 1:] # view + for i in range(r): + for j in range(c): + if (isinf(w) or (max(0, i - w) <= j <= min(c, i + w))): + D1[i, j] = dist(x[i], y[j]) + C = D1.copy() + jrange = range(c) + for i in range(r): + if not isinf(w): + jrange = range(max(0, i - w), min(c, i + w + 1)) + for j in jrange: + min_list = [D0[i, j]] + for k in range(1, warp + 1): + i_k = min(i + k, r) + j_k = min(j + k, c) + min_list += [D0[i_k, j] * s, D0[i, j_k] * s] + D1[i, j] += min(min_list) + if len(x) == 1: + path = zeros(len(y)), range(len(y)) + elif len(y) == 1: + path = range(len(x)), zeros(len(x)) + else: + path = _traceback(D0) + return D1[-1, -1], C, D1, path + + +def accelerated_dtw(x, y, dist, warp=1): + """ + Computes Dynamic Time Warping (DTW) of two sequences in a faster way. + Instead of iterating through each element and calculating each distance, + this uses the cdist function from scipy (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html) + + :param array x: N1*M array + :param array y: N2*M array + :param string or func dist: distance parameter for cdist. When string is given, cdist uses optimized functions for the distance metrics. + If a string is passed, the distance function can be 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'wminkowski', 'yule'. + :param int warp: how many shifts are computed. + Returns the minimum distance, the cost matrix, the accumulated cost matrix, and the wrap path. + """ + assert len(x) + assert len(y) + if ndim(x) == 1: + x = x.reshape(-1, 1) + if ndim(y) == 1: + y = y.reshape(-1, 1) + r, c = len(x), len(y) + D0 = zeros((r + 1, c + 1)) + D0[0, 1:] = inf + D0[1:, 0] = inf + D1 = D0[1:, 1:] + D0[1:, 1:] = cdist(x, y, dist) + C = D1.copy() + for i in range(r): + for j in range(c): + min_list = [D0[i, j]] + for k in range(1, warp + 1): + min_list += [D0[min(i + k, r), j], + D0[i, min(j + k, c)]] + D1[i, j] += min(min_list) + if len(x) == 1: + path = zeros(len(y)), range(len(y)) + elif len(y) == 1: + path = range(len(x)), zeros(len(x)) + else: + path = _traceback(D0) + return D1[-1, -1], C, D1, path + + +def _traceback(D): + i, j = array(D.shape) - 2 + p, q = [i], [j] + while (i > 0) or (j > 0): + tb = argmin((D[i, j], D[i, j + 1], D[i + 1, j])) + if tb == 0: + i -= 1 + j -= 1 + elif tb == 1: + i -= 1 + else: # (tb == 2): + j -= 1 + p.insert(0, i) + q.insert(0, j) + return array(p), array(q) + + +if __name__ == '__main__': + w = inf + s = 1.0 + if 1: # 1-D numeric + from sklearn.metrics.pairwise import manhattan_distances + + x = [0, 0, 1, 1, 2, 4, 2, 1, 2, 0] + y = [1, 1, 1, 2, 2, 2, 2, 3, 2, 0] + dist_fun = manhattan_distances + w = 1 + # s = 1.2 + elif 0: # 2-D numeric + from sklearn.metrics.pairwise import euclidean_distances + + x = [[0, 0], [0, 1], [1, 1], [1, 2], [2, 2], [4, 3], [2, 3], [1, 1], [2, 2], [0, 1]] + y = [[1, 0], [1, 1], [1, 1], [2, 1], [4, 3], [4, 3], [2, 3], [3, 1], [1, 2], [1, 0]] + dist_fun = euclidean_distances + else: # 1-D list of strings + from nltk.metrics.distance import edit_distance + + # x = ['we', 'shelled', 'clams', 'for', 'the', 'chowder'] + # y = ['class', 'too'] + x = ['i', 'soon', 'found', 'myself', 'muttering', 'to', 'the', 'walls'] + y = ['see', 'drown', 'himself'] + # x = 'we talked about the situation'.split() + # y = 'we talked about the situation'.split() + dist_fun = edit_distance + dist, cost, acc, path = dtw(x, y, dist_fun, w=w, s=s) + + # Vizualize + from matplotlib import pyplot as plt + + plt.imshow(cost.T, origin='lower', cmap=plt.cm.Reds, interpolation='nearest') + plt.plot(path[0], path[1], '-o') # relation + plt.xticks(range(len(x)), x) + plt.yticks(range(len(y)), y) + plt.xlabel('x') + plt.ylabel('y') + plt.axis('tight') + if isinf(w): + plt.title('Minimum distance: {}, slope weight: {}'.format(dist, s)) + else: + plt.title('Minimum distance: {}, window widht: {}, slope weight: {}'.format(dist, w, s)) + plt.show() diff --git a/utils/metrics/laplace_var.py b/utils/metrics/laplace_var.py new file mode 100644 index 0000000000000000000000000000000000000000..ec6f5f8d877195e7ee512d7e9f6f8a879d3ef32c --- /dev/null +++ b/utils/metrics/laplace_var.py @@ -0,0 +1,4 @@ +import scipy.ndimage + +def laplace_var(x): + return scipy.ndimage.laplace(x).var() diff --git a/utils/metrics/pitch_distance.py b/utils/metrics/pitch_distance.py new file mode 100644 index 0000000000000000000000000000000000000000..3bc11424a9f75270fc7eb5ef98731129e25ff715 --- /dev/null +++ b/utils/metrics/pitch_distance.py @@ -0,0 +1,102 @@ +import numpy as np +import matplotlib.pyplot as plt +from numba import jit + +import torch + + +@jit +def time_warp(costs): + dtw = np.zeros_like(costs) + dtw[0, 1:] = np.inf + dtw[1:, 0] = np.inf + eps = 1e-4 + for i in range(1, costs.shape[0]): + for j in range(1, costs.shape[1]): + dtw[i, j] = costs[i, j] + min(dtw[i - 1, j], dtw[i, j - 1], dtw[i - 1, j - 1]) + return dtw + + +def align_from_distances(distance_matrix, debug=False, return_mindist=False): + # for each position in spectrum 1, returns best match position in spectrum2 + # using monotonic alignment + dtw = time_warp(distance_matrix) + + i = distance_matrix.shape[0] - 1 + j = distance_matrix.shape[1] - 1 + results = [0] * distance_matrix.shape[0] + while i > 0 and j > 0: + results[i] = j + i, j = min([(i - 1, j), (i, j - 1), (i - 1, j - 1)], key=lambda x: dtw[x[0], x[1]]) + + if debug: + visual = np.zeros_like(dtw) + visual[range(len(results)), results] = 1 + plt.matshow(visual) + plt.show() + if return_mindist: + return results, dtw[-1, -1] + return results + + +def get_local_context(input_f, max_window=32, scale_factor=1.): + # input_f: [S, 1], support numpy array or torch tensor + # return hist: [S, max_window * 2], list of list + T = input_f.shape[0] + # max_window = int(max_window * scale_factor) + derivative = [[0 for _ in range(max_window * 2)] for _ in range(T)] + + for t in range(T): # travel the time series + for feat_idx in range(-max_window, max_window): + if t + feat_idx < 0 or t + feat_idx >= T: + value = 0 + else: + value = input_f[t + feat_idx] + derivative[t][feat_idx + max_window] = value + return derivative + + +def cal_localnorm_dist(src, tgt, src_len, tgt_len): + local_src = torch.tensor(get_local_context(src)) + local_tgt = torch.tensor(get_local_context(tgt, scale_factor=tgt_len / src_len)) + + local_norm_src = (local_src - local_src.mean(-1).unsqueeze(-1)) # / local_src.std(-1).unsqueeze(-1) # [T1, 32] + local_norm_tgt = (local_tgt - local_tgt.mean(-1).unsqueeze(-1)) # / local_tgt.std(-1).unsqueeze(-1) # [T2, 32] + + dists = torch.cdist(local_norm_src[None, :, :], local_norm_tgt[None, :, :]) # [1, T1, T2] + return dists + + +## here is API for one sample +def LoNDTWDistance(src, tgt): + # src: [S] + # tgt: [T] + dists = cal_localnorm_dist(src, tgt, src.shape[0], tgt.shape[0]) # [1, S, T] + costs = dists.squeeze(0) # [S, T] + alignment, min_distance = align_from_distances(costs.T.cpu().detach().numpy(), return_mindist=True) # [T] + return alignment, min_distance + +# if __name__ == '__main__': +# # utils from ns +# from utils.pitch_utils import denorm_f0 +# from tasks.singing.fsinging import FastSingingDataset +# from utils.hparams import hparams, set_hparams +# +# set_hparams() +# +# train_ds = FastSingingDataset('test') +# +# # Test One sample case +# sample = train_ds[0] +# amateur_f0 = sample['f0'] +# prof_f0 = sample['prof_f0'] +# +# amateur_uv = sample['uv'] +# amateur_padding = sample['mel2ph'] == 0 +# prof_uv = sample['prof_uv'] +# prof_padding = sample['prof_mel2ph'] == 0 +# amateur_f0_denorm = denorm_f0(amateur_f0, amateur_uv, hparams, pitch_padding=amateur_padding) +# prof_f0_denorm = denorm_f0(prof_f0, prof_uv, hparams, pitch_padding=prof_padding) +# alignment, min_distance = LoNDTWDistance(amateur_f0_denorm, prof_f0_denorm) +# print(min_distance) +# python utils/pitch_distance.py --config egs/datasets/audio/molar/svc_ppg.yaml diff --git a/utils/metrics/ssim.py b/utils/metrics/ssim.py new file mode 100644 index 0000000000000000000000000000000000000000..cb8c6a47b14fbd450a6717a21236906d6de9679f --- /dev/null +++ b/utils/metrics/ssim.py @@ -0,0 +1,84 @@ +""" +Adapted from https://github.com/Po-Hsun-Su/pytorch-ssim +""" + +import torch +import torch.nn.functional as F +from torch.autograd import Variable +import numpy as np +from math import exp + + +def gaussian(window_size, sigma): + gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)]) + return gauss / gauss.sum() + + +def create_window(window_size, channel): + _1D_window = gaussian(window_size, 1.5).unsqueeze(1) + _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0) + window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous()) + return window + + +def _ssim(img1, img2, window, window_size, channel, size_average=True): + mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel) + mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel) + + mu1_sq = mu1.pow(2) + mu2_sq = mu2.pow(2) + mu1_mu2 = mu1 * mu2 + + sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq + sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq + sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2 + + C1 = 0.01 ** 2 + C2 = 0.03 ** 2 + + ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) + + if size_average: + return ssim_map.mean() + else: + return ssim_map.mean(1) + + +class SSIM(torch.nn.Module): + def __init__(self, window_size=11, size_average=True): + super(SSIM, self).__init__() + self.window_size = window_size + self.size_average = size_average + self.channel = 1 + self.window = create_window(window_size, self.channel) + + def forward(self, img1, img2): + (_, channel, _, _) = img1.size() + + if channel == self.channel and self.window.data.type() == img1.data.type(): + window = self.window + else: + window = create_window(self.window_size, channel) + + if img1.is_cuda: + window = window.cuda(img1.get_device()) + window = window.type_as(img1) + + self.window = window + self.channel = channel + + return _ssim(img1, img2, window, self.window_size, channel, self.size_average) + + +window = None + + +def ssim(img1, img2, window_size=11, size_average=True): + (_, channel, _, _) = img1.size() + global window + if window is None: + window = create_window(window_size, channel) + if img1.is_cuda: + window = window.cuda(img1.get_device()) + window = window.type_as(img1) + return _ssim(img1, img2, window, window_size, channel, size_average) diff --git a/utils/nn/__pycache__/model_utils.cpython-36.pyc b/utils/nn/__pycache__/model_utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e37bdf7a33c7a9464157c0eec4b671c6c11a84d Binary files /dev/null and b/utils/nn/__pycache__/model_utils.cpython-36.pyc differ diff --git a/utils/nn/__pycache__/model_utils.cpython-37.pyc b/utils/nn/__pycache__/model_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e052aab8ca0db08f9d79082c2fce32a766c0812e Binary files /dev/null and b/utils/nn/__pycache__/model_utils.cpython-37.pyc differ diff --git a/utils/nn/__pycache__/schedulers.cpython-36.pyc b/utils/nn/__pycache__/schedulers.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ccb8de3ace020cb7ce17591872d8e37601fbbeb7 Binary files /dev/null and b/utils/nn/__pycache__/schedulers.cpython-36.pyc differ diff --git a/utils/nn/__pycache__/schedulers.cpython-37.pyc b/utils/nn/__pycache__/schedulers.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe6e1f1eac3f7447ab3c1a8fd633a9412a80214a Binary files /dev/null and b/utils/nn/__pycache__/schedulers.cpython-37.pyc differ diff --git a/utils/nn/__pycache__/seq_utils.cpython-36.pyc b/utils/nn/__pycache__/seq_utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..727b331c30a171e52bc23ade3af671cd54a68bfe Binary files /dev/null and b/utils/nn/__pycache__/seq_utils.cpython-36.pyc differ diff --git a/utils/nn/__pycache__/seq_utils.cpython-37.pyc b/utils/nn/__pycache__/seq_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3bbf8ffccb44edf07ecaafbc2f49f11052dac93 Binary files /dev/null and b/utils/nn/__pycache__/seq_utils.cpython-37.pyc differ diff --git a/utils/nn/model_utils.py b/utils/nn/model_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b81200e9a2629ac4d791a37d31d5f13330aefd30 --- /dev/null +++ b/utils/nn/model_utils.py @@ -0,0 +1,14 @@ +import numpy as np + + +def print_arch(model, model_name='model'): + print(f"| {model_name} Arch: ", model) + num_params(model, model_name=model_name) + + +def num_params(model, print_out=True, model_name="model"): + parameters = filter(lambda p: p.requires_grad, model.parameters()) + parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000 + if print_out: + print(f'| {model_name} Trainable Parameters: %.3fM' % parameters) + return parameters diff --git a/utils/nn/schedulers.py b/utils/nn/schedulers.py new file mode 100644 index 0000000000000000000000000000000000000000..c91969dd8e01a8342488e060592700f3957c3651 --- /dev/null +++ b/utils/nn/schedulers.py @@ -0,0 +1,57 @@ +class NoneSchedule(object): + def __init__(self, optimizer, lr): + self.optimizer = optimizer + self.constant_lr = lr + self.step(0) + + def step(self, num_updates): + self.lr = self.constant_lr + for param_group in self.optimizer.param_groups: + param_group['lr'] = self.lr + return self.lr + + def get_lr(self): + return self.optimizer.param_groups[0]['lr'] + + def get_last_lr(self): + return self.get_lr() + + +class RSQRTSchedule(NoneSchedule): + def __init__(self, optimizer, lr, warmup_updates, hidden_size): + self.optimizer = optimizer + self.constant_lr = lr + self.warmup_updates = warmup_updates + self.hidden_size = hidden_size + self.lr = lr + for param_group in optimizer.param_groups: + param_group['lr'] = self.lr + self.step(0) + + def step(self, num_updates): + constant_lr = self.constant_lr + warmup = min(num_updates / self.warmup_updates, 1.0) + rsqrt_decay = max(self.warmup_updates, num_updates) ** -0.5 + rsqrt_hidden = self.hidden_size ** -0.5 + self.lr = max(constant_lr * warmup * rsqrt_decay * rsqrt_hidden, 1e-7) + for param_group in self.optimizer.param_groups: + param_group['lr'] = self.lr + return self.lr + + +class WarmupSchedule(NoneSchedule): + def __init__(self, optimizer, lr, warmup_updates): + self.optimizer = optimizer + self.constant_lr = self.lr = lr + self.warmup_updates = warmup_updates + for param_group in optimizer.param_groups: + param_group['lr'] = self.lr + self.step(0) + + def step(self, num_updates): + constant_lr = self.constant_lr + warmup = min(num_updates / self.warmup_updates, 1.0) + self.lr = max(constant_lr * warmup, 1e-7) + for param_group in self.optimizer.param_groups: + param_group['lr'] = self.lr + return self.lr diff --git a/utils/nn/seq_utils.py b/utils/nn/seq_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1308bf7d1806a6c36de9c8af5e9d217eaefa7b56 --- /dev/null +++ b/utils/nn/seq_utils.py @@ -0,0 +1,305 @@ +from collections import defaultdict +import torch +import torch.nn.functional as F + + +def make_positions(tensor, padding_idx): + """Replace non-padding symbols with their position numbers. + + Position numbers begin at padding_idx+1. Padding symbols are ignored. + """ + # The series of casts and type-conversions here are carefully + # balanced to both work with ONNX export and XLA. In particular XLA + # prefers ints, cumsum defaults to output longs, and ONNX doesn't know + # how to handle the dtype kwarg in cumsum. + mask = tensor.ne(padding_idx).int() + return ( + torch.cumsum(mask, dim=1).type_as(mask) * mask + ).long() + padding_idx + + +def softmax(x, dim): + return F.softmax(x, dim=dim, dtype=torch.float32) + + +def sequence_mask(lengths, maxlen, dtype=torch.bool): + if maxlen is None: + maxlen = lengths.max() + mask = ~(torch.ones((len(lengths), maxlen)).to(lengths.device).cumsum(dim=1).t() > lengths).t() + mask.type(dtype) + return mask + + +def weights_nonzero_speech(target): + # target : B x T x mel + # Assign weight 1.0 to all labels except for padding (id=0). + dim = target.size(-1) + return target.abs().sum(-1, keepdim=True).ne(0).float().repeat(1, 1, dim) + + +INCREMENTAL_STATE_INSTANCE_ID = defaultdict(lambda: 0) + + +def _get_full_incremental_state_key(module_instance, key): + module_name = module_instance.__class__.__name__ + + # assign a unique ID to each module instance, so that incremental state is + # not shared across module instances + if not hasattr(module_instance, '_instance_id'): + INCREMENTAL_STATE_INSTANCE_ID[module_name] += 1 + module_instance._instance_id = INCREMENTAL_STATE_INSTANCE_ID[module_name] + + return '{}.{}.{}'.format(module_name, module_instance._instance_id, key) + + +def get_incremental_state(module, incremental_state, key): + """Helper for getting incremental state for an nn.Module.""" + full_key = _get_full_incremental_state_key(module, key) + if incremental_state is None or full_key not in incremental_state: + return None + return incremental_state[full_key] + + +def set_incremental_state(module, incremental_state, key, value): + """Helper for setting incremental state for an nn.Module.""" + if incremental_state is not None: + full_key = _get_full_incremental_state_key(module, key) + incremental_state[full_key] = value + + +def fill_with_neg_inf(t): + """FP16-compatible function that fills a tensor with -inf.""" + return t.float().fill_(float('-inf')).type_as(t) + + +def fill_with_neg_inf2(t): + """FP16-compatible function that fills a tensor with -inf.""" + return t.float().fill_(-1e8).type_as(t) + + +def select_attn(attn_logits, type='best'): + """ + + :param attn_logits: [n_layers, B, n_head, T_sp, T_txt] + :return: + """ + encdec_attn = torch.stack(attn_logits, 0).transpose(1, 2) + # [n_layers * n_head, B, T_sp, T_txt] + encdec_attn = (encdec_attn.reshape([-1, *encdec_attn.shape[2:]])).softmax(-1) + if type == 'best': + indices = encdec_attn.max(-1).values.sum(-1).argmax(0) + encdec_attn = encdec_attn.gather( + 0, indices[None, :, None, None].repeat(1, 1, encdec_attn.size(-2), encdec_attn.size(-1)))[0] + return encdec_attn + elif type == 'mean': + return encdec_attn.mean(0) + + +def make_pad_mask(lengths, xs=None, length_dim=-1): + """Make mask tensor containing indices of padded part. + Args: + lengths (LongTensor or List): Batch of lengths (B,). + xs (Tensor, optional): The reference tensor. + If set, masks will be the same shape as this tensor. + length_dim (int, optional): Dimension indicator of the above tensor. + See the example. + Returns: + Tensor: Mask tensor containing indices of padded part. + dtype=torch.uint8 in PyTorch 1.2- + dtype=torch.bool in PyTorch 1.2+ (including 1.2) + Examples: + With only lengths. + >>> lengths = [5, 3, 2] + >>> make_non_pad_mask(lengths) + masks = [[0, 0, 0, 0 ,0], + [0, 0, 0, 1, 1], + [0, 0, 1, 1, 1]] + With the reference tensor. + >>> xs = torch.zeros((3, 2, 4)) + >>> make_pad_mask(lengths, xs) + tensor([[[0, 0, 0, 0], + [0, 0, 0, 0]], + [[0, 0, 0, 1], + [0, 0, 0, 1]], + [[0, 0, 1, 1], + [0, 0, 1, 1]]], dtype=torch.uint8) + >>> xs = torch.zeros((3, 2, 6)) + >>> make_pad_mask(lengths, xs) + tensor([[[0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 0, 1]], + [[0, 0, 0, 1, 1, 1], + [0, 0, 0, 1, 1, 1]], + [[0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8) + With the reference tensor and dimension indicator. + >>> xs = torch.zeros((3, 6, 6)) + >>> make_pad_mask(lengths, xs, 1) + tensor([[[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1]], + [[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1]], + [[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1]]], dtype=torch.uint8) + >>> make_pad_mask(lengths, xs, 2) + tensor([[[0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 0, 1]], + [[0, 0, 0, 1, 1, 1], + [0, 0, 0, 1, 1, 1], + [0, 0, 0, 1, 1, 1], + [0, 0, 0, 1, 1, 1], + [0, 0, 0, 1, 1, 1], + [0, 0, 0, 1, 1, 1]], + [[0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8) + """ + if length_dim == 0: + raise ValueError("length_dim cannot be 0: {}".format(length_dim)) + + if not isinstance(lengths, list): + lengths = lengths.tolist() + bs = int(len(lengths)) + if xs is None: + maxlen = int(max(lengths)) + else: + maxlen = xs.size(length_dim) + + seq_range = torch.arange(0, maxlen, dtype=torch.int64) + seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen) + seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1) + mask = seq_range_expand >= seq_length_expand + + if xs is not None: + assert xs.size(0) == bs, (xs.size(0), bs) + + if length_dim < 0: + length_dim = xs.dim() + length_dim + # ind = (:, None, ..., None, :, , None, ..., None) + ind = tuple( + slice(None) if i in (0, length_dim) else None for i in range(xs.dim()) + ) + mask = mask[ind].expand_as(xs).to(xs.device) + return mask + + +def make_non_pad_mask(lengths, xs=None, length_dim=-1): + """Make mask tensor containing indices of non-padded part. + Args: + lengths (LongTensor or List): Batch of lengths (B,). + xs (Tensor, optional): The reference tensor. + If set, masks will be the same shape as this tensor. + length_dim (int, optional): Dimension indicator of the above tensor. + See the example. + Returns: + ByteTensor: mask tensor containing indices of padded part. + dtype=torch.uint8 in PyTorch 1.2- + dtype=torch.bool in PyTorch 1.2+ (including 1.2) + Examples: + With only lengths. + >>> lengths = [5, 3, 2] + >>> make_non_pad_mask(lengths) + masks = [[1, 1, 1, 1 ,1], + [1, 1, 1, 0, 0], + [1, 1, 0, 0, 0]] + With the reference tensor. + >>> xs = torch.zeros((3, 2, 4)) + >>> make_non_pad_mask(lengths, xs) + tensor([[[1, 1, 1, 1], + [1, 1, 1, 1]], + [[1, 1, 1, 0], + [1, 1, 1, 0]], + [[1, 1, 0, 0], + [1, 1, 0, 0]]], dtype=torch.uint8) + >>> xs = torch.zeros((3, 2, 6)) + >>> make_non_pad_mask(lengths, xs) + tensor([[[1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 0]], + [[1, 1, 1, 0, 0, 0], + [1, 1, 1, 0, 0, 0]], + [[1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0]]], dtype=torch.uint8) + With the reference tensor and dimension indicator. + >>> xs = torch.zeros((3, 6, 6)) + >>> make_non_pad_mask(lengths, xs, 1) + tensor([[[1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [0, 0, 0, 0, 0, 0]], + [[1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]], + [[1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]]], dtype=torch.uint8) + >>> make_non_pad_mask(lengths, xs, 2) + tensor([[[1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 0]], + [[1, 1, 1, 0, 0, 0], + [1, 1, 1, 0, 0, 0], + [1, 1, 1, 0, 0, 0], + [1, 1, 1, 0, 0, 0], + [1, 1, 1, 0, 0, 0], + [1, 1, 1, 0, 0, 0]], + [[1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0]]], dtype=torch.uint8) + """ + return ~make_pad_mask(lengths, xs, length_dim) + + +def get_mask_from_lengths(lengths): + max_len = torch.max(lengths).item() + ids = torch.arange(0, max_len).to(lengths.device) + mask = (ids < lengths.unsqueeze(1)).bool() + return mask + + +def group_hidden_by_segs(h, seg_ids, max_len): + """ + + :param h: [B, T, H] + :param seg_ids: [B, T] + :return: h_ph: [B, T_ph, H] + """ + B, T, H = h.shape + h_gby_segs = h.new_zeros([B, max_len + 1, H]).scatter_add_(1, seg_ids[:, :, None].repeat([1, 1, H]), h) + all_ones = h.new_ones(h.shape[:2]) + cnt_gby_segs = h.new_zeros([B, max_len + 1]).scatter_add_(1, seg_ids, all_ones).contiguous() + h_gby_segs = h_gby_segs[:, 1:] + cnt_gby_segs = cnt_gby_segs[:, 1:] + h_gby_segs = h_gby_segs / torch.clamp(cnt_gby_segs[:, :, None], min=1) + return h_gby_segs, cnt_gby_segs diff --git a/utils/os_utils.py b/utils/os_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4567d17c398c535884600cdd86a36a823acb886f --- /dev/null +++ b/utils/os_utils.py @@ -0,0 +1,20 @@ +import os +import subprocess + + +def link_file(from_file, to_file): + subprocess.check_call( + f'ln -s "`realpath --relative-to="{os.path.dirname(to_file)}" "{from_file}"`" "{to_file}"', shell=True) + + +def move_file(from_file, to_file): + subprocess.check_call(f'mv "{from_file}" "{to_file}"', shell=True) + + +def copy_file(from_file, to_file): + subprocess.check_call(f'cp -r "{from_file}" "{to_file}"', shell=True) + + +def remove_file(*fns): + for f in fns: + subprocess.check_call(f'rm -rf "{f}"', shell=True) diff --git a/utils/plot/__pycache__/plot.cpython-36.pyc b/utils/plot/__pycache__/plot.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3dae051d7004f6c0ce52782bc1396d6fe2b370e8 Binary files /dev/null and b/utils/plot/__pycache__/plot.cpython-36.pyc differ diff --git a/utils/plot/__pycache__/plot.cpython-37.pyc b/utils/plot/__pycache__/plot.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51af86f6ce9b0381b0b6affc45c9f0d569fec29b Binary files /dev/null and b/utils/plot/__pycache__/plot.cpython-37.pyc differ diff --git a/utils/plot/plot.py b/utils/plot/plot.py new file mode 100644 index 0000000000000000000000000000000000000000..9d7fc02cef69fa5517228437156e687ca054efc8 --- /dev/null +++ b/utils/plot/plot.py @@ -0,0 +1,51 @@ +import matplotlib + +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import numpy as np +import torch + +LINE_COLORS = ['w', 'r', 'orange', 'k', 'cyan', 'm', 'b', 'lime', 'g', 'brown', 'navy'] + + +def spec_to_figure(spec, vmin=None, vmax=None, title='', f0s=None, dur_info=None): + if isinstance(spec, torch.Tensor): + spec = spec.cpu().numpy() + H = spec.shape[1] // 2 + fig = plt.figure(figsize=(12, 6)) + plt.title(title) + plt.pcolor(spec.T, vmin=vmin, vmax=vmax) + if dur_info is not None: + assert isinstance(dur_info, dict) + txt = dur_info['txt'] + dur_gt = dur_info['dur_gt'] + if isinstance(dur_gt, torch.Tensor): + dur_gt = dur_gt.cpu().numpy() + dur_gt = np.cumsum(dur_gt).astype(int) + for i in range(len(dur_gt)): + shift = (i % 8) + 1 + plt.text(dur_gt[i], shift * 4, txt[i]) + plt.vlines(dur_gt[i], 0, H // 2, colors='b') # blue is gt + plt.xlim(0, dur_gt[-1]) + if 'dur_pred' in dur_info: + dur_pred = dur_info['dur_pred'] + if isinstance(dur_pred, torch.Tensor): + dur_pred = dur_pred.cpu().numpy() + dur_pred = np.cumsum(dur_pred).astype(int) + for i in range(len(dur_pred)): + shift = (i % 8) + 1 + plt.text(dur_pred[i], H + shift * 4, txt[i]) + plt.vlines(dur_pred[i], H, H * 1.5, colors='r') # red is pred + plt.xlim(0, max(dur_gt[-1], dur_pred[-1])) + if f0s is not None: + ax = plt.gca() + ax2 = ax.twinx() + if not isinstance(f0s, dict): + f0s = {'f0': f0s} + for i, (k, f0) in enumerate(f0s.items()): + if isinstance(f0, torch.Tensor): + f0 = f0.cpu().numpy() + ax2.plot(f0, label=k, c=LINE_COLORS[i], linewidth=1, alpha=0.5) + ax2.set_ylim(0, 1000) + ax2.legend() + return fig diff --git a/utils/text/__pycache__/text_encoder.cpython-36.pyc b/utils/text/__pycache__/text_encoder.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07b013465e3fabb8fc742f674311012fe5660b26 Binary files /dev/null and b/utils/text/__pycache__/text_encoder.cpython-36.pyc differ diff --git a/utils/text/__pycache__/text_encoder.cpython-37.pyc b/utils/text/__pycache__/text_encoder.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..afe5aa3888b16c3669ba931d0b14ac2ab1d4bc72 Binary files /dev/null and b/utils/text/__pycache__/text_encoder.cpython-37.pyc differ diff --git a/utils/text/__pycache__/text_norm.cpython-36.pyc b/utils/text/__pycache__/text_norm.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a92623fdda26c2419d370f2940ff834668777fb2 Binary files /dev/null and b/utils/text/__pycache__/text_norm.cpython-36.pyc differ diff --git a/utils/text/__pycache__/text_norm.cpython-37.pyc b/utils/text/__pycache__/text_norm.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66d5b8306a33fb43cc4c42bf27b96abf130df94f Binary files /dev/null and b/utils/text/__pycache__/text_norm.cpython-37.pyc differ diff --git a/utils/text/encoding.py b/utils/text/encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..f09f514613fd44a27450fe7c04cbdf5ebfbe78a8 --- /dev/null +++ b/utils/text/encoding.py @@ -0,0 +1,9 @@ +import chardet + + +def get_encoding(file): + with open(file, 'rb') as f: + encoding = chardet.detect(f.read())['encoding'] + if encoding == 'GB2312': + encoding = 'GB18030' + return encoding diff --git a/utils/text/text_encoder.py b/utils/text/text_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..09555af09720382a795712f0fdd9b711c5b19e02 --- /dev/null +++ b/utils/text/text_encoder.py @@ -0,0 +1,263 @@ +import json +import re +import six +from six.moves import range # pylint: disable=redefined-builtin + +PAD = "" +EOS = "" +UNK = "" +SEG = "|" +PUNCS = '!,.?;:' +RESERVED_TOKENS = [PAD, EOS, UNK] +NUM_RESERVED_TOKENS = len(RESERVED_TOKENS) +PAD_ID = RESERVED_TOKENS.index(PAD) # Normally 0 +EOS_ID = RESERVED_TOKENS.index(EOS) # Normally 1 +UNK_ID = RESERVED_TOKENS.index(UNK) # Normally 2 + +if six.PY2: + RESERVED_TOKENS_BYTES = RESERVED_TOKENS +else: + RESERVED_TOKENS_BYTES = [bytes(PAD, "ascii"), bytes(EOS, "ascii")] + +# Regular expression for unescaping token strings. +# '\u' is converted to '_' +# '\\' is converted to '\' +# '\213;' is converted to unichr(213) +_UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);") +_ESCAPE_CHARS = set(u"\\_u;0123456789") + + +def strip_ids(ids, ids_to_strip): + """Strip ids_to_strip from the end ids.""" + ids = list(ids) + while ids and ids[-1] in ids_to_strip: + ids.pop() + return ids + + +class TextEncoder(object): + """Base class for converting from ints to/from human readable strings.""" + + def __init__(self, num_reserved_ids=NUM_RESERVED_TOKENS): + self._num_reserved_ids = num_reserved_ids + + @property + def num_reserved_ids(self): + return self._num_reserved_ids + + def encode(self, s): + """Transform a human-readable string into a sequence of int ids. + + The ids should be in the range [num_reserved_ids, vocab_size). Ids [0, + num_reserved_ids) are reserved. + + EOS is not appended. + + Args: + s: human-readable string to be converted. + + Returns: + ids: list of integers + """ + return [int(w) + self._num_reserved_ids for w in s.split()] + + def decode(self, ids, strip_extraneous=False): + """Transform a sequence of int ids into a human-readable string. + + EOS is not expected in ids. + + Args: + ids: list of integers to be converted. + strip_extraneous: bool, whether to strip off extraneous tokens + (EOS and PAD). + + Returns: + s: human-readable string. + """ + if strip_extraneous: + ids = strip_ids(ids, list(range(self._num_reserved_ids or 0))) + return " ".join(self.decode_list(ids)) + + def decode_list(self, ids): + """Transform a sequence of int ids into a their string versions. + + This method supports transforming individual input/output ids to their + string versions so that sequence to/from text conversions can be visualized + in a human readable format. + + Args: + ids: list of integers to be converted. + + Returns: + strs: list of human-readable string. + """ + decoded_ids = [] + for id_ in ids: + if 0 <= id_ < self._num_reserved_ids: + decoded_ids.append(RESERVED_TOKENS[int(id_)]) + else: + decoded_ids.append(id_ - self._num_reserved_ids) + return [str(d) for d in decoded_ids] + + @property + def vocab_size(self): + raise NotImplementedError() + + +class TokenTextEncoder(TextEncoder): + """Encoder based on a user-supplied vocabulary (file or list).""" + + def __init__(self, + vocab_filename, + reverse=False, + vocab_list=None, + replace_oov=None, + num_reserved_ids=NUM_RESERVED_TOKENS): + """Initialize from a file or list, one token per line. + + Handling of reserved tokens works as follows: + - When initializing from a list, we add reserved tokens to the vocab. + - When initializing from a file, we do not add reserved tokens to the vocab. + - When saving vocab files, we save reserved tokens to the file. + + Args: + vocab_filename: If not None, the full filename to read vocab from. If this + is not None, then vocab_list should be None. + reverse: Boolean indicating if tokens should be reversed during encoding + and decoding. + vocab_list: If not None, a list of elements of the vocabulary. If this is + not None, then vocab_filename should be None. + replace_oov: If not None, every out-of-vocabulary token seen when + encoding will be replaced by this string (which must be in vocab). + num_reserved_ids: Number of IDs to save for reserved tokens like . + """ + super(TokenTextEncoder, self).__init__(num_reserved_ids=num_reserved_ids) + self._reverse = reverse + self._replace_oov = replace_oov + if vocab_filename: + self._init_vocab_from_file(vocab_filename) + else: + assert vocab_list is not None + self._init_vocab_from_list(vocab_list) + self.pad_index = self.token_to_id[PAD] + self.eos_index = self.token_to_id[EOS] + self.unk_index = self.token_to_id[UNK] + self.seg_index = self.token_to_id[SEG] if SEG in self.token_to_id else self.eos_index + + def encode(self, s): + """Converts a space-separated string of tokens to a list of ids.""" + sentence = s + tokens = sentence.strip().split() + if self._replace_oov is not None: + tokens = [t if t in self.token_to_id else self._replace_oov + for t in tokens] + ret = [self.token_to_id[tok] for tok in tokens] + return ret[::-1] if self._reverse else ret + + def decode(self, ids, strip_eos=False, strip_padding=False): + if strip_padding and self.pad() in list(ids): + pad_pos = list(ids).index(self.pad()) + ids = ids[:pad_pos] + if strip_eos and self.eos() in list(ids): + eos_pos = list(ids).index(self.eos()) + ids = ids[:eos_pos] + return " ".join(self.decode_list(ids)) + + def decode_list(self, ids): + seq = reversed(ids) if self._reverse else ids + return [self._safe_id_to_token(i) for i in seq] + + @property + def vocab_size(self): + return len(self.id_to_token) + + def __len__(self): + return self.vocab_size + + def _safe_id_to_token(self, idx): + return self.id_to_token.get(idx, "ID_%d" % idx) + + def _init_vocab_from_file(self, filename): + """Load vocab from a file. + + Args: + filename: The file to load vocabulary from. + """ + with open(filename) as f: + tokens = [token.strip() for token in f.readlines()] + + def token_gen(): + for token in tokens: + yield token + + self._init_vocab(token_gen(), add_reserved_tokens=False) + + def _init_vocab_from_list(self, vocab_list): + """Initialize tokens from a list of tokens. + + It is ok if reserved tokens appear in the vocab list. They will be + removed. The set of tokens in vocab_list should be unique. + + Args: + vocab_list: A list of tokens. + """ + + def token_gen(): + for token in vocab_list: + if token not in RESERVED_TOKENS: + yield token + + self._init_vocab(token_gen()) + + def _init_vocab(self, token_generator, add_reserved_tokens=True): + """Initialize vocabulary with tokens from token_generator.""" + + self.id_to_token = {} + non_reserved_start_index = 0 + + if add_reserved_tokens: + self.id_to_token.update(enumerate(RESERVED_TOKENS)) + non_reserved_start_index = len(RESERVED_TOKENS) + + self.id_to_token.update( + enumerate(token_generator, start=non_reserved_start_index)) + + # _token_to_id is the reverse of _id_to_token + self.token_to_id = dict((v, k) for k, v in six.iteritems(self.id_to_token)) + + def pad(self): + return self.pad_index + + def eos(self): + return self.eos_index + + def unk(self): + return self.unk_index + + def seg(self): + return self.seg_index + + def store_to_file(self, filename): + """Write vocab file to disk. + + Vocab files have one token per line. The file ends in a newline. Reserved + tokens are written to the vocab file as well. + + Args: + filename: Full path of the file to store the vocab to. + """ + with open(filename, "w") as f: + for i in range(len(self.id_to_token)): + f.write(self.id_to_token[i] + "\n") + + def sil_phonemes(self): + return [p for p in self.id_to_token.values() if is_sil_phoneme(p)] + + +def build_token_encoder(token_list_file): + token_list = json.load(open(token_list_file)) + return TokenTextEncoder(None, vocab_list=token_list, replace_oov='') + + +def is_sil_phoneme(p): + return p == '' or not p[0].isalpha() diff --git a/utils/text/text_norm.py b/utils/text/text_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..863c2fb235e209f25cce954ec9b585cb6fe13c96 --- /dev/null +++ b/utils/text/text_norm.py @@ -0,0 +1,797 @@ +# coding=utf-8 +# Authors: +# 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git) +# 2019.9 Jiayu DU +# +# requirements: +# - python 3.X +# notes: python 2.X WILL fail or produce misleading results + +import sys, os, argparse, codecs, string, re + +# ================================================================================ # +# basic constant +# ================================================================================ # +CHINESE_DIGIS = u'零一二三四五六七八九' +BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖' +BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖' +SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万' +SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬' +LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载' +LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載' +SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万' +SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬' + +ZERO_ALT = u'〇' +ONE_ALT = u'幺' +TWO_ALTS = [u'两', u'兩'] + +POSITIVE = [u'正', u'正'] +NEGATIVE = [u'负', u'負'] +POINT = [u'点', u'點'] +# PLUS = [u'加', u'加'] +# SIL = [u'杠', u'槓'] + +# 中文数字系统类型 +NUMBERING_TYPES = ['low', 'mid', 'high'] + +CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \ + '里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)' +CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' +COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \ + '砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \ + '针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \ + '毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \ + '盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \ + '纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)' + +# punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git) +CHINESE_PUNC_STOP = '!?。。' +CHINESE_PUNC_NON_STOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏' +CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP + + +# ================================================================================ # +# basic class +# ================================================================================ # +class ChineseChar(object): + """ + 中文字符 + 每个字符对应简体和繁体, + e.g. 简体 = '负', 繁体 = '負' + 转换时可转换为简体或繁体 + """ + + def __init__(self, simplified, traditional): + self.simplified = simplified + self.traditional = traditional + # self.__repr__ = self.__str__ + + def __str__(self): + return self.simplified or self.traditional or None + + def __repr__(self): + return self.__str__() + + +class ChineseNumberUnit(ChineseChar): + """ + 中文数字/数位字符 + 每个字符除繁简体外还有一个额外的大写字符 + e.g. '陆' 和 '陸' + """ + + def __init__(self, power, simplified, traditional, big_s, big_t): + super(ChineseNumberUnit, self).__init__(simplified, traditional) + self.power = power + self.big_s = big_s + self.big_t = big_t + + def __str__(self): + return '10^{}'.format(self.power) + + @classmethod + def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False): + + if small_unit: + return ChineseNumberUnit(power=index + 1, + simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1]) + elif numbering_type == NUMBERING_TYPES[0]: + return ChineseNumberUnit(power=index + 8, + simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) + elif numbering_type == NUMBERING_TYPES[1]: + return ChineseNumberUnit(power=(index + 2) * 4, + simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) + elif numbering_type == NUMBERING_TYPES[2]: + return ChineseNumberUnit(power=pow(2, index + 3), + simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) + else: + raise ValueError( + 'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type)) + + +class ChineseNumberDigit(ChineseChar): + """ + 中文数字字符 + """ + + def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None): + super(ChineseNumberDigit, self).__init__(simplified, traditional) + self.value = value + self.big_s = big_s + self.big_t = big_t + self.alt_s = alt_s + self.alt_t = alt_t + + def __str__(self): + return str(self.value) + + @classmethod + def create(cls, i, v): + return ChineseNumberDigit(i, v[0], v[1], v[2], v[3]) + + +class ChineseMath(ChineseChar): + """ + 中文数位字符 + """ + + def __init__(self, simplified, traditional, symbol, expression=None): + super(ChineseMath, self).__init__(simplified, traditional) + self.symbol = symbol + self.expression = expression + self.big_s = simplified + self.big_t = traditional + + +CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath + + +class NumberSystem(object): + """ + 中文数字系统 + """ + pass + + +class MathSymbol(object): + """ + 用于中文数字系统的数学符号 (繁/简体), e.g. + positive = ['正', '正'] + negative = ['负', '負'] + point = ['点', '點'] + """ + + def __init__(self, positive, negative, point): + self.positive = positive + self.negative = negative + self.point = point + + def __iter__(self): + for v in self.__dict__.values(): + yield v + + +# class OtherSymbol(object): +# """ +# 其他符号 +# """ +# +# def __init__(self, sil): +# self.sil = sil +# +# def __iter__(self): +# for v in self.__dict__.values(): +# yield v + + +# ================================================================================ # +# basic utils +# ================================================================================ # +def create_system(numbering_type=NUMBERING_TYPES[1]): + """ + 根据数字系统类型返回创建相应的数字系统,默认为 mid + NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型 + low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc. + mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc. + high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc. + 返回对应的数字系统 + """ + + # chinese number units of '亿' and larger + all_larger_units = zip( + LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL) + larger_units = [CNU.create(i, v, numbering_type, False) + for i, v in enumerate(all_larger_units)] + # chinese number units of '十, 百, 千, 万' + all_smaller_units = zip( + SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL) + smaller_units = [CNU.create(i, v, small_unit=True) + for i, v in enumerate(all_smaller_units)] + # digis + chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS, + BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL) + digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)] + digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT + digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT + digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1] + + # symbols + positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x) + negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x) + point_cn = CM(POINT[0], POINT[1], '.', lambda x, + y: float(str(x) + '.' + str(y))) + # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y))) + system = NumberSystem() + system.units = smaller_units + larger_units + system.digits = digits + system.math = MathSymbol(positive_cn, negative_cn, point_cn) + # system.symbols = OtherSymbol(sil_cn) + return system + + +def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]): + def get_symbol(char, system): + for u in system.units: + if char in [u.traditional, u.simplified, u.big_s, u.big_t]: + return u + for d in system.digits: + if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]: + return d + for m in system.math: + if char in [m.traditional, m.simplified]: + return m + + def string2symbols(chinese_string, system): + int_string, dec_string = chinese_string, '' + for p in [system.math.point.simplified, system.math.point.traditional]: + if p in chinese_string: + int_string, dec_string = chinese_string.split(p) + break + return [get_symbol(c, system) for c in int_string], \ + [get_symbol(c, system) for c in dec_string] + + def correct_symbols(integer_symbols, system): + """ + 一百八 to 一百八十 + 一亿一千三百万 to 一亿 一千万 三百万 + """ + + if integer_symbols and isinstance(integer_symbols[0], CNU): + if integer_symbols[0].power == 1: + integer_symbols = [system.digits[1]] + integer_symbols + + if len(integer_symbols) > 1: + if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU): + integer_symbols.append( + CNU(integer_symbols[-2].power - 1, None, None, None, None)) + + result = [] + unit_count = 0 + for s in integer_symbols: + if isinstance(s, CND): + result.append(s) + unit_count = 0 + elif isinstance(s, CNU): + current_unit = CNU(s.power, None, None, None, None) + unit_count += 1 + + if unit_count == 1: + result.append(current_unit) + elif unit_count > 1: + for i in range(len(result)): + if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power: + result[-i - 1] = CNU(result[-i - 1].power + + current_unit.power, None, None, None, None) + return result + + def compute_value(integer_symbols): + """ + Compute the value. + When current unit is larger than previous unit, current unit * all previous units will be used as all previous units. + e.g. '两千万' = 2000 * 10000 not 2000 + 10000 + """ + value = [0] + last_power = 0 + for s in integer_symbols: + if isinstance(s, CND): + value[-1] = s.value + elif isinstance(s, CNU): + value[-1] *= pow(10, s.power) + if s.power > last_power: + value[:-1] = list(map(lambda v: v * + pow(10, s.power), value[:-1])) + last_power = s.power + value.append(0) + return sum(value) + + system = create_system(numbering_type) + int_part, dec_part = string2symbols(chinese_string, system) + int_part = correct_symbols(int_part, system) + int_str = str(compute_value(int_part)) + dec_str = ''.join([str(d.value) for d in dec_part]) + if dec_part: + return '{0}.{1}'.format(int_str, dec_str) + else: + return int_str + + +def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False, + traditional=False, alt_zero=False, alt_one=False, alt_two=True, + use_zeros=True, use_units=True): + def get_value(value_string, use_zeros=True): + + striped_string = value_string.lstrip('0') + + # record nothing if all zeros + if not striped_string: + return [] + + # record one digits + elif len(striped_string) == 1: + if use_zeros and len(value_string) != len(striped_string): + return [system.digits[0], system.digits[int(striped_string)]] + else: + return [system.digits[int(striped_string)]] + + # recursively record multiple digits + else: + result_unit = next(u for u in reversed( + system.units) if u.power < len(striped_string)) + result_string = value_string[:-result_unit.power] + return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:]) + + system = create_system(numbering_type) + + int_dec = number_string.split('.') + if len(int_dec) == 1: + int_string = int_dec[0] + dec_string = "" + elif len(int_dec) == 2: + int_string = int_dec[0] + dec_string = int_dec[1] + else: + raise ValueError( + "invalid input num string with more than one dot: {}".format(number_string)) + + if use_units and len(int_string) > 1: + result_symbols = get_value(int_string) + else: + result_symbols = [system.digits[int(c)] for c in int_string] + dec_symbols = [system.digits[int(c)] for c in dec_string] + if dec_string: + result_symbols += [system.math.point] + dec_symbols + + if alt_two: + liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t, + system.digits[2].big_s, system.digits[2].big_t) + for i, v in enumerate(result_symbols): + if isinstance(v, CND) and v.value == 2: + next_symbol = result_symbols[i + + 1] if i < len(result_symbols) - 1 else None + previous_symbol = result_symbols[i - 1] if i > 0 else None + if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))): + if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)): + result_symbols[i] = liang + + # if big is True, '两' will not be used and `alt_two` has no impact on output + if big: + attr_name = 'big_' + if traditional: + attr_name += 't' + else: + attr_name += 's' + else: + if traditional: + attr_name = 'traditional' + else: + attr_name = 'simplified' + + result = ''.join([getattr(s, attr_name) for s in result_symbols]) + + # if not use_zeros: + # result = result.strip(getattr(system.digits[0], attr_name)) + + if alt_zero: + result = result.replace( + getattr(system.digits[0], attr_name), system.digits[0].alt_s) + + if alt_one: + result = result.replace( + getattr(system.digits[1], attr_name), system.digits[1].alt_s) + + for i, p in enumerate(POINT): + if result.startswith(p): + return CHINESE_DIGIS[0] + result + + # ^10, 11, .., 19 + if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0], + SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \ + result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]: + result = result[1:] + + return result + + +# ================================================================================ # +# different types of rewriters +# ================================================================================ # +class Cardinal: + """ + CARDINAL类 + """ + + def __init__(self, cardinal=None, chntext=None): + self.cardinal = cardinal + self.chntext = chntext + + def chntext2cardinal(self): + return chn2num(self.chntext) + + def cardinal2chntext(self): + return num2chn(self.cardinal) + + +class Digit: + """ + DIGIT类 + """ + + def __init__(self, digit=None, chntext=None): + self.digit = digit + self.chntext = chntext + + # def chntext2digit(self): + # return chn2num(self.chntext) + + def digit2chntext(self): + return num2chn(self.digit, alt_two=False, use_units=False) + + +class TelePhone: + """ + TELEPHONE类 + """ + + def __init__(self, telephone=None, raw_chntext=None, chntext=None): + self.telephone = telephone + self.raw_chntext = raw_chntext + self.chntext = chntext + + # def chntext2telephone(self): + # sil_parts = self.raw_chntext.split('') + # self.telephone = '-'.join([ + # str(chn2num(p)) for p in sil_parts + # ]) + # return self.telephone + + def telephone2chntext(self, fixed=False): + + if fixed: + sil_parts = self.telephone.split('-') + self.raw_chntext = ''.join([ + num2chn(part, alt_two=False, use_units=False) for part in sil_parts + ]) + self.chntext = self.raw_chntext.replace('', '') + else: + sp_parts = self.telephone.strip('+').split() + self.raw_chntext = ''.join([ + num2chn(part, alt_two=False, use_units=False) for part in sp_parts + ]) + self.chntext = self.raw_chntext.replace('', '') + return self.chntext + + +class Fraction: + """ + FRACTION类 + """ + + def __init__(self, fraction=None, chntext=None): + self.fraction = fraction + self.chntext = chntext + + def chntext2fraction(self): + denominator, numerator = self.chntext.split('分之') + return chn2num(numerator) + '/' + chn2num(denominator) + + def fraction2chntext(self): + numerator, denominator = self.fraction.split('/') + return num2chn(denominator) + '分之' + num2chn(numerator) + + +class Date: + """ + DATE类 + """ + + def __init__(self, date=None, chntext=None): + self.date = date + self.chntext = chntext + + # def chntext2date(self): + # chntext = self.chntext + # try: + # year, other = chntext.strip().split('年', maxsplit=1) + # year = Digit(chntext=year).digit2chntext() + '年' + # except ValueError: + # other = chntext + # year = '' + # if other: + # try: + # month, day = other.strip().split('月', maxsplit=1) + # month = Cardinal(chntext=month).chntext2cardinal() + '月' + # except ValueError: + # day = chntext + # month = '' + # if day: + # day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1] + # else: + # month = '' + # day = '' + # date = year + month + day + # self.date = date + # return self.date + + def date2chntext(self): + date = self.date + try: + year, other = date.strip().split('年', 1) + year = Digit(digit=year).digit2chntext() + '年' + except ValueError: + other = date + year = '' + if other: + try: + month, day = other.strip().split('月', 1) + month = Cardinal(cardinal=month).cardinal2chntext() + '月' + except ValueError: + day = date + month = '' + if day: + day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1] + else: + month = '' + day = '' + chntext = year + month + day + self.chntext = chntext + return self.chntext + + +class Money: + """ + MONEY类 + """ + + def __init__(self, money=None, chntext=None): + self.money = money + self.chntext = chntext + + # def chntext2money(self): + # return self.money + + def money2chntext(self): + money = self.money + pattern = re.compile(r'(\d+(\.\d+)?)') + matchers = pattern.findall(money) + if matchers: + for matcher in matchers: + money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext()) + self.chntext = money + return self.chntext + + +class Percentage: + """ + PERCENTAGE类 + """ + + def __init__(self, percentage=None, chntext=None): + self.percentage = percentage + self.chntext = chntext + + def chntext2percentage(self): + return chn2num(self.chntext.strip().strip('百分之')) + '%' + + def percentage2chntext(self): + return '百分之' + num2chn(self.percentage.strip().strip('%')) + + +# ================================================================================ # +# NSW Normalizer +# ================================================================================ # +class NSWNormalizer: + def __init__(self, raw_text): + self.raw_text = '^' + raw_text + '$' + self.norm_text = '' + + def _particular(self): + text = self.norm_text + pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))") + matchers = pattern.findall(text) + if matchers: + # print('particular') + for matcher in matchers: + text = text.replace(matcher[0], matcher[1] + '2' + matcher[2], 1) + self.norm_text = text + return self.norm_text + + def normalize(self, remove_punc=True): + text = self.raw_text + + # 规范化日期 + pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)") + matchers = pattern.findall(text) + if matchers: + # print('date') + for matcher in matchers: + text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1) + + # 规范化金钱 + pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)") + matchers = pattern.findall(text) + if matchers: + # print('money') + for matcher in matchers: + text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1) + + # 规范化固话/手机号码 + # 手机 + # http://www.jihaoba.com/news/show/13680 + # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 + # 联通:130、131、132、156、155、186、185、176 + # 电信:133、153、189、180、181、177 + pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D") + matchers = pattern.findall(text) + if matchers: + # print('telephone') + for matcher in matchers: + text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1) + # 固话 + pattern = re.compile(r"\D((0(10|2[0-9]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D") + matchers = pattern.findall(text) + if matchers: + # print('fixed telephone') + for matcher in matchers: + text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1) + + # 规范化分数 + pattern = re.compile(r"(\d+/\d+)") + matchers = pattern.findall(text) + if matchers: + # print('fraction') + for matcher in matchers: + text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1) + + # 规范化百分数 + text = text.replace('%', '%') + pattern = re.compile(r"(\d+(\.\d+)?%)") + matchers = pattern.findall(text) + if matchers: + # print('percentage') + for matcher in matchers: + text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1) + + # 规范化纯数+量词 + pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS) + matchers = pattern.findall(text) + if matchers: + # print('cardinal+quantifier') + for matcher in matchers: + text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) + + # 规范化小数 + pattern = re.compile(r"(\d+\.\d+)") + matchers = pattern.findall(text) + if matchers: + # print('cardinal') + for matcher in matchers: + text = text.replace(matcher, Cardinal(cardinal=matcher).cardinal2chntext(), 1) + + # 规范化数字编号 + pattern = re.compile(r"(\d{4,32})") + matchers = pattern.findall(text) + if matchers: + # print('digit') + for matcher in matchers: + text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1) + + # 规范化其他数字 + pattern = re.compile(r"(\d+(\.\d+)?)") + matchers = pattern.findall(text) + if matchers: + # print('cardinal') + for matcher in matchers: + text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) + + self.norm_text = text + self._particular() + + text = self.norm_text.lstrip('^').rstrip('$') + if remove_punc: + # Punctuations removal + old_chars = CHINESE_PUNC_LIST + string.punctuation # includes all CN and EN punctuations + new_chars = ' ' * len(old_chars) + del_chars = '' + text = text.translate(str.maketrans(old_chars, new_chars, del_chars)) + return text + + +def nsw_test_case(raw_text): + print('I:' + raw_text) + print('O:' + NSWNormalizer(raw_text).normalize()) + print('') + + +def nsw_test(): + nsw_test_case('固话:0595-23865596或者23880880。') + nsw_test_case('手机:+86 19859213959或者15659451527。') + nsw_test_case('分数:32477/76391。') + nsw_test_case('百分数:80.03%。') + nsw_test_case('编号:31520181154418。') + nsw_test_case('纯数:2983.07克或12345.60米。') + nsw_test_case('日期:1999年2月20日或09年3月15号。') + nsw_test_case('金钱:12块5,34.5元,20.1万, 40多块钱') + nsw_test_case('特殊:O2O或B2C。') + nsw_test_case('3456万吨') + nsw_test_case('2938478321947个') + nsw_test_case('938') + nsw_test_case('今天吃了115个小笼包231个馒头') + nsw_test_case('有62%的概率') + + +if __name__ == '__main__': + # nsw_test() + + p = argparse.ArgumentParser() + p.add_argument('ifile', help='input filename, assume utf-8 encoding') + p.add_argument('ofile', help='output filename') + p.add_argument('--to_upper', action='store_true', help='convert to upper case') + p.add_argument('--to_lower', action='store_true', help='convert to lower case') + p.add_argument('--has_key', action='store_true', help="input text has Kaldi's key as first field.") + p.add_argument('--log_interval', type=int, default=10000, help='log interval in number of processed lines') + args = p.parse_args() + + ifile = codecs.open(args.ifile, 'r', 'utf8') + ofile = codecs.open(args.ofile, 'w+', 'utf8') + + n = 0 + for l in ifile: + key = '' + text = '' + if args.has_key: + cols = l.split(maxsplit=1) + key = cols[0] + if len(cols) == 2: + text = cols[1] + else: + text = '' + else: + text = l + + # cases + if args.to_upper and args.to_lower: + sys.stderr.write('text norm: to_upper OR to_lower?') + exit(1) + if args.to_upper: + text = text.upper() + if args.to_lower: + text = text.lower() + + # NSW(Non-Standard-Word) normalization + text = NSWNormalizer(text).normalize() + + # + if args.has_key: + ofile.write(key + '\t' + text) + else: + ofile.write(text) + + n += 1 + if n % args.log_interval == 0: + sys.stderr.write("text norm: {} lines done.\n".format(n)) + + sys.stderr.write("text norm: {} lines done in total.\n".format(n)) + + ifile.close() + ofile.close()