From 46823e7bce473fc37260c008307e1be407ae5151 Mon Sep 17 00:00:00 2001 From: dogeystamp Date: Fri, 12 Jul 2024 16:59:59 +0800 Subject: [PATCH] [hanzi-flash] single mode + misc modifications - indexing changed - readme updated - output is now stdout --- hanzi-flash/README.md | 74 +++++++++++++++++++++++++++++++++++--- hanzi-flash/hanzi_flash.py | 59 +++++++++++++++++++++++------- 2 files changed, 115 insertions(+), 18 deletions(-) diff --git a/hanzi-flash/README.md b/hanzi-flash/README.md index 17e4da6..c41826f 100644 --- a/hanzi-flash/README.md +++ b/hanzi-flash/README.md @@ -1,10 +1,74 @@ # hanzi-flash -Generate flashcards for a range of frequent hanzi characters. +Generate flashcards in CSV format for a range of frequent hanzi words. -There are multiple sources for words: +This is based on the +[hsk CSV](https://github.com/plaktos/hsk_csv) +repo, including common usage words graded by difficulty. +These form the vocabulary of the HSK (hanzi proficiency exam). -- [hanziDB CSV](https://github.com/ruddfawcett/hanziDB.csv) (purely frequency based) -- [hsk CSV](https://github.com/plaktos/hsk_csv) (common usage words, graded by difficulty) +## usage -Be careful to keep the filenames as `hanzi_db.csv` and `hsk*.csv` because the script tries to recognize these. +This script requires the HSK vocabulary in a CSV file. +The expected format is word, pronunciation in pinyin, and definition. +You may combine all levels into a single file as such: + +``` +git clone https://github.com/plaktos/hsk_csv +cd hsk_csv +cat hsk*.csv > all_hsk.csv +``` + +To use the script, put this `all_hsk.csv` file in the same directory, or pass the path explicitly with the `-i/--input` flag. +CSV output goes to stdout, which can be redirected to a file. +For example, this generates a flashcard deck for the entire HSK vocabulary: + +``` +python hanzi_flash.py -i ./all_hsk.csv > output.csv +``` + +## ranges + +HSK's 6 levels have increasingly large vocabulary. +This script can help you divide this into more digestible chunks. +Specify the `-s/--start` and `-e/--end` options to only output a range of characters. +For example, the first 50 characters: + +``` +python hanzi_flash.py -s 1 -e 50 +``` + +Or, the next 50: + +``` +python hanzi_flash.py -s 51 -e 100 +``` + +Once generated, use your flashcard app's merge feature after importing both these decks. + +## single character mode + +HSK's vocabulary is in words, not in individual characters. +Pass the `-S/--single` flag to break up the words into characters. +The flashcard will have a single character, and the answer will be its pronunciations and example words containing it. +This is intended as a supplement to the regular word flashcard decks. + +Single mode respects the range options above, +and only outputs new, unique, characters +that appear first in the given range. +It will also not duplicate flashcards for words that are single characters. + +For example, take the following invocations, with and without single mode: + +``` +$ python hanzi_flash.py -s 17 -e 19 +电脑,diàn nǎo (computer) +电视,diàn shì (television) +电影,diàn yǐng (movie) +$ python hanzi_flash.py -s 17 -e 19 --single +脑,nǎo / 电脑 +视,shì / 电视 +影,yǐng / 电影 +``` + +Single mode only picks out the new characters (电 was learned before the given range `17-19`). diff --git a/hanzi-flash/hanzi_flash.py b/hanzi-flash/hanzi_flash.py index be34340..4f81f16 100644 --- a/hanzi-flash/hanzi_flash.py +++ b/hanzi-flash/hanzi_flash.py @@ -2,31 +2,64 @@ """ Generate flashcards for a range of frequent hanzi characters. -Based on https://github.com/ruddfawcett/hanziDB.csv +See attached README for more information. """ import csv import itertools import argparse -import re +import sys from pathlib import Path parser = argparse.ArgumentParser() -parser.add_argument("-s", "--start", default=0, type=int) +parser.add_argument("-s", "--start", default=1, type=int) parser.add_argument("-e", "--end", default=99999999, type=int) -parser.add_argument("-O", "--output", default="hanzi_flash.csv", type=Path) -parser.add_argument("-i", "--input", default="hanzi_db.csv", type=Path) +parser.add_argument("-i", "--input", default="all_hsk.csv", type=Path) +parser.add_argument( + "-S", + "--single", + action="store_true", + help="Output unique single characters instead of words.", +) args = parser.parse_args() -offset = 1 +prev: set[str] = set() +"""Characters from previous single character card decks.""" -fname: str = args.input.stem -if fname.startswith("hsk"): - offset = 0 +single: set[str] = set() +"""Already single characters.""" + +uniq: dict[str, set[str]] = {} +"""Character to words mapping.""" + +prons: dict[str, set[str]] = {} +"""Character to pronunciations mapping.""" with open(args.input) as csv_file: reader = csv.reader(csv_file) - with open(args.output, "w") as outp_file: - writer = csv.writer(outp_file) - for row in itertools.islice(reader, args.start, args.end + 1): - writer.writerow([row[offset], f"{row[offset+1]} ({row[offset+2]})"]) + writer = csv.writer(sys.stdout) + start = 0 if args.single else args.start - 1 + for i, row in enumerate(itertools.islice(reader, start, args.end)): + word, pron, mean = row[:3] + if args.single: + if len(word) > 1: + for sound, char in zip(pron.lower().split(), word): + if i < args.start - 1: + prev.add(char) + elif char not in prev: + if char not in uniq: + uniq[char] = set() + prons[char] = set() + uniq[char].add(word) + prons[char].add(sound) + else: + single.add(word[0]) + else: + writer.writerow([word, f"{pron} ({mean})"]) + + if args.single: + for char in uniq: + if char not in single: + writer.writerow( + [char, f"{', '.join(prons[char])} / {' '.join(uniq[char])}"] + )