[hanzi-flash] hsk stuff

2024-06-14 16:04:55 -04:00 · 2024-06-14 16:04:55 -04:00 · 7a5f5f1dc0
commit 7a5f5f1dc0
parent 4cdfbd93a2
2 changed files with 14 additions and 2 deletions
--- a/hanzi-flash/README.md
+++ b/hanzi-flash/README.md
@ -2,4 +2,9 @@

 Generate flashcards for a range of frequent hanzi characters.

-You need the [hanziDB CSV](https://github.com/ruddfawcett/hanziDB.csv) file for this to work.
+There are multiple sources for words:
+
+- [hanziDB CSV](https://github.com/ruddfawcett/hanziDB.csv) (purely frequency based)
+- [hsk CSV](https://github.com/plaktos/hsk_csv) (common usage words, graded by difficulty)
+
+Be careful to keep the filenames as `hanzi_db.csv` and `hsk*.csv` because the script tries to recognize these.
--- a/hanzi-flash/hanzi_flash.py
+++ b/hanzi-flash/hanzi_flash.py
@ -8,6 +8,7 @@ Based on https://github.com/ruddfawcett/hanziDB.csv
 import csv
 import itertools
 import argparse
+import re
 from pathlib import Path

 parser = argparse.ArgumentParser()
@ -17,9 +18,15 @@ parser.add_argument("-O", "--output", default="hanzi_flash.csv", type=Path)
 parser.add_argument("-i", "--input", default="hanzi_db.csv", type=Path)
 args = parser.parse_args()

+offset = 1
+
+fname: str = args.input.stem
+if fname.startswith("hsk"):
+    offset = 0
+
 with open(args.input) as csv_file:
    reader = csv.reader(csv_file)
    with open(args.output, "w") as outp_file:
        writer = csv.writer(outp_file)
        for row in itertools.islice(reader, args.start, args.end + 1):
-            writer.writerow([row[1], f"{row[2]} ({row[3]})"])
+            writer.writerow([row[offset], f"{row[offset+1]} ({row[offset+2]})"])