diff --git a/dictionary/sqlite/create_pinyin_db_schema.py b/dictionary/sqlite/create_pinyin_db_schema.py new file mode 100644 index 0000000..6e7d19f --- /dev/null +++ b/dictionary/sqlite/create_pinyin_db_schema.py @@ -0,0 +1,44 @@ +import sqlite3 + +# Path to the SQLite database file +db_file = 'pinyin_data.sqlite3' + +# Connect to the database. This will create the file if it doesn't exist. +conn = sqlite3.connect(db_file) + +# Create a cursor object using the connection +cur = conn.cursor() + +# SQL statement to create the pinyin_data table +# hz hanzhi +# py pinyin +# abbr abbreviation +# freq frequency +create_table_sql = """ +CREATE TABLE IF NOT EXISTS pinyin_data ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + hz TEXT NOT NULL, + py TEXT NOT NULL, + abbr TEXT NOT NULL, + freq REAL NOT NULL +); +""" + +# Execute the SQL statement to create the table +cur.execute(create_table_sql) + +# SQL statements to create indexes on the pinyin and abbreviation columns +create_index_pinyin_sql = "CREATE INDEX IF NOT EXISTS idx_pinyin ON pinyin_data(py);" +create_index_abbreviation_sql = "CREATE INDEX IF NOT EXISTS idx_abbr ON pinyin_data(abbr);" + +# Execute the SQL statements to create the indexes +cur.execute(create_index_pinyin_sql) +cur.execute(create_index_abbreviation_sql) + +# Commit the changes +conn.commit() + +# Close the connection +conn.close() + +print("Database, table, and indexes created successfully.") diff --git a/dictionary/sqlite/insert_pinyin_to_sqlite_db.py b/dictionary/sqlite/insert_pinyin_to_sqlite_db.py new file mode 100644 index 0000000..6579a59 --- /dev/null +++ b/dictionary/sqlite/insert_pinyin_to_sqlite_db.py @@ -0,0 +1,28 @@ +import sqlite3 + +# Connect to your SQLite database +conn = sqlite3.connect('pinyin_data.sqlite3') +c = conn.cursor() + +# Open and read your data file +with open('../google_pinyin_rawdict_utf16_65105_freq.txt', 'r', encoding='utf-16') as file: + for line in file: + # Split the line into components + parts = line.strip().split(' ') + + # Omit the '0' and reconstruct the line if necessary + # Assuming the format is consistent and '0' always appears at the third position + if parts[2] == '0': + phrase = parts[0] + score = parts[1] + pinyin = ''.join(parts[3:]) # Join the remaining parts as the pinyin + abbreviation = ''.join([p[0] for p in parts[3:]]) # Create the abbreviation from the pinyin parts + print(phrase, pinyin, abbreviation, float(score)) + # Execute the insert command + c.execute('INSERT INTO pinyin_data (hz, py, abbr, freq) VALUES (?, ?, ?, ?)', + (phrase, pinyin, abbreviation, float(score))) + + +# Commit the changes and close the connection +conn.commit() +conn.close() diff --git a/dictionary/sqlite/pinyin_data.sqlite3 b/dictionary/sqlite/pinyin_data.sqlite3 new file mode 100644 index 0000000..636804e Binary files /dev/null and b/dictionary/sqlite/pinyin_data.sqlite3 differ