feat: 校验编码

This commit is contained in:
起个名字好难的喵
2025-07-26 10:30:48 +08:00
parent 37b859e26a
commit a24eb4d2ee
8 changed files with 599 additions and 534 deletions

View File

@@ -4,6 +4,7 @@ import json
import os
import subprocess
import re
import chardet
from packaging.version import parse
from semver import VersionInfo
@@ -414,6 +415,68 @@ def check_position_ids(positions):
return validation_issues, corrections
# ==================== 验证修复编码 ====================
def detect_encoding(file_path, read_size=2048):
try:
with open(file_path, 'rb') as f:
raw = f.read(read_size)
result = chardet.detect(raw)
return result['encoding'], result['confidence']
except:
return None, 0
def fix_encoding_name(enc, file_path=None):
if not enc:
return None
enc = enc.lower()
if enc in ['ascii']:
try:
with open(file_path, 'rb') as f:
raw = f.read()
raw.decode('utf-8')
return 'utf-8'
except:
return 'gb18030'
if enc in ['gb2312', 'gbk', 'windows-1252', 'iso-8859-1', 'gb18030']:
return 'gb18030'
return enc
def convert_to_utf8(file_path, original_encoding):
try:
encoding = fix_encoding_name(original_encoding, file_path)
with open(file_path, 'r', encoding=encoding, errors='replace') as f:
content = f.read()
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"[✔] Converted to UTF-8: {file_path} (from {original_encoding}{encoding})")
except Exception as e:
print(f"[✖] Failed to convert: {file_path} | Error: {e}")
def process_file(file_path, target_extensions=None):
if target_extensions and not any(file_path.lower().endswith(ext) for ext in target_extensions):
return
encoding, confidence = detect_encoding(file_path)
if encoding is None or confidence < 0.7:
print(f"[⚠️] Unknown encoding: {file_path} | Detected: {encoding}, Conf: {confidence:.2f}")
return
if encoding.lower() == 'utf-8':
return # Skip already UTF-8
convert_to_utf8(file_path, encoding)
def scan_and_convert(path, target_extensions=None):
if os.path.isfile(path):
process_file(path, target_extensions)
elif os.path.isdir(path):
for dirpath, _, filenames in os.walk(path):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
process_file(filepath, target_extensions)
else:
print(f"❌ Path not found: {path}")
# ==================== 主验证逻辑 ====================
def initialize_data(data, file_path):
@@ -614,6 +677,7 @@ def main():
all_notices = [] # 初始化 all_notices 变量
if os.path.isfile(path) and path.endswith('.json'):
scan_and_convert(path)
# print(f"\n🔍 校验文件: {path}")
notices = validate_file(path, auto_fix)
if notices:
@@ -629,6 +693,7 @@ def main():
if file.endswith('.json'):
file_path = os.path.join(root, file)
print(f"\n🔍 校验文件: {file_path}")
scan_and_convert(file_path)
notices = validate_file(file_path, auto_fix)
if notices:
all_notices.extend([f"{file_path}: {n}" for n in notices])