ZIP文件解压缩并转码工具
本工具使用python写的一个代码,主要用于解压缩zip文件,并转码为utf-8编码.因为我们在网上下载下来的很多文件都是gbk编码的,所以需要将这些文件转码为utf-8编码.文本内容都是乱码,非常不方便.特此写一个工具来解压缩zip文件,并转码为utf-8编码.
依赖:
本工具依赖iconv和unar两个工具,请自行安装.必须事先进行安装.
bash
sudo apt install iconv unar否则无法正常使用.
使用方法:
在压缩文件所在的目录下执行以下命令:
bash
python unzipv.py 待解压缩的zip文件.zip为了方便使用,也可以将文件放在 /usr/local/bin 目录下,并为这个文件起一个好记的名字,然后执行以下命令:
bash
sudo cp unzipv.py /usr/local/bin/unzipv
sudo chmod +x unzipv然后就可以在任何地方直接使用 unzipv 命令来解压缩和转码文件了.
bash
unzipv 待解压缩的zip文件.zip代码内容:
直接复制代码,并将代码保存至 unzipv.py 文件中,就可以使用了.
python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import subprocess
import argparse
from pathlib import Path
FILE_EXTENSIONS = {'.c', '.cpp', '.h', '.hpp', '.txt', '.md', '.json', '.xml', '.py', '.java', '.js', '.css', '.html', '.ini', '.cfg', '.conf', '.log'}
def is_text_file(filepath):
ext = os.path.splitext(filepath)[1].lower()
return ext in FILE_EXTENSIONS
def detect_encoding_with_iconv(filepath):
try:
result = subprocess.run(
['iconv', '-f', 'GBK', '-t', 'UTF-8', '-o', '/dev/null', filepath],
capture_output=True,
timeout=5
)
if result.returncode == 0:
return 'GBK'
except (subprocess.TimeoutExpired, FileNotFoundError):
pass
try:
result = subprocess.run(
['iconv', '-f', 'UTF-8', '-t', 'UTF-8', '-o', '/dev/null', filepath],
capture_output=True,
timeout=5
)
if result.returncode == 0:
return 'UTF-8'
except (subprocess.TimeoutExpired, FileNotFoundError):
pass
return None
def convert_file_encoding(filepath):
encoding = detect_encoding_with_iconv(filepath)
if encoding == 'GBK':
print(f"Converting: {filepath}")
try:
result = subprocess.run(
['iconv', '-f', 'GBK', '-t', 'UTF-8', filepath, '-o', filepath + '.tmp'],
capture_output=True,
timeout=30
)
if result.returncode == 0:
os.replace(filepath + '.tmp', filepath)
print(f" -> Converted successfully")
return True
else:
print(f" -> Conversion failed")
if os.path.exists(filepath + '.tmp'):
os.remove(filepath + '.tmp')
return False
except Exception as e:
print(f" -> Error: {e}")
if os.path.exists(filepath + '.tmp'):
os.remove(filepath + '.tmp')
return False
elif encoding == 'UTF-8':
print(f"Skipping (already UTF-8): {filepath}")
return None
else:
print(f"Skipping (unknown encoding or binary): {filepath}")
return None
def process_directory(directory, existed_files=None):
if existed_files is None:
existed_files = set()
converted_count = 0
skipped_count = 0
error_count = 0
for root, dirs, files in os.walk(directory):
for filename in files:
filepath = os.path.join(root, filename)
if filepath in existed_files:
continue
if is_text_file(filepath):
result = convert_file_encoding(filepath)
if result is True:
converted_count += 1
elif result is False:
error_count += 1
else:
skipped_count += 1
return converted_count, skipped_count, error_count
def extract_zip_with_unar(zip_path, extract_dir):
print(f"Extracting: {zip_path}")
print(f"Destination: {extract_dir}")
try:
result = subprocess.run(
['unar', '-o', extract_dir, '-f', zip_path],
capture_output=True,
timeout=300
)
if result.returncode == 0:
print(f"Extracted successfully")
return True
else:
print(f"Extraction failed: {result.stderr.decode('utf-8', errors='ignore')}")
return False
except FileNotFoundError:
print("Error: 'unar' command not found. Please install unar first.")
print(" Ubuntu/Debian: sudo apt install unar")
print(" macOS: brew install unar")
return False
except Exception as e:
print(f"Extraction error: {e}")
return False
def get_all_files(directory):
all_files = set()
for root, dirs, files in os.walk(directory):
for filename in files:
all_files.add(os.path.join(root, filename))
return all_files
def main():
parser = argparse.ArgumentParser(
description='Extract zip file and convert GBK text files to UTF-8'
)
parser.add_argument('zip_file', help='Path to the zip file')
parser.add_argument('-d', '--directory', default='.', help='Destination directory (default: current directory)')
args = parser.parse_args()
if not os.path.exists(args.zip_file):
print(f"Error: File not found: {args.zip_file}")
sys.exit(1)
extract_dir = os.path.abspath(args.directory)
if not os.path.exists(extract_dir):
os.makedirs(extract_dir, exist_ok=True)
existed_files = get_all_files(extract_dir)
if not extract_zip_with_unar(args.zip_file, extract_dir):
sys.exit(1)
dirs_in_extract = [d for d in os.listdir(extract_dir) if os.path.isdir(os.path.join(extract_dir, d))]
if dirs_in_extract:
target_dir = os.path.join(extract_dir, dirs_in_extract[0])
else:
target_dir = extract_dir
print()
print("=" * 60)
print("Converting GBK files to UTF-8...")
print("=" * 60)
print()
converted, skipped, errors = process_directory(target_dir, existed_files)
print()
print("=" * 60)
print("Summary:")
print(f" Converted: {converted}")
print(f" Skipped: {skipped}")
print(f" Errors: {errors}")
print("=" * 60)
if __name__ == '__main__':
main()