Skip to content

ZIP文件解压缩并转码工具

本工具使用python写的一个代码,主要用于解压缩zip文件,并转码为utf-8编码.因为我们在网上下载下来的很多文件都是gbk编码的,所以需要将这些文件转码为utf-8编码.文本内容都是乱码,非常不方便.特此写一个工具来解压缩zip文件,并转码为utf-8编码.

依赖:

本工具依赖iconv和unar两个工具,请自行安装.必须事先进行安装.

bash
sudo apt install iconv unar

否则无法正常使用.

使用方法:

在压缩文件所在的目录下执行以下命令:

bash
python unzipv.py  待解压缩的zip文件.zip

为了方便使用,也可以将文件放在 /usr/local/bin 目录下,并为这个文件起一个好记的名字,然后执行以下命令:

bash
sudo cp unzipv.py /usr/local/bin/unzipv
sudo chmod +x unzipv

然后就可以在任何地方直接使用 unzipv 命令来解压缩和转码文件了.

bash
unzipv 待解压缩的zip文件.zip

代码内容:

直接复制代码,并将代码保存至 unzipv.py 文件中,就可以使用了.

python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import sys
import subprocess
import argparse
from pathlib import Path

FILE_EXTENSIONS = {'.c', '.cpp', '.h', '.hpp', '.txt', '.md', '.json', '.xml', '.py', '.java', '.js', '.css', '.html', '.ini', '.cfg', '.conf', '.log'}

def is_text_file(filepath):
    ext = os.path.splitext(filepath)[1].lower()
    return ext in FILE_EXTENSIONS

def detect_encoding_with_iconv(filepath):
    try:
        result = subprocess.run(
            ['iconv', '-f', 'GBK', '-t', 'UTF-8', '-o', '/dev/null', filepath],
            capture_output=True,
            timeout=5
        )
        if result.returncode == 0:
            return 'GBK'
    except (subprocess.TimeoutExpired, FileNotFoundError):
        pass

    try:
        result = subprocess.run(
            ['iconv', '-f', 'UTF-8', '-t', 'UTF-8', '-o', '/dev/null', filepath],
            capture_output=True,
            timeout=5
        )
        if result.returncode == 0:
            return 'UTF-8'
    except (subprocess.TimeoutExpired, FileNotFoundError):
        pass

    return None

def convert_file_encoding(filepath):
    encoding = detect_encoding_with_iconv(filepath)
    
    if encoding == 'GBK':
        print(f"Converting: {filepath}")
        try:
            result = subprocess.run(
                ['iconv', '-f', 'GBK', '-t', 'UTF-8', filepath, '-o', filepath + '.tmp'],
                capture_output=True,
                timeout=30
            )
            if result.returncode == 0:
                os.replace(filepath + '.tmp', filepath)
                print(f"  -> Converted successfully")
                return True
            else:
                print(f"  -> Conversion failed")
                if os.path.exists(filepath + '.tmp'):
                    os.remove(filepath + '.tmp')
                return False
        except Exception as e:
            print(f"  -> Error: {e}")
            if os.path.exists(filepath + '.tmp'):
                os.remove(filepath + '.tmp')
            return False
    elif encoding == 'UTF-8':
        print(f"Skipping (already UTF-8): {filepath}")
        return None
    else:
        print(f"Skipping (unknown encoding or binary): {filepath}")
        return None

def process_directory(directory, existed_files=None):
    if existed_files is None:
        existed_files = set()
    
    converted_count = 0
    skipped_count = 0
    error_count = 0
    
    for root, dirs, files in os.walk(directory):
        for filename in files:
            filepath = os.path.join(root, filename)
            
            if filepath in existed_files:
                continue
            
            if is_text_file(filepath):
                result = convert_file_encoding(filepath)
                if result is True:
                    converted_count += 1
                elif result is False:
                    error_count += 1
                else:
                    skipped_count += 1
    
    return converted_count, skipped_count, error_count

def extract_zip_with_unar(zip_path, extract_dir):
    print(f"Extracting: {zip_path}")
    print(f"Destination: {extract_dir}")
    try:
        result = subprocess.run(
            ['unar', '-o', extract_dir, '-f', zip_path],
            capture_output=True,
            timeout=300
        )
        if result.returncode == 0:
            print(f"Extracted successfully")
            return True
        else:
            print(f"Extraction failed: {result.stderr.decode('utf-8', errors='ignore')}")
            return False
    except FileNotFoundError:
        print("Error: 'unar' command not found. Please install unar first.")
        print("  Ubuntu/Debian: sudo apt install unar")
        print("  macOS: brew install unar")
        return False
    except Exception as e:
        print(f"Extraction error: {e}")
        return False

def get_all_files(directory):
    all_files = set()
    for root, dirs, files in os.walk(directory):
        for filename in files:
            all_files.add(os.path.join(root, filename))
    return all_files

def main():
    parser = argparse.ArgumentParser(
        description='Extract zip file and convert GBK text files to UTF-8'
    )
    parser.add_argument('zip_file', help='Path to the zip file')
    parser.add_argument('-d', '--directory', default='.', help='Destination directory (default: current directory)')
    args = parser.parse_args()

    if not os.path.exists(args.zip_file):
        print(f"Error: File not found: {args.zip_file}")
        sys.exit(1)

    extract_dir = os.path.abspath(args.directory)
    
    if not os.path.exists(extract_dir):
        os.makedirs(extract_dir, exist_ok=True)

    existed_files = get_all_files(extract_dir)

    if not extract_zip_with_unar(args.zip_file, extract_dir):
        sys.exit(1)

    dirs_in_extract = [d for d in os.listdir(extract_dir) if os.path.isdir(os.path.join(extract_dir, d))]
    if dirs_in_extract:
        target_dir = os.path.join(extract_dir, dirs_in_extract[0])
    else:
        target_dir = extract_dir

    print()
    print("=" * 60)
    print("Converting GBK files to UTF-8...")
    print("=" * 60)
    print()

    converted, skipped, errors = process_directory(target_dir, existed_files)

    print()
    print("=" * 60)
    print("Summary:")
    print(f"  Converted: {converted}")
    print(f"  Skipped:   {skipped}")
    print(f"  Errors:    {errors}")
    print("=" * 60)

if __name__ == '__main__':
    main()