mrfakename's picture
Upload 43 files
d93aca0 verified
raw
history blame
7.17 kB
# Copyright (c) 2025 SparkAudio
# 2025 Xinsheng Wang (w.xinshawn@gmail.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Description:
This script contains a collection of functions designed to handle various
file reading and writing operations. It provides utilities to read from files,
write data to files, and perform file manipulation tasks.
"""
import os
import json
import json
import csv
from tqdm import tqdm
from typing import List, Dict, Any, Set, Union
from pathlib import Path
from omegaconf import OmegaConf, DictConfig
def resolve_symbolic_link(symbolic_link_path: Path) -> Path:
"""
Resolves the absolute path of a symbolic link.
Args:
symbolic_link_path (Path): The path to the symbolic link.
Returns:
Path: The absolute path that the symbolic link points to.
"""
link_directory = os.path.dirname(symbolic_link_path)
target_path_relative = os.readlink(symbolic_link_path)
return os.path.join(link_directory, target_path_relative)
def write_jsonl(metadata: List[dict], file_path: Path) -> None:
"""Writes a list of dictionaries to a JSONL file.
Args:
metadata : List[dict]
A list of dictionaries, each representing a piece of meta.
file_path : Path
The file path to save the JSONL file
This function writes each dictionary in the list to a new line in the specified file.
"""
with open(file_path, "w", encoding="utf-8") as f:
for meta in tqdm(metadata, desc="writing jsonl"):
# Convert dictionary to JSON string and write it to the file with a newline
json_str = json.dumps(meta, ensure_ascii=False) + "\n"
f.write(json_str)
print(f"jsonl saved to {file_path}")
def read_jsonl(file_path: Path) -> List[dict]:
"""
Reads a JSONL file and returns a list of dictionaries.
Args:
file_path : Path
The path to the JSONL file to be read.
Returns:
List[dict]
A list of dictionaries parsed from each line of the JSONL file.
"""
metadata = []
# Open the file for reading
with open(file_path, "r", encoding="utf-8") as f:
# Split the file into lines
lines = f.read().splitlines()
# Process each line
for line in lines:
# Convert JSON string back to dictionary and append to list
meta = json.loads(line)
metadata.append(meta)
# Return the list of metadata
return metadata
def read_json_as_jsonl(file_path: Path) -> List[dict]:
metadata = []
with open(file_path, 'r', encoding='utf-8') as infile:
data = json.load(infile)
for k in sorted(data.keys()):
meta = {'index': k}
meta.update(data[k])
metadata.append(meta)
return metadata
def decode_unicode_strings(meta: Dict[str, Any]) -> Dict[str, Any]:
processed_meta = {}
for k, v in meta.items():
if isinstance(v, str):
processed_meta[k] = v.encode("utf-8").decode("unicode_escape")
else:
processed_meta[k] = v
return processed_meta
def load_config(config_path: Path) -> DictConfig:
"""Loads a configuration file and optionally merges it with a base configuration.
Args:
config_path (Path): Path to the configuration file.
"""
# Load the initial configuration from the given path
config = OmegaConf.load(config_path)
# Check if there is a base configuration specified and merge if necessary
if config.get("base_config", None) is not None:
base_config = OmegaConf.load(config["base_config"])
config = OmegaConf.merge(base_config, config)
return config
def jsonl_to_csv(jsonl_file_path: str, csv_file_path: str) -> None:
"""
Converts a JSONL file to a CSV file.
This function reads a JSONL file, determines all unique keys present in the file,
and writes the data to a CSV file with columns for all these keys.
"""
all_keys = set()
data_rows = []
# Read the JSONL file once to extract keys and collect data
with open(jsonl_file_path, 'r') as file:
for line in file:
data = json.loads(line.strip())
data_rows.append(data)
all_keys.update(data.keys())
# Convert the set of keys to a sorted list for consistent column order
sorted_keys = sorted(all_keys)
# Write the data to a CSV file
with open(csv_file_path, 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=sorted_keys)
# Write the header row
writer.writeheader()
# Write each row of data
for data in data_rows:
writer.writerow(data)
print(f"CSV file has been created at {csv_file_path}")
def save_metadata(data, filename, headers=None):
"""
Save metadata to a file.
Args:
data (list of dict): Metadata to be saved.
filename (str): Name of the file to save the metadata.
headers (list of str): The order of column names to be saved; defaults to the keys from the first dictionary in data if not provided.
"""
# Set headers to keys from the first dictionary in data if not explicitly provided
if headers is None:
headers = list(data[0].keys())
with open(filename, "w", encoding="utf-8") as file:
# Write the headers to the file
file.write("|".join(headers) + "\n")
for entry in data:
# Retrieve values in the order of headers, replacing any '|' characters with a space to prevent formatting errors
formatted_values = [str(entry.get(key, "")).replace("|", " ") for key in headers]
# Write the formatted values to the file
file.write("|".join(formatted_values) + "\n")
def read_metadata(filename, headers=None):
"""
Read metadata from a file.
Args:
filename (str): The file from which to read the metadata.
Returns:
list of dict: The metadata read from the file.
list of str: The headers used in the file.
"""
with open(filename, "r", encoding="utf-8") as file:
lines = file.readlines()
data = []
# Set headers from the first line of the file if not provided
if headers is None:
headers = lines[0].strip().split("|")
lines = lines[1:]
for line in lines:
line = line.strip()
# Skip empty lines
if not line:
continue
# Split the line by '|' and pair with headers to form a dictionary
entry_data = dict(zip(headers, line.split("|")))
data.append(entry_data)
return data, headers