Spaces:

mrfakename
/

Spark-TTS-0.5B

Runtime error

App Files Files Community

Spark-TTS-0.5B / sparktts /utils /file.py

mrfakename

Upload 43 files

d93aca0 verified 10 days ago

raw

history blame

7.17 kB

	# Copyright (c) 2025 SparkAudio
	# 2025 Xinsheng Wang (w.xinshawn@gmail.com)
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Description:
	This script contains a collection of functions designed to handle various
	file reading and writing operations. It provides utilities to read from files,
	write data to files, and perform file manipulation tasks.
	"""


	import os
	import json
	import json
	import csv

	from tqdm import tqdm
	from typing import List, Dict, Any, Set, Union
	from pathlib import Path
	from omegaconf import OmegaConf, DictConfig


	def resolve_symbolic_link(symbolic_link_path: Path) -> Path:
	"""
	Resolves the absolute path of a symbolic link.

	Args:
	symbolic_link_path (Path): The path to the symbolic link.

	Returns:
	Path: The absolute path that the symbolic link points to.
	"""

	link_directory = os.path.dirname(symbolic_link_path)
	target_path_relative = os.readlink(symbolic_link_path)
	return os.path.join(link_directory, target_path_relative)


	def write_jsonl(metadata: List[dict], file_path: Path) -> None:
	"""Writes a list of dictionaries to a JSONL file.

	Args:
	metadata : List[dict]
	A list of dictionaries, each representing a piece of meta.
	file_path : Path
	The file path to save the JSONL file

	This function writes each dictionary in the list to a new line in the specified file.
	"""
	with open(file_path, "w", encoding="utf-8") as f:
	for meta in tqdm(metadata, desc="writing jsonl"):
	# Convert dictionary to JSON string and write it to the file with a newline
	json_str = json.dumps(meta, ensure_ascii=False) + "\n"
	f.write(json_str)
	print(f"jsonl saved to {file_path}")


	def read_jsonl(file_path: Path) -> List[dict]:
	"""
	Reads a JSONL file and returns a list of dictionaries.

	Args:
	file_path : Path
	The path to the JSONL file to be read.

	Returns:
	List[dict]
	A list of dictionaries parsed from each line of the JSONL file.
	"""
	metadata = []
	# Open the file for reading
	with open(file_path, "r", encoding="utf-8") as f:
	# Split the file into lines
	lines = f.read().splitlines()
	# Process each line
	for line in lines:
	# Convert JSON string back to dictionary and append to list
	meta = json.loads(line)
	metadata.append(meta)
	# Return the list of metadata
	return metadata

	def read_json_as_jsonl(file_path: Path) -> List[dict]:
	metadata = []
	with open(file_path, 'r', encoding='utf-8') as infile:
	data = json.load(infile)
	for k in sorted(data.keys()):
	meta = {'index': k}
	meta.update(data[k])
	metadata.append(meta)
	return metadata



	def decode_unicode_strings(meta: Dict[str, Any]) -> Dict[str, Any]:
	processed_meta = {}
	for k, v in meta.items():
	if isinstance(v, str):
	processed_meta[k] = v.encode("utf-8").decode("unicode_escape")
	else:
	processed_meta[k] = v
	return processed_meta


	def load_config(config_path: Path) -> DictConfig:
	"""Loads a configuration file and optionally merges it with a base configuration.

	Args:
	config_path (Path): Path to the configuration file.
	"""
	# Load the initial configuration from the given path
	config = OmegaConf.load(config_path)

	# Check if there is a base configuration specified and merge if necessary
	if config.get("base_config", None) is not None:
	base_config = OmegaConf.load(config["base_config"])
	config = OmegaConf.merge(base_config, config)

	return config



	def jsonl_to_csv(jsonl_file_path: str, csv_file_path: str) -> None:
	"""
	Converts a JSONL file to a CSV file.

	This function reads a JSONL file, determines all unique keys present in the file,
	and writes the data to a CSV file with columns for all these keys.
	"""

	all_keys = set()
	data_rows = []

	# Read the JSONL file once to extract keys and collect data
	with open(jsonl_file_path, 'r') as file:
	for line in file:
	data = json.loads(line.strip())
	data_rows.append(data)
	all_keys.update(data.keys())

	# Convert the set of keys to a sorted list for consistent column order
	sorted_keys = sorted(all_keys)

	# Write the data to a CSV file
	with open(csv_file_path, 'w', newline='') as csvfile:
	writer = csv.DictWriter(csvfile, fieldnames=sorted_keys)

	# Write the header row
	writer.writeheader()

	# Write each row of data
	for data in data_rows:
	writer.writerow(data)

	print(f"CSV file has been created at {csv_file_path}")


	def save_metadata(data, filename, headers=None):
	"""
	Save metadata to a file.

	Args:
	data (list of dict): Metadata to be saved.
	filename (str): Name of the file to save the metadata.
	headers (list of str): The order of column names to be saved; defaults to the keys from the first dictionary in data if not provided.
	"""
	# Set headers to keys from the first dictionary in data if not explicitly provided
	if headers is None:
	headers = list(data[0].keys())

	with open(filename, "w", encoding="utf-8") as file:
	# Write the headers to the file
	file.write("\|".join(headers) + "\n")
	for entry in data:
	# Retrieve values in the order of headers, replacing any '\|' characters with a space to prevent formatting errors
	formatted_values = [str(entry.get(key, "")).replace("\|", " ") for key in headers]
	# Write the formatted values to the file
	file.write("\|".join(formatted_values) + "\n")


	def read_metadata(filename, headers=None):
	"""
	Read metadata from a file.

	Args:
	filename (str): The file from which to read the metadata.

	Returns:
	list of dict: The metadata read from the file.
	list of str: The headers used in the file.
	"""
	with open(filename, "r", encoding="utf-8") as file:
	lines = file.readlines()

	data = []
	# Set headers from the first line of the file if not provided
	if headers is None:
	headers = lines[0].strip().split("\|")
	lines = lines[1:]

	for line in lines:
	line = line.strip()
	# Skip empty lines
	if not line:
	continue
	# Split the line by '\|' and pair with headers to form a dictionary
	entry_data = dict(zip(headers, line.split("\|")))
	data.append(entry_data)

	return data, headers