Yiqiao Jin
Initial Commit
bdafe83
raw
history blame
2.15 kB
import os
import random
import shutil
# Set the path to the main directory where files are located (absolute path to the target directory)
# In this case, the directory contains the original data in JSON format
base_dir = 'real_review/original_data/'
# Define the target directory where the selected JSON files will be copied
# This is a subdirectory within the base directory, named 'selected_files'
selected_base_dir = os.path.join(base_dir, 'selected_files')
# Create a list to store the full paths of all JSON files found in the base directory
json_files = []
# Traverse the base directory and its subdirectories to locate all files
# Collect the paths of files that have a '.json' extension
for root, dirs, files in os.walk(base_dir):
for file in files:
if file.endswith('.json'): # Check if the file is a JSON file
json_files.append(os.path.join(root, file)) # Add the full path of the file to the list
# Calculate the number of files to select randomly
# 1% of the total number of JSON files is selected, with a minimum of 1 file
num_files_to_select = max(1, int(len(json_files) * 0.01))
# Randomly select 1% of the JSON files from the list of all files
selected_files = random.sample(json_files, num_files_to_select)
# Print the number of selected files for reference
print(f"Selected {num_files_to_select} file(s):")
# Copy the selected files to the target directory ('selected_files'), preserving their original directory structure
for file in selected_files:
# Get the relative path of the file (relative to the base directory)
relative_path = os.path.relpath(file, base_dir)
# Create the full destination path for the file in the target directory
dest_file_path = os.path.join(selected_base_dir, relative_path)
# Ensure that the destination directory exists; if not, create it
dest_dir = os.path.dirname(dest_file_path)
os.makedirs(dest_dir, exist_ok=True)
# Copy the file from the original location to the destination
shutil.copy(file, dest_file_path)
# Print confirmation message after all files have been successfully copied
print("File copying completed.")