-
Notifications
You must be signed in to change notification settings - Fork 0
/
move_files.py
executable file
·97 lines (78 loc) · 4.29 KB
/
move_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from pathlib import Path
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor, as_completed
def determine_class(row):
if row["common_name"].lower() == "hairy vetch" and row["TargetWeed"]:
return "hairy vetch"
if row["TargetWeed"]:
return "grass" if row["group"].lower() == "monocot" else "broadleaf"
else:
return "non_target"
# Paths
image_folder1 = Path("/mnt/research-projects/s/screberg/longterm_images/semifield-cutouts")
image_folder2 = Path("/mnt/research-projects/s/screberg/GROW_DATA/semifield-cutouts")
image_folder3 = Path("/home/psa_images/SemiF-AnnotationPipeline/data/semifield-cutouts")
state_prefix = "MD"
label_csvs = Path("labels/md_covers").glob("*.csv")
dest = Path("data") # Destination directory for copied images
# Load CSV data
dfs = [pd.read_csv(csv) for csv in label_csvs]
df = pd.concat(dfs, ignore_index=True)
# Filter rows based on batch prefix
df = df[df["batch_id"].str.contains(state_prefix)]
df = df.drop_duplicates(subset=["cutout_id"])
df = df[df["common_name"] != "unknown"]
print(f"Total number of images: {len(df)}")
# Create a new column for class
df["class"] = df.apply(determine_class, axis=1)
# Perform train/val split (80% train, 20% val)
train_df, val_df = train_test_split(df, test_size=0.1, stratify=df["class"], random_state=42)
print(f"Number of training images: {len(train_df)}")
print(f"Number of validation images: {len(val_df)}")
# print(f"Number of target weed train images: {train_df[train_df['class'] == True].shape[0]}")
print(f"Number of target weed grass train images: {train_df[train_df['class'] == 'grass'].shape[0]}")
print(f"Number of target weed broadleaf train images: {train_df[train_df['class'] == 'broadleaf'].shape[0]}")
print(f"Number of target weed hairy vetch train images: {train_df[train_df['class'] == 'hairy vetch'].shape[0]}")
print(f"Number of non-target weed train images: {train_df[train_df['class'] == 'non_target'].shape[0]}")
print(f"Number of target grass weed val images: {val_df[val_df['class'] == 'grass'].shape[0]}")
print(f"Number of target broadleaf weed val images: {val_df[val_df['class'] == 'broadleaf'].shape[0]}")
print(f"Number of target hairy vetch weed val images: {val_df[val_df['class'] == 'hairy vetch'].shape[0]}")
print(f"Number of non-target weed val images: {val_df[val_df['class'] == 'non_target'].shape[0]}")
# print(f"Number of non-target weed train images: {train_df[train_df['class'] == False].shape[0]}")
# print(f"Number of target weed val images: {val_df[val_df['class'] == True].shape[0]}")
# print(f"Number of non-target weed val images: {val_df[val_df['class'] == False].shape[0]}")
# Helper function to copy a single image
def copy_single_image(row, subset, dest):
"""Copy a single image based on the DataFrame row."""
image_name = row["cutout_id"] + ".jpg"
source = image_folder1 / row["batch_id"] / image_name
if not source.exists():
source = image_folder2 / row["batch_id"] / image_name
if not source.exists():
source = image_folder3 / row["batch_id"] / image_name
if source.exists():
# targetweed = row["TargetWeed"]
targetweed = row["class"]
# Set the target folder based on class and subset (train/val)
# target_folder = dest / subset / ("target_grass" if targetweed else "non_target")
target_folder = dest / subset / targetweed
target_folder.mkdir(exist_ok=True, parents=True)
target = target_folder / image_name
shutil.copy2(source, target)
else:
print(f"Image {source} not found")
# Function to copy images using a thread pool
def copy_images_parallel(df, subset, dest, max_workers=20):
"""Copy images in parallel using ThreadPoolExecutor."""
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(copy_single_image, row, subset, dest) for _, row in df.iterrows()]
for future in as_completed(futures):
try:
future.result() # Raise exceptions if any occurred during execution
except Exception as e:
print(f"Error copying image: {e}")
# Create 'train' and 'val' directories and copy images in parallel
copy_images_parallel(train_df, "train", dest)
copy_images_parallel(val_df, "val", dest)