The datasets linked below were used, and the following code was used as the DataLoader to transfer to the training part. In the examples, the HuggingFace Datasets library is generally used, so such a transformation was made to convert any segmentation dataset into the datasets format.

class WaterDatasetLoader:
    def __init__(self, dataset_root, image_subfolder, annotation_subfolder):
        self.dataset_root = dataset_root
        self.image_subfolder = image_subfolder
        self.annotation_subfolder = annotation_subfolder
        self.image_paths = []
        self.annotation_paths = []
        self.load_paths()

    def load_paths(self):
        for root, dirs, files in os.walk(self.dataset_root):
            if self.image_subfolder in root:
                for filename in files:
                    if filename.endswith(".png"):
                        image_path = os.path.join(root, filename)
                        annotation_path = image_path.replace(self.image_subfolder, self.annotation_subfolder).replace(".png", ".png")
                        if os.path.exists(annotation_path):
                            self.image_paths.append(image_path)
                            self.annotation_paths.append(annotation_path)
                        else:
                            print(f"Warning: Annotation file not found for image {image_path}")
        if len(self.image_paths) != len(self.annotation_paths):
            print("Warning: Mismatch between the number of images and annotations.")

    @staticmethod
    def load_and_preprocess_image(image_path, target_size=(256, 256), dtype=np.uint8):
        img = Image.open(image_path)
        img = img.resize(target_size)
        img = np.array(img, dtype=dtype)
        return img

    @staticmethod
    def load_and_preprocess_annotation(annotation_path, target_size=(256, 256), dtype=np.uint8):
        ann = Image.open(annotation_path)
        ann = ann.resize(target_size)
        ann = np.array(ann, dtype=dtype) / 255
        return ann

    def create_dataset(self):
        images = [self.load_and_preprocess_image(path) for path in self.image_paths]
        annotations = [self.load_and_preprocess_annotation(path) for path in self.annotation_paths]
        images = np.array(images, dtype=np.uint8)
        annotations = np.array(annotations, dtype=np.uint8)

        # dataset_dict = {
        #     "image": [Image.fromarray(img, 'L') for img in images[:, :, :, 0]],
        #     "label": [Image.fromarray(ann, 'L') for ann in annotations[:, :, :, 0]],
        # }  
        images_train, images_test, annotations_train, annotations_test = train_test_split(images, annotations, test_size=0.2, random_state=42)
        train_dataset_dict = { 
            "image": [Image.fromarray(img, 'RGB') for img in images_train],
            "label": [Image.fromarray(ann, 'L') for ann in annotations_train[:, :, :, 0]],
        }
        test_dataset_dict = {
            "image": [Image.fromarray(img, 'RGB') for img in images_test],
            "label": [Image.fromarray(ann, 'L') for ann in annotations_test[:, :, :, 0]],
        }
        train_dataset = Dataset.from_dict(train_dataset_dict)
        test_dataset = Dataset.from_dict(test_dataset_dict)
        # return Dataset.from_dict(dataset_dict)
        return train_dataset, test_dataset

The code snippet below was taken from this link

class SAMDataset(TorchDataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item["image"]
        ground_truth_mask = np.array(item["label"])
        prompt = get_bounding_box(ground_truth_mask)
        inputs = self.processor(image, input_boxes=[[prompt]], return_tensors="pt")
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs["ground_truth_mask"] = ground_truth_mask
        return inputs

Here, we briefly describe the dataset and the pre-trained SAM Processor used to prepare the dataset for the Segment Anything Model (SAM). Then, we select the ground truth mask using the bounding box method from the label part. After that, we match the image with the label and send it to our model. The purpose here is a process necessary for fine-tuning the mask decoder. However, as can be seen in the Test section, since there is no ground truth mask, and as seen in the Finetune section, if there is no ground truth mask or any guide in the input part, an efficient result cannot be obtained.

Next Page

  • [https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Fine_tune_SAM_(segment_anything)on_a_custom_dataset.ipynb](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Fine_tune_SAM(segment_anything)_on_a_custom_dataset.ipynb)

  • link