init

d3110caa · Paul G · 072a1158 · d3110caa · d3110caa · d3110caa
Commit d3110caa authored 1 year ago by Paul G
--- a/.gitignore
+++ b/.gitignore
+.venv
+.venv\Lib\site-packages
\ No newline at end of file
--- a/delete_images.ipynb
+++ b/delete_images.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import random\n",
+    "\n",
+    "# Pfad zum Ordner, in dem sich die zu löschenden Dateien befinden\n",
+    "folder_path = \"D:/Studium/Masterarbeit/Einarbeitung/Codebeispiele/detecting_anomalies/data/cell_images/parasitized\"\n",
+    "\n",
+    "# Liste der Dateinamen im Ordner\n",
+    "file_list = os.listdir(folder_path)\n",
+    "\n",
+    "# Anzahl der Dateien, die Sie löschen möchten\n",
+    "num_files_to_delete = 0\n",
+    "\n",
+    "# Zufällige Auswahl der Dateien zum Löschen\n",
+    "files_to_delete = random.sample(file_list, num_files_to_delete)\n",
+    "\n",
+    "# Schleife zum Löschen der ausgewählten Dateien\n",
+    "for file_name in files_to_delete:\n",
+    "    file_path = os.path.join(folder_path, file_name)\n",
+    "    os.remove(file_path)\n",
+    "    print(f\"Datei {file_name} wurde gelöscht.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+
+``` python
+import os
+import random
+
+# Pfad zum Ordner, in dem sich die zu löschenden Dateien befinden
+folder_path = "D:/Studium/Masterarbeit/Einarbeitung/Codebeispiele/detecting_anomalies/data/cell_images/parasitized"
+
+# Liste der Dateinamen im Ordner
+file_list = os.listdir(folder_path)
+
+# Anzahl der Dateien, die Sie löschen möchten
+num_files_to_delete = 0
+
+# Zufällige Auswahl der Dateien zum Löschen
+files_to_delete = random.sample(file_list, num_files_to_delete)
+
+# Schleife zum Löschen der ausgewählten Dateien
+for file_name in files_to_delete:
+    file_path = os.path.join(folder_path, file_name)
+    os.remove(file_path)
+    print(f"Datei {file_name} wurde gelöscht.")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
--- a/detecting_anomalie.ipynb
+++ b/detecting_anomalie.ipynb
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Quelle\n",
+    "https://github.com/bnsreenu/python_for_microscopists/blob/master/260_image_anomaly_detection_using_autoencoders/260_image_anomaly_detection_using_autoencoders.py\n",
+    "\n",
+    "``Infos``\\\n",
+    "Detecting anomaly images using AutoEncoders. (Sorting an entire image as either normal or anomaly)\\\n",
+    "Here, we use both the reconstruction error and also the kernel density estimation based on the vectors in the latent space. \n",
+    "We will consider the bottleneck layer outputfrom our autoencoder as the latent space.\\\n",
+    "This code uses the malarial data set but it can be easily applied to any application. \n",
+    "\n",
+    "Data from: https://data.lhncbc.nlm.nih.gov/public/Malaria/cell_images.zip"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D\n",
+    "from tensorflow.keras.preprocessing.image import ImageDataGenerator\n",
+    "\n",
+    "from PIL import Image\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import random"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 0 images belonging to 0 classes.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "\"\\nvalidation_generator = datagen.flow_from_directory(\\n    'data/cell_images/uninfected_test/',\\n    target_size=(SIZE, SIZE),\\n    batch_size=batch_size,\\n    class_mode='input'\\n    )\\n\\nanomaly_generator = datagen.flow_from_directory(\\n    'data/cell_images/parasitized/',\\n    target_size=(SIZE, SIZE),\\n    batch_size=batch_size,\\n    class_mode='input'\\n    )\\n\""
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Size of our input images\n",
+    "SIZE = 128\n",
+    "\n",
+    "#Define generators for training, validation and also anomaly data.\n",
+    "batch_size = 64\n",
+    "datagen = ImageDataGenerator(rescale=1./255)\n",
+    "\n",
+    "train_generator = datagen.flow_from_directory(\n",
+    "    'data/cell_images/uninfected_train/',\n",
+    "    target_size=(SIZE, SIZE),\n",
+    "    batch_size=batch_size,\n",
+    "    class_mode='input'\n",
+    "    )\n",
+    "'''\n",
+    "validation_generator = datagen.flow_from_directory(\n",
+    "    'data/cell_images/uninfected_test/',\n",
+    "    target_size=(SIZE, SIZE),\n",
+    "    batch_size=batch_size,\n",
+    "    class_mode='input'\n",
+    "    )\n",
+    "\n",
+    "anomaly_generator = datagen.flow_from_directory(\n",
+    "    'data/cell_images/parasitized/',\n",
+    "    target_size=(SIZE, SIZE),\n",
+    "    batch_size=batch_size,\n",
+    "    class_mode='input'\n",
+    "    )\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model: \"sequential\"\n",
+      "_________________________________________________________________\n",
+      " Layer (type)                Output Shape              Param #   \n",
+      "=================================================================\n",
+      " conv2d (Conv2D)             (None, 128, 128, 64)      1792      \n",
+      "                                                                 \n",
+      " max_pooling2d (MaxPooling2  (None, 64, 64, 64)        0         \n",
+      " D)                                                              \n",
+      "                                                                 \n",
+      " conv2d_1 (Conv2D)           (None, 64, 64, 32)        18464     \n",
+      "                                                                 \n",
+      " max_pooling2d_1 (MaxPoolin  (None, 32, 32, 32)        0         \n",
+      " g2D)                                                            \n",
+      "                                                                 \n",
+      " conv2d_2 (Conv2D)           (None, 32, 32, 16)        4624      \n",
+      "                                                                 \n",
+      " max_pooling2d_2 (MaxPoolin  (None, 16, 16, 16)        0         \n",
+      " g2D)                                                            \n",
+      "                                                                 \n",
+      " conv2d_3 (Conv2D)           (None, 16, 16, 16)        2320      \n",
+      "                                                                 \n",
+      " up_sampling2d (UpSampling2  (None, 32, 32, 16)        0         \n",
+      " D)                                                              \n",
+      "                                                                 \n",
+      " conv2d_4 (Conv2D)           (None, 32, 32, 32)        4640      \n",
+      "                                                                 \n",
+      " up_sampling2d_1 (UpSamplin  (None, 64, 64, 32)        0         \n",
+      " g2D)                                                            \n",
+      "                                                                 \n",
+      " conv2d_5 (Conv2D)           (None, 64, 64, 64)        18496     \n",
+      "                                                                 \n",
+      " up_sampling2d_2 (UpSamplin  (None, 128, 128, 64)      0         \n",
+      " g2D)                                                            \n",
+      "                                                                 \n",
+      " conv2d_6 (Conv2D)           (None, 128, 128, 3)       1731      \n",
+      "                                                                 \n",
+      "=================================================================\n",
+      "Total params: 52067 (203.39 KB)\n",
+      "Trainable params: 52067 (203.39 KB)\n",
+      "Non-trainable params: 0 (0.00 Byte)\n",
+      "_________________________________________________________________\n"
+     ]
+    },
+    {
+     "ename": "ValueError",
+     "evalue": "Asked to retrieve element 0, but the Sequence has length 0",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[7], line 28\u001b[0m\n\u001b[0;32m     25\u001b[0m model\u001b[39m.\u001b[39msummary()\n\u001b[0;32m     27\u001b[0m \u001b[39m#Fit the model. \u001b[39;00m\n\u001b[1;32m---> 28\u001b[0m history \u001b[39m=\u001b[39m model\u001b[39m.\u001b[39;49mfit(\n\u001b[0;32m     29\u001b[0m         train_generator,\n\u001b[0;32m     30\u001b[0m         steps_per_epoch\u001b[39m=\u001b[39;49m \u001b[39m500\u001b[39;49m \u001b[39m/\u001b[39;49m\u001b[39m/\u001b[39;49m batch_size,\n\u001b[0;32m     31\u001b[0m         epochs\u001b[39m=\u001b[39;49m\u001b[39m1000\u001b[39;49m,\n\u001b[0;32m     32\u001b[0m         validation_data\u001b[39m=\u001b[39;49mvalidation_generator,\n\u001b[0;32m     33\u001b[0m         validation_steps\u001b[39m=\u001b[39;49m\u001b[39m75\u001b[39;49m \u001b[39m/\u001b[39;49m\u001b[39m/\u001b[39;49m batch_size,\n\u001b[0;32m     34\u001b[0m         shuffle \u001b[39m=\u001b[39;49m \u001b[39mTrue\u001b[39;49;00m)\n\u001b[0;32m     37\u001b[0m \u001b[39m#plot the training and validation accuracy and loss at each epoch\u001b[39;00m\n\u001b[0;32m     38\u001b[0m loss \u001b[39m=\u001b[39m history\u001b[39m.\u001b[39mhistory[\u001b[39m'\u001b[39m\u001b[39mloss\u001b[39m\u001b[39m'\u001b[39m]\n",
+      "File \u001b[1;32md:\\Studium\\Masterarbeit\\Einarbeitung\\Codebeispiele\\detecting_anomalies\\.venv\\Lib\\site-packages\\keras\\src\\utils\\traceback_utils.py:70\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m     67\u001b[0m     filtered_tb \u001b[39m=\u001b[39m _process_traceback_frames(e\u001b[39m.\u001b[39m__traceback__)\n\u001b[0;32m     68\u001b[0m     \u001b[39m# To get the full stack trace, call:\u001b[39;00m\n\u001b[0;32m     69\u001b[0m     \u001b[39m# `tf.debugging.disable_traceback_filtering()`\u001b[39;00m\n\u001b[1;32m---> 70\u001b[0m     \u001b[39mraise\u001b[39;00m e\u001b[39m.\u001b[39mwith_traceback(filtered_tb) \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m     71\u001b[0m \u001b[39mfinally\u001b[39;00m:\n\u001b[0;32m     72\u001b[0m     \u001b[39mdel\u001b[39;00m filtered_tb\n",
+      "File \u001b[1;32md:\\Studium\\Masterarbeit\\Einarbeitung\\Codebeispiele\\detecting_anomalies\\.venv\\Lib\\site-packages\\keras\\src\\preprocessing\\image.py:103\u001b[0m, in \u001b[0;36mIterator.__getitem__\u001b[1;34m(self, idx)\u001b[0m\n\u001b[0;32m    101\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__getitem__\u001b[39m(\u001b[39mself\u001b[39m, idx):\n\u001b[0;32m    102\u001b[0m     \u001b[39mif\u001b[39;00m idx \u001b[39m>\u001b[39m\u001b[39m=\u001b[39m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m--> 103\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m    104\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mAsked to retrieve element \u001b[39m\u001b[39m{idx}\u001b[39;00m\u001b[39m, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    105\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mbut the Sequence \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    106\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mhas length \u001b[39m\u001b[39m{length}\u001b[39;00m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(idx\u001b[39m=\u001b[39midx, length\u001b[39m=\u001b[39m\u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m))\n\u001b[0;32m    107\u001b[0m         )\n\u001b[0;32m    108\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mseed \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m    109\u001b[0m         np\u001b[39m.\u001b[39mrandom\u001b[39m.\u001b[39mseed(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mseed \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtotal_batches_seen)\n",
+      "\u001b[1;31mValueError\u001b[0m: Asked to retrieve element 0, but the Sequence has length 0"
+     ]
+    }
+   ],
+   "source": [
+    "#Define the autoencoder. \n",
+    "#Try to make the bottleneck layer size as small as possible to make it easy for\n",
+    "#density calculations and also picking appropriate thresholds. \n",
+    "\n",
+    "#Encoder\n",
+    "model = Sequential()\n",
+    "model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(SIZE, SIZE, 3)))\n",
+    "model.add(MaxPooling2D((2, 2), padding='same'))\n",
+    "model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))\n",
+    "model.add(MaxPooling2D((2, 2), padding='same'))\n",
+    "model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))\n",
+    "model.add(MaxPooling2D((2, 2), padding='same'))\n",
+    "\n",
+    "#Decoder\n",
+    "model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))\n",
+    "model.add(UpSampling2D((2, 2)))\n",
+    "model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))\n",
+    "model.add(UpSampling2D((2, 2)))\n",
+    "model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))\n",
+    "model.add(UpSampling2D((2, 2)))\n",
+    "\n",
+    "model.add(Conv2D(3, (3, 3), activation='sigmoid', padding='same'))\n",
+    "\n",
+    "model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])\n",
+    "model.summary()\n",
+    "\n",
+    "#Fit the model. \n",
+    "history = model.fit(\n",
+    "        train_generator,\n",
+    "        steps_per_epoch= 500 // batch_size,\n",
+    "        epochs=1000,\n",
+    "        validation_data=validation_generator,\n",
+    "        validation_steps=75 // batch_size,\n",
+    "        shuffle = True)\n",
+    "\n",
+    "\n",
+    "#plot the training and validation accuracy and loss at each epoch\n",
+    "loss = history.history['loss']\n",
+    "val_loss = history.history['val_loss']\n",
+    "epochs = range(1, len(loss) + 1)\n",
+    "plt.plot(epochs, loss, 'y', label='Training loss')\n",
+    "plt.plot(epochs, val_loss, 'r', label='Validation loss')\n",
+    "plt.title('Training and validation loss')\n",
+    "plt.xlabel('Epochs')\n",
+    "plt.ylabel('Loss')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get all batches generated by the datagen and pick a batch for prediction\n",
+    "#Just to test the model. \n",
+    "data_batch = []  #Capture all training batches as a numpy array\n",
+    "img_num = 0\n",
+    "while img_num <= train_generator.batch_index:   #gets each generated batch of size batch_size\n",
+    "    data = train_generator.next()\n",
+    "    data_batch.append(data[0])\n",
+    "    img_num = img_num + 1\n",
+    "\n",
+    "predicted = model.predict(data_batch[0])  #Predict on the first batch of images\n",
+    "\n",
+    "\n",
+    "#Sanity check, view few images and corresponding reconstructions\n",
+    "image_number = random.randint(0, predicted.shape[0])\n",
+    "plt.figure(figsize=(12, 6))\n",
+    "plt.subplot(121)\n",
+    "plt.imshow(data_batch[0][image_number])\n",
+    "plt.subplot(122)\n",
+    "plt.imshow(predicted[image_number])\n",
+    "plt.show()\n",
+    "\n",
+    "#Let us examine the reconstruction error between our validation data (good/normal images)\n",
+    "# and the anomaly images\n",
+    "validation_error = model.evaluate_generator(validation_generator)\n",
+    "anomaly_error = model.evaluate_generator(anomaly_generator)\n",
+    "\n",
+    "print(\"Recon. error for the validation (normal) data is: \", validation_error)\n",
+    "print(\"Recon. error for the anomaly data is: \", anomaly_error)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Let us extract (or build) the encoder network, with trained weights.\n",
+    "#This is used to get the compressed output (latent space) of the input image. \n",
+    "#The compressed output is then used to calculate the KDE\n",
+    "\n",
+    "encoder_model = Sequential()\n",
+    "encoder_model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(SIZE, SIZE, 3), weights=model.layers[0].get_weights()) )\n",
+    "encoder_model.add(MaxPooling2D((2, 2), padding='same'))\n",
+    "encoder_model.add(Conv2D(32, (3, 3), activation='relu', padding='same', weights=model.layers[2].get_weights()))\n",
+    "encoder_model.add(MaxPooling2D((2, 2), padding='same'))\n",
+    "encoder_model.add(Conv2D(16, (3, 3), activation='relu', padding='same', weights=model.layers[4].get_weights()))\n",
+    "encoder_model.add(MaxPooling2D((2, 2), padding='same'))\n",
+    "encoder_model.summary()\n",
+    "\n",
+    "########################################################\n",
+    "# Calculate KDE using sklearn\n",
+    "from sklearn.neighbors import KernelDensity\n",
+    "\n",
+    "#Get encoded output of input images = Latent space\n",
+    "encoded_images = encoder_model.predict_generator(train_generator)\n",
+    "\n",
+    "# Flatten the encoder output because KDE from sklearn takes 1D vectors as input\n",
+    "encoder_output_shape = encoder_model.output_shape #Here, we have 16x16x16\n",
+    "out_vector_shape = encoder_output_shape[1]*encoder_output_shape[2]*encoder_output_shape[3]\n",
+    "\n",
+    "encoded_images_vector = [np.reshape(img, (out_vector_shape)) for img in encoded_images]\n",
+    "\n",
+    "#Fit KDE to the image latent data\n",
+    "kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(encoded_images_vector)\n",
+    "\n",
+    "#Calculate density and reconstruction error to find their means values for\n",
+    "#good and anomaly images. \n",
+    "#We use these mean and sigma to set thresholds. \n",
+    "def calc_density_and_recon_error(batch_images):\n",
+    "    \n",
+    "    density_list=[]\n",
+    "    recon_error_list=[]\n",
+    "    for im in range(0, batch_images.shape[0]-1):\n",
+    "        \n",
+    "        img  = batch_images[im]\n",
+    "        img = img[np.newaxis, :,:,:]\n",
+    "        encoded_img = encoder_model.predict([[img]]) # Create a compressed version of the image using the encoder\n",
+    "        encoded_img = [np.reshape(img, (out_vector_shape)) for img in encoded_img] # Flatten the compressed image\n",
+    "        density = kde.score_samples(encoded_img)[0] # get a density score for the new image\n",
+    "        reconstruction = model.predict([[img]])\n",
+    "        reconstruction_error = model.evaluate([reconstruction],[[img]], batch_size = 1)[0]\n",
+    "        density_list.append(density)\n",
+    "        recon_error_list.append(reconstruction_error)\n",
+    "        \n",
+    "    average_density = np.mean(np.array(density_list))  \n",
+    "    stdev_density = np.std(np.array(density_list)) \n",
+    "    \n",
+    "    average_recon_error = np.mean(np.array(recon_error_list))  \n",
+    "    stdev_recon_error = np.std(np.array(recon_error_list)) \n",
+    "    \n",
+    "    return average_density, stdev_density, average_recon_error, stdev_recon_error\n",
+    "\n",
+    "#Get average and std dev. of density and recon. error for uninfected and anomaly (parasited) images. \n",
+    "#For this let us generate a batch of images for each. \n",
+    "train_batch = train_generator.next()[0]\n",
+    "anomaly_batch = anomaly_generator.next()[0]\n",
+    "\n",
+    "uninfected_values = calc_density_and_recon_error(train_batch)\n",
+    "anomaly_values = calc_density_and_recon_error(anomaly_batch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Now, input unknown images and sort as Good or Anomaly\n",
+    "def check_anomaly(img_path):\n",
+    "    density_threshold = 2500 #Set this value based on the above exercise\n",
+    "    reconstruction_error_threshold = 0.004 # Set this value based on the above exercise\n",
+    "    img  = Image.open(img_path)\n",
+    "    img = np.array(img.resize((128,128), Image.ANTIALIAS))\n",
+    "    plt.imshow(img)\n",
+    "    img = img / 255.\n",
+    "    img = img[np.newaxis, :,:,:]\n",
+    "    encoded_img = encoder_model.predict([[img]]) \n",
+    "    encoded_img = [np.reshape(img, (out_vector_shape)) for img in encoded_img] \n",
+    "    density = kde.score_samples(encoded_img)[0] \n",
+    "\n",
+    "    reconstruction = model.predict([[img]])\n",
+    "    reconstruction_error = model.evaluate([reconstruction],[[img]], batch_size = 1)[0]\n",
+    "\n",
+    "    if density < density_threshold or reconstruction_error > reconstruction_error_threshold:\n",
+    "        print(\"The image is an anomaly\")\n",
+    "        \n",
+    "    else:\n",
+    "        print(\"The image is NOT an anomaly\")\n",
+    "        \n",
+    "        \n",
+    "#Load a couple of test images and verify whether they are reported as anomalies.\n",
+    "import glob\n",
+    "para_file_paths = glob.glob('cell_images2/parasitized/images/*')\n",
+    "uninfected_file_paths = glob.glob('cell_images2/uninfected_train/images/*')\n",
+    "\n",
+    "#Anomaly image verification\n",
+    "num=random.randint(0,len(para_file_paths)-1)\n",
+    "check_anomaly(para_file_paths[num])\n",
+    "\n",
+    "#Good/normal image verification\n",
+    "num=random.randint(0,len(para_file_paths)-1)\n",
+    "check_anomaly(uninfected_file_paths[num])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:markdown id: tags:
+
+# Quelle
+https://github.com/bnsreenu/python_for_microscopists/blob/master/260_image_anomaly_detection_using_autoencoders/260_image_anomaly_detection_using_autoencoders.py
+
+``Infos``\
+Detecting anomaly images using AutoEncoders. (Sorting an entire image as either normal or anomaly)\
+Here, we use both the reconstruction error and also the kernel density estimation based on the vectors in the latent space.
+We will consider the bottleneck layer outputfrom our autoencoder as the latent space.\
+This code uses the malarial data set but it can be easily applied to any application.
+
+Data from: https://data.lhncbc.nlm.nih.gov/public/Malaria/cell_images.zip
+
+%% Cell type:code id: tags:
+
+``` python
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D
+from tensorflow.keras.preprocessing.image import ImageDataGenerator
+
+from PIL import Image
+import matplotlib.pyplot as plt
+import numpy as np
+import random
+```
+
+%% Cell type:code id: tags:
+
+``` python
+#Size of our input images
+SIZE = 128
+
+#Define generators for training, validation and also anomaly data.
+batch_size = 64
+datagen = ImageDataGenerator(rescale=1./255)
+
+train_generator = datagen.flow_from_directory(
+    'data/cell_images/uninfected_train/',
+    target_size=(SIZE, SIZE),
+    batch_size=batch_size,
+    class_mode='input'
+    )
+'''
+validation_generator = datagen.flow_from_directory(
+    'data/cell_images/uninfected_test/',
+    target_size=(SIZE, SIZE),
+    batch_size=batch_size,
+    class_mode='input'
+    )
+
+anomaly_generator = datagen.flow_from_directory(
+    'data/cell_images/parasitized/',
+    target_size=(SIZE, SIZE),
+    batch_size=batch_size,
+    class_mode='input'
+    )
+'''
+```
+
+%% Output
+
+    Found 0 images belonging to 0 classes.
+
+    "\nvalidation_generator = datagen.flow_from_directory(\n    'data/cell_images/uninfected_test/',\n    target_size=(SIZE, SIZE),\n    batch_size=batch_size,\n    class_mode='input'\n    )\n\nanomaly_generator = datagen.flow_from_directory(\n    'data/cell_images/parasitized/',\n    target_size=(SIZE, SIZE),\n    batch_size=batch_size,\n    class_mode='input'\n    )\n"
+
+%% Cell type:code id: tags:
+
+``` python
+#Define the autoencoder.
+#Try to make the bottleneck layer size as small as possible to make it easy for
+#density calculations and also picking appropriate thresholds.
+
+#Encoder
+model = Sequential()
+model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(SIZE, SIZE, 3)))
+model.add(MaxPooling2D((2, 2), padding='same'))
+model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
+model.add(MaxPooling2D((2, 2), padding='same'))
+model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
+model.add(MaxPooling2D((2, 2), padding='same'))
+
+#Decoder
+model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
+model.add(UpSampling2D((2, 2)))
+model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
+model.add(UpSampling2D((2, 2)))
+model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
+model.add(UpSampling2D((2, 2)))
+
+model.add(Conv2D(3, (3, 3), activation='sigmoid', padding='same'))
+
+model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
+model.summary()
+
+#Fit the model.
+history = model.fit(
+        train_generator,
+        steps_per_epoch= 500 // batch_size,
+        epochs=1000,
+        validation_data=validation_generator,
+        validation_steps=75 // batch_size,
+        shuffle = True)
+
+
+#plot the training and validation accuracy and loss at each epoch
+loss = history.history['loss']
+val_loss = history.history['val_loss']
+epochs = range(1, len(loss) + 1)
+plt.plot(epochs, loss, 'y', label='Training loss')
+plt.plot(epochs, val_loss, 'r', label='Validation loss')
+plt.title('Training and validation loss')
+plt.xlabel('Epochs')
+plt.ylabel('Loss')
+plt.legend()
+plt.show()
+```
+
+%% Output
+
+    Model: "sequential"
+    _________________________________________________________________
+     Layer (type)                Output Shape              Param #
+    =================================================================
+     conv2d (Conv2D)             (None, 128, 128, 64)      1792
+    
+     max_pooling2d (MaxPooling2  (None, 64, 64, 64)        0
+     D)
+    
+     conv2d_1 (Conv2D)           (None, 64, 64, 32)        18464
+    
+     max_pooling2d_1 (MaxPoolin  (None, 32, 32, 32)        0
+     g2D)
+    
+     conv2d_2 (Conv2D)           (None, 32, 32, 16)        4624
+    
+     max_pooling2d_2 (MaxPoolin  (None, 16, 16, 16)        0
+     g2D)
+    
+     conv2d_3 (Conv2D)           (None, 16, 16, 16)        2320
+    
+     up_sampling2d (UpSampling2  (None, 32, 32, 16)        0
+     D)
+    
+     conv2d_4 (Conv2D)           (None, 32, 32, 32)        4640
+    
+     up_sampling2d_1 (UpSamplin  (None, 64, 64, 32)        0
+     g2D)
+    
+     conv2d_5 (Conv2D)           (None, 64, 64, 64)        18496
+    
+     up_sampling2d_2 (UpSamplin  (None, 128, 128, 64)      0
+     g2D)
+    
+     conv2d_6 (Conv2D)           (None, 128, 128, 3)       1731
+    
+    =================================================================
+    Total params: 52067 (203.39 KB)
+    Trainable params: 52067 (203.39 KB)
+    Non-trainable params: 0 (0.00 Byte)
+    _________________________________________________________________
+
+    ---------------------------------------------------------------------------
+    ValueError                                Traceback (most recent call last)
+Cell     In[7], line 28
+         25 model.summary()
+         27 #Fit the model.
+    ---> 28 history = model.fit(
+         29         train_generator,
+         30         steps_per_epoch= 500 // batch_size,
+         31         epochs=1000,
+         32         validation_data=validation_generator,
+         33         validation_steps=75 // batch_size,
+         34         shuffle = True)
+         37 #plot the training and validation accuracy and loss at each epoch
+         38 loss = history.history['loss']
+File     d:\Studium\Masterarbeit\Einarbeitung\Codebeispiele\detecting_anomalies\.venv\Lib\site-packages\keras\src\utils\traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
+         67     filtered_tb = _process_traceback_frames(e.__traceback__)
+         68     # To get the full stack trace, call:
+         69     # `tf.debugging.disable_traceback_filtering()`
+    ---> 70     raise e.with_traceback(filtered_tb) from None
+         71 finally:
+         72     del filtered_tb
+File     d:\Studium\Masterarbeit\Einarbeitung\Codebeispiele\detecting_anomalies\.venv\Lib\site-packages\keras\src\preprocessing\image.py:103, in Iterator.__getitem__(self, idx)
+        101 def __getitem__(self, idx):
+        102     if idx >= len(self):
+    --> 103         raise ValueError(
+        104             "Asked to retrieve element {idx}, "
+        105             "but the Sequence "
+        106             "has length {length}".format(idx=idx, length=len(self))
+        107         )
+        108     if self.seed is not None:
+        109         np.random.seed(self.seed + self.total_batches_seen)
+    ValueError: Asked to retrieve element 0, but the Sequence has length 0
+
+%% Cell type:code id: tags:
+
+``` python
+# Get all batches generated by the datagen and pick a batch for prediction
+#Just to test the model.
+data_batch = []  #Capture all training batches as a numpy array
+img_num = 0
+while img_num <= train_generator.batch_index:   #gets each generated batch of size batch_size
+    data = train_generator.next()
+    data_batch.append(data[0])
+    img_num = img_num + 1
+
+predicted = model.predict(data_batch[0])  #Predict on the first batch of images
+
+
+#Sanity check, view few images and corresponding reconstructions
+image_number = random.randint(0, predicted.shape[0])
+plt.figure(figsize=(12, 6))
+plt.subplot(121)
+plt.imshow(data_batch[0][image_number])
+plt.subplot(122)
+plt.imshow(predicted[image_number])
+plt.show()
+
+#Let us examine the reconstruction error between our validation data (good/normal images)
+# and the anomaly images
+validation_error = model.evaluate_generator(validation_generator)
+anomaly_error = model.evaluate_generator(anomaly_generator)
+
+print("Recon. error for the validation (normal) data is: ", validation_error)
+print("Recon. error for the anomaly data is: ", anomaly_error)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+#Let us extract (or build) the encoder network, with trained weights.
+#This is used to get the compressed output (latent space) of the input image.
+#The compressed output is then used to calculate the KDE
+
+encoder_model = Sequential()
+encoder_model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(SIZE, SIZE, 3), weights=model.layers[0].get_weights()) )
+encoder_model.add(MaxPooling2D((2, 2), padding='same'))
+encoder_model.add(Conv2D(32, (3, 3), activation='relu', padding='same', weights=model.layers[2].get_weights()))
+encoder_model.add(MaxPooling2D((2, 2), padding='same'))
+encoder_model.add(Conv2D(16, (3, 3), activation='relu', padding='same', weights=model.layers[4].get_weights()))
+encoder_model.add(MaxPooling2D((2, 2), padding='same'))
+encoder_model.summary()
+
+########################################################
+# Calculate KDE using sklearn
+from sklearn.neighbors import KernelDensity
+
+#Get encoded output of input images = Latent space
+encoded_images = encoder_model.predict_generator(train_generator)
+
+# Flatten the encoder output because KDE from sklearn takes 1D vectors as input
+encoder_output_shape = encoder_model.output_shape #Here, we have 16x16x16
+out_vector_shape = encoder_output_shape[1]*encoder_output_shape[2]*encoder_output_shape[3]
+
+encoded_images_vector = [np.reshape(img, (out_vector_shape)) for img in encoded_images]
+
+#Fit KDE to the image latent data
+kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(encoded_images_vector)
+
+#Calculate density and reconstruction error to find their means values for
+#good and anomaly images.
+#We use these mean and sigma to set thresholds.
+def calc_density_and_recon_error(batch_images):
+
+    density_list=[]
+    recon_error_list=[]
+    for im in range(0, batch_images.shape[0]-1):
+
+        img  = batch_images[im]
+        img = img[np.newaxis, :,:,:]
+        encoded_img = encoder_model.predict([[img]]) # Create a compressed version of the image using the encoder
+        encoded_img = [np.reshape(img, (out_vector_shape)) for img in encoded_img] # Flatten the compressed image
+        density = kde.score_samples(encoded_img)[0] # get a density score for the new image
+        reconstruction = model.predict([[img]])
+        reconstruction_error = model.evaluate([reconstruction],[[img]], batch_size = 1)[0]
+        density_list.append(density)
+        recon_error_list.append(reconstruction_error)
+
+    average_density = np.mean(np.array(density_list))
+    stdev_density = np.std(np.array(density_list))
+
+    average_recon_error = np.mean(np.array(recon_error_list))
+    stdev_recon_error = np.std(np.array(recon_error_list))
+
+    return average_density, stdev_density, average_recon_error, stdev_recon_error
+
+#Get average and std dev. of density and recon. error for uninfected and anomaly (parasited) images.
+#For this let us generate a batch of images for each.
+train_batch = train_generator.next()[0]
+anomaly_batch = anomaly_generator.next()[0]
+
+uninfected_values = calc_density_and_recon_error(train_batch)
+anomaly_values = calc_density_and_recon_error(anomaly_batch)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+#Now, input unknown images and sort as Good or Anomaly
+def check_anomaly(img_path):
+    density_threshold = 2500 #Set this value based on the above exercise
+    reconstruction_error_threshold = 0.004 # Set this value based on the above exercise
+    img  = Image.open(img_path)
+    img = np.array(img.resize((128,128), Image.ANTIALIAS))
+    plt.imshow(img)
+    img = img / 255.
+    img = img[np.newaxis, :,:,:]
+    encoded_img = encoder_model.predict([[img]])
+    encoded_img = [np.reshape(img, (out_vector_shape)) for img in encoded_img]
+    density = kde.score_samples(encoded_img)[0]
+
+    reconstruction = model.predict([[img]])
+    reconstruction_error = model.evaluate([reconstruction],[[img]], batch_size = 1)[0]
+
+    if density < density_threshold or reconstruction_error > reconstruction_error_threshold:
+        print("The image is an anomaly")
+
+    else:
+        print("The image is NOT an anomaly")
+
+
+#Load a couple of test images and verify whether they are reported as anomalies.
+import glob
+para_file_paths = glob.glob('cell_images2/parasitized/images/*')
+uninfected_file_paths = glob.glob('cell_images2/uninfected_train/images/*')
+
+#Anomaly image verification
+num=random.randint(0,len(para_file_paths)-1)
+check_anomaly(para_file_paths[num])
+
+#Good/normal image verification
+num=random.randint(0,len(para_file_paths)-1)
+check_anomaly(uninfected_file_paths[num])
+```
--- a/requirements.txt
+++ b/requirements.txt
+absl-py==1.4.0
+asttokens==2.2.1
+astunparse==1.6.3
+backcall==0.2.0
+cachetools==5.3.1
+certifi==2023.7.22
+charset-normalizer==3.2.0
+colorama==0.4.6
+comm==0.1.3
+contourpy==1.1.0
+cycler==0.11.0
+debugpy==1.6.7
+decorator==5.1.1
+executing==1.2.0
+flatbuffers==23.5.26
+fonttools==4.41.1
+gast==0.4.0
+google-auth==2.22.0
+google-auth-oauthlib==1.0.0
+google-pasta==0.2.0
+grpcio==1.56.2
+h5py==3.9.0
+idna==3.4
+imageio==2.31.1
+ipykernel==6.25.0
+ipython==8.14.0
+jedi==0.18.2
+jupyter_client==8.3.0
+jupyter_core==5.3.1
+keras==2.13.1
+kiwisolver==1.4.4
+lazy_loader==0.3
+libclang==16.0.6
+Markdown==3.4.4
+MarkupSafe==2.1.3
+matplotlib==3.7.2
+matplotlib-inline==0.1.6
+nest-asyncio==1.5.6
+networkx==3.1
+numpy==1.24.3
+oauthlib==3.2.2
+opencv-python==4.8.0.74
+opt-einsum==3.3.0
+packaging==23.1
+pandas==2.0.3
+parso==0.8.3
+pickleshare==0.7.5
+Pillow==10.0.0
+platformdirs==3.9.1
+prompt-toolkit==3.0.39
+protobuf==4.23.4
+psutil==5.9.5
+pure-eval==0.2.2
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+Pygments==2.15.1
+pyparsing==3.0.9
+python-dateutil==2.8.2
+pytz==2023.3
+PyWavelets==1.4.1
+pywin32==306
+pyzmq==25.1.0
+requests==2.31.0
+requests-oauthlib==1.3.1
+rsa==4.9
+scikit-image==0.21.0
+scipy==1.11.1
+six==1.16.0
+stack-data==0.6.2
+tensorboard==2.13.0
+tensorboard-data-server==0.7.1
+tensorflow==2.13.0
+tensorflow-estimator==2.13.0
+tensorflow-intel==2.13.0
+tensorflow-io-gcs-filesystem==0.31.0
+termcolor==2.3.0
+tifffile==2023.7.18
+tornado==6.3.2
+traitlets==5.9.0
+typing_extensions==4.5.0
+tzdata==2023.3
+urllib3==1.26.16
+wcwidth==0.2.6
+Werkzeug==2.3.6
+wrapt==1.15.0