Spaces:

Littlehongman
/

CLIPGPT-ImageCaptioner

Sleeping

App Files Files Community

Littlehongman commited on May 10, 2023

Commit

9c68392

1 Parent(s): ac3b99b

feat: add html & references

Browse files

Files changed (1) hide show

app.py +35 -5

app.py CHANGED Viewed

@@ -35,6 +35,28 @@ st.write(
         flex: 1 1 calc(50% - 1rem);
         min-width: calc(50% - 1rem);
     }
     </style>""",
     unsafe_allow_html=True,
 )
@@ -42,7 +64,7 @@ st.write(
 # Render Streamlit page
 st.title("Image Captioner")
 st.markdown(
-    "This app generates Image Caption using OpenAI's [GPT-2](https://openai.com/research/better-language-models) and [CLIP](https://openai.com/research/clip) model."
 )
@@ -59,11 +81,11 @@ select_file = image_select(
     # captions=["A cat", "Another cat", "Oh look, a cat!", "Guess what, a cat..."],
 )
 upload_file = st.file_uploader("Upload an image:", type=['png','jpg','jpeg'])
-st.markdown("<hr/>", unsafe_allow_html=True)
 # Checking the Format of the page
 if upload_file or select_file:
@@ -74,7 +96,7 @@ if upload_file or select_file:
         img = Image.open(upload_file)
     elif select_file:
-        st.text(select_file)
         img = Image.open(requests.get(select_file, stream=True).raw)
@@ -90,9 +112,17 @@ if upload_file or select_file:
 # Model information
 with st.expander("See model architecture"):
-    st.write("")
     model_img = Image.open('./model.png')
-    st.image(model_img, width=500)

         flex: 1 1 calc(50% - 1rem);
         min-width: calc(50% - 1rem);
     }
+    .separator {
+        display: flex;
+        align-items: center;
+        text-align: center;
+    }
+    .separator::before,
+    .separator::after {
+        content: '';
+        flex: 1;
+        border-bottom: 1px solid #000;
+    }
+    .separator:not(:empty)::before {
+        margin-right: .25em;
+    }
+    .separator:not(:empty)::after {
+        margin-left: .25em;
+    }
     </style>""",
     unsafe_allow_html=True,
 )
 # Render Streamlit page
 st.title("Image Captioner")
 st.markdown(
+    "This app utilizes OpenAI's [GPT-2](https://openai.com/research/better-language-models) and [CLIP](https://openai.com/research/clip) models to generate image captions. The model architecture was inspired by [ClipCap: CLIP Prefix for Image Captioning](https://arxiv.org/abs/2111.09734), which uses CLIP encoding as prefix and fine-tune GPT-2 model to generate the caption."
 )
     # captions=["A cat", "Another cat", "Oh look, a cat!", "Guess what, a cat..."],
 )
+st.markdown("<div class='separator'>Or</div>", unsafe_allow_html=True)
 upload_file = st.file_uploader("Upload an image:", type=['png','jpg','jpeg'])
 # Checking the Format of the page
 if upload_file or select_file:
         img = Image.open(upload_file)
     elif select_file:
+        # st.text(select_file)
         img = Image.open(requests.get(select_file, stream=True).raw)
 # Model information
 with st.expander("See model architecture"):
+    st.markdown(
+        """
+        Steps:
+        1.  Feed image into CLIP Image Encoder to get image embedding
+        2.  image embedding into text embedding shape
+        3.  Feed Text into GPT-2 Text Embedder to get a text embedding
+        4.  Concatenate two embeddings and feed into GPT-2 Attention Layers
+        """)
+    st.write(" \nModel Architecture:  ")
     model_img = Image.open('./model.png')
+    st.image(model_img, width=450)