cbh123 · taradepan · Dec 14, 2023 · Dec 18, 2023 · Dec 18, 2023
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ Make a [Replicate](https://replicate.com), [OpenAI](https://beta.openai.com/), a
 ```
 export OPENAI_API_KEY=<token>
 export ELEVENLABS_API_KEY=<eleven-token>
+export GEMINI_API_KEY=<Gemini-api-key>
 ```
 
 Make a new voice in Eleven and get the voice id of that voice using their [get voices](https://elevenlabs.io/docs/api-reference/voices) API, or by clicking the flask icon next to the voice in the VoiceLab tab.
@@ -39,7 +40,10 @@ python capture.py
 ```
 In another terminal, run the narrator:
 
-```bash
+```bash 
 python narrator.py
 ```
-
+Default model is GPT-4. To use Gemini Pro Vision:
+```bash
+python narrator.py -m gemini
+```
diff --git a/narrator.py b/narrator.py
@@ -6,9 +6,14 @@
 import simpleaudio as sa
 import errno
 from elevenlabs import generate, play, set_api_key, voices
+import google.generativeai as genai
+import PIL.Image
+import argparse
+
 
 client = OpenAI()
 
+
 set_api_key(os.environ.get("ELEVENLABS_API_KEY"))
 
 def encode_image(image_path):
@@ -74,28 +79,67 @@ def analyze_image(base64_image, script):
 
 
 def main():
+    parser = argparse.ArgumentParser(description="Image narration script with model selection.")
+    parser.add_argument("-m", "--model", choices=["gpt-4", "gemini"], default="gpt-4", help="Select the AI model (default: gpt-4)")
+    args = parser.parse_args()
     script = []
 
-    while True:
-        # path to your image
-        image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
+    if args.model.lower() == "gpt-4":
+        print("using GPT-4 Vision")
+        print("👀 David is watching...")
+
+        while True:
+            # path to your image
+            image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
+
+
+            # getting the base64 encoding
+            base64_image = encode_image(image_path)
+            analysis = analyze_image(base64_image, script=script)
+
+
+            print("🎙️ David says:")
+            print(analysis)
 
-        # getting the base64 encoding
-        base64_image = encode_image(image_path)
+            play_audio(analysis)
 
-        # analyze posture
+            script = script + [{"role": "assistant", "content": analysis}]
+
+            # wait for 5 seconds
+            time.sleep(5)
+
+    elif args.model.lower() == "gemini":
+        genai.configure(api_key = os.environ.get("GEMINI_API_KEY"))
+        print("using Gemini Pro Vision")
         print("👀 David is watching...")
-        analysis = analyze_image(base64_image, script=script)
 
-        print("🎙️ David says:")
-        print(analysis)
+        while True:
+            # path to your image
+            image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
+
+            img = PIL.Image.open(image_path)
+
+            model = genai.GenerativeModel('gemini-pro-vision')
+            response = model.generate_content(["""
+                You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary.
+                Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
+                """+"refer to these previous narrations".join(script), img])
+
+            response_text = response.text
+
+
+            print("🎙️ David says:")
+            print(response_text)
+
+            play_audio(response_text)
 
-        play_audio(analysis)
+            script = script + [response_text]
 
-        script = script + [{"role": "assistant", "content": analysis}]
+            # wait for 5 seconds
+            time.sleep(5)
 
-        # wait for 5 seconds
-        time.sleep(5)
+    else:
+        print("Please enter a valid argument")
 
 
 if __name__ == "__main__":

diff --git a/requirements.txt b/requirements.txt
@@ -39,3 +39,4 @@ typing_extensions==4.8.0
 urllib3==2.0.7
 wcwidth==0.2.10
 websockets==12.0
+google-generativeai==0.3.1