Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Make a [Replicate](https://replicate.com), [OpenAI](https://beta.openai.com/), a
```
export OPENAI_API_KEY=<token>
export ELEVENLABS_API_KEY=<eleven-token>
export GEMINI_API_KEY=<Gemini-api-key>
```

Make a new voice in Eleven and get the voice id of that voice using their [get voices](https://elevenlabs.io/docs/api-reference/voices) API, or by clicking the flask icon next to the voice in the VoiceLab tab.
Expand All @@ -39,7 +40,10 @@ python capture.py
```
In another terminal, run the narrator:

```bash
```bash
python narrator.py
```

Default model is GPT-4. To use Gemini Pro Vision:
```bash
python narrator.py -m gemini
```
70 changes: 57 additions & 13 deletions narrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,14 @@
import simpleaudio as sa
import errno
from elevenlabs import generate, play, set_api_key, voices
import google.generativeai as genai
import PIL.Image
import argparse


client = OpenAI()


set_api_key(os.environ.get("ELEVENLABS_API_KEY"))

def encode_image(image_path):
Expand Down Expand Up @@ -74,28 +79,67 @@ def analyze_image(base64_image, script):


def main():
parser = argparse.ArgumentParser(description="Image narration script with model selection.")
parser.add_argument("-m", "--model", choices=["gpt-4", "gemini"], default="gpt-4", help="Select the AI model (default: gpt-4)")
args = parser.parse_args()
script = []

while True:
# path to your image
image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
if args.model.lower() == "gpt-4":
print("using GPT-4 Vision")
print("👀 David is watching...")

while True:
# path to your image
image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")


# getting the base64 encoding
base64_image = encode_image(image_path)
analysis = analyze_image(base64_image, script=script)


print("🎙️ David says:")
print(analysis)

# getting the base64 encoding
base64_image = encode_image(image_path)
play_audio(analysis)

# analyze posture
script = script + [{"role": "assistant", "content": analysis}]

# wait for 5 seconds
time.sleep(5)

elif args.model.lower() == "gemini":
genai.configure(api_key = os.environ.get("GEMINI_API_KEY"))
print("using Gemini Pro Vision")
print("👀 David is watching...")
analysis = analyze_image(base64_image, script=script)

print("🎙️ David says:")
print(analysis)
while True:
# path to your image
image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")

img = PIL.Image.open(image_path)

model = genai.GenerativeModel('gemini-pro-vision')
response = model.generate_content(["""
You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary.
Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
"""+"refer to these previous narrations".join(script), img])

response_text = response.text


print("🎙️ David says:")
print(response_text)

play_audio(response_text)

play_audio(analysis)
script = script + [response_text]

script = script + [{"role": "assistant", "content": analysis}]
# wait for 5 seconds
time.sleep(5)

# wait for 5 seconds
time.sleep(5)
else:
print("Please enter a valid argument")


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@ typing_extensions==4.8.0
urllib3==2.0.7
wcwidth==0.2.10
websockets==12.0
google-generativeai==0.3.1