-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess_ig_posts.py
More file actions
75 lines (63 loc) · 2.8 KB
/
preprocess_ig_posts.py
File metadata and controls
75 lines (63 loc) · 2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
import argparse
from datetime import datetime
def convert_posts_to_zod_schema(input_json_path, output_json_path):
with open(input_json_path, 'r', encoding='utf-8') as f:
posts = json.load(f)
file_list = []
for post in posts:
# If there is a media array, process each media item
if "media" in post:
for media_item in post["media"]:
entry = {
"filepath": media_item.get("uri"),
}
metadata = {}
if "title" in media_item and media_item["title"] not in (None, "null", ""):
metadata["_Caption"] = media_item["title"]
metadata["_blank_1"] = ""
if "uri" in media_item:
metadata["_Filepath"] = media_item["uri"]
metadata["_blank_2"] = ""
if "creation_timestamp" in media_item:
try:
ts = int(media_item["creation_timestamp"])
dt = datetime.fromtimestamp(ts)
metadata["_blank_3"] = ""
#metadata["_Created"] = dt.strftime("%b %d, %Y %H:%M")
metadata["_title"] = dt.strftime("%b %d, %Y %H:%M")
# add title to entry
except Exception:
#metadata["_Created"] = str(media_item["creation_timestamp"])
entry["_title"] = str(media_item["creation_timestamp"])
if metadata:
entry["metadata"] = metadata
file_list.append(entry)
else:
# Fallback for posts without media array
entry = {
"filepath": post.get("uri"),
}
# metadata = {}
# if "title" in post:
# metadata["title"] = post["title"]
# if metadata:
# entry["metadata"] = metadata
# if "qr_data" in post:
# entry["qr_data"] = post["qr_data"]
file_list.append(entry)
with open(output_json_path, 'w', encoding='utf-8') as f:
# for x in file_list:
# print("creation_timestamp is ", x.get("metadata", {}).get("_title", ""))
# file_list.sort(
# key=lambda x: x.get("metadata", {}).get("_title", "")
# )
json.dump(file_list, f, indent=2)
def main():
parser = argparse.ArgumentParser(description="Convert Instagram posts JSON to Zod schema format.")
parser.add_argument("--input-json", required=True, help="Path to input JSON file.")
parser.add_argument("--output-json", required=True, help="Path to output JSON file.")
args = parser.parse_args()
convert_posts_to_zod_schema(args.input_json, args.output_json)
if __name__ == "__main__":
main()