Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 26 additions & 3 deletions CHANGELOG.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,34 @@
{
"metadata": {
"lastUpdated": "2026-01-11T15:34:03Z",
"currentVersion": "0.1.1",
"lastUpdated": "2026-01-13T15:12:41Z",
"currentVersion": "0.1.2",
"projectType": "python",
"totalReleases": 2
"totalReleases": 3
},
"releases": [
{
"version": "0.1.2",
"project_type": "python",
"date": "2026-01-13",
"pr_number": 10,
"raw_summary": "## Summary by CodeRabbit\n\n## ๋ฆด๋ฆฌ์Šค ๋…ธํŠธ\n\n* **์ƒˆ๋กœ์šด ๊ธฐ๋Šฅ**\n * Instagram ๊ฒŒ์‹œ๋ฌผ์˜ ์บ๋Ÿฌ์…€ ์ด๋ฏธ์ง€๋ฅผ ๋ชจ๋‘ ์ถ”์ถœํ•˜๋„๋ก ๊ฐœ์„ \n * ์ž‘์„ฑ์ž์˜ ํ”„๋กœํ•„ ์ด๋ฏธ์ง€ ์ •๋ณด ์ถ”์ถœ ๊ธฐ๋Šฅ ์ถ”๊ฐ€\n\n* **๊ธฐํƒ€**\n * ๋ฒ„์ „ ์—…๋ฐ์ดํŠธ ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๊ฐฑ์‹ ",
"parsed_changes": {
"์ƒˆ๋กœ์šด_๊ธฐ๋Šฅ": {
"title": "์ƒˆ๋กœ์šด ๊ธฐ๋Šฅ",
"items": [
"Instagram ๊ฒŒ์‹œ๋ฌผ์˜ ์บ๋Ÿฌ์…€ ์ด๋ฏธ์ง€๋ฅผ ๋ชจ๋‘ ์ถ”์ถœํ•˜๋„๋ก ๊ฐœ์„ ",
"์ž‘์„ฑ์ž์˜ ํ”„๋กœํ•„ ์ด๋ฏธ์ง€ ์ •๋ณด ์ถ”์ถœ ๊ธฐ๋Šฅ ์ถ”๊ฐ€"
]
},
"๊ธฐํƒ€": {
"title": "๊ธฐํƒ€",
"items": [
"๋ฒ„์ „ ์—…๋ฐ์ดํŠธ ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๊ฐฑ์‹ "
]
}
},
"parse_method": "markdown"
},
{
"version": "0.1.1",
"project_type": "python",
Expand Down
17 changes: 15 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
# Changelog

**ํ˜„์žฌ ๋ฒ„์ „:** 0.1.1
**๋งˆ์ง€๋ง‰ ์—…๋ฐ์ดํŠธ:** 2026-01-11T15:34:03Z
**ํ˜„์žฌ ๋ฒ„์ „:** 0.1.2
**๋งˆ์ง€๋ง‰ ์—…๋ฐ์ดํŠธ:** 2026-01-13T15:12:41Z

---

## [0.1.2] - 2026-01-13

**PR:** #10

**์ƒˆ๋กœ์šด ๊ธฐ๋Šฅ**
- Instagram ๊ฒŒ์‹œ๋ฌผ์˜ ์บ๋Ÿฌ์…€ ์ด๋ฏธ์ง€๋ฅผ ๋ชจ๋‘ ์ถ”์ถœํ•˜๋„๋ก ๊ฐœ์„ 
- ์ž‘์„ฑ์ž์˜ ํ”„๋กœํ•„ ์ด๋ฏธ์ง€ ์ •๋ณด ์ถ”์ถœ ๊ธฐ๋Šฅ ์ถ”๊ฐ€

**๊ธฐํƒ€**
- ๋ฒ„์ „ ์—…๋ฐ์ดํŠธ ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๊ฐฑ์‹ 

---

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# MapSee-AI

<!-- ์ˆ˜์ •ํ•˜์ง€๋งˆ์„ธ์š” ์ž๋™์œผ๋กœ ๋™๊ธฐํ™” ๋ฉ๋‹ˆ๋‹ค -->
## ์ตœ์‹  ๋ฒ„์ „ : v0.0.11 (2026-01-11)
## ์ตœ์‹  ๋ฒ„์ „ : v0.1.1 (2026-01-11)

[์ „์ฒด ๋ฒ„์ „ ๊ธฐ๋ก ๋ณด๊ธฐ](CHANGELOG.md)

Expand Down
138 changes: 113 additions & 25 deletions src/services/scraper/platforms/instagram_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,27 +74,110 @@ def parse_instagram_description(self, description: str) -> dict:

async def extract_instagram_image_urls(self) -> list[str]:
"""
Instagram ์ด๋ฏธ์ง€ URL ์ถ”์ถœ (cdninstagram.com ๋„๋ฉ”์ธ๋งŒ)
Instagram ๊ฒŒ์‹œ๊ธ€ ์ด๋ฏธ์ง€ URL ์ถ”์ถœ (์บ๋Ÿฌ์…€ ์Šฌ๋ผ์ด๋“œ ๋„ค๋น„๊ฒŒ์ด์…˜ ํฌํ•จ)

์บ๋Ÿฌ์…€์˜ ๊ฒฝ์šฐ Next ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜๋ฉฐ ๋ชจ๋“  ์ด๋ฏธ์ง€๋ฅผ ์ˆ˜์ง‘ํ•ฉ๋‹ˆ๋‹ค.

Returns:
list[str]: ์ด๋ฏธ์ง€ URL ๋ชฉ๋ก
list[str]: ๊ฒŒ์‹œ๊ธ€ ์ด๋ฏธ์ง€ URL ๋ชฉ๋ก (๋‹ค๋ฅธ ๊ฒŒ์‹œ๊ธ€ ์ธ๋„ค์ผ ์ œ์™ธ)
"""
image_urls = await self.browser_controller.page.evaluate('''() => {
const imgs = document.querySelectorAll('img[src*="cdninstagram.com"]');
const urls = [];
imgs.forEach(img => {
const src = img.src;
// ํ”„๋กœํ•„ ์ด๋ฏธ์ง€ ์ œ์™ธ (๋ณดํ†ต ์ž‘์€ ํฌ๊ธฐ)
if (src && !src.includes('150x150') && !src.includes('44x44')) {
urls.push(src);
}
});
// ์ค‘๋ณต ์ œ๊ฑฐ
return [...new Set(urls)];
page = self.browser_controller.page

# 1. ์บ๋Ÿฌ์…€ ์กด์žฌ ์—ฌ๋ถ€ ํ™•์ธ
has_carousel = await page.evaluate('''() => {
return !!document.querySelector('ul._acay');
}''')
logger.info(f"์ด๋ฏธ์ง€ URL ์ถ”์ถœ: {len(image_urls)}๊ฐœ")

if has_carousel:
# 2. ์บ๋Ÿฌ์…€: ์Šฌ๋ผ์ด๋“œ๋ฅผ ๋„˜๊ธฐ๋ฉฐ ๋ชจ๋“  ์ด๋ฏธ์ง€ ์ˆ˜์ง‘
image_urls = await self._extract_carousel_images()
else:
# 3. ๋‹จ์ผ ์ด๋ฏธ์ง€: article ๋‚ด ๋ฉ”์ธ ์ด๋ฏธ์ง€ ์ถ”์ถœ
image_urls = await page.evaluate('''() => {
const article = document.querySelector('article');
if (!article) return [];

const mainImg = article.querySelector('div._aagv img[src*="cdninstagram.com"]');
return mainImg && mainImg.src ? [mainImg.src] : [];
}''')

logger.info(f"๊ฒŒ์‹œ๊ธ€ ์ด๋ฏธ์ง€ URL ์ถ”์ถœ: {len(image_urls)}๊ฐœ")
return image_urls

async def _extract_carousel_images(self) -> list[str]:
"""
์บ๋Ÿฌ์…€ ์Šฌ๋ผ์ด๋“œ๋ฅผ ๋„˜๊ธฐ๋ฉฐ ๋ชจ๋“  ์ด๋ฏธ์ง€ URL ์ˆ˜์ง‘ (์ˆœ์„œ ์œ ์ง€)

Returns:
list[str]: ์บ๋Ÿฌ์…€ ๋‚ด ๋ชจ๋“  ์ด๋ฏธ์ง€ URL (๊ฒŒ์‹œ๊ธ€ ์ˆœ์„œ๋Œ€๋กœ)
"""
import asyncio
page = self.browser_controller.page
collected_urls: list[str] = [] # ์ˆœ์„œ ์œ ์ง€๋ฅผ ์œ„ํ•ด ๋ฆฌ์ŠคํŠธ ์‚ฌ์šฉ
seen_urls: set[str] = set() # ์ค‘๋ณต ์ฒดํฌ์šฉ

# ํ˜„์žฌ ๋กœ๋“œ๋œ ์ด๋ฏธ์ง€ ์ˆ˜์ง‘ ํ•จ์ˆ˜
async def collect_current_images():
urls = await page.evaluate('''() => {
const carousel = document.querySelector('ul._acay');
if (!carousel) return [];

const imgs = carousel.querySelectorAll('li._acaz img[src*="cdninstagram.com"]');
return Array.from(imgs).map(img => img.src).filter(Boolean);
}''')
for url in urls:
if url not in seen_urls:
seen_urls.add(url)
collected_urls.append(url)

# ์ดˆ๊ธฐ ์ด๋ฏธ์ง€ ์ˆ˜์ง‘
await collect_current_images()

# ์Šฌ๋ผ์ด๋“œ ๊ฐœ์ˆ˜ ํ™•์ธ (์ธ๋””์ผ€์ดํ„ฐ ๋„ํŠธ๋กœ ํ™•์ธ)
total_slides = await page.evaluate('''() => {
// ์บ๋Ÿฌ์…€ ์ธ๋””์ผ€์ดํ„ฐ ๋„ํŠธ ๊ฐœ์ˆ˜๋กœ ์ „์ฒด ์Šฌ๋ผ์ด๋“œ ์ˆ˜ ํ™•์ธ
const dots = document.querySelectorAll('div._acnb');
return dots.length || 1;
}''')

logger.info(f"์บ๋Ÿฌ์…€ ์Šฌ๋ผ์ด๋“œ ๊ฐœ์ˆ˜: {total_slides}")

# Next ๋ฒ„ํŠผ ํด๋ฆญํ•˜๋ฉฐ ์ด๋ฏธ์ง€ ์ˆ˜์ง‘
for i in range(total_slides - 1):
# JavaScript๋กœ ์ง์ ‘ Next ๋ฒ„ํŠผ ํด๋ฆญ (๊ฐ€๋ ค์ง„ ์š”์†Œ ๋ฌด์‹œ)
clicked = await page.evaluate('''() => {
const btn = document.querySelector('button[aria-label="Next"]');
if (btn) {
btn.click();
return true;
}
return false;
}''')

if not clicked:
break

await asyncio.sleep(0.4) # ์ด๋ฏธ์ง€ ๋กœ๋“œ ๋Œ€๊ธฐ
await collect_current_images()

return list(collected_urls)

async def extract_author_profile_image(self) -> str | None:
"""
Instagram ์ž‘์„ฑ์ž ํ”„๋กœํ•„ ์ด๋ฏธ์ง€ URL ์ถ”์ถœ

Returns:
str | None: ์ž‘์„ฑ์ž ํ”„๋กœํ•„ ์ด๋ฏธ์ง€ URL ๋˜๋Š” None
"""
author_profile_url = await self.browser_controller.page.evaluate('''() => {
// ํ”„๋กœํ•„ ์ด๋ฏธ์ง€ ์…€๋ ‰ํ„ฐ: alt ์†์„ฑ์— "profile picture" ํฌํ•จ
const profileImg = document.querySelector('img[alt*="profile picture"]');
return profileImg ? profileImg.src : null;
}''')
if author_profile_url:
logger.info("์ž‘์„ฑ์ž ํ”„๋กœํ•„ ์ด๋ฏธ์ง€ URL ์ถ”์ถœ ์™„๋ฃŒ")
return author_profile_url

async def scrape_instagram_post(self, url: str, classification: UrlClassification) -> dict:
"""
Instagram ๊ฒŒ์‹œ๊ธ€/๋ฆด์Šค ์Šคํฌ๋ž˜ํ•‘
Expand All @@ -109,16 +192,16 @@ async def scrape_instagram_post(self, url: str, classification: UrlClassificatio
Raises:
HTTPException: ์Šคํฌ๋ž˜ํ•‘ ์‹คํŒจ ์‹œ
"""
logger.info(f"[1/5] Instagram ์Šคํฌ๋ž˜ํ•‘ ์‹œ์ž‘: {url} (type={classification.content_type})")
logger.info(f"[1/6] Instagram ์Šคํฌ๋ž˜ํ•‘ ์‹œ์ž‘: {url} (type={classification.content_type})")

async with async_playwright() as playwright:
try:
# [2/5] ๋ธŒ๋ผ์šฐ์ € ์ƒ์„ฑ
logger.info("[2/5] ๋ธŒ๋ผ์šฐ์ € ์ดˆ๊ธฐํ™”...")
# [2/6] ๋ธŒ๋ผ์šฐ์ € ์ƒ์„ฑ
logger.info("[2/6] ๋ธŒ๋ผ์šฐ์ € ์ดˆ๊ธฐํ™”...")
await self.browser_controller.create_browser_and_context(playwright)

# [3/5] ํŽ˜์ด์ง€ ๋กœ๋“œ
logger.info("[3/5] ํŽ˜์ด์ง€ ๋กœ๋“œ...")
# [3/6] ํŽ˜์ด์ง€ ๋กœ๋“œ
logger.info("[3/6] ํŽ˜์ด์ง€ ๋กœ๋“œ...")
response = await self.browser_controller.load_page(url)

if response and response.status >= 400:
Expand All @@ -128,8 +211,8 @@ async def scrape_instagram_post(self, url: str, classification: UrlClassificatio
detail=f"Instagram ์‘๋‹ต ์˜ค๋ฅ˜: {response.status}"
)

# [4/5] ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ถ”์ถœ
logger.info("[4/5] ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ถ”์ถœ...")
# [4/6] ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ถ”์ถœ
logger.info("[4/6] ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ถ”์ถœ...")
open_graph_metadata = await self.browser_controller.extract_open_graph_tags()

# og:description ํŒŒ์‹ฑ
Expand All @@ -141,10 +224,14 @@ async def scrape_instagram_post(self, url: str, classification: UrlClassificatio
f"likes={parsed_metadata['likes_count']}, comments={parsed_metadata['comments_count']}"
)

# [5/5] ์ด๋ฏธ์ง€ URL ์ถ”์ถœ
logger.info("[5/5] ์ด๋ฏธ์ง€ URL ์ถ”์ถœ...")
# [5/6] ์ด๋ฏธ์ง€ URL ์ถ”์ถœ
logger.info("[5/6] ๊ฒŒ์‹œ๊ธ€ ์ด๋ฏธ์ง€ URL ์ถ”์ถœ...")
image_urls = await self.extract_instagram_image_urls()

# [6/6] ์ž‘์„ฑ์ž ํ”„๋กœํ•„ ์ด๋ฏธ์ง€ URL ์ถ”์ถœ
logger.info("[6/6] ์ž‘์„ฑ์ž ํ”„๋กœํ•„ ์ด๋ฏธ์ง€ URL ์ถ”์ถœ...")
author_profile_image_url = await self.extract_author_profile_image()

return {
"platform": classification.platform,
"content_type": classification.content_type,
Expand All @@ -156,7 +243,8 @@ async def scrape_instagram_post(self, url: str, classification: UrlClassificatio
"posted_at": parsed_metadata["posted_at"],
"hashtags": parsed_metadata["hashtags"],
"og_image": open_graph_metadata.get('image'),
"image_urls": image_urls
"image_urls": image_urls,
"author_profile_image_url": author_profile_image_url
}

except HTTPException:
Expand Down
6 changes: 3 additions & 3 deletions version.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@
# - ๋ฒ„์ „์€ ํ•ญ์ƒ ๋†’์€ ๋ฒ„์ „์œผ๋กœ ์ž๋™ ๋™๊ธฐํ™”๋ฉ๋‹ˆ๋‹ค
# ===================================================================

version: "0.1.1"
version_code: 14 # app build number
version: "0.1.2"
version_code: 15 # app build number
project_type: "python" # spring, flutter, react, react-native, react-native-expo, node, python, basic
metadata:
last_updated: "2026-01-11 15:32:48"
last_updated: "2026-01-13 15:05:48"
last_updated_by: "Cassiiopeia"
default_branch: "main"
integrated_from: "SUH-DEVOPS-TEMPLATE"
Expand Down