Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from bundle
Telegram MTProto MCP server with userbot watcher, chat/DM parser and context builders
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
parse_chat.py
1"""2parse_chat.py — универсальный парсер чатов через telegram-mcp MCP.34Использование:5python3 parse_chat.py <chat_id> [--limit 1000] [--out output.jsonl]67Примеры:8python3 parse_chat.py 123456789 # личка9python3 parse_chat.py -100123456789 --limit 500 # группа10python3 parse_chat.py 123456789 --out dm.jsonl # в файл11"""1213import argparse14import json15import subprocess16import sys17import time18from pathlib import Path1920CONTEXT_DIR = Path(__file__).parent / "context"21CHATS_DIR = CONTEXT_DIR / "chats"22PEOPLE_DIR = CONTEXT_DIR / "people"2324PAGE_SIZE = 100 # сообщений за раз25PAUSE = 1.5 # пауза между запросами (сек)262728def mcporter_call(tool: str, **kwargs) -> list[dict]:29"""Вызывает MCP-инструмент через mcporter, возвращает список распарсенных сообщений."""30args = ["mcporter", "call", f"telegram.{tool}"]31for k, v in kwargs.items():32args.append(f"{k}={v}")3334result = subprocess.run(args, capture_output=True, text=True, timeout=60)35if result.returncode != 0:36raise RuntimeError(f"mcporter error: {result.stderr.strip()}")3738# Парсим текстовый вывод вида "ID: 123 | Name | Date: ... | Message: text"39messages = []40for line in result.stdout.splitlines():41if not line.startswith("ID:"):42continue43parts = line.split("|")44try:45msg_id = int(parts[0].replace("ID:", "").strip())46sender = parts[1].strip() if len(parts) > 1 else ""47date = parts[2].replace("Date:", "").strip() if len(parts) > 2 else ""48text = ""49reply_to = None5051for p in parts[2:]:52if "Message:" in p:53text = p.split("Message:", 1)[1].strip()54if "reply to" in p:55try:56reply_to = int(p.split("reply to")[1].strip())57except Exception:58pass5960messages.append({61"id": msg_id,62"sender": sender,63"date": date,64"text": text,65"reply_to": reply_to,66})67except Exception:68continue6970return messages717273def parse_messages_output(raw: str) -> list[dict]:74"""Парсит текстовый вывод iter_all_messages."""75messages = []76for line in raw.splitlines():77if not line.startswith("ID:"):78continue79parts = line.split("|")80try:81msg_id = int(parts[0].replace("ID:", "").strip())82sender = parts[1].strip() if len(parts) > 1 else ""83date = parts[2].replace("Date:", "").strip() if len(parts) > 2 else ""84text = ""85reply_to = None86for p in parts[2:]:87if "Message:" in p:88text = p.split("Message:", 1)[1].strip()89if "reply to" in p:90try:91reply_to = int(p.strip().split("reply to")[1].strip())92except Exception:93pass94messages.append({"id": msg_id, "sender": sender, "date": date, "text": text, "reply_to": reply_to})95except Exception:96continue97return messages9899100def parse_chat(chat_id: int, limit: int = 5000, out_file: str = None) -> list[dict]:101"""Парсит чат через iter_all_messages с пагинацией по offset_id."""102all_messages = []103offset_id = 0104total = 0105batch = min(PAGE_SIZE * 5, 500) # 500 за раз106107print(f"[parse_chat] Парсю чат {chat_id}, лимит: {limit}")108109while total < limit:110fetch = min(batch, limit - total)111print(f"[parse_chat] Запрос {fetch} сообщений (offset_id={offset_id})...")112113args = ["mcporter", "call", "telegram.iter_all_messages",114f"chat_id={chat_id}", f"limit={fetch}"]115if offset_id:116args.append(f"offset_id={offset_id}")117118try:119result = subprocess.run(args, capture_output=True, text=True, timeout=120)120raw = result.stdout121# убираем stderr строки122clean = "\n".join(l for l in raw.splitlines() if not l.startswith("[mcporter]"))123msgs = parse_messages_output(clean)124except Exception as e:125print(f"[parse_chat] Ошибка: {e}")126break127128if not msgs:129print(f"[parse_chat] Пусто — конец чата.")130break131132all_messages.extend(msgs)133total += len(msgs)134print(f"[parse_chat] Получено: {len(msgs)} | Всего: {total}")135136if len(msgs) < fetch:137print(f"[parse_chat] Последняя партия.")138break139140# offset_id = самый старый ID для следующей итерации141offset_id = min(m["id"] for m in msgs)142time.sleep(PAUSE)143144print(f"[parse_chat] Итого: {total} сообщений")145146# Сохраняем147if out_file:148out = Path(out_file)149else:150out = CONTEXT_DIR / f"raw_{chat_id}.jsonl"151152out.parent.mkdir(parents=True, exist_ok=True)153with open(out, "w", encoding="utf-8") as f:154for msg in all_messages:155f.write(json.dumps(msg, ensure_ascii=False) + "\n")156157print(f"[parse_chat] Сохранено в: {out}")158159# Дописываем в messages.jsonl (глобальный лог)160messages_log = CONTEXT_DIR / "messages.jsonl"161# Загружаем существующие ID чтобы не дублировать162existing_ids = set()163if messages_log.exists():164with open(messages_log, "r", encoding="utf-8") as f:165for line in f:166try:167existing_ids.add(json.loads(line)["id"])168except Exception:169pass170171new_count = 0172with open(messages_log, "a", encoding="utf-8") as f:173for msg in all_messages:174if msg["id"] not in existing_ids:175f.write(json.dumps({**msg, "chat_id": chat_id}, ensure_ascii=False) + "\n")176new_count += 1177178print(f"[parse_chat] Добавлено в messages.jsonl: {new_count} новых записей")179180return all_messages181182183def print_summary(messages: list[dict]):184"""Выводит краткую статистику."""185senders = {}186for m in messages:187s = m.get("sender", "?")188senders[s] = senders.get(s, 0) + 1189190print("\n=== Участники ===")191for name, count in sorted(senders.items(), key=lambda x: -x[1]):192print(f" {name}: {count} сообщений")193194if messages:195print(f"\nПервое: {messages[-1].get('date', '?')}")196print(f"Последнее: {messages[0].get('date', '?')}")197198199def main():200parser = argparse.ArgumentParser(description="Парсер чатов через telegram-mcp")201parser.add_argument("chat_id", type=int, help="ID чата или пользователя")202parser.add_argument("--limit", type=int, default=1000, help="Макс. кол-во сообщений (default: 1000)")203parser.add_argument("--out", type=str, help="Путь к выходному файлу (.jsonl)")204parser.add_argument("--stats", action="store_true", help="Показать статистику")205args = parser.parse_args()206207messages = parse_chat(args.chat_id, args.limit, args.out)208209if args.stats or not args.out:210print_summary(messages)211212213if __name__ == "__main__":214main()215