| | """ |
| | Updated: supports large tables using LongTable + docx export. |
| | Processor module. |
| | Expose: generate_reports_from_csv(input_csv: str, out_dir: str) -> dict |
| | Produces: out_dir/analysis_output.csv, out_dir/report.pdf, out_dir/report.docx (optional) |
| | """ |
| |
|
| | import os,re,sys,csv,logging |
| | from datetime import datetime |
| | from pathlib import Path |
| | import pandas as pd |
| | import numpy as np |
| | import matplotlib.pyplot as plt |
| | from wordcloud import WordCloud, STOPWORDS |
| | from transformers import pipeline |
| | from sklearn.feature_extraction.text import CountVectorizer |
| | from sklearn.decomposition import LatentDirichletAllocation |
| | |
| | |
| | from reportlab.platypus import (SimpleDocTemplate, Paragraph, Spacer, PageBreak, |
| | TableStyle, Image, LongTable) |
| | from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle |
| | from reportlab.lib import colors |
| | from reportlab.lib.pagesizes import A4 |
| | from reportlab.lib.units import inch |
| | from reportlab.lib.enums import TA_LEFT |
| |
|
| | |
| | DOCX_AVAILABLE = True |
| | try: |
| | from docx import Document |
| | from docx.shared import Inches |
| | except Exception: |
| | DOCX_AVAILABLE = False |
| |
|
| | try: |
| | import sentiment_analysis |
| | except Exception as e: |
| | raise RuntimeError(f"Failed to import sentiment_analysis.py: {e}") |
| |
|
| | logger = logging.getLogger("processor") |
| | logger.setLevel(logging.INFO) |
| |
|
| | |
| | CSV_ENCODING = "utf-8" |
| | MAX_ROWS = None |
| | TOPIC_COUNT = 3 |
| |
|
| | |
| | TEASER_CHAR_LIMIT = 900 |
| |
|
| | |
| | RELATIVE_TIME_RE = re.compile( |
| | r'(?:(\d+)\s*(second|sec|s|minute|min|m|hour|hr|h|day|d|week|w|month|mo|year|yr|y)s?\s*ago)|\b(yesterday|today|just now|now)\b', |
| | flags=re.IGNORECASE |
| | ) |
| |
|
| | try: |
| | import torch |
| | device = 0 if torch.cuda.is_available() else -1 |
| | except Exception: |
| | device = -1 |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | def parse_relative_time(s: str, ref: pd.Timestamp): |
| | if not isinstance(s, str) or s.strip() == "": |
| | return pd.NaT |
| | s = s.strip().lower() |
| | if s in ("just now", "now"): |
| | return ref |
| | if s == "today": |
| | return pd.Timestamp(ref.date()) |
| | if s == "yesterday": |
| | return ref - pd.Timedelta(days=1) |
| | s = re.sub(r'\b(an|a)\b', '1', s) |
| | m = re.search(r'(\d+)\s*(second|sec|s|minute|min|m|hour|hr|h|day|d|week|w|month|mo|year|yr|y)s?\s*ago', s) |
| | if not m: |
| | return pd.NaT |
| | qty = int(m.group(1)); unit = m.group(2).lower() |
| | if unit in ("second","sec","s"): return ref - pd.Timedelta(seconds=qty) |
| | if unit in ("minute","min","m"): return ref - pd.Timedelta(minutes=qty) |
| | if unit in ("hour","hr","h"): return ref - pd.Timedelta(hours=qty) |
| | if unit in ("day","d"): return ref - pd.Timedelta(days=qty) |
| | if unit in ("week","w"): return ref - pd.Timedelta(weeks=qty) |
| | if unit in ("month","mo"): return ref - pd.Timedelta(days=qty * 30) |
| | if unit in ("year","yr","y"): return ref - pd.Timedelta(days=qty * 365) |
| | return pd.NaT |
| |
|
| | def clean_text(text: str) -> str: |
| | if not isinstance(text, str): return "" |
| | text = re.sub(r"http\S+", "", text) |
| | text = re.sub(r"@\w+", "", text) |
| | text = re.sub(r"#\w+", "", text) |
| | text = re.sub(r"[^A-Za-z\s]", " ", text) |
| | text = re.sub(r"\s+", " ", text) |
| | return text.lower().strip() |
| |
|
| | def chunked(iterable, size): |
| | for i in range(0, len(iterable), size): |
| | yield iterable[i:i+size] |
| |
|
| |
|
| | def teaser(s, n=TEASER_CHAR_LIMIT): |
| | if not isinstance(s, str): return "" |
| | s = s.strip() |
| | return (s if len(s) <= n else s[:n-1].rsplit(" ",1)[0] + " ...") |
| |
|
| | def parse_score(x): |
| | if pd.isna(x): return np.nan |
| | s = str(x) |
| | m = re.search(r"(-?\d+)", s.replace(",", "")) |
| | if m: return int(m.group(1)) |
| | nums = re.findall(r"\d+", s) |
| | return int(nums[0]) if nums else np.nan |
| |
|
| | def parse_time_value(v,ref_ts): |
| | if isinstance(v, (pd.Timestamp, datetime)): return pd.to_datetime(v) |
| | if pd.isna(v): return pd.NaT |
| | s = str(v).strip() |
| | try: |
| | parsed = pd.to_datetime(s, errors='coerce', utc=None) |
| | if pd.notna(parsed): return parsed |
| | except Exception: pass |
| | rt = parse_relative_time(s, ref_ts) |
| | if pd.notna(rt): return pd.to_datetime(rt) |
| | return pd.NaT |
| |
|
| | def compile_list(lst): return [re.compile(pat, flags=re.IGNORECASE) for pat in lst] |
| |
|
| |
|
| | |
| | PRO_INDIA = [r"\bjai hind\b", r"\bvande mataram\b", r"\bpro india\b", r"\bpro-india\b", r"\bsupport (?:india|modi|bjp)\b", r"\bproud of india\b", r"\bindia is great\b"] |
| | ANTI_INDIA = [r"\banti[- ]?india\b", r"\banti national\b", r"\btraitor\b", r"\banti-india\b", r"\bkill india\b", r"\bboycott india\b"] |
| | CRITICAL_GOVT = [r"\bmodi sucks\b", r"\bcorrupt government\b", r"\bgovernment (?:is )?failing\b", r"\b(criticis|criticize|criticising) (?:government|modi|bjp)\b", r"\bpolicy (?:failure|fail)\b", r"\banti-corruption\b", r"\bmisgovern(ance|ing)\b", r"\bgovernment (?:policy|policies)"] |
| | SUPPORT_OPPOSITION = [r"\bsupport (?:congress|aam aadmi|aap|opposition)\b", r"\bvot(e|ing) for .*opposition\b"] |
| | SEPARATIST = [r"\bazadi\b", r"\bseparatist\b", r"\bsecede\b", r"\bindependence for\b"] |
| | COMMUNAL = [r"\bcommunal\b", r"\breligious (?:tension|hatred)\b", r"\breligious\b", r"\bminority\b"] |
| | CALL_TO_ACTION = [r"\bprotest\b", r"\bboycott\b", r"\bjoin (?:the )?protest\b", r"\bstrike\b", r"\brally\b", r"\baction\b"] |
| | CONSPIRACY = [r"\bforeign funded\b", r"\bdeep state\b", r"\bconspiracy\b", r"\bwestern plot\b", r"\bcia\b", r"\bsecret agenda\b"] |
| |
|
| | PRO_INDIA_RE = compile_list(PRO_INDIA); ANTI_INDIA_RE = compile_list(ANTI_INDIA) |
| | CRITICAL_GOVT_RE = compile_list(CRITICAL_GOVT); SUPPORT_OPPOSITION_RE = compile_list(SUPPORT_OPPOSITION) |
| | SEPARATIST_RE = compile_list(SEPARATIST); COMMUNAL_RE = compile_list(COMMUNAL) |
| | CALL_TO_ACTION_RE = compile_list(CALL_TO_ACTION); CONSPIRACY_RE = compile_list(CONSPIRACY) |
| |
|
| |
|
| | def text_matches_any(text, patterns): |
| | for pat in patterns: |
| | if pat.search(text or ""): return True |
| | return False |
| |
|
| | def determine_nature(text, sentiment_label): |
| | t = (text or "").lower() |
| | |
| | if text_matches_any(t, SEPARATIST_RE): return "separatist" |
| | if text_matches_any(t, CALL_TO_ACTION_RE): return "call-to-action" |
| | if text_matches_any(t, COMMUNAL_RE): return "communal" |
| | if text_matches_any(t, CONSPIRACY_RE): return "conspiratorial" |
| |
|
| | |
| | s = str(sentiment_label) |
| | if s == "Pro-India": return "pro-india" |
| | if s == "Anti-India": return "anti-india" |
| | if s == "Pro-Government": return "pro-government" |
| | if s == "Anti-Government": return "anti-government" |
| |
|
| | |
| | if text_matches_any(t, ANTI_INDIA_RE): return "anti-india" |
| | if text_matches_any(t, PRO_INDIA_RE): return "pro-india" |
| | if text_matches_any(t, CRITICAL_GOVT_RE): return "critical-of-government" |
| | if text_matches_any(t, SUPPORT_OPPOSITION_RE): return "supportive-of-opposition" |
| |
|
| | |
| | s_upper = s.upper() |
| | if "POS" in s_upper: return "supportive" |
| | if "NEG" in s_upper: return "critical" |
| | |
| | return "neutral" |
| |
|
| | |
| | danger_keywords = ["kill","attack","bomb","violence","terror","terrorist","militant", |
| | "insurgency","boycott","protest","call to action"] |
| | pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, danger_keywords)) + r')\b', |
| | flags=re.IGNORECASE) |
| |
|
| | def is_dangerous(text, sentiment): |
| | |
| | return (str(sentiment).upper() == "ANTI-INDIA" and text.strip() != "") |
| |
|
| | def generate_reports_from_csv(input_csv:str, out_dir:str) -> dict: |
| | """ |
| | Runs full analysis pipeline. Returns dict: {'pdf':..., 'csv':..., 'docx':...} |
| | """ |
| | logger.info("Running processing pipeline on %s",input_csv) |
| | out_dir= Path(out_dir) |
| | out_dir.mkdir(parents=True,exist_ok=True) |
| |
|
| | |
| | if not os.path.exists(input_csv): |
| | print("CSV file not found:", input_csv); sys.exit(1) |
| |
|
| | print("Loading CSV:", input_csv) |
| | try: |
| | df_raw = pd.read_csv(input_csv, encoding=CSV_ENCODING, low_memory=False) |
| | except Exception as e: |
| | print("Error reading CSV:", e); sys.exit(1) |
| |
|
| | if MAX_ROWS: |
| | df_raw = df_raw.head(MAX_ROWS) |
| |
|
| | title_col = "Title" |
| | reference_col = "Reference" |
| | subreddit_col = "Subreddit" |
| | score_col = "Score" |
| | comment_col = "Comments" |
| | time_col = "Time" |
| | author_col = "Author" |
| | desc_col = "Description" |
| | url_col = "Url" |
| |
|
| | if not any(c in df_raw.columns for c in [title_col, comment_col, desc_col]): |
| | print("No text column detected. CSV columns:", list(df_raw.columns)); sys.exit(1) |
| |
|
| | |
| | |
| | |
| | df = pd.DataFrame() |
| | df["orig_index"] = df_raw.index.astype(str) |
| | df["title"] = df_raw[title_col].fillna("").astype(str) if title_col else "" |
| | df["reference"] = df_raw[reference_col].astype(str) if reference_col else "" |
| | df["subreddit"] = df_raw[subreddit_col] if subreddit_col else "N/A" |
| | df["raw_score"] = df_raw[score_col] if score_col else np.nan |
| | df["comment"] = df_raw[comment_col].fillna("").astype(str) if comment_col else "" |
| | df["time_raw"] = df_raw[time_col] if time_col else "" |
| | df["username"] = df_raw[author_col] if author_col else "N/A" |
| | df["description"] = df_raw[desc_col].fillna("").astype(str) if desc_col else "" |
| | df["url"] = df_raw[url_col] if url_col else "" |
| | |
| | df["text_for_analysis"] = (df["title"] + " " + df["comment"] + " " + df["description"]).str.strip() |
| | df.loc[df["text_for_analysis"].str.strip() == "", "text_for_analysis"] = df.loc[df["text_for_analysis"].str.strip() == "", :].apply( |
| | lambda r: " ".join([str(v) for v in r.values if isinstance(v, str) and v.strip() != ""]), axis=1 |
| | ) |
| | df["clean_text"] = df["text_for_analysis"].apply(clean_text) |
| | df["score"] = df["raw_score"].apply(parse_score) |
| |
|
| | |
| | try: |
| | ref_ts = pd.to_datetime(os.path.getmtime(input_csv), unit='s') |
| | except Exception: |
| | ref_ts = pd.Timestamp.now() |
| | |
| | df["created_at"] = df["time_raw"].apply(lambda x: parse_time_value(x,ref_ts)) |
| |
|
| | |
| | print("Loading sentiment model...") |
| | |
| | sentiment_analysis.init_anchors() |
| |
|
| | texts = df["clean_text"].tolist() |
| | preds = [] |
| | |
| | for text in texts: |
| | out = sentiment_analysis.classify(text) |
| | |
| | |
| | if "error" in out: |
| | preds.append(("NEUTRAL", 0.0)) |
| | else: |
| | label = out.get("label", "NEUTRAL") |
| | score = float(out.get("confidence", 0.0)) |
| | preds.append((label, score)) |
| |
|
| | df["sentiment"] = [p[0] for p in preds] |
| | df["sentiment_score"] = [p[1] for p in preds] |
| | |
| | df["nature"] = [ |
| | determine_nature(text, sentiment) |
| | for text, sentiment in zip(df["clean_text"], df["sentiment"]) |
| | ] |
| |
|
| | |
| | print("Performing topic modeling...") |
| |
|
| | vectorizer = CountVectorizer(stop_words="english", min_df=2) |
| | try: |
| | X = vectorizer.fit_transform(df["clean_text"]) |
| | except Exception as e: |
| | print("Topic vectorization failed:", e); X = None |
| |
|
| | if X is None or X.shape[0] < 3 or len(vectorizer.get_feature_names_out()) < 5: |
| | df["topic"] = np.nan |
| | topic_counts = pd.Series(dtype=int) |
| | else: |
| | n_topics = min(TOPIC_COUNT, X.shape[0]) |
| | lda = LatentDirichletAllocation(n_components=n_topics, random_state=42) |
| | lda.fit(X) |
| | doc_topic = lda.transform(X) |
| | df["topic"] = doc_topic.argmax(axis=1) |
| | topic_counts = df["topic"].value_counts().sort_index() |
| |
|
| | df["dangerous"] = df.apply(lambda r: is_dangerous(r["clean_text"], r["sentiment"]), axis=1) |
| | dangerous_tweets = df[df["dangerous"]].copy() |
| | print(f"Flagged {len(dangerous_tweets)} potentially dangerous posts.") |
| |
|
| | |
| | try: |
| | |
| | sent_counts = df["sentiment"].value_counts() |
| | plt.figure(figsize=(6,4)) |
| | sent_counts.plot(kind="bar") |
| | plt.title("Sentiment Distribution") |
| | plt.tight_layout() |
| | plt.savefig(out_dir / "sentiment.png", dpi=150) |
| | plt.close() |
| | |
| | if "topic" in df and df["topic"].notna().any(): |
| | topic_counts = df["topic"].value_counts().sort_index() |
| | plt.figure(figsize=(6,4)) |
| | topic_counts.plot(kind="bar") |
| | plt.title("Topic Distribution") |
| | plt.tight_layout() |
| | plt.savefig(out_dir / "topics.png", dpi=150) |
| | plt.close() |
| | |
| | dangerous_df = df[df["dangerous"]] |
| | if not dangerous_df.empty: |
| | wc_text = " ".join(dangerous_df["clean_text"].tolist()) |
| | wc = WordCloud(width=1000, height=400, background_color="white", stopwords=set(STOPWORDS)).generate(wc_text) |
| | plt.figure(figsize=(12,5)) |
| | plt.imshow(wc, interpolation="bilinear") |
| | plt.axis("off") |
| | plt.tight_layout() |
| | plt.savefig(out_dir / "danger_wc.png", dpi=150) |
| | plt.close() |
| | except Exception as e: |
| | logger.warning("Visuals generation failed: %s", e) |
| |
|
| |
|
| | |
| | print("Building PDF report (LongTable for large tables)...") |
| | pdf_out= out_dir/"report.pdf" |
| | styles = getSampleStyleSheet() |
| | styleN = styles["Normal"] |
| | styleH = styles["Heading2"] |
| | title_style = styles["Title"] |
| | tweet_paragraph_style = ParagraphStyle("TweetStyle", parent=styles["BodyText"], fontSize=9, leading=11, spaceAfter=6, alignment=TA_LEFT) |
| |
|
| | doc = SimpleDocTemplate(pdf_out, pagesize=A4, rightMargin=36, leftMargin=36, topMargin=36, bottomMargin=36) |
| | elements = [] |
| | elements.append(Paragraph("Reddit Posts Report (CSV Source) — India-specific Nature", title_style)) |
| | elements.append(Spacer(1, 8)) |
| | elements.append(Paragraph(f"Total Posts Processed: {len(df)}", styleN)) |
| | elements.append(Spacer(1, 8)) |
| |
|
| | |
| | elements.append(Paragraph("Sentiment Analysis Summary", styleH)) |
| | total = len(df) |
| | for label, count in sent_counts.items(): |
| | pct = count / total * 100 if total > 0 else 0 |
| | elements.append(Paragraph(f"{label}: {count} posts ({pct:.1f}%)", styleN)) |
| | elements.append(Spacer(1, 6)) |
| | if os.path.exists("sentiment.png"): |
| | elements.append(Image("sentiment.png", width=5.5*inch, height=3*inch)) |
| | elements.append(Spacer(1, 12)) |
| |
|
| | |
| | if not topic_counts.empty: |
| | elements.append(Paragraph("Topic Modeling Summary", styleH)) |
| | for idx, val in topic_counts.items(): |
| | elements.append(Paragraph(f"Topic {int(idx)}: {int(val)} posts", styleN)) |
| | elements.append(Spacer(1, 6)) |
| | if os.path.exists("topics.png"): elements.append(Image("topics.png", width=5.5*inch, height=3*inch)) |
| | elements.append(Spacer(1, 12)) |
| |
|
| | elements.append(Paragraph("Nature (India-specific) Summary", styleH)) |
| | nature_counts = df["nature"].value_counts() |
| | for label, count in nature_counts.items(): |
| | pct = count / total * 100 if total > 0 else 0 |
| | elements.append(Paragraph(f"{label}: {count} posts ({pct:.1f}%)", styleN)) |
| | elements.append(Spacer(1, 12)) |
| |
|
| | |
| | elements.append(Paragraph("Flagged Potentially Dangerous Posts", styleH)) |
| | elements.append(Spacer(1, 6)) |
| | if dangerous_tweets.empty: |
| | elements.append(Paragraph("No dangerous posts detected.", styleN)) |
| | else: |
| | |
| | header = ["Post (teaser)", "Subreddit", "Author", "Sentiment", "Nature", "Topic", "Date"] |
| | lt_data = [header] |
| | for _, row in dangerous_tweets.iterrows(): |
| | date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A" |
| | lt_data.append([ |
| | Paragraph(teaser(row["text_for_analysis"], TEASER_CHAR_LIMIT), tweet_paragraph_style), |
| | row["subreddit"] if pd.notna(row["subreddit"]) else "N/A", |
| | row["username"] if pd.notna(row["username"]) else "N/A", |
| | row["sentiment"], |
| | row["nature"], |
| | str(int(row["topic"])) if not pd.isna(row["topic"]) else "N/A", |
| | date_str |
| | ]) |
| | col_widths = [3.0*inch, 0.7*inch, 0.8*inch, 0.6*inch, 0.8*inch, 0.5*inch, 1.0*inch] |
| | lt = LongTable(lt_data, colWidths=col_widths, repeatRows=1) |
| | |
| | lt_style = TableStyle([ |
| | ('BACKGROUND', (0,0), (-1,0), colors.HexColor("#4F81BD")), |
| | ('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke), |
| | ('ALIGN', (1,0), (-1,-1), 'CENTER'), |
| | ('VALIGN', (0,0), (-1,-1), 'TOP'), |
| | ('GRID', (0,0), (-1,-1), 0.25, colors.grey), |
| | ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'), |
| | ('FONTSIZE', (0,0), (-1,-1), 8), |
| | ('LEFTPADDING', (0,0), (-1,-1), 4), |
| | ('RIGHTPADDING', (0,0), (-1,-1), 4), |
| | ]) |
| | lt.setStyle(lt_style) |
| | elements.append(lt) |
| | elements.append(Spacer(1, 12)) |
| | if os.path.exists("danger_wc.png"): |
| | elements.append(Paragraph("Word Cloud of Flagged Posts", styleH)); elements.append(Image("danger_wc.png", width=5.5*inch, height=2.6*inch)) |
| |
|
| | elements.append(PageBreak()) |
| |
|
| | |
| | elements.append(Paragraph("All Collected Posts", styles['Heading2'])) |
| | all_header = ["Date", "Subreddit", "Author", "Score", "Nature", "Post (teaser)"] |
| | all_lt_data = [all_header] |
| | for idx, row in df.iterrows(): |
| | date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A" |
| | all_lt_data.append([ |
| | date_str, |
| | row["subreddit"] if pd.notna(row["subreddit"]) else "N/A", |
| | row["username"] if pd.notna(row["username"]) else "N/A", |
| | str(row["score"]) if not pd.isna(row["score"]) else "N/A", |
| | row["nature"], |
| | Paragraph(teaser(row["text_for_analysis"], TEASER_CHAR_LIMIT), tweet_paragraph_style) |
| | ]) |
| |
|
| | all_col_widths = [1.0*inch, 1.0*inch, 1.0*inch, 0.7*inch, 0.9*inch, 2.8*inch] |
| | all_lt = LongTable(all_lt_data, colWidths=all_col_widths, repeatRows=1) |
| | all_lt.setStyle(TableStyle([ |
| | ('BACKGROUND', (0,0), (-1,0), colors.HexColor("#4F81BD")), |
| | ('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke), |
| | ('GRID', (0,0), (-1,-1), 0.25, colors.grey), |
| | ('VALIGN', (0,0), (-1,-1), 'TOP'), |
| | ('FONTSIZE', (0,0), (-1,-1), 8), |
| | ('LEFTPADDING', (0,0), (-1,-1), 4), |
| | ('RIGHTPADDING', (0,0), (-1,-1), 4), |
| | ])) |
| | elements.append(all_lt) |
| |
|
| | |
| | doc = SimpleDocTemplate(str(pdf_out)) |
| | doc.build(elements) |
| | print("✅ PDF saved as:", pdf_out) |
| |
|
| | |
| | csv_out = out_dir/"analysis_output.csv" |
| | df_out = df.copy() |
| | df_out["created_at_str"] = df_out["created_at"].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S") if pd.notna(x) else "") |
| | |
| | import time |
| | for attempt in range(3): |
| | try: |
| | df_out.to_csv(csv_out, index=False, encoding="utf-8") |
| | print("✅ Enriched CSV saved as:", csv_out) |
| | break |
| | except PermissionError: |
| | if attempt < 2: |
| | print(f"⚠️ Permission denied saving CSV (file locked?). Retrying {attempt+1}/3 in 1s...") |
| | time.sleep(1) |
| | else: |
| | print("❌ FAILED to save CSV. The file is likely open in another program (Excel/VS Code).") |
| | |
| | |
| |
|
| |
|
| |
|
| | |
| | if not DOCX_AVAILABLE: |
| | print("python-docx not installed — skipping DOCX export. Install via: pip install python-docx") |
| | else: |
| | try: |
| | print("Building DOCX report...") |
| | DOCX_OUTPUT= out_dir/"report.docx" |
| | docx = Document() |
| | docx.add_heading("Reddit Posts Report (India-specific Nature)", level=1) |
| | docx.add_paragraph(f"Total Posts Processed: {len(df)}") |
| | docx.add_heading("Sentiment Analysis Summary", level=2) |
| | for label, count in sent_counts.items(): |
| | pct = count / total * 100 if total > 0 else 0 |
| | docx.add_paragraph(f"{label}: {count} posts ({pct:.1f}%)") |
| |
|
| | docx.add_heading("Nature Summary", level=2) |
| | for label, count in nature_counts.items(): |
| | pct = count / total * 100 if total > 0 else 0 |
| | docx.add_paragraph(f"{label}: {count} posts ({pct:.1f}%)") |
| |
|
| | |
| | sample_n = min(200, len(df)) |
| | docx.add_heading(f"Sample of First {sample_n} Posts", level=2) |
| | table = docx.add_table(rows=1, cols=6) |
| | hdr_cells = table.rows[0].cells |
| | hdr_cells[0].text = "Date" |
| | hdr_cells[1].text = "Subreddit" |
| | hdr_cells[2].text = "Author" |
| | hdr_cells[3].text = "Score" |
| | hdr_cells[4].text = "Nature" |
| | hdr_cells[5].text = "Post (teaser)" |
| | for idx, row in df.head(sample_n).iterrows(): |
| | row_cells = table.add_row().cells |
| | date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A" |
| | row_cells[0].text = date_str |
| | row_cells[1].text = str(row["subreddit"]) if pd.notna(row["subreddit"]) else "N/A" |
| | row_cells[2].text = str(row["username"]) if pd.notna(row["username"]) else "N/A" |
| | row_cells[3].text = str(row["score"]) if not pd.isna(row["score"]) else "N/A" |
| | row_cells[4].text = str(row["nature"]) |
| | row_cells[5].text = teaser(row["text_for_analysis"], 300) |
| |
|
| | docx.save(DOCX_OUTPUT) |
| | print("✅ DOCX saved as:", DOCX_OUTPUT) |
| | except Exception as e: |
| | logger.exception("DOCX creation failed: %s", e) |
| | if DOCX_OUTPUT.exists(): |
| | try: |
| | DOCX_OUTPUT.unlink(missing_ok=True) |
| | except Exception: |
| | pass |
| | logger.info("Processor: finished, files at %s", out_dir) |
| | return {"pdf": str(pdf_out), "csv": str(csv_out), "docx": str(DOCX_OUTPUT) if DOCX_OUTPUT.exists() else ""} |
| | |
| |
|