vault_audit_tools/commands/
entity_churn.rs

1//! Multi-day entity churn analysis with intelligent ephemeral pattern detection.
2//!
3//! Tracks entity lifecycle across multiple audit log files to identify:
4//! - New entities appearing each day
5//! - Returning vs. churned entities
6//! - Entity persistence patterns
7//! - Authentication method usage trends
8//! - **Ephemeral entities** using data-driven pattern learning
9//!
10//! # Usage
11//!
12//! ```bash
13//! # Analyze entity churn across a week
14//! vault-audit entity-churn day1.log day2.log day3.log day4.log day5.log day6.log day7.log
15//!
16//! # With baseline for accurate new entity detection
17//! vault-audit entity-churn *.log --baseline baseline_entities.json
18//!
19//! # With entity mappings for enriched display names
20//! vault-audit entity-churn *.log --baseline baseline.json --entity-map entity_mappings.json
21//!
22//! # Export detailed churn data with ephemeral analysis
23//! vault-audit entity-churn *.log --output entity_churn.json
24//!
25//! # Export as CSV format
26//! vault-audit entity-churn *.log --output entity_churn.csv --format csv
27//! ```
28//!
29//! # Ephemeral Pattern Detection
30//!
31//! The command uses a sophisticated two-pass analysis to detect ephemeral entities
32//! (e.g., CI/CD pipeline entities, temporary build entities) with confidence scoring:
33//!
34//! **Pass 1: Data Collection**
35//! - Track all entities across log files
36//! - Record first/last seen times and files
37//! - Count login activity per entity
38//!
39//! **Pass 2: Pattern Learning & Classification**
40//! - Learn patterns from entities that appeared 1-2 days
41//! - Identify naming patterns (e.g., `github-repo:org/repo:ref:branch`)
42//! - Calculate confidence scores (0.0-1.0) based on:
43//!   - Days active (1 day = high confidence, 2 days = medium)
44//!   - Similar entities on same mount path
45//!   - Activity levels (low login counts)
46//!   - Gaps in activity (reduces confidence for sporadic access)
47//!
48//! # Output
49//!
50//! ## Entity Lifecycle Classification:
51//! - **new_day_N**: Entities first seen on day N (not in baseline)
52//! - **pre_existing_baseline**: Entities that existed before analysis period
53//!
54//! ## Activity Patterns:
55//! - **consistent**: Appeared in most/all log files
56//! - **sporadic**: Appeared intermittently with gaps
57//! - **declining**: Activity decreased over time
58//! - **single_burst**: Appeared only once
59//!
60//! ## Ephemeral Detection:
61//! - Confidence levels: High (≥70%), Medium (50-69%), Low (40-49%)
62//! - Detailed reasoning for each classification
63//! - Top ephemeral entities by confidence
64//! - Pattern statistics and mount path analysis
65//!
66//! # JSON Output Fields
67//!
68//! When using `--output`, each entity record includes:
69//! - `entity_id`: Vault entity identifier
70//! - `display_name`: Human-readable name
71//! - `first_seen_file` / `first_seen_time`: When first observed
72//! - `last_seen_file` / `last_seen_time`: When last observed
73//! - `files_appeared`: List of log files entity was active in
74//! - `total_logins`: Total login count across all files
75//! - `lifecycle`: Entity lifecycle classification
76//! - `activity_pattern`: Behavioral pattern classification
77//! - `is_ephemeral_pattern`: Boolean flag for ephemeral detection
78//! - `ephemeral_confidence`: Confidence score (0.0-1.0)
79//! - `ephemeral_reasons`: Array of human-readable reasons
80//!
81//! Only tracks entities that performed login operations (paths ending in `/login`).
82
83use crate::audit::types::AuditEntry;
84use crate::utils::progress::ProgressBar;
85use anyhow::{Context, Result};
86use chrono::{DateTime, Utc};
87use serde::{Deserialize, Serialize};
88use std::collections::{HashMap, HashSet};
89use std::fs::File;
90use std::io::{BufRead, BufReader};
91use std::path::Path;
92
93/// Entity mapping from baseline CSV files
94#[derive(Debug, Serialize, Deserialize)]
95struct EntityMapping {
96    display_name: String,
97    mount_path: String,
98    #[allow(dead_code)]
99    mount_accessor: String,
100    #[allow(dead_code)]
101    login_count: usize,
102    #[allow(dead_code)]
103    first_seen: String,
104    #[allow(dead_code)]
105    last_seen: String,
106}
107
108/// Represents an entity's churn status
109#[derive(Debug, Serialize, Clone)]
110struct EntityChurnRecord {
111    entity_id: String,
112    display_name: String,
113    mount_path: String,
114    mount_type: String,
115    token_type: String,
116    first_seen_file: String,
117    first_seen_time: DateTime<Utc>,
118    last_seen_file: String,
119    last_seen_time: DateTime<Utc>,
120    files_appeared: Vec<String>,
121    total_logins: usize,
122    lifecycle: String, // "new_day_1", "new_day_2", "new_day_3", "pre_existing"
123    activity_pattern: String, // "consistent", "sporadic", "declining", "single_burst", "unknown"
124    is_ephemeral_pattern: bool,
125    ephemeral_confidence: f32, // 0.0 to 1.0
126    ephemeral_reasons: Vec<String>,
127    // Baseline metadata (if entity existed in baseline)
128    #[serde(skip_serializing_if = "Option::is_none")]
129    baseline_entity_name: Option<String>,
130    #[serde(skip_serializing_if = "Option::is_none")]
131    baseline_created: Option<String>,
132    #[serde(skip_serializing_if = "Option::is_none")]
133    baseline_alias_name: Option<String>,
134    #[serde(skip_serializing_if = "Option::is_none")]
135    baseline_mount_path: Option<String>,
136    // Entity-map metadata (from historical audit logs via preprocess-entities)
137    #[serde(skip_serializing_if = "Option::is_none")]
138    historical_display_name: Option<String>,
139    #[serde(skip_serializing_if = "Option::is_none")]
140    historical_first_seen: Option<String>,
141    #[serde(skip_serializing_if = "Option::is_none")]
142    historical_last_seen: Option<String>,
143    #[serde(skip_serializing_if = "Option::is_none")]
144    historical_login_count: Option<usize>,
145}
146
147/// CSV-compatible representation of entity churn record
148#[derive(Debug, Serialize)]
149struct EntityChurnRecordCsv {
150    entity_id: String,
151    display_name: String,
152    mount_path: String,
153    mount_type: String,
154    token_type: String,
155    first_seen_file: String,
156    first_seen_time: String,
157    last_seen_file: String,
158    last_seen_time: String,
159    files_appeared: String, // Comma-separated list
160    days_active: usize,
161    total_logins: usize,
162    lifecycle: String,
163    activity_pattern: String,
164    is_ephemeral_pattern: bool,
165    ephemeral_confidence: f32,
166    ephemeral_reasons: String, // Semicolon-separated list
167    baseline_entity_name: String,
168    baseline_created: String,
169    baseline_alias_name: String,
170    baseline_mount_path: String,
171    historical_display_name: String,
172    historical_first_seen: String,
173    historical_last_seen: String,
174    historical_login_count: String,
175}
176
177impl From<EntityChurnRecord> for EntityChurnRecordCsv {
178    fn from(record: EntityChurnRecord) -> Self {
179        EntityChurnRecordCsv {
180            entity_id: record.entity_id,
181            display_name: record.display_name,
182            mount_path: record.mount_path,
183            mount_type: record.mount_type,
184            token_type: record.token_type,
185            first_seen_file: record.first_seen_file,
186            first_seen_time: record.first_seen_time.to_rfc3339(),
187            last_seen_file: record.last_seen_file,
188            last_seen_time: record.last_seen_time.to_rfc3339(),
189            files_appeared: record.files_appeared.join(", "),
190            days_active: record.files_appeared.len(),
191            total_logins: record.total_logins,
192            lifecycle: record.lifecycle,
193            activity_pattern: record.activity_pattern,
194            is_ephemeral_pattern: record.is_ephemeral_pattern,
195            ephemeral_confidence: record.ephemeral_confidence,
196            ephemeral_reasons: record.ephemeral_reasons.join("; "),
197            baseline_entity_name: record.baseline_entity_name.unwrap_or_default(),
198            baseline_created: record.baseline_created.unwrap_or_default(),
199            baseline_alias_name: record.baseline_alias_name.unwrap_or_default(),
200            baseline_mount_path: record.baseline_mount_path.unwrap_or_default(),
201            historical_display_name: record.historical_display_name.unwrap_or_default(),
202            historical_first_seen: record.historical_first_seen.unwrap_or_default(),
203            historical_last_seen: record.historical_last_seen.unwrap_or_default(),
204            historical_login_count: record
205                .historical_login_count
206                .map(|n| n.to_string())
207                .unwrap_or_default(),
208        }
209    }
210}
211
212#[derive(Debug)]
213struct DailyStats {
214    #[allow(dead_code)]
215    file_name: String,
216    new_entities: usize,
217    returning_entities: usize,
218    total_logins: usize,
219}
220
221/// Analyzes entity behavior patterns to detect ephemeral entities
222#[derive(Debug)]
223struct EphemeralPatternAnalyzer {
224    total_files: usize,
225    short_lived_patterns: Vec<ShortLivedPattern>,
226}
227
228#[derive(Debug)]
229struct ShortLivedPattern {
230    days_active: usize,
231    display_name: String,
232    mount_path: String,
233}
234
235impl EphemeralPatternAnalyzer {
236    fn new(total_files: usize) -> Self {
237        Self {
238            total_files,
239            short_lived_patterns: Vec::new(),
240        }
241    }
242
243    /// Learn patterns from entities that appeared 1-2 days (potential ephemeral patterns)
244    fn learn_from_entities(&mut self, entities: &HashMap<String, EntityChurnRecord>) {
245        for entity in entities.values() {
246            let days_active = entity.files_appeared.len();
247
248            // Learn from entities that appeared 1-2 days only
249            if days_active <= 2 {
250                self.short_lived_patterns.push(ShortLivedPattern {
251                    days_active,
252                    display_name: entity.display_name.clone(),
253                    mount_path: entity.mount_path.clone(),
254                });
255            }
256        }
257    }
258
259    /// Analyze an entity and determine if it matches ephemeral patterns
260    fn analyze_entity(&self, entity: &EntityChurnRecord) -> (bool, f32, Vec<String>) {
261        let days_active = entity.files_appeared.len();
262        let mut confidence = 0.0;
263        let mut reasons = Vec::new();
264
265        // Strong indicators (high confidence)
266        if days_active == 1 {
267            confidence += 0.5;
268            reasons.push(format!("Appeared only 1 day ({})", entity.first_seen_file));
269        } else if days_active == 2 {
270            confidence += 0.3;
271            reasons.push(format!(
272                "Appeared only 2 days: {}, {}",
273                entity.files_appeared.first().unwrap_or(&String::new()),
274                entity.files_appeared.last().unwrap_or(&String::new())
275            ));
276        }
277
278        // Pattern matching: Check if display name follows patterns seen in other short-lived entities
279        if days_active <= 2 {
280            // Count how many other short-lived entities share similar patterns
281            let similar_count = self
282                .short_lived_patterns
283                .iter()
284                .filter(|p| {
285                    // Same mount path
286                    if p.mount_path == entity.mount_path && p.days_active <= 2 {
287                        return true;
288                    }
289                    // Similar naming pattern (e.g., github-repo:* or airflow-*)
290                    if entity.display_name.contains(':') && p.display_name.contains(':') {
291                        let entity_prefix = entity.display_name.split(':').next().unwrap_or("");
292                        let pattern_prefix = p.display_name.split(':').next().unwrap_or("");
293                        if entity_prefix == pattern_prefix && !entity_prefix.is_empty() {
294                            return true;
295                        }
296                    }
297                    false
298                })
299                .count();
300
301            if similar_count > 5 {
302                confidence += 0.2;
303                reasons.push(format!(
304                    "Matches pattern seen in {} other short-lived entities",
305                    similar_count
306                ));
307            } else if similar_count > 0 {
308                confidence += 0.1;
309                reasons.push(format!(
310                    "Similar to {} other short-lived entities",
311                    similar_count
312                ));
313            }
314        }
315
316        // Low activity indicator
317        if entity.total_logins <= 5 && days_active <= 2 {
318            confidence += 0.1;
319            reasons.push(format!(
320                "Low activity: only {} login(s)",
321                entity.total_logins
322            ));
323        }
324
325        // Non-continuous appearance (sporadic pattern suggests not churned, just periodic)
326        if days_active >= 2 {
327            let first_day_idx = entity.files_appeared.first().and_then(|f| {
328                f.split('_')
329                    .next_back()
330                    .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
331            });
332            let last_day_idx = entity.files_appeared.last().and_then(|f| {
333                f.split('_')
334                    .next_back()
335                    .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
336            });
337
338            if let (Some(first), Some(last)) = (first_day_idx, last_day_idx) {
339                let span = last - first + 1;
340                if span > days_active {
341                    // Gaps in activity - reduce confidence
342                    confidence *= 0.7;
343                    reasons.push(
344                        "Has gaps in activity (possibly sporadic access, not churned)".to_string(),
345                    );
346                }
347            }
348        }
349
350        // Cap confidence and determine ephemeral status
351        confidence = f32::min(confidence, 1.0);
352        let is_ephemeral = confidence >= 0.4; // Threshold for classification
353
354        // Add absence indicator if not seen in recent files
355        if is_ephemeral && days_active < self.total_files {
356            reasons.push(format!(
357                "Not seen in most recent {} file(s)",
358                self.total_files - days_active
359            ));
360        }
361
362        (is_ephemeral, confidence, reasons)
363    }
364
365    /// Determine activity pattern based on appearance across files
366    fn classify_activity_pattern(&self, entity: &EntityChurnRecord) -> String {
367        let days_active = entity.files_appeared.len();
368
369        if days_active == 1 {
370            return "single_burst".to_string();
371        }
372
373        if days_active == self.total_files {
374            return "consistent".to_string();
375        }
376
377        if days_active >= (self.total_files * 2) / 3 {
378            return "consistent".to_string();
379        }
380
381        // Check if activity is declining (appeared early but stopped)
382        if let (Some(_first_file), Some(last_file)) =
383            (entity.files_appeared.first(), entity.files_appeared.last())
384        {
385            // Simple heuristic: if last seen was in first half of files, it's declining
386            let last_file_num = last_file
387                .split('_')
388                .next_back()
389                .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
390                .unwrap_or(self.total_files);
391
392            if last_file_num < self.total_files / 2 {
393                return "declining".to_string();
394            }
395        }
396
397        if days_active <= 2 {
398            return "single_burst".to_string();
399        }
400
401        "sporadic".to_string()
402    }
403}
404
405fn format_number(n: usize) -> String {
406    let s = n.to_string();
407    let mut result = String::new();
408    for (i, c) in s.chars().rev().enumerate() {
409        if i > 0 && i % 3 == 0 {
410            result.push(',');
411        }
412        result.push(c);
413    }
414    result.chars().rev().collect()
415}
416
417fn get_file_size(path: &str) -> Result<u64> {
418    Ok(std::fs::metadata(path)?.len())
419}
420
421fn load_entity_mappings(path: &str) -> Result<HashMap<String, EntityMapping>> {
422    let file = File::open(path).context("Failed to open entity map file")?;
423    let mappings: HashMap<String, EntityMapping> =
424        serde_json::from_reader(file).context("Failed to parse entity map JSON")?;
425    Ok(mappings)
426}
427
428#[derive(Debug, Deserialize, Clone)]
429#[allow(dead_code)]
430struct BaselineEntity {
431    entity_id: String,
432    // Fields from entity-list (Vault API) - full metadata
433    #[serde(default)]
434    entity_name: String,
435    #[serde(default)]
436    entity_disabled: bool,
437    #[serde(default)]
438    entity_created: String,
439    #[serde(default)]
440    entity_updated: String,
441    #[serde(default)]
442    alias_id: String,
443    #[serde(default)]
444    alias_name: String,
445    #[serde(default)]
446    mount_path: String,
447    #[serde(default)]
448    mount_type: String,
449    #[serde(default)]
450    mount_accessor: String,
451    #[serde(default)]
452    alias_created: String,
453    #[serde(default)]
454    alias_updated: String,
455    #[serde(default)]
456    alias_metadata: String,
457}
458
459impl BaselineEntity {
460    /// Get the best available name (entity_name if available, otherwise alias_name)
461    fn get_name(&self) -> String {
462        if !self.entity_name.is_empty() {
463            self.entity_name.clone()
464        } else if !self.alias_name.is_empty() {
465            self.alias_name.clone()
466        } else {
467            String::new()
468        }
469    }
470
471    /// Get the entity creation time
472    fn get_created(&self) -> String {
473        self.entity_created.clone()
474    }
475}
476
477fn load_baseline_entities(path: &str) -> Result<HashMap<String, BaselineEntity>> {
478    let file = File::open(path).context("Failed to open baseline entities file")?;
479
480    // Check if it's JSON or CSV based on file extension
481    let path_lower = path.to_lowercase();
482    if path_lower.ends_with(".json") {
483        // JSON format from entity-list with --format json
484        let entities: Vec<BaselineEntity> =
485            serde_json::from_reader(file).context("Failed to parse baseline entities JSON")?;
486        Ok(entities
487            .into_iter()
488            .map(|e| (e.entity_id.clone(), e))
489            .collect())
490    } else {
491        // CSV format (default entity-list output)
492        let mut reader = csv::Reader::from_reader(file);
493        let mut entities = HashMap::new();
494
495        for result in reader.deserialize() {
496            let entity: BaselineEntity = result.context("Failed to parse baseline CSV row")?;
497            // Use first occurrence of each entity_id (entities can have multiple aliases)
498            entities.entry(entity.entity_id.clone()).or_insert(entity);
499        }
500
501        Ok(entities)
502    }
503}
504
505pub fn run(
506    log_files: &[String],
507    entity_map: Option<&str>,
508    baseline_entities: Option<&str>,
509    output: Option<&str>,
510    format: Option<&str>,
511) -> Result<()> {
512    println!("\n=== Multi-Day Entity Churn Analysis ===\n");
513    println!("Analyzing {} log files:", log_files.len());
514    for (i, file) in log_files.iter().enumerate() {
515        let size = get_file_size(file)?;
516        println!(
517            "  Day {}: {} ({:.2} GB)",
518            i + 1,
519            file,
520            size as f64 / 1_000_000_000.0
521        );
522    }
523    println!();
524
525    // Load baseline entities if provided
526    let baseline = if let Some(path) = baseline_entities {
527        println!(
528            "Loading baseline entity list (Vault API metadata) from {}...",
529            path
530        );
531        let baseline_set = load_baseline_entities(path)?;
532        println!(
533            "Loaded {} pre-existing entities from Vault API baseline",
534            format_number(baseline_set.len())
535        );
536        println!();
537        Some(baseline_set)
538    } else {
539        println!("No baseline entity list provided. Cannot distinguish truly NEW entities from pre-existing.");
540        println!("   All Day 1 entities will be marked as 'pre_existing_or_new_day_1'.");
541        println!("   To get accurate results, run: ./vault-audit entity-list --output baseline_entities.json\n");
542        None
543    };
544
545    // Load entity mappings if provided (historical data from audit logs)
546    let entity_mappings = if let Some(path) = entity_map {
547        println!(
548            "Loading historical entity mappings (audit log enrichment) from {}...",
549            path
550        );
551        let mappings = load_entity_mappings(path)?;
552        println!(
553            "Loaded {} entity mappings with historical audit log data",
554            format_number(mappings.len())
555        );
556        println!();
557        Some(mappings)
558    } else {
559        None
560    };
561
562    // Track all entities across all files
563    let mut entities: HashMap<String, EntityChurnRecord> = HashMap::new();
564    let mut daily_stats: Vec<DailyStats> = Vec::new();
565
566    // Process each log file in order
567    for (file_idx, log_file) in log_files.iter().enumerate() {
568        let file_name = Path::new(log_file)
569            .file_name()
570            .unwrap()
571            .to_string_lossy()
572            .to_string();
573
574        println!("\nProcessing Day {} ({})...", file_idx + 1, file_name);
575
576        let file = File::open(log_file)
577            .with_context(|| format!("Failed to open log file: {}", log_file))?;
578        let file_size = get_file_size(log_file)? as usize;
579
580        let reader = BufReader::new(file);
581        let mut progress = ProgressBar::new(file_size, "Processing");
582
583        let mut new_entities_this_file = 0;
584        let mut returning_entities_this_file = HashSet::new();
585        let mut logins_this_file = 0;
586        let mut bytes_processed = 0;
587
588        for line in reader.lines() {
589            let line = line.context("Failed to read line from log file")?;
590            bytes_processed += line.len() + 1; // +1 for newline
591
592            // Update progress periodically
593            if bytes_processed % 10_000 == 0 {
594                progress.update(bytes_processed.min(file_size));
595            }
596
597            let trimmed = line.trim();
598            if trimmed.is_empty() {
599                continue;
600            }
601
602            let entry: AuditEntry = match serde_json::from_str(trimmed) {
603                Ok(e) => e,
604                Err(_) => continue,
605            };
606
607            // Only process login operations (auth paths ending in /login)
608            let Some(ref request) = entry.request else {
609                continue;
610            };
611            let Some(ref path) = request.path else {
612                continue;
613            };
614            if !path.ends_with("/login") {
615                continue;
616            }
617
618            logins_this_file += 1;
619
620            // Extract entity info
621            let Some(ref auth) = entry.auth else {
622                continue;
623            };
624            let Some(ref entity_id) = auth.entity_id else {
625                continue;
626            };
627
628            let display_name = auth
629                .display_name
630                .clone()
631                .unwrap_or_else(|| entity_id.clone());
632            let mount_path = request.path.clone().unwrap_or_default();
633            let mount_type = request.mount_type.clone().unwrap_or_default();
634            let token_type = auth.token_type.clone().unwrap_or_default();
635
636            // Parse timestamp
637            let first_seen_time = chrono::DateTime::parse_from_rfc3339(&entry.time)
638                .ok()
639                .map(|dt| dt.with_timezone(&Utc))
640                .unwrap_or_else(Utc::now);
641
642            // Check if this entity exists from a previous file
643            if let Some(entity_record) = entities.get_mut(entity_id) {
644                // Returning entity
645                entity_record.total_logins += 1;
646                entity_record.last_seen_file = file_name.clone();
647                entity_record.last_seen_time = first_seen_time;
648                if !entity_record.files_appeared.contains(&file_name) {
649                    entity_record.files_appeared.push(file_name.clone());
650                }
651                returning_entities_this_file.insert(entity_id.clone());
652            } else {
653                // New entity (first time across all files)
654                new_entities_this_file += 1;
655
656                // Determine lifecycle based on baseline and which file this is
657                let lifecycle = if let Some(ref baseline_set) = baseline {
658                    if baseline_set.contains_key(entity_id) {
659                        "pre_existing_baseline".to_string()
660                    } else {
661                        // Not in baseline, so truly NEW during analysis period
662                        match file_idx {
663                            0 => "new_day_1".to_string(),
664                            1 => "new_day_2".to_string(),
665                            2 => "new_day_3".to_string(),
666                            _ => format!("new_day_{}", file_idx + 1),
667                        }
668                    }
669                } else {
670                    // No baseline provided, can't distinguish
671                    match file_idx {
672                        0 => "pre_existing_or_new_day_1".to_string(),
673                        1 => "new_day_2".to_string(),
674                        2 => "new_day_3".to_string(),
675                        _ => format!("new_day_{}", file_idx + 1),
676                    }
677                };
678
679                // Get baseline metadata if entity exists in baseline
680                let (
681                    baseline_entity_name,
682                    baseline_created,
683                    baseline_alias_name,
684                    baseline_mount_path,
685                ) = if let Some(ref baseline_map) = baseline {
686                    if let Some(baseline_entity) = baseline_map.get(entity_id) {
687                        let name = baseline_entity.get_name();
688                        let created = baseline_entity.get_created();
689                        (
690                            if !name.is_empty() { Some(name) } else { None },
691                            if !created.is_empty() {
692                                Some(created)
693                            } else {
694                                None
695                            },
696                            if !baseline_entity.alias_name.is_empty() {
697                                Some(baseline_entity.alias_name.clone())
698                            } else {
699                                None
700                            },
701                            if !baseline_entity.mount_path.is_empty() {
702                                Some(baseline_entity.mount_path.clone())
703                            } else {
704                                None
705                            },
706                        )
707                    } else {
708                        (None, None, None, None)
709                    }
710                } else {
711                    (None, None, None, None)
712                };
713
714                // Fetch historical data from entity_mappings
715                let (
716                    historical_display_name,
717                    historical_first_seen,
718                    historical_last_seen,
719                    historical_login_count,
720                ) = if let Some(ref mappings) = entity_mappings {
721                    if let Some(mapping) = mappings.get(entity_id) {
722                        (
723                            Some(mapping.display_name.clone()),
724                            Some(mapping.first_seen.clone()),
725                            Some(mapping.last_seen.clone()),
726                            Some(mapping.login_count),
727                        )
728                    } else {
729                        (None, None, None, None)
730                    }
731                } else {
732                    (None, None, None, None)
733                };
734
735                entities.insert(
736                    entity_id.clone(),
737                    EntityChurnRecord {
738                        entity_id: entity_id.clone(),
739                        display_name: display_name.clone(),
740                        mount_path: mount_path.clone(),
741                        mount_type: mount_type.clone(),
742                        token_type: token_type.clone(),
743                        first_seen_file: file_name.clone(),
744                        first_seen_time,
745                        last_seen_file: file_name.clone(),
746                        last_seen_time: first_seen_time,
747                        files_appeared: vec![file_name.clone()],
748                        total_logins: 1,
749                        lifecycle,
750                        activity_pattern: "unknown".to_string(), // Will be computed in second pass
751                        is_ephemeral_pattern: false,             // Will be computed in second pass
752                        ephemeral_confidence: 0.0,               // Will be computed in second pass
753                        ephemeral_reasons: Vec::new(),           // Will be computed in second pass
754                        baseline_entity_name,
755                        baseline_created,
756                        baseline_alias_name,
757                        baseline_mount_path,
758                        historical_display_name,
759                        historical_first_seen,
760                        historical_last_seen,
761                        historical_login_count,
762                    },
763                );
764            }
765        }
766
767        progress.finish();
768
769        daily_stats.push(DailyStats {
770            file_name,
771            new_entities: new_entities_this_file,
772            returning_entities: returning_entities_this_file.len(),
773            total_logins: logins_this_file,
774        });
775
776        println!(
777            "Day {} Summary: {} new entities, {} returning, {} logins",
778            file_idx + 1,
779            format_number(new_entities_this_file),
780            format_number(returning_entities_this_file.len()),
781            format_number(logins_this_file)
782        );
783    }
784
785    // === SECOND PASS: Analyze patterns and classify entities ===
786    println!("\nAnalyzing entity behavior patterns...");
787
788    let mut analyzer = EphemeralPatternAnalyzer::new(log_files.len());
789
790    // Step 1: Learn patterns from short-lived entities
791    analyzer.learn_from_entities(&entities);
792    println!(
793        "Learned from {} short-lived entity patterns",
794        format_number(analyzer.short_lived_patterns.len())
795    );
796
797    // Step 2: Classify all entities using learned patterns
798    let entity_ids: Vec<String> = entities.keys().cloned().collect();
799    for entity_id in entity_ids {
800        if let Some(entity) = entities.get_mut(&entity_id) {
801            // Classify activity pattern
802            entity.activity_pattern = analyzer.classify_activity_pattern(entity);
803
804            // Analyze for ephemeral patterns
805            let (is_ephemeral, confidence, reasons) = analyzer.analyze_entity(entity);
806            entity.is_ephemeral_pattern = is_ephemeral;
807            entity.ephemeral_confidence = confidence;
808            entity.ephemeral_reasons = reasons;
809        }
810    }
811
812    // Generate final report
813    println!("\n=== Entity Churn Analysis ===\n");
814
815    println!("Daily Breakdown:");
816    for (idx, stats) in daily_stats.iter().enumerate() {
817        println!(
818            "  Day {}: {} new, {} returning, {} total logins",
819            idx + 1,
820            format_number(stats.new_entities),
821            format_number(stats.returning_entities),
822            format_number(stats.total_logins)
823        );
824    }
825
826    // Lifecycle classification
827    let mut lifecycle_counts: HashMap<String, usize> = HashMap::new();
828    let mut entities_by_file_count: HashMap<usize, usize> = HashMap::new();
829
830    for entity in entities.values() {
831        *lifecycle_counts
832            .entry(entity.lifecycle.clone())
833            .or_insert(0) += 1;
834        *entities_by_file_count
835            .entry(entity.files_appeared.len())
836            .or_insert(0) += 1;
837    }
838
839    println!("\nEntity Lifecycle Classification:");
840    let mut lifecycle_vec: Vec<_> = lifecycle_counts.iter().collect();
841    lifecycle_vec.sort_by_key(|(k, _)| *k);
842    for (lifecycle, count) in lifecycle_vec {
843        println!("  {}: {}", lifecycle, format_number(*count));
844    }
845
846    println!("\nEntity Persistence:");
847    for day_count in 1..=log_files.len() {
848        if let Some(count) = entities_by_file_count.get(&day_count) {
849            let label = if day_count == 1 {
850                "Appeared 1 day only"
851            } else if day_count == log_files.len() {
852                "Appeared all days (persistent)"
853            } else {
854                "Appeared some days"
855            };
856            println!(
857                "  {} day(s): {} entities ({})",
858                day_count,
859                format_number(*count),
860                label
861            );
862        }
863    }
864
865    // Activity pattern analysis
866    let mut activity_pattern_counts: HashMap<String, usize> = HashMap::new();
867    let mut ephemeral_entities = Vec::new();
868
869    for entity in entities.values() {
870        *activity_pattern_counts
871            .entry(entity.activity_pattern.clone())
872            .or_insert(0) += 1;
873
874        if entity.is_ephemeral_pattern {
875            ephemeral_entities.push(entity.clone());
876        }
877    }
878
879    println!("\nActivity Pattern Distribution:");
880    let mut pattern_vec: Vec<_> = activity_pattern_counts.iter().collect();
881    pattern_vec.sort_by(|a, b| b.1.cmp(a.1));
882    for (pattern, count) in pattern_vec {
883        println!("  {}: {}", pattern, format_number(*count));
884    }
885
886    println!("\nEphemeral Entity Detection:");
887    println!(
888        "  Detected {} likely ephemeral entities (confidence ≥ 0.4)",
889        format_number(ephemeral_entities.len())
890    );
891
892    if !ephemeral_entities.is_empty() {
893        // Sort by confidence
894        ephemeral_entities.sort_by(|a, b| {
895            b.ephemeral_confidence
896                .partial_cmp(&a.ephemeral_confidence)
897                .unwrap_or(std::cmp::Ordering::Equal)
898        });
899
900        println!("  Top 10 by confidence:");
901        for (idx, entity) in ephemeral_entities.iter().take(10).enumerate() {
902            println!(
903                "    {}. {} (confidence: {:.1}%)",
904                idx + 1,
905                entity.display_name,
906                entity.ephemeral_confidence * 100.0
907            );
908            for reason in &entity.ephemeral_reasons {
909                println!("       - {}", reason);
910            }
911        }
912
913        // Breakdown by confidence ranges
914        let high_conf = ephemeral_entities
915            .iter()
916            .filter(|e| e.ephemeral_confidence >= 0.7)
917            .count();
918        let med_conf = ephemeral_entities
919            .iter()
920            .filter(|e| e.ephemeral_confidence >= 0.5 && e.ephemeral_confidence < 0.7)
921            .count();
922        let low_conf = ephemeral_entities
923            .iter()
924            .filter(|e| e.ephemeral_confidence >= 0.4 && e.ephemeral_confidence < 0.5)
925            .count();
926
927        println!("\n  Confidence distribution:");
928        println!("    High (≥70%): {}", format_number(high_conf));
929        println!("    Medium (50-69%): {}", format_number(med_conf));
930        println!("    Low (40-49%): {}", format_number(low_conf));
931    }
932
933    // Mount path breakdown
934    let mut mount_stats: HashMap<String, (usize, String)> = HashMap::new();
935    for entity in entities.values() {
936        let entry = mount_stats
937            .entry(entity.mount_path.clone())
938            .or_insert((0, entity.mount_type.clone()));
939        entry.0 += 1;
940    }
941
942    println!("\nTop Authentication Methods (Total Entities):");
943    let mut mount_vec: Vec<_> = mount_stats.iter().collect();
944    mount_vec.sort_by(|a, b| b.1 .0.cmp(&a.1 .0));
945
946    for (idx, (path, (count, mount_type))) in mount_vec.iter().take(20).enumerate() {
947        println!(
948            "  {}. {} ({}): {}",
949            idx + 1,
950            path,
951            mount_type,
952            format_number(*count)
953        );
954    }
955
956    // Calculate GitHub duplication if present
957    let github_entities: Vec<_> = entities
958        .values()
959        .filter(|e| e.mount_path.contains("/github"))
960        .collect();
961
962    if !github_entities.is_empty() {
963        println!("\n=== GitHub Entity Analysis ===");
964        println!(
965            "Total GitHub entities: {}",
966            format_number(github_entities.len())
967        );
968
969        // Extract repo names and count duplicates
970        let mut repo_counts: HashMap<String, usize> = HashMap::new();
971        for entity in &github_entities {
972            // Extract repo from "github-repo:org/repo:..." pattern
973            if let Some(repo) = entity.display_name.split(':').nth(1) {
974                *repo_counts.entry(repo.to_string()).or_insert(0) += 1;
975            }
976        }
977
978        println!("Unique repositories: {}", format_number(repo_counts.len()));
979        println!("\nTop repositories by entity count:");
980        let mut repo_vec: Vec<_> = repo_counts.iter().collect();
981        repo_vec.sort_by(|a, b| b.1.cmp(a.1));
982
983        for (idx, (repo, count)) in repo_vec.iter().take(20).enumerate() {
984            if **count > 1 {
985                println!(
986                    "  {}. {}: {} entities",
987                    idx + 1,
988                    repo,
989                    format_number(**count)
990                );
991            }
992        }
993    }
994
995    // Export to file if requested
996    if let Some(output_path) = output {
997        let mut entities_vec: Vec<_> = entities.into_values().collect();
998        entities_vec.sort_by(|a, b| a.first_seen_time.cmp(&b.first_seen_time));
999
1000        // Determine format from parameter or file extension
1001        let output_format = format.unwrap_or_else(|| {
1002            if output_path.ends_with(".csv") {
1003                "csv"
1004            } else {
1005                "json"
1006            }
1007        });
1008
1009        println!(
1010            "\nExporting detailed entity records to {} (format: {})...",
1011            output_path, output_format
1012        );
1013
1014        let output_file = File::create(output_path)
1015            .with_context(|| format!("Failed to create output file: {}", output_path))?;
1016
1017        match output_format {
1018            "csv" => {
1019                let mut writer = csv::Writer::from_writer(output_file);
1020                for entity in &entities_vec {
1021                    let csv_record: EntityChurnRecordCsv = entity.clone().into();
1022                    writer
1023                        .serialize(&csv_record)
1024                        .context("Failed to write CSV record")?;
1025                }
1026                writer.flush().context("Failed to flush CSV writer")?;
1027            }
1028            _ => {
1029                // Default to JSON
1030                serde_json::to_writer_pretty(output_file, &entities_vec)
1031                    .context("Failed to write JSON output")?;
1032            }
1033        }
1034
1035        println!(
1036            "Exported {} entity records",
1037            format_number(entities_vec.len())
1038        );
1039    }
1040
1041    println!("\n=== Analysis Complete ===\n");
1042    Ok(())
1043}