vault_audit_tools/commands/
entity_churn.rs

1//! Multi-day entity churn analysis with intelligent ephemeral pattern detection.
2//!
3//! Tracks entity lifecycle across multiple audit log files to identify:
4//! - New entities appearing each day
5//! - Returning vs. churned entities
6//! - Entity persistence patterns
7//! - Authentication method usage trends
8//! - **Ephemeral entities** using data-driven pattern learning
9//!
10//! # Usage
11//!
12//! ```bash
13//! # Analyze entity churn across a week
14//! vault-audit entity-churn day1.log day2.log day3.log day4.log day5.log day6.log day7.log
15//!
16//! # With baseline for accurate new entity detection
17//! vault-audit entity-churn *.log --baseline baseline_entities.json
18//!
19//! # With entity mappings for enriched display names
20//! vault-audit entity-churn *.log --baseline baseline.json --entity-map entity_mappings.json
21//!
22//! # Export detailed churn data with ephemeral analysis
23//! vault-audit entity-churn *.log --output entity_churn.json
24//!
25//! # Export as CSV format
26//! vault-audit entity-churn *.log --output entity_churn.csv --format csv
27//! ```
28//!
29//! # Ephemeral Pattern Detection
30//!
31//! The command uses a sophisticated two-pass analysis to detect ephemeral entities
32//! (e.g., CI/CD pipeline entities, temporary build entities) with confidence scoring:
33//!
34//! **Pass 1: Data Collection**
35//! - Track all entities across log files
36//! - Record first/last seen times and files
37//! - Count login activity per entity
38//!
39//! **Pass 2: Pattern Learning & Classification**
40//! - Learn patterns from entities that appeared 1-2 days
41//! - Identify naming patterns (e.g., `github-repo:org/repo:ref:branch`)
42//! - Calculate confidence scores (0.0-1.0) based on:
43//!   - Days active (1 day = high confidence, 2 days = medium)
44//!   - Similar entities on same mount path
45//!   - Activity levels (low login counts)
46//!   - Gaps in activity (reduces confidence for sporadic access)
47//!
48//! # Output
49//!
50//! ## Entity Lifecycle Classification:
51//! - **new_day_N**: Entities first seen on day N (not in baseline)
52//! - **pre_existing_baseline**: Entities that existed before analysis period
53//!
54//! ## Activity Patterns:
55//! - **consistent**: Appeared in most/all log files
56//! - **sporadic**: Appeared intermittently with gaps
57//! - **declining**: Activity decreased over time
58//! - **single_burst**: Appeared only once
59//!
60//! ## Ephemeral Detection:
61//! - Confidence levels: High (≥70%), Medium (50-69%), Low (40-49%)
62//! - Detailed reasoning for each classification
63//! - Top ephemeral entities by confidence
64//! - Pattern statistics and mount path analysis
65//!
66//! # JSON Output Fields
67//!
68//! When using `--output`, each entity record includes:
69//! - `entity_id`: Vault entity identifier
70//! - `display_name`: Human-readable name
71//! - `first_seen_file` / `first_seen_time`: When first observed
72//! - `last_seen_file` / `last_seen_time`: When last observed
73//! - `files_appeared`: List of log files entity was active in
74//! - `total_logins`: Total login count across all files
75//! - `lifecycle`: Entity lifecycle classification
76//! - `activity_pattern`: Behavioral pattern classification
77//! - `is_ephemeral_pattern`: Boolean flag for ephemeral detection
78//! - `ephemeral_confidence`: Confidence score (0.0-1.0)
79//! - `ephemeral_reasons`: Array of human-readable reasons
80//!
81//! Only tracks entities that performed login operations (paths ending in `/login`).
82
83use crate::audit::types::AuditEntry;
84use crate::utils::progress::ProgressBar;
85use crate::utils::reader::open_file;
86use anyhow::{Context, Result};
87use chrono::{DateTime, Utc};
88use serde::{Deserialize, Serialize};
89use std::collections::{HashMap, HashSet};
90use std::fs::File;
91use std::io::{BufRead, BufReader};
92use std::path::Path;
93
94/// Entity mapping from baseline CSV files
95#[derive(Debug, Serialize, Deserialize)]
96struct EntityMapping {
97    display_name: String,
98    mount_path: String,
99    #[allow(dead_code)]
100    mount_accessor: String,
101    #[allow(dead_code)]
102    login_count: usize,
103    #[allow(dead_code)]
104    first_seen: String,
105    #[allow(dead_code)]
106    last_seen: String,
107}
108
109/// Represents an entity's churn status
110#[derive(Debug, Serialize, Clone)]
111struct EntityChurnRecord {
112    entity_id: String,
113    display_name: String,
114    mount_path: String,
115    mount_type: String,
116    token_type: String,
117    first_seen_file: String,
118    first_seen_time: DateTime<Utc>,
119    last_seen_file: String,
120    last_seen_time: DateTime<Utc>,
121    files_appeared: Vec<String>,
122    total_logins: usize,
123    lifecycle: String, // "new_day_1", "new_day_2", "new_day_3", "pre_existing"
124    activity_pattern: String, // "consistent", "sporadic", "declining", "single_burst", "unknown"
125    is_ephemeral_pattern: bool,
126    ephemeral_confidence: f32, // 0.0 to 1.0
127    ephemeral_reasons: Vec<String>,
128    // Baseline metadata (if entity existed in baseline)
129    #[serde(skip_serializing_if = "Option::is_none")]
130    baseline_entity_name: Option<String>,
131    #[serde(skip_serializing_if = "Option::is_none")]
132    baseline_created: Option<String>,
133    #[serde(skip_serializing_if = "Option::is_none")]
134    baseline_alias_name: Option<String>,
135    #[serde(skip_serializing_if = "Option::is_none")]
136    baseline_mount_path: Option<String>,
137    // Entity-map metadata (from historical audit logs via preprocess-entities)
138    #[serde(skip_serializing_if = "Option::is_none")]
139    historical_display_name: Option<String>,
140    #[serde(skip_serializing_if = "Option::is_none")]
141    historical_first_seen: Option<String>,
142    #[serde(skip_serializing_if = "Option::is_none")]
143    historical_last_seen: Option<String>,
144    #[serde(skip_serializing_if = "Option::is_none")]
145    historical_login_count: Option<usize>,
146}
147
148/// CSV-compatible representation of entity churn record
149#[derive(Debug, Serialize)]
150struct EntityChurnRecordCsv {
151    entity_id: String,
152    display_name: String,
153    mount_path: String,
154    mount_type: String,
155    token_type: String,
156    first_seen_file: String,
157    first_seen_time: String,
158    last_seen_file: String,
159    last_seen_time: String,
160    files_appeared: String, // Comma-separated list
161    days_active: usize,
162    total_logins: usize,
163    lifecycle: String,
164    activity_pattern: String,
165    is_ephemeral_pattern: bool,
166    ephemeral_confidence: f32,
167    ephemeral_reasons: String, // Semicolon-separated list
168    baseline_entity_name: String,
169    baseline_created: String,
170    baseline_alias_name: String,
171    baseline_mount_path: String,
172    historical_display_name: String,
173    historical_first_seen: String,
174    historical_last_seen: String,
175    historical_login_count: String,
176}
177
178impl From<EntityChurnRecord> for EntityChurnRecordCsv {
179    fn from(record: EntityChurnRecord) -> Self {
180        EntityChurnRecordCsv {
181            entity_id: record.entity_id,
182            display_name: record.display_name,
183            mount_path: record.mount_path,
184            mount_type: record.mount_type,
185            token_type: record.token_type,
186            first_seen_file: record.first_seen_file,
187            first_seen_time: record.first_seen_time.to_rfc3339(),
188            last_seen_file: record.last_seen_file,
189            last_seen_time: record.last_seen_time.to_rfc3339(),
190            files_appeared: record.files_appeared.join(", "),
191            days_active: record.files_appeared.len(),
192            total_logins: record.total_logins,
193            lifecycle: record.lifecycle,
194            activity_pattern: record.activity_pattern,
195            is_ephemeral_pattern: record.is_ephemeral_pattern,
196            ephemeral_confidence: record.ephemeral_confidence,
197            ephemeral_reasons: record.ephemeral_reasons.join("; "),
198            baseline_entity_name: record.baseline_entity_name.unwrap_or_default(),
199            baseline_created: record.baseline_created.unwrap_or_default(),
200            baseline_alias_name: record.baseline_alias_name.unwrap_or_default(),
201            baseline_mount_path: record.baseline_mount_path.unwrap_or_default(),
202            historical_display_name: record.historical_display_name.unwrap_or_default(),
203            historical_first_seen: record.historical_first_seen.unwrap_or_default(),
204            historical_last_seen: record.historical_last_seen.unwrap_or_default(),
205            historical_login_count: record
206                .historical_login_count
207                .map(|n| n.to_string())
208                .unwrap_or_default(),
209        }
210    }
211}
212
213#[derive(Debug)]
214struct DailyStats {
215    #[allow(dead_code)]
216    file_name: String,
217    new_entities: usize,
218    returning_entities: usize,
219    total_logins: usize,
220}
221
222/// Analyzes entity behavior patterns to detect ephemeral entities
223#[derive(Debug)]
224struct EphemeralPatternAnalyzer {
225    total_files: usize,
226    short_lived_patterns: Vec<ShortLivedPattern>,
227}
228
229#[derive(Debug)]
230struct ShortLivedPattern {
231    days_active: usize,
232    display_name: String,
233    mount_path: String,
234}
235
236impl EphemeralPatternAnalyzer {
237    fn new(total_files: usize) -> Self {
238        Self {
239            total_files,
240            short_lived_patterns: Vec::new(),
241        }
242    }
243
244    /// Learn patterns from entities that appeared 1-2 days (potential ephemeral patterns)
245    fn learn_from_entities(&mut self, entities: &HashMap<String, EntityChurnRecord>) {
246        for entity in entities.values() {
247            let days_active = entity.files_appeared.len();
248
249            // Learn from entities that appeared 1-2 days only
250            if days_active <= 2 {
251                self.short_lived_patterns.push(ShortLivedPattern {
252                    days_active,
253                    display_name: entity.display_name.clone(),
254                    mount_path: entity.mount_path.clone(),
255                });
256            }
257        }
258    }
259
260    /// Analyze an entity and determine if it matches ephemeral patterns
261    fn analyze_entity(&self, entity: &EntityChurnRecord) -> (bool, f32, Vec<String>) {
262        let days_active = entity.files_appeared.len();
263        let mut confidence = 0.0;
264        let mut reasons = Vec::new();
265
266        // Strong indicators (high confidence)
267        if days_active == 1 {
268            confidence += 0.5;
269            reasons.push(format!("Appeared only 1 day ({})", entity.first_seen_file));
270        } else if days_active == 2 {
271            confidence += 0.3;
272            reasons.push(format!(
273                "Appeared only 2 days: {}, {}",
274                entity.files_appeared.first().unwrap_or(&String::new()),
275                entity.files_appeared.last().unwrap_or(&String::new())
276            ));
277        }
278
279        // Pattern matching: Check if display name follows patterns seen in other short-lived entities
280        if days_active <= 2 {
281            // Count how many other short-lived entities share similar patterns
282            let similar_count = self
283                .short_lived_patterns
284                .iter()
285                .filter(|p| {
286                    // Same mount path
287                    if p.mount_path == entity.mount_path && p.days_active <= 2 {
288                        return true;
289                    }
290                    // Similar naming pattern (e.g., github-repo:* or airflow-*)
291                    if entity.display_name.contains(':') && p.display_name.contains(':') {
292                        let entity_prefix = entity.display_name.split(':').next().unwrap_or("");
293                        let pattern_prefix = p.display_name.split(':').next().unwrap_or("");
294                        if entity_prefix == pattern_prefix && !entity_prefix.is_empty() {
295                            return true;
296                        }
297                    }
298                    false
299                })
300                .count();
301
302            if similar_count > 5 {
303                confidence += 0.2;
304                reasons.push(format!(
305                    "Matches pattern seen in {} other short-lived entities",
306                    similar_count
307                ));
308            } else if similar_count > 0 {
309                confidence += 0.1;
310                reasons.push(format!(
311                    "Similar to {} other short-lived entities",
312                    similar_count
313                ));
314            }
315        }
316
317        // Low activity indicator
318        if entity.total_logins <= 5 && days_active <= 2 {
319            confidence += 0.1;
320            reasons.push(format!(
321                "Low activity: only {} login(s)",
322                entity.total_logins
323            ));
324        }
325
326        // Non-continuous appearance (sporadic pattern suggests not churned, just periodic)
327        if days_active >= 2 {
328            let first_day_idx = entity.files_appeared.first().and_then(|f| {
329                f.split('_')
330                    .next_back()
331                    .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
332            });
333            let last_day_idx = entity.files_appeared.last().and_then(|f| {
334                f.split('_')
335                    .next_back()
336                    .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
337            });
338
339            if let (Some(first), Some(last)) = (first_day_idx, last_day_idx) {
340                let span = last - first + 1;
341                if span > days_active {
342                    // Gaps in activity - reduce confidence
343                    confidence *= 0.7;
344                    reasons.push(
345                        "Has gaps in activity (possibly sporadic access, not churned)".to_string(),
346                    );
347                }
348            }
349        }
350
351        // Cap confidence and determine ephemeral status
352        confidence = f32::min(confidence, 1.0);
353        let is_ephemeral = confidence >= 0.4; // Threshold for classification
354
355        // Add absence indicator if not seen in recent files
356        if is_ephemeral && days_active < self.total_files {
357            reasons.push(format!(
358                "Not seen in most recent {} file(s)",
359                self.total_files - days_active
360            ));
361        }
362
363        (is_ephemeral, confidence, reasons)
364    }
365
366    /// Determine activity pattern based on appearance across files
367    fn classify_activity_pattern(&self, entity: &EntityChurnRecord) -> String {
368        let days_active = entity.files_appeared.len();
369
370        if days_active == 1 {
371            return "single_burst".to_string();
372        }
373
374        if days_active == self.total_files {
375            return "consistent".to_string();
376        }
377
378        if days_active >= (self.total_files * 2) / 3 {
379            return "consistent".to_string();
380        }
381
382        // Check if activity is declining (appeared early but stopped)
383        if let (Some(_first_file), Some(last_file)) =
384            (entity.files_appeared.first(), entity.files_appeared.last())
385        {
386            // Simple heuristic: if last seen was in first half of files, it's declining
387            let last_file_num = last_file
388                .split('_')
389                .next_back()
390                .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
391                .unwrap_or(self.total_files);
392
393            if last_file_num < self.total_files / 2 {
394                return "declining".to_string();
395            }
396        }
397
398        if days_active <= 2 {
399            return "single_burst".to_string();
400        }
401
402        "sporadic".to_string()
403    }
404}
405
406fn format_number(n: usize) -> String {
407    let s = n.to_string();
408    let mut result = String::new();
409    for (i, c) in s.chars().rev().enumerate() {
410        if i > 0 && i % 3 == 0 {
411            result.push(',');
412        }
413        result.push(c);
414    }
415    result.chars().rev().collect()
416}
417
418fn get_file_size(path: &str) -> Result<u64> {
419    Ok(std::fs::metadata(path)?.len())
420}
421
422fn load_entity_mappings(path: &str) -> Result<HashMap<String, EntityMapping>> {
423    let file = File::open(path).context("Failed to open entity map file")?;
424    let mappings: HashMap<String, EntityMapping> =
425        serde_json::from_reader(file).context("Failed to parse entity map JSON")?;
426    Ok(mappings)
427}
428
429#[derive(Debug, Deserialize, Clone)]
430#[allow(dead_code)]
431struct BaselineEntity {
432    entity_id: String,
433    // Fields from entity-list (Vault API) - full metadata
434    #[serde(default)]
435    entity_name: String,
436    #[serde(default)]
437    entity_disabled: bool,
438    #[serde(default)]
439    entity_created: String,
440    #[serde(default)]
441    entity_updated: String,
442    #[serde(default)]
443    alias_id: String,
444    #[serde(default)]
445    alias_name: String,
446    #[serde(default)]
447    mount_path: String,
448    #[serde(default)]
449    mount_type: String,
450    #[serde(default)]
451    mount_accessor: String,
452    #[serde(default)]
453    alias_created: String,
454    #[serde(default)]
455    alias_updated: String,
456    #[serde(default)]
457    alias_metadata: String,
458}
459
460impl BaselineEntity {
461    /// Get the best available name (entity_name if available, otherwise alias_name)
462    fn get_name(&self) -> String {
463        if !self.entity_name.is_empty() {
464            self.entity_name.clone()
465        } else if !self.alias_name.is_empty() {
466            self.alias_name.clone()
467        } else {
468            String::new()
469        }
470    }
471
472    /// Get the entity creation time
473    fn get_created(&self) -> String {
474        self.entity_created.clone()
475    }
476}
477
478fn load_baseline_entities(path: &str) -> Result<HashMap<String, BaselineEntity>> {
479    let file = File::open(path).context("Failed to open baseline entities file")?;
480
481    // Check if it's JSON or CSV based on file extension
482    let path_lower = path.to_lowercase();
483    if path_lower.ends_with(".json") {
484        // JSON format from entity-list with --format json
485        let entities: Vec<BaselineEntity> =
486            serde_json::from_reader(file).context("Failed to parse baseline entities JSON")?;
487        Ok(entities
488            .into_iter()
489            .map(|e| (e.entity_id.clone(), e))
490            .collect())
491    } else {
492        // CSV format (default entity-list output)
493        let mut reader = csv::Reader::from_reader(file);
494        let mut entities = HashMap::new();
495
496        for result in reader.deserialize() {
497            let entity: BaselineEntity = result.context("Failed to parse baseline CSV row")?;
498            // Use first occurrence of each entity_id (entities can have multiple aliases)
499            entities.entry(entity.entity_id.clone()).or_insert(entity);
500        }
501
502        Ok(entities)
503    }
504}
505
506pub fn run(
507    log_files: &[String],
508    entity_map: Option<&str>,
509    baseline_entities: Option<&str>,
510    output: Option<&str>,
511    format: Option<&str>,
512) -> Result<()> {
513    println!("\n=== Multi-Day Entity Churn Analysis ===\n");
514    println!("Analyzing {} log files:", log_files.len());
515    for (i, file) in log_files.iter().enumerate() {
516        let size = get_file_size(file)?;
517        println!(
518            "  Day {}: {} ({:.2} GB)",
519            i + 1,
520            file,
521            size as f64 / 1_000_000_000.0
522        );
523    }
524    println!();
525
526    // Load baseline entities if provided
527    let baseline = if let Some(path) = baseline_entities {
528        println!(
529            "Loading baseline entity list (Vault API metadata) from {}...",
530            path
531        );
532        let baseline_set = load_baseline_entities(path)?;
533        println!(
534            "Loaded {} pre-existing entities from Vault API baseline",
535            format_number(baseline_set.len())
536        );
537        println!();
538        Some(baseline_set)
539    } else {
540        println!("No baseline entity list provided. Cannot distinguish truly NEW entities from pre-existing.");
541        println!("   All Day 1 entities will be marked as 'pre_existing_or_new_day_1'.");
542        println!("   To get accurate results, run: ./vault-audit entity-list --output baseline_entities.json\n");
543        None
544    };
545
546    // Load entity mappings if provided (historical data from audit logs)
547    let entity_mappings = if let Some(path) = entity_map {
548        println!(
549            "Loading historical entity mappings (audit log enrichment) from {}...",
550            path
551        );
552        let mappings = load_entity_mappings(path)?;
553        println!(
554            "Loaded {} entity mappings with historical audit log data",
555            format_number(mappings.len())
556        );
557        println!();
558        Some(mappings)
559    } else {
560        None
561    };
562
563    // Track all entities across all files
564    let mut entities: HashMap<String, EntityChurnRecord> = HashMap::new();
565    let mut daily_stats: Vec<DailyStats> = Vec::new();
566
567    // Process each log file in order
568    for (file_idx, log_file) in log_files.iter().enumerate() {
569        let file_name = Path::new(log_file)
570            .file_name()
571            .unwrap()
572            .to_string_lossy()
573            .to_string();
574
575        println!("\nProcessing Day {} ({})...", file_idx + 1, file_name);
576
577        let file = open_file(log_file)
578            .with_context(|| format!("Failed to open log file: {}", log_file))?;
579        let file_size = get_file_size(log_file)? as usize;
580
581        let reader = BufReader::new(file);
582        let mut progress = ProgressBar::new(file_size, "Processing");
583
584        let mut new_entities_this_file = 0;
585        let mut returning_entities_this_file = HashSet::new();
586        let mut logins_this_file = 0;
587        let mut bytes_processed = 0;
588
589        for line in reader.lines() {
590            let line = line.context("Failed to read line from log file")?;
591            bytes_processed += line.len() + 1; // +1 for newline
592
593            // Update progress periodically
594            if bytes_processed % 10_000 == 0 {
595                progress.update(bytes_processed.min(file_size));
596            }
597
598            let trimmed = line.trim();
599            if trimmed.is_empty() {
600                continue;
601            }
602
603            let entry: AuditEntry = match serde_json::from_str(trimmed) {
604                Ok(e) => e,
605                Err(_) => continue,
606            };
607
608            // Only process login operations (auth paths ending in /login)
609            let Some(ref request) = entry.request else {
610                continue;
611            };
612            let Some(ref path) = request.path else {
613                continue;
614            };
615            if !path.ends_with("/login") {
616                continue;
617            }
618
619            logins_this_file += 1;
620
621            // Extract entity info
622            let Some(ref auth) = entry.auth else {
623                continue;
624            };
625            let Some(ref entity_id) = auth.entity_id else {
626                continue;
627            };
628
629            let display_name = auth
630                .display_name
631                .clone()
632                .unwrap_or_else(|| entity_id.clone());
633            let mount_path = request.path.clone().unwrap_or_default();
634            let mount_type = request.mount_type.clone().unwrap_or_default();
635            let token_type = auth.token_type.clone().unwrap_or_default();
636
637            // Parse timestamp
638            let first_seen_time = chrono::DateTime::parse_from_rfc3339(&entry.time)
639                .ok()
640                .map(|dt| dt.with_timezone(&Utc))
641                .unwrap_or_else(Utc::now);
642
643            // Check if this entity exists from a previous file
644            if let Some(entity_record) = entities.get_mut(entity_id) {
645                // Returning entity
646                entity_record.total_logins += 1;
647                entity_record.last_seen_file = file_name.clone();
648                entity_record.last_seen_time = first_seen_time;
649                if !entity_record.files_appeared.contains(&file_name) {
650                    entity_record.files_appeared.push(file_name.clone());
651                }
652                returning_entities_this_file.insert(entity_id.clone());
653            } else {
654                // New entity (first time across all files)
655                new_entities_this_file += 1;
656
657                // Determine lifecycle based on baseline and which file this is
658                let lifecycle = if let Some(ref baseline_set) = baseline {
659                    if baseline_set.contains_key(entity_id) {
660                        "pre_existing_baseline".to_string()
661                    } else {
662                        // Not in baseline, so truly NEW during analysis period
663                        match file_idx {
664                            0 => "new_day_1".to_string(),
665                            1 => "new_day_2".to_string(),
666                            2 => "new_day_3".to_string(),
667                            _ => format!("new_day_{}", file_idx + 1),
668                        }
669                    }
670                } else {
671                    // No baseline provided, can't distinguish
672                    match file_idx {
673                        0 => "pre_existing_or_new_day_1".to_string(),
674                        1 => "new_day_2".to_string(),
675                        2 => "new_day_3".to_string(),
676                        _ => format!("new_day_{}", file_idx + 1),
677                    }
678                };
679
680                // Get baseline metadata if entity exists in baseline
681                let (
682                    baseline_entity_name,
683                    baseline_created,
684                    baseline_alias_name,
685                    baseline_mount_path,
686                ) = if let Some(ref baseline_map) = baseline {
687                    if let Some(baseline_entity) = baseline_map.get(entity_id) {
688                        let name = baseline_entity.get_name();
689                        let created = baseline_entity.get_created();
690                        (
691                            if !name.is_empty() { Some(name) } else { None },
692                            if !created.is_empty() {
693                                Some(created)
694                            } else {
695                                None
696                            },
697                            if !baseline_entity.alias_name.is_empty() {
698                                Some(baseline_entity.alias_name.clone())
699                            } else {
700                                None
701                            },
702                            if !baseline_entity.mount_path.is_empty() {
703                                Some(baseline_entity.mount_path.clone())
704                            } else {
705                                None
706                            },
707                        )
708                    } else {
709                        (None, None, None, None)
710                    }
711                } else {
712                    (None, None, None, None)
713                };
714
715                // Fetch historical data from entity_mappings
716                let (
717                    historical_display_name,
718                    historical_first_seen,
719                    historical_last_seen,
720                    historical_login_count,
721                ) = if let Some(ref mappings) = entity_mappings {
722                    if let Some(mapping) = mappings.get(entity_id) {
723                        (
724                            Some(mapping.display_name.clone()),
725                            Some(mapping.first_seen.clone()),
726                            Some(mapping.last_seen.clone()),
727                            Some(mapping.login_count),
728                        )
729                    } else {
730                        (None, None, None, None)
731                    }
732                } else {
733                    (None, None, None, None)
734                };
735
736                entities.insert(
737                    entity_id.clone(),
738                    EntityChurnRecord {
739                        entity_id: entity_id.clone(),
740                        display_name: display_name.clone(),
741                        mount_path: mount_path.clone(),
742                        mount_type: mount_type.clone(),
743                        token_type: token_type.clone(),
744                        first_seen_file: file_name.clone(),
745                        first_seen_time,
746                        last_seen_file: file_name.clone(),
747                        last_seen_time: first_seen_time,
748                        files_appeared: vec![file_name.clone()],
749                        total_logins: 1,
750                        lifecycle,
751                        activity_pattern: "unknown".to_string(), // Will be computed in second pass
752                        is_ephemeral_pattern: false,             // Will be computed in second pass
753                        ephemeral_confidence: 0.0,               // Will be computed in second pass
754                        ephemeral_reasons: Vec::new(),           // Will be computed in second pass
755                        baseline_entity_name,
756                        baseline_created,
757                        baseline_alias_name,
758                        baseline_mount_path,
759                        historical_display_name,
760                        historical_first_seen,
761                        historical_last_seen,
762                        historical_login_count,
763                    },
764                );
765            }
766        }
767
768        progress.finish();
769
770        daily_stats.push(DailyStats {
771            file_name,
772            new_entities: new_entities_this_file,
773            returning_entities: returning_entities_this_file.len(),
774            total_logins: logins_this_file,
775        });
776
777        println!(
778            "Day {} Summary: {} new entities, {} returning, {} logins",
779            file_idx + 1,
780            format_number(new_entities_this_file),
781            format_number(returning_entities_this_file.len()),
782            format_number(logins_this_file)
783        );
784    }
785
786    // === SECOND PASS: Analyze patterns and classify entities ===
787    println!("\nAnalyzing entity behavior patterns...");
788
789    let mut analyzer = EphemeralPatternAnalyzer::new(log_files.len());
790
791    // Step 1: Learn patterns from short-lived entities
792    analyzer.learn_from_entities(&entities);
793    println!(
794        "Learned from {} short-lived entity patterns",
795        format_number(analyzer.short_lived_patterns.len())
796    );
797
798    // Step 2: Classify all entities using learned patterns
799    let entity_ids: Vec<String> = entities.keys().cloned().collect();
800    for entity_id in entity_ids {
801        if let Some(entity) = entities.get_mut(&entity_id) {
802            // Classify activity pattern
803            entity.activity_pattern = analyzer.classify_activity_pattern(entity);
804
805            // Analyze for ephemeral patterns
806            let (is_ephemeral, confidence, reasons) = analyzer.analyze_entity(entity);
807            entity.is_ephemeral_pattern = is_ephemeral;
808            entity.ephemeral_confidence = confidence;
809            entity.ephemeral_reasons = reasons;
810        }
811    }
812
813    // Generate final report
814    println!("\n=== Entity Churn Analysis ===\n");
815
816    println!("Daily Breakdown:");
817    for (idx, stats) in daily_stats.iter().enumerate() {
818        println!(
819            "  Day {}: {} new, {} returning, {} total logins",
820            idx + 1,
821            format_number(stats.new_entities),
822            format_number(stats.returning_entities),
823            format_number(stats.total_logins)
824        );
825    }
826
827    // Lifecycle classification
828    let mut lifecycle_counts: HashMap<String, usize> = HashMap::new();
829    let mut entities_by_file_count: HashMap<usize, usize> = HashMap::new();
830
831    for entity in entities.values() {
832        *lifecycle_counts
833            .entry(entity.lifecycle.clone())
834            .or_insert(0) += 1;
835        *entities_by_file_count
836            .entry(entity.files_appeared.len())
837            .or_insert(0) += 1;
838    }
839
840    println!("\nEntity Lifecycle Classification:");
841    let mut lifecycle_vec: Vec<_> = lifecycle_counts.iter().collect();
842    lifecycle_vec.sort_by_key(|(k, _)| *k);
843    for (lifecycle, count) in lifecycle_vec {
844        println!("  {}: {}", lifecycle, format_number(*count));
845    }
846
847    println!("\nEntity Persistence:");
848    for day_count in 1..=log_files.len() {
849        if let Some(count) = entities_by_file_count.get(&day_count) {
850            let label = if day_count == 1 {
851                "Appeared 1 day only"
852            } else if day_count == log_files.len() {
853                "Appeared all days (persistent)"
854            } else {
855                "Appeared some days"
856            };
857            println!(
858                "  {} day(s): {} entities ({})",
859                day_count,
860                format_number(*count),
861                label
862            );
863        }
864    }
865
866    // Activity pattern analysis
867    let mut activity_pattern_counts: HashMap<String, usize> = HashMap::new();
868    let mut ephemeral_entities = Vec::new();
869
870    for entity in entities.values() {
871        *activity_pattern_counts
872            .entry(entity.activity_pattern.clone())
873            .or_insert(0) += 1;
874
875        if entity.is_ephemeral_pattern {
876            ephemeral_entities.push(entity.clone());
877        }
878    }
879
880    println!("\nActivity Pattern Distribution:");
881    let mut pattern_vec: Vec<_> = activity_pattern_counts.iter().collect();
882    pattern_vec.sort_by(|a, b| b.1.cmp(a.1));
883    for (pattern, count) in pattern_vec {
884        println!("  {}: {}", pattern, format_number(*count));
885    }
886
887    println!("\nEphemeral Entity Detection:");
888    println!(
889        "  Detected {} likely ephemeral entities (confidence ≥ 0.4)",
890        format_number(ephemeral_entities.len())
891    );
892
893    if !ephemeral_entities.is_empty() {
894        // Sort by confidence
895        ephemeral_entities.sort_by(|a, b| {
896            b.ephemeral_confidence
897                .partial_cmp(&a.ephemeral_confidence)
898                .unwrap_or(std::cmp::Ordering::Equal)
899        });
900
901        println!("  Top 10 by confidence:");
902        for (idx, entity) in ephemeral_entities.iter().take(10).enumerate() {
903            println!(
904                "    {}. {} (confidence: {:.1}%)",
905                idx + 1,
906                entity.display_name,
907                entity.ephemeral_confidence * 100.0
908            );
909            for reason in &entity.ephemeral_reasons {
910                println!("       - {}", reason);
911            }
912        }
913
914        // Breakdown by confidence ranges
915        let high_conf = ephemeral_entities
916            .iter()
917            .filter(|e| e.ephemeral_confidence >= 0.7)
918            .count();
919        let med_conf = ephemeral_entities
920            .iter()
921            .filter(|e| e.ephemeral_confidence >= 0.5 && e.ephemeral_confidence < 0.7)
922            .count();
923        let low_conf = ephemeral_entities
924            .iter()
925            .filter(|e| e.ephemeral_confidence >= 0.4 && e.ephemeral_confidence < 0.5)
926            .count();
927
928        println!("\n  Confidence distribution:");
929        println!("    High (≥70%): {}", format_number(high_conf));
930        println!("    Medium (50-69%): {}", format_number(med_conf));
931        println!("    Low (40-49%): {}", format_number(low_conf));
932    }
933
934    // Mount path breakdown
935    let mut mount_stats: HashMap<String, (usize, String)> = HashMap::new();
936    for entity in entities.values() {
937        let entry = mount_stats
938            .entry(entity.mount_path.clone())
939            .or_insert((0, entity.mount_type.clone()));
940        entry.0 += 1;
941    }
942
943    println!("\nTop Authentication Methods (Total Entities):");
944    let mut mount_vec: Vec<_> = mount_stats.iter().collect();
945    mount_vec.sort_by(|a, b| b.1 .0.cmp(&a.1 .0));
946
947    for (idx, (path, (count, mount_type))) in mount_vec.iter().take(20).enumerate() {
948        println!(
949            "  {}. {} ({}): {}",
950            idx + 1,
951            path,
952            mount_type,
953            format_number(*count)
954        );
955    }
956
957    // Calculate GitHub duplication if present
958    let github_entities: Vec<_> = entities
959        .values()
960        .filter(|e| e.mount_path.contains("/github"))
961        .collect();
962
963    if !github_entities.is_empty() {
964        println!("\n=== GitHub Entity Analysis ===");
965        println!(
966            "Total GitHub entities: {}",
967            format_number(github_entities.len())
968        );
969
970        // Extract repo names and count duplicates
971        let mut repo_counts: HashMap<String, usize> = HashMap::new();
972        for entity in &github_entities {
973            // Extract repo from "github-repo:org/repo:..." pattern
974            if let Some(repo) = entity.display_name.split(':').nth(1) {
975                *repo_counts.entry(repo.to_string()).or_insert(0) += 1;
976            }
977        }
978
979        println!("Unique repositories: {}", format_number(repo_counts.len()));
980        println!("\nTop repositories by entity count:");
981        let mut repo_vec: Vec<_> = repo_counts.iter().collect();
982        repo_vec.sort_by(|a, b| b.1.cmp(a.1));
983
984        for (idx, (repo, count)) in repo_vec.iter().take(20).enumerate() {
985            if **count > 1 {
986                println!(
987                    "  {}. {}: {} entities",
988                    idx + 1,
989                    repo,
990                    format_number(**count)
991                );
992            }
993        }
994    }
995
996    // Export to file if requested
997    if let Some(output_path) = output {
998        let mut entities_vec: Vec<_> = entities.into_values().collect();
999        entities_vec.sort_by(|a, b| a.first_seen_time.cmp(&b.first_seen_time));
1000
1001        // Determine format from parameter or file extension
1002        let output_format = format.unwrap_or_else(|| {
1003            if output_path.ends_with(".csv") {
1004                "csv"
1005            } else {
1006                "json"
1007            }
1008        });
1009
1010        println!(
1011            "\nExporting detailed entity records to {} (format: {})...",
1012            output_path, output_format
1013        );
1014
1015        let output_file = File::create(output_path)
1016            .with_context(|| format!("Failed to create output file: {}", output_path))?;
1017
1018        match output_format {
1019            "csv" => {
1020                let mut writer = csv::Writer::from_writer(output_file);
1021                for entity in &entities_vec {
1022                    let csv_record: EntityChurnRecordCsv = entity.clone().into();
1023                    writer
1024                        .serialize(&csv_record)
1025                        .context("Failed to write CSV record")?;
1026                }
1027                writer.flush().context("Failed to flush CSV writer")?;
1028            }
1029            _ => {
1030                // Default to JSON
1031                serde_json::to_writer_pretty(output_file, &entities_vec)
1032                    .context("Failed to write JSON output")?;
1033            }
1034        }
1035
1036        println!(
1037            "Exported {} entity records",
1038            format_number(entities_vec.len())
1039        );
1040    }
1041
1042    println!("\n=== Analysis Complete ===\n");
1043    Ok(())
1044}