Evolution3DVisualizer/data_processor.py at main · humancuration/Evolution3DVisualizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# data_processor.py

import pandas as pd
from util_math import calculate_genetic_distance
import igraph as ig

def process_data(raw_data):
    """
    Processes raw genetic and taxonomic data.

    Parameters:
        raw_data (list of dict): Raw data entries.

    Returns:
        dict: Processed data with computed genetic distances.
    """
    # Convert raw data to DataFrame for easier manipulation
    df = pd.DataFrame(raw_data)

    # Example: Calculate genetic distance matrix
    # Assuming raw_data has 'species' and 'genetic_marker' columns
    species = df['species'].unique()
    distance_matrix = {}

    for sp1 in species:
        distance_matrix[sp1] = {}
        markers1 = df[df['species'] == sp1]['genetic_marker'].tolist()
        for sp2 in species:
            markers2 = df[df['species'] == sp2]['genetic_marker'].tolist()
            distance = calculate_genetic_distance(markers1, markers2)
            distance_matrix[sp1][sp2] = distance

    processed_data = {
        'species': species,
        'distance_matrix': distance_matrix,
        'raw_dataframe': df
    }

    return processed_data

def generate_tree(processed_data):
    species = processed_data['species']
    edges = []
    weights = []
    for sp1 in species:
        for sp2 in species:
            if sp1 != sp2:
                distance = processed_data['distance_matrix'][sp1][sp2]
                edges.append((sp1, sp2))
                weights.append(distance)
    g = ig.Graph.TupleList(edges, directed=False)
    g.es['weight'] = weights
    return g

def process_data_in_chunks(file_path, chunk_size=1000):
    # Assuming data is in CSV format
    chunks = pd.read_csv(file_path, chunksize=chunk_size)
    for chunk in chunks:
        # Process each chunk
        process_chunk(chunk)

def process_chunk(chunk):
    # Implement the logic to process each chunk
    # This could involve calculating distances, updating a global data structure, etc.
    pass

def load_data_generator(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            yield process_line(line)

def process_line(line):
    # Implement the logic to process a single line of data
    # This could involve parsing the line, extracting relevant information, etc.
    pass