Reading Data

The following Python scripts and functions are used to create data files for each of the mentioned data source types:

  • Python scripts read_SCADA.py and read_SCADA_missing.py
  • Python script read_INAP.py
  • Python function read_expert_tool_data(room_name)

SCADA data

Download read_SCADA.py.

import os
from contextlib import contextmanager
import glob
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

"""
This script is a part of the MOBISTYLE project (2018-2020), namely first step before data analysis for Deliverable D6.2 
at Slovenian demonstration. This script reads the csv files from 'SCADA_data' directory. These files are gathered from 
SFTP server at IRI-UL demo site, contact IRI-UL for further information. 
Resulting pandas DataFrame is converted to csv file. 
It is further used to analyse and plot figures, see file 'Data Analysis, Slovenia.ipynb'.
"""


@contextmanager
def change_dir(destination):
    """
    This f-n reads the subfolder path and then returns back to current working directory (cwd).
    Args: 
        destination (str): Subfolder name (that is located in cwd)
    Returns: 
        Location is returned back to cwd after desired computations are completed
    """
    try:
        cwd = os.getcwd()
        os.chdir(destination)
        yield
    finally:
        os.chdir(cwd)


start = time.perf_counter()

# Read file paths for files in 'SCADA_data' directory
file_paths = []
with change_dir('SCADA_data'):
    for file_name in glob.glob('**/*.csv', recursive=True):
        # read .csv file paths from sub folder
        file_path = os.path.join(os.getcwd(), file_name)
        file_paths.append(file_path)

print('File paths are gathered. Converting to DataFrame...')

# Convert data to DataFrame format and create a list
df_list = [pd.read_csv(file_path, na_values='Bad') for file_path in tqdm(file_paths, desc='File(s) completed')]

print('\nDataFrame list is created. Concatenating...')

# Concatenate DFs
frames = pd.concat(df_list, ignore_index=True, sort=False)

# Create datetime index
frames['Timestamp'] = pd.to_datetime(frames['Timestamp'])
frames.set_index('Timestamp', inplace=True)
frames.sort_index()

# Files folder (data for each room)
os.makedirs('./Files', exist_ok=True)
os.makedirs('./Files/SCADA_files', exist_ok=True)

# Save data as one .csv file
frames.to_csv('./Files/SCADA_files/SCADA_SI.csv')

print('\nSCADA dataFrame is created. Filtering and saving data by room...')

# Outdoor climate data
outdoor_data = frames.filter(
    items=['WEATHERS_TEMP', 'WEATHERS_HUM01', 'WEATHERS_ILL_SOUTH', 'WEATHERS_ILL_EAST', 'WEATHERS_ILL_WEST'])

# Correction for SCADA weather data Temp
outdoor_data['WEATHERS_TEMP'].replace({'Error': np.nan}, inplace=True)
outdoor_data['WEATHERS_TEMP'].astype('float64')

# Separate data for each room
room_ID = ['K1N0623', 'K1N0605', 'R3N0808', 'R3N0644', 'K1N0624', 'K3N0618', 'R2N0805', 'R3N0634']
room_dct = {room_name: frames.filter(regex=room_name) for room_name in room_ID}

# Save df for each room as csv file
for name, val in room_dct.items():
    val.to_csv(f'./Files/SCADA_files/SCADA_{name}.csv')

# Save outdoor climate data
# N.B. There is another outdoor climate source which is used in the analysis due to less missing data
outdoor_data.to_csv('./Files/SCADA_files/outdoor.csv')

finish = time.perf_counter()
print(f'\nFinished in {round(finish-start, 2)} second(s)')

Download read_SCADA_missing.py.

import os
from contextlib import contextmanager
import glob
from datetime import datetime
import numpy as np
import pandas as pd

"""
This script is a part of the MOBISTYLE project (2018-2020), data analysis for Deliverable D6.2 at Slovenian demo-case. 
This script reads the csv files from 'SCADA_data' directory. These files are gathered from SFTP server at IRI-UL,
contact IRI-UL for further information. 
This folder contains .csv files with different columns, thus different way of reading is necessary. 
Resulting pandas DataFrame is stored as csv file.
It is further used to update the SCADA csv data, see file 'Data Analysis, Slovenia.ipynb'.
"""


@contextmanager
def change_dir(destination):
    """
    This f-n reads the subfolder path and then returns back to current working directory (cwd).
    Args: 
        destination (str): Subfolder name (that is located in cwd)
    Returns: 
        Location is returned back to cwd after desired computations are completed
    """
    cwd = os.getcwd()
    try:
        os.chdir(destination)
        yield
    finally:
        os.chdir(cwd)


# Read file paths from 'SCADA_missing' directory
file_paths = []

with change_dir('SCADA_missing'):
    for file_name in glob.glob('**/*.csv', recursive=True):
        
        # read .csv file paths from sub folder
        file_path = os.path.join(os.getcwd(), file_name)
        file_paths.append(file_path)


# Change date format
def date_parser(date):
    try:
        if len(date) < 11:
            return datetime.strptime(date, '%d.%m.%Y')
        else:
            return datetime.strptime(date, '%d.%m.%Y %H:%M:%S')
    except TypeError:
        return pd.NaT


# Convert csv files to DataFrame format and create a list
df_list = []
for file_path in file_paths:
    df_list.append(
        (pd
         .read_csv(file_path,
                   sep=';',
                   decimal=',',
                   header=0,
                   skiprows=[1, 2, 3],
                   parse_dates=['SOURCE:'],
                   date_parser=date_parser)
         .rename(columns={'SOURCE:': 'Timestamp'})
         .drop_duplicates(subset=['Timestamp'])
         .dropna(subset=['Timestamp'])
         .set_index('Timestamp')
         .resample('15T')
         .nearest()
         )
    )

# Concatenate DFs
frame = pd.concat(df_list, sort=False)


# Filter data from frame by room name and store in a dictionary
room_lst = ['K1N0623', 'K3N0605', 'R3N0808', 'R3N0644', 'K1N0624', 'K3N0618', 'R2N0805', 'R2N0634']

missing_scada = {room_name: frame.filter(regex=room_name).dropna(how='all') for room_name in room_lst}

room_dct = dict()

for room_name in room_lst:
    df = missing_scada[room_name].copy()

    # Rename columns as in original SCADA data prepared by read_csv_SL.py and load_scada() function
    col_labels = [name.split('.')[2] for name in df.columns]
    df.columns = col_labels
    df = df.rename(columns={
        f'{room_name}_AV_Temp_prostora': f'{room_name}_CV_TEMP',
        f'{room_name}_AV_Sp_Tpr_Aktivna': f'{room_name}_SP_TEMPR_ACT',
        f'{room_name}_AO_DO4_Ventil_FC_HL': f'{room_name}_FC_HL',
        f'{room_name}_AO_DO5_Ventil_RAD_GR': f'{room_name}_RAD_HV',
        f'{room_name}_AV_Hitrost_ventilator': f'{room_name}_FC_SPEED',
        f'{room_name}_AV_Stikalo_ventilator': f'{room_name}_FC_SWITCH',
        f'{room_name}_AV_Dnevni_rezim': f'{room_name}_MODE',
        f'{room_name}_AV_Temp_Rezim': f'{room_name}_REG_TEMP',
                             }).sort_index()

    # Extract room occupied and window opening data from _MODE column
    # In _MODE column '0' means room is occupied, '4' - window open
    # N.B.: In room K1N0623 _OCC sensor is not working properly
    mask = df[f'{room_name}_MODE'].notna()
    s = df.loc[mask, f'{room_name}_MODE']
    df.loc[mask, f'{room_name}_OCC'] = np.where((s == 0) | (s == 4), 1, 0)
    df.loc[mask, f'{room_name}_WINDOW'] = np.where((s == 4), 1, 0)
    df.loc[mask, f'{room_name}_WINDOW_Openings'] = df.loc[mask, f'{room_name}_WINDOW'].diff()

    room_dct[room_name] = df

# Files folder (to store resulting files for each room)
os.makedirs('./Files', exist_ok=True)
os.makedirs('./Files/SCADA_missing_files', exist_ok=True)

# Save df for each room as csv file
for name, val in room_dct.items():
    val.to_csv(f'./Files/SCADA_missing_files/SCADA_missing_{name}.csv')

INAP data

Download read_INAP.py.

import requests
import pandas as pd
import json
import os
from tqdm import tqdm


def read_inap(room_name='K1N0624', start='2018-02-01', end='2018-03-01'):
    """
    Function to download Indoor Environmental Quality (IEQ) data monitored by INAP sensors in SL demo-case
    using IRI-UL web api. You may need to update the token if the current one is expired (contact IRI-UL).

    Args:
        room_name (str): Room identification.
        start (str): Start date as a date string object.
        end (str): End date as a date string object.

    Returns:
        df (pandas DataFrame object): IEQ data (CO2, temperature, relative humidity, VOC).

    Example:
        df_R3N0808 = read_inap('R3N0808', '2018-02-01', '2018-03-01')

    """

    # Room INAP sensor id dictionary
    sensor_id = {'R3N0808': '00681B5B', 'R3N0644': '00682753', 'K1N0623': '0068224F', 'K3N0605': '0068272C',
                 'R2N0805': '0029DC18', 'R2N0634': '00681B21', 'K1N0624': '000CC736', 'K3N0618': '00681A09'}
    # Selected room sensor id
    ids = sensor_id[room_name]

    # Create URL with correct parameters (contact IRI-UL to get token)
    start_date = f'{start}T00:00:00Z'
    end_date = f'{end}T23:59:59Z'
    token = ''
    url = f'http://52.211.97.129:8080/api/sensors/data?token={token}&id={ids}&startDate={start_date}&endDate={end_date}'

    # Connect to IRI-UL INAP sensor api
    r = requests.get(url, stream=True)

    # Check connection status
    print(f'\nRoom {room_name}: Connection is good!') if r else print('\nAn error has occurred.')

    with open('INAP_chunks.txt', 'wb') as txt:
        for chunk in r.iter_content(chunk_size=1024):

            # writing one chunk at a time to txt file
            if chunk:
                txt.write(chunk)

    with open('INAP_chunks.txt', 'r') as f:
        f_contents = f.readlines()

    # Convert api response to DataFrame object
    lst = json.loads(f_contents[0])
    data = lst[0]['measurements']

    if lst[0]['id'] != ids:
        print('Sensor id in response do not match')

    df = pd.DataFrame(data)
    df.drop(labels=['interupt', 'rgbw', 'score', 'sound', 'type'], axis=1, inplace=True)
    df['timeStamp'] = pd.to_datetime(df['timeStamp'], format='%Y-%m-%d %H:%M:%S')
    df.set_index('timeStamp', inplace=True)
    df.columns = [f'{room_name}_INAP_{col}' for col in df.columns]
    df.index.rename('Timestamp', inplace=True)

    if 'df' in locals():
        os.remove('INAP_chunks.txt')
    else:
        print('DataFrame is not created')

    print(f'Room {room_name}: Data from {start} to {end} is downloaded.')

    return df


# Divide the length of the period and download data one month at a time (to avoid web api closing the connection)
dates1 = pd.date_range(start='2018-04-1', end='2019-03-1', freq='MS').strftime('%Y-%m-%d')
dates2 = pd.date_range(start='2018-04-1', end='2019-03-1', freq='M').strftime('%Y-%m-%d')
date_lst = list(zip(dates1, dates2))

# Files folder (data for each room)
os.makedirs('./Files', exist_ok=True)
os.makedirs('./Files/INAP_files', exist_ok=True)

# Room list to perform data analysis on. Room R2N0805 INAP sensor id is not working
# room_lst = ['K3N0605', 'K1N0623', 'R3N0808', 'R3N0644', 'K1N0624', 'K3N0618', 'R2N0634']
room_lst = ['R2N0805']

# Collect the data and save it for all rooms in room_lst
for name in tqdm(room_lst, desc='Room(s)', position=0, leave=True):
    
    # Append only new data to existing files
    # INAP_data = pd.read_csv(f'./INAP_data/INAP_{name}.csv', parse_dates=True, index_col='Timestamp')
    # df_stack = read_inap(name, '2019-11-01', '2019-12-2').resample('15Min').mean()
    # INAP_data = INAP_data.append(df_stack, sort=False)
    
    # load data from beginning
    INAP_data = pd.concat([read_inap(name, date[0], date[1]) for date in tqdm(date_lst, desc='Data download', position=1, leave=True)])
    INAP_data = INAP_data.resample('15Min').mean()
    INAP_data = INAP_data.interpolate(limit=8, limit_direction='both', limit_area='inside')
    
    INAP_data.to_csv(f'./Files/INAP_files/INAP_{name}.csv')
    print(f'Room {name}: Data is saved.')

MOBISTYLE Expert Tool data