Source code for libra_py.CP2K_methods

#*********************************************************************************
#* Copyright (C) 2020 Mohammad Shakiba, Brendan Smith, Alexey V. Akimov
#* This file is distributed under the terms of the GNU General Public License
#* as published by the Free Software Foundation, either version 3 of
#* the License, or (at your option) any later version.
#* See the file LICENSE in the root directory of this distribution
#* or <http://www.gnu.org/licenses/>.
#*
#*********************************************************************************/
"""
.. module:: CP2K_methods
   :platform: Unix, Windows
   :synopsis: This module implements functions for processing the CP2K inputs and outputs.
.. moduleauthors:: 
       Mohammad Shakiba, Brendan Smith, Alexey V. Akimov 
  
"""


import os
import sys
import math
import re
import numpy as np

if sys.platform=="cygwin":
    from cyglibra_core import *
elif sys.platform=="linux" or sys.platform=="linux2":
    from liblibra_core import *


import util.libutil as comn   


[docs]def ndigits( integer_number: int ):
    """
    This function calculates the number of digits for an integer number.

    Args:

        integer_number ( integer ): An integer number.

    Returns:
 
        digit_num ( integer ): The number of digits for 'num'.
    """

    # Setting up the digit number counter
    digit_num = 0

    while( integer_number > 0 ):

        digit_num = digit_num+1
        integer_number = integer_number//10
        
    return digit_num



[docs]def state_num_cp2k(state_num: int):
    """
    This function turns an integer number to a string. This string 
    has the format in five digits and if the number of digits is less than
    five it will append zeros before the number. This is necessary to read
    the .cube files produced by CP2K.

    Args:

        state_num (integer): Integer number which is in fact the energy level.

    Returns:

        state_num_str (string): The number in five digits (or more) in string format.

    """
    # Setting up an initial string
    state_num_str = ''

    # Now count the number of digits for the state sumber
    # and then add 0 before the number in the string

    if ndigits( state_num ) == 1:

        state_num_str = '0000%d' % state_num

    elif ndigits( state_num ) == 2:

        state_num_str = '000%d' % state_num

    elif ndigits( state_num ) == 3:

        state_num_str = '00%d' % state_num

    elif ndigits( state_num ) == 4:

        state_num_str = '0%d' % state_num

    else:
        state_num_str = '%d' % state_num
    
    return state_num_str




[docs]def cube_file_names_cp2k( params ):
    """
    This function will produce the cube file names produced by CP2K both for energy calculations
    and molecular dynamics. 

    Args:

        params( dictionary ): contains dynamical paramters
	
            project_name ( string ): The project name used in CP2K input file.
	    
            min_band ( integer ): The minimum state number to be considered.
	    
            max_band ( integer ): The maximum state number to be considered.
	    
            curr_step ( integer ): The time current step in CP2K calculations.
	    
            isUKS ( int ): 1 if spin-polarized calculations used

    Returns:

        names ( list ): The list of cube file names.
	
    """
    # Critical parameters
    critical_params = [ "project_name", "min_band", "max_band", "curr_step" ]
    # Default parameters
    default_params = { "isUKS":0 }
    # Check input
    comn.check_input(params, default_params, critical_params) 
  
    project_name = params["project_name"]
    min_band  = params["min_band"]
    max_band  = params["max_band"]
    curr_step = params["curr_step"]
    isUKS = params["isUKS"]
    es_software = params["es_software"] 

    # isUKS needs to be epressed as a string, even though it is used as a Boolean in other parts of the workflows
    spin = []
    if isUKS == 1:
        spin = [1,2]
    else:
        spin = [1]

    # Define the names of the cube files for time t.
    cube_file_names = []   

    for i in range( min_band, max_band+1 ):

        # Using the function state_num_cp2k to obtain the names of the .cube files for state 'i'.
        state_num = state_num_cp2k( i )
 
        # For the current time step "curr_step"
        if isUKS == 1:

            if es_software == "cp2k" or es_software == "gaussian":

                cube_file_name_of_band_i = str('cubefiles/%s-%d-WFN_%s_%d-1_0.cube' % ( project_name, curr_step, state_num, spin[0] ) )
                cube_file_name_of_band_j = str('cubefiles/%s-%d-WFN_%s_%d-1_0.cube' % ( project_name, curr_step, state_num, spin[1] ) )
                cube_file_names.append( cube_file_name_of_band_i )
                cube_file_names.append( cube_file_name_of_band_j )

            elif software == "dftb+":
                print("\nReading cubefiles from spin-polarized calculations not currently available")
                print("Exiting now")
                sys.exit(0)
 
        else:

            # Using the function state_num_cp2k to obtain the names of the .cube files for state 'i'.
            state_num = state_num_cp2k( i )
            cube_file_name_of_band_i = str('cubefiles/%s-%d-WFN_%s_%d-1_0.cube' % ( project_name, curr_step, state_num, spin[0] ) )
            cube_file_names.append( cube_file_name_of_band_i )
		
    return cube_file_names





[docs]def read_cp2k_tddfpt_log_file( params ):
    """
    This function reads log files generated from TD-DFPT calculations using CP2K and returns the TD-DFPT
    excitation energies, Slater determinent states in terms of the Kohn-Sham orbital indicies, 
    and the Slater determinent coefficients that comprise the multi-configurational electronic states
    
    Args:
        params ( dictionary ): parameters dictionary 
	
            logfile_name ( string ): name of the log file for this particular timestep
	    
            number_of_states ( int ): how many ci states to consider
	    
            tolerance ( float ): the tolerance for accepting SDs are members of the CI wavefunctions
	    
            isUKS ( Boolean ): flag for whether or not a spin-polarized Kohn-Sham basis is being used. TRUE means
                                 that a spin-polarized Kohn-Sham basis is being used.
    Returns:
        excitation_energies ( list ): The excitation energies in the log file.
	
        ci_basis ( list ): The CI-basis configuration.
	
        ci_coefficients ( list ): The coefficients of the CI-states.
	
        spin_components (list): The spin component of the excited states.
	
    """

    # Critical parameters
    critical_params = [ "logfile_name", "number_of_states" ]
    # Default parameters
    default_params = { "tolerance":0.05, "isUKS": 0}
    # Check input
    comn.check_input(params, default_params, critical_params)
    
    logfile_name = params["logfile_name"]
    number_of_states = int(params["number_of_states"])
    tolerance = float(params["tolerance"])
    isUKS = int(params["isUKS"])


    f = open( logfile_name, 'r' )
    lines = f.readlines()
    f.close()

    for i in range( 0, len(lines) ):
        tmp_line = lines[i].lower().split()
        if 'excitation' in tmp_line:
            if 'analysis' in tmp_line:

                # When found the line in which contains 'Excitation analysis' in 
                # the log file, append it in the variable exc_anal_line
                exc_anal_line = i

        if 'states' in tmp_line:
            if 'multiplicity' in tmp_line:

                # Here we search for the line that contains 'R-TDDFPT states of multiplicity 1'
                # or 'U-TDDFPT states of multiplicity 1' in the log file
                r_tddfpt_line = i

    excitation_energies = []

    # Start from 5 lines after finding the line contaning 'R-TDDFPT states of multiplicity 1'
    # This is because they contain the energies from that line.
    for i in range( r_tddfpt_line+5, len( lines ) ):
        tmp_line = lines[i].split()
        if len( tmp_line ) == 0:
            break
        excitation_energies.append( float( tmp_line[2] ) )

    # Start from 5 lines after finding the line contaning 'Excitation analysis'
    # From that point we have the state numbers with their configurations.
    # So, we append the lines which contain only 'State number' and stop
    # whenever we reach to a blank line.
    state_num_lines = []
    for i in range( exc_anal_line+5, len( lines ) ):
        tmp_line = lines[i].split()
        if isUKS == 1:

            if len( tmp_line ) == 0 or '----' in lines[i]:
                state_num_lines.append( i )
                break
            if len( tmp_line )==1 or len( tmp_line )==3:
                state_num_lines.append(i)

        else:

            if len( tmp_line ) == 0 or '----' in lines[i]:
                state_num_lines.append( i )
                break
            if len(lines[i].split())==1:
                state_num_lines.append(i)
            elif len( tmp_line ) > 1:
                if tmp_line[0].isdigit() and not( tmp_line[1].isdigit() ):
                    state_num_lines.append( i )


    # Setting up the CI-basis list and their coefficients list
    ci_basis = []
    ci_coefficients = []
    spin_components = []
    for i in range( number_of_states ):

        # states and their coefficients for each excited state
        tmp_state              = []
        tmp_state_coefficients = []
        tmp_spin = []
        for j in range( state_num_lines[i]+1, state_num_lines[i+1] ):
            
            # Splitting lines[j] into tmp_splitted_line
            tmp_splitted_line = lines[j].split()

            # If we are using the spin-polarized Kohn-Sham basis, then the size of the split line is different
            # from the size of the split line when using the spin-unpolarized Kohn-Sham basis
            if isUKS == 1:

                ci_coefficient = float( tmp_splitted_line[4] )
                if ci_coefficient**2 > tolerance:
                    # We need to remove the paranthesis from the 2nd element of the temporary splitted line
                    tmp_spin.append( tmp_splitted_line[1].replace('(','').replace(')','') )
                    tmp_state.append( [ int( tmp_splitted_line[0] ), int( tmp_splitted_line[2] ) ]  )
                    tmp_state_coefficients.append( ci_coefficient  )

            # Here, we have the spin-unpolarize Kohn-Sham basis
            # For this case, spin-components will just return all alpha
            else:
                ci_coefficient = float( tmp_splitted_line[2] )
                if ci_coefficient**2 > tolerance:
                    tmp_spin.append( "alp" )
                    tmp_state.append( [ int( tmp_splitted_line[0] ), int( tmp_splitted_line[1] ) ]  )
                    tmp_state_coefficients.append( ci_coefficient  )

        # Append the CI-basis and and their coefficients for
        # this state into the ci_basis and ci_coefficients lists
        ci_basis.append( tmp_state )
        ci_coefficients.append( tmp_state_coefficients )
        spin_components.append( tmp_spin )

    return excitation_energies[0:number_of_states], ci_basis, ci_coefficients, spin_components




[docs]def cp2k_distribute( istep, fstep, nsteps_this_job, cp2k_positions, cp2k_input, curr_job_number ):
    """
    Distributes cp2k jobs for trivial parallelization 
    *** Make sure that your cp2k input file has absolute paths to the following input parameters:
        BASIS
        POTENTIAL
        < any other file dependent parameter (dftd3, etc) >
    Args:
    
        istep (int): The initial time step for molecular dynamics trajectory which 
                     the user wants to start the calculations with. This paramter 
                     is set in the input file for submitting the jobs.
        
        fstep (int): The final time step for molecular dynamics trajectory which 
                     the user wants to start the calculations with. This paramter 
                     is set in the input file for submitting the jobs.
     
        nsteps_this_job (int): The number of step for the job to be distributed.
	
        cp2k_positions (str): The full path to the molecular dynamics trajectory xyz file.
	
        curr_job_number (int): The current job number.
    
    Returns:
    
        None
	
    """

    # Now we need to distribute the jobs into job batches
    # First, make a working directory where the calculations will take place
    os.chdir("wd")

    # total number of steps
    nsteps = fstep - istep + 1

    # number of jobs obtained from nsteps/nsteps_this_job
    njobs  = int( nsteps / nsteps_this_job ) 

    # The current step is set to istep
    curr_step = istep

    # Making the job directories in the wd folder like job0, job1, ...
    os.system("mkdir job"+str(curr_job_number)+"")

    # Chnage directory to job folder
    os.chdir("job"+str(curr_job_number)+"")

    # Copy the trajectory xyz file
    os.system("cp ../../"+cp2k_positions+" .")

    # Copy the CP2K sample input
    os.system("cp ../../"+cp2k_input+" .")

    # Go back to the main directory
    os.chdir("../../") 



[docs]def read_trajectory_xyz_file(file_name: str, step: int):
    """
    This function reads the trajectory of a molecular dynamics .xyz file and
    extract the 'step' th step then writes it to coord-step.xyz file. This function
    is used in single point calculations for calculations of the NACs where one has
    previously obtained the trajectory via any molecular dynamics packages. 
    
    Args:
    
        file_name (string): The trajectory .xyz file name.
            step (integer): The desired time to extract its .xyz 
                            coordinates which starts from zero.
    
    Returns:
    
        None
	
    """

    f = open(file_name,'r')
    lines = f.readlines()
    f.close()

    # The number of atoms for each time step in the .xyz file of the trajectory.
    number_of_atoms = int(lines[0].split()[0])

    # Write the coordinates of the 't' th step in file coord-t.xyz
    f = open('coord-%d'%step+'.xyz','w')

    # This is used to skip the first two lines for each time step.
    n = number_of_atoms+2

    # Write the coordinates of the 'step'th time step into the file
    for i in range( n * step, n * ( step + 1 ) ):
        f.write( lines[i] )
    f.close()





[docs]def CP2K_input_static(cp2k_sample_energy_input: str, project_name: str,
                           trajectory_file: str, time_step: int):
    """
    This function provides the inputs required for single point calculations used for 
    calculation of NACs and overlap matrix. This function requires a sample CP2K input
    used for ENERGY calculations and the project name which will used to identify the 
    produced cube files. 
    Args:
        cp2k_sample_energy_input (string): The CP2K sample input file for energy calculations.
	
        project_name (string): The poject name of the CP2K sample input (in &GLOBAL-> PROJECT)
                               This is required for identifying the cube files.
			       
        trajectory_file (string): The .xyz trajectory file obtained from previous 
                                  molecular dynamics calculations
				  
        time_step (integer): The time step of the single point calculation. It starts from 0.
    
    Returns:
    
        None
	
    """
    
    # Reading the .xyz trajectory file and extract the coordinates at time 'time_step'.
    read_trajectory_xyz_file(trajectory_file, time_step)

    # The new project name for each time_step.
    f_new = project_name+'-%d'%time_step+'.inp'

    f = open(cp2k_sample_energy_input,'r')
    lines = f.readlines()
    f.close()

    # wfn_restart_file name
    c = 0


    f = open(f_new,'w')
    for i in range(0,len(lines)):
        # Writing the COORD_FILE_NAME in the input
        if 'COORD_FILE_NAME'.lower() in lines[i].lower().split():
            f.write('      COORD_FILE_NAME')
            f.write(' coord-%d'%time_step+'.xyz')
            f.write('\n')
        # Writing the project name
        elif 'PROJECT'.lower() in lines[i].lower().split():
            f.write('  PROJECT')
            f.write(' %s'%project_name+'-%d'%time_step)
            f.write('\n')
        elif 'PROJECT_NAME'.lower() in lines[i].lower().split():
            f.write('  PROJECT')
            f.write(' %s'%project_name+'-%d'%time_step)
            f.write('\n')
        # Writing the WFN_RESTART_FILE_NAME of the previous time step
        elif 'WFN_RESTART_FILE_NAME'.lower() in lines[i].lower().split() and ".wfn" in lines[i].lower() and "!" not in lines[i].lower():	
            f.write('    WFN_RESTART_FILE_NAME')	
            f.write(' %s'%project_name+'-%d'%(time_step-1)+'-RESTART.wfn')	
            f.write('\n')	
        # Writing the WFN_RESTART_FILE_NAME of the previous time step for the tddft calculation	
        elif 'WFN_RESTART_FILE_NAME'.lower() in lines[i].lower().split() and ".tdwfn" in lines[i].lower() and "!" not in lines[i].lower():	
            f.write('    WFN_RESTART_FILE_NAME')	
            f.write(' %s'%project_name+'-%d'%(time_step-1)+'-RESTART.tdwfn')
            f.write('\n')
        # Setting the scf guess to restart, CP2K will automatically switch to atomic if the wfn file does not exist
        elif 'SCF_GUESS'.lower() in lines[i].lower().split():
            f.write('    SCF_GUESS RESTART')
            f.write('\n')
        else:
            f.write(lines[i])



[docs]def read_energies_from_cp2k_log_file( params ):
    """
    This function reads the energies from CP2K log file for a specific spin.
    
    Args:
    
        params (dictionary):
    
            logfile_name (str): The output file produced by CP2K
        
            time (int): The time step of the molecular dynamics. For single point calculations
                        it should be set to 0.
                    
            min_band (int): The minimum state number.

            max_band (int): The maximum state number.
 
            spin (int): This number can only accept two values: 1 for alpha and 2 for beta spin.
        
    Returns:
    
        E (1D numpy array): The vector consisting of the KS energies from min_band to max_band.
        
        total_energy (float): The total energy obtained from the log file.
        
    """
    
    # Critical parameters
    critical_params = [ "logfile_name", "min_band", "max_band" ]
    # Default parameters
    default_params = { "time":0, "spin": 1}
    # Check input
    comn.check_input(params, default_params, critical_params)
    
    cp2k_log_file_name = params["logfile_name"]
    # Time step, for molecular dynamics it will read the energies
    # of time step 'time', but for static calculations the time is set to 0
    time = params["time"]
    
    # Koh-Sham orbital indicies
    # ks_orbital_indicies = params["ks_orbital_indicies"]
    
    # The minimum state number
    min_band = params["min_band"] # ks_orbital_indicies[0]
    # The maximum state number
    max_band = params["max_band"] # ks_orbital_indicies[-1]
    
    spin = params["spin"]
    
    # First openning the file and reading its lines
    f = open( cp2k_log_file_name, 'r' )
    lines = f.readlines()
    f.close()
    
    # The lines containing 'Eigenvalues of the occupied subspace'
    lines_with_occupied   = []
    # The lines containing 'Eigenvalues of the unoccupied subspace'
    lines_with_unoccupied = []
    
    # The lines containing the energies of the occupied states
    occ_energies_last_line   = []
    # The lines containing the energies of the unoccupied states
    unocc_energies_last_line = []
    # Set the total energy to zero
    total_energy = 0.0

    for i in range(0,len(lines)):
        # Find the occupied states lines
        if 'Eigenvalues of the occupied subspace'.lower() in lines[i].lower():
        
            if  str( spin ) in lines[i].lower().split():
                lines_with_occupied.append( i )
                for j in range( i, len( lines ) ):
                    # Find the last line number containing the energies for occupied states
                    if len( lines[j].split() ) == 0:
                        occ_energies_last_line.append( j )
                        break
        # Find the unoccupied states lines
        if 'Eigenvalues of the unoccupied subspace'.lower() in lines[i].lower():
            if  str( spin ) in lines[i].lower().split():
                lines_with_unoccupied.append(i)
                for j in range( i, len( lines ) ):
                    # Find the last line number containing the energies for unoccupied states
                    if len( lines[j].split() ) == 0:
                        unocc_energies_last_line.append( j )
                        break
       
        if "Total energy:" in lines[i]:  
            total_energy = float( lines[i].split()[2] )


    # The Kohn-Sham energies
    ks_energies = []
    # Start after two lines of the 'Eigenvalues of the occupied subspace'
    for i in range( lines_with_occupied[time] + 2, occ_energies_last_line[time] - 1 ):
        for j in range(0,len(lines[i].split())):
            ks_energies.append(float(lines[i].split()[j]))
    
    # Start after two lines of the 'Eigenvalues of the unoccupied subspace'
    for i in range( lines_with_unoccupied[time] + 2, unocc_energies_last_line[time] ):
        if not 'Reached'.lower() in lines[i].lower().split():
            for j in range(0,len(lines[i].split())):
                ks_energies.append(float(lines[i].split()[j]))

    # Now appending all the energies into a numpy array
    ks_energies = np.array(ks_energies)
        
    # Returning the energeis from min_band to max_band
    return ks_energies[min_band-1:max_band], total_energy
  

[docs]def read_energies_from_cp2k_md_log_file( params ):
    """
    This function reads the energies from CP2K molecular dynamics log file for a specific spin. The difference between this function
    and the function `read_energies_from_cp2k_log_file` is that it is more efficient and faster for molecular dynamics energy levels.
    
    Args:
    
        params (dictionary):
    
            logfile_name (str): The output file produced by CP2K
        
            init_time (int): The initial time step of the molecular dynamics. 
                        
            final_time (int): The final time step of the molecular dynamics. 
                    
            min_band (int): The minimum state number.

            max_band (int): The maximum state number.
 
            spin (int): This number can only accept two values: 1 for alpha and 2 for beta spin.
        
    Returns:
    
        E (1D numpy array): The vector consisting of the KS energies from min_band to max_band.
        
        total_energy (float): The total energy obtained from the log file.
        
    """
    
    # Critical parameters
    critical_params = [ "logfile_name", "min_band", "max_band" ]
    # Default parameters
    default_params = { "spin": 1, "init_time": 0, "final_time": 1}
    # Check input
    comn.check_input(params, default_params, critical_params)
    
    cp2k_log_file_name = params["logfile_name"]
    # Time step, for molecular dynamics it will read the energies
    # of time step 'time', but for static calculations the time is set to 0
    init_time = params["init_time"]
    final_time = params["final_time"]
    
    # Koh-Sham orbital indicies
    # ks_orbital_indicies = params["ks_orbital_indicies"]
    
    # The minimum state number
    min_band = params["min_band"] # ks_orbital_indicies[0]
    # The maximum state number
    max_band = params["max_band"] # ks_orbital_indicies[-1]
    
    spin = params["spin"]
    
    # First openning the file and reading its lines
    f = open( cp2k_log_file_name, 'r' )
    lines = f.readlines()
    f.close()
    
    # The lines containing 'Eigenvalues of the occupied subspace'
    lines_with_occupied   = []
    # The lines containing 'Eigenvalues of the unoccupied subspace'
    lines_with_unoccupied = []
    
    # The lines containing the energies of the occupied states
    occ_energies_last_line   = []
    # The lines containing the energies of the unoccupied states
    unocc_energies_last_line = []
    # Set the total energy to zero
    total_energy = 0.0

    
    for i in range(0,len(lines)):
        # Find the occupied states lines
        if 'Eigenvalues of the occupied subspace'.lower() in lines[i].lower():
        
            if  str( spin ) in lines[i].lower().split():
                lines_with_occupied.append( i )
                for j in range( i, len( lines ) ):
                    # Find the last line number containing the energies for occupied states
                    if len( lines[j].split() ) == 0:
                        occ_energies_last_line.append( j )
                        break
        # Find the unoccupied states lines
        if 'Eigenvalues of the unoccupied subspace'.lower() in lines[i].lower():
            if  str( spin ) in lines[i].lower().split():
                lines_with_unoccupied.append(i)
                for j in range( i, len( lines ) ):
                    # Find the last line number containing the energies for unoccupied states
                    if len( lines[j].split() ) == 0:
                        unocc_energies_last_line.append( j )
                        break
       
        if "Total energy:" in lines[i]:  
            total_energy = float( lines[i].split()[2] )

    # All KS energies
    KS_energies = []
    for time in range(init_time,final_time):
        # The Kohn-Sham energies
        ks_energies = []
        # Start after two lines of the 'Eigenvalues of the occupied subspace'
        for i in range( lines_with_occupied[time] + 2, occ_energies_last_line[time] - 1 ):
            for j in range(0,len(lines[i].split())):
                ks_energies.append(float(lines[i].split()[j]))
    
        # Start after two lines of the 'Eigenvalues of the unoccupied subspace'
        for i in range( lines_with_unoccupied[time] + 2, unocc_energies_last_line[time] ):
            if not 'Reached'.lower() in lines[i].lower().split():
                for j in range(0,len(lines[i].split())):
                    ks_energies.append(float(lines[i].split()[j]))

        # Now appending all the energies into a numpy array
        ks_energies = np.array(ks_energies)
        KS_energies.append(ks_energies[min_band-1:max_band])
        
    # Returning the energeis from min_band to max_band
    return KS_energies, total_energy