python の正規表現で文字列から年月日を抽出

長い文字列から、「2021/11/12」などの年月日文字列を抽出します。


import re

s = '/var/www/html/DICOM/2021/11/12/64D94C0D/A8BA92DA/5B04B6B8'
pattern = '\d{4}/\d{2}/\d{2}'
res = re.search(pattern, s)
print(res.group())

結果は


2021/11/12

dicom ファイルの保存先のパスを作成します。


import pydicom
import os
import re

class Dicom:

    def getDicomArr( self, sd ):
        DicomArr = []
        for root, dir, files in os.walk( sd ):
            for file_ in files:
                full_path = os.path.join(root, file_)
                DicomArr.append( full_path )
        return DicomArr
    
    @classmethod 
    def getBdir(self, ef ):
        pattern = '\d{4}/\d{2}/\d{2}'
        res = re.search(pattern, ef)
        return res.group()

    def getDicomInfo( self, dcmArr ):
        dcmInfoArr = []
        studyIDArr =[]
        for eachFile in dcmArr:           
            ds = pydicom.read_file(eachFile)
            studyID = ds[0x0020, 0x0010].value
            if studyID not in studyIDArr:        
                studyDate = ds[0x0008, 0x0020].value
                studyTime = ds[0x0008, 0x0030].value
                modality = ds[0x0008, 0x0060].value
                try:
                    studyDscr = ds[0x0008, 0x1030].value
                except:
                    studyDscr = ''
                ptName = str(ds[0x0010, 0x0010].value).replace('^', ' ')
                karteNo = ds[0x0010, 0x0020].value
                sex = ds[0x0010, 0x0040].value
                birthday = ds[0x0010, 0x0030].value
                age = ds[0x0010, 0x1010].value
                institution = ds[0x0008, 0x0080].value
                bdir = Dicom.getBdir(eachFile)
                path = bdir + '/' + karteNo + '/' + studyID 
                thisLineInfo = [ studyID, studyDate, studyTime, modality, studyDscr, ptName, karteNo, birthday, sex, age, institution, path ]
                dcmInfoArr.append( thisLineInfo )
                studyIDArr.append( studyID )
        return dcmInfoArr

if __name__ == "__main__":
    dcm = Dicom()
    dcmArr = dcm.getDicomArr( '/var/www/html/DICOM/2021/11/12' )
    infos = dcm.getDicomInfo( dcmArr )
    print( infos )

infos には以下のような情報が入っています。(氏名と病院名は表示していません)


['9482', '20200129', '152540', 'CT', 'Abdomen', '21902405', '19870506', 'F', '032Y', '2021/11/12/21902405/9482']
['9464', '20200125', '142126', 'CT', 'Abdomen', '22000187', '19870910', 'M', '032Y', '2021/11/12/22000187/9464']
['9417', '20200110', '132943', 'CT', 'Chest', '21601458', '19331224', 'F', '086Y', '2021/11/12/21601458/9417']
['9463', '20200125', '094512', 'CT', 'Chest', '21800186', '19530405', 'M', '066Y', '2021/11/12/21800186/9463']
['9419', '20200110', '153217', 'CT', 'Chest', '21502808', '19411116', 'F', '078Y', '2021/11/12/21502808/9419']
['9468', '20200127', '095618', 'CT', 'Abdomen', '22000191', '19690206', 'M', '050Y', '2021/11/12/22000191/9468']
['9431', '20200115', '104327', 'CT', 'Abdomen', '21800138', '19950118', 'M', '024Y', '2021/11/12/21800138/9431']
['996', '20200123', '134853', 'RF', '', '20703388', '19531001', 'M', '066Y', '2021/11/12/20703388/996']

クラスメソッド

クラスメソッドにすると、クラス名 + メソッドでアクセスできるので便利です。


    @classmethod 
    def getBdir(self, ef ):
        pattern = '\d{4}/\d{2}/\d{2}'
        res = re.search(pattern, ef)
        return res.group()

正規表現の訂正

年月日は「2021/11/6」というように1桁のものもあるので上の正規表現ではエラーが出ます。
月と日は数字が1桁もしくは2桁なので、


pattern = '\d{4}/\d{1,2}/\d{1,2}'