/******************************************************************************/ /* */ /* X r d C m s M a n a g e r . c c */ /* */ /* (c) 2007 by the Board of Trustees of the Leland Stanford, Jr., University */ /* All Rights Reserved */ /* Produced by Andrew Hanushevsky for Stanford University under contract */ /* DE-AC02-76-SFO0515 with the Department of Energy */ /* */ /* This file is part of the XRootD software suite. */ /* */ /* XRootD is free software: you can redistribute it and/or modify it under */ /* the terms of the GNU Lesser General Public License as published by the */ /* Free Software Foundation, either version 3 of the License, or (at your */ /* option) any later version. */ /* */ /* XRootD is distributed in the hope that it will be useful, but WITHOUT */ /* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or */ /* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public */ /* License for more details. */ /* */ /* You should have received a copy of the GNU Lesser General Public License */ /* along with XRootD in a file called COPYING.LESSER (LGPL license) and file */ /* COPYING (GPL license). If not, see . */ /* */ /* The copyright holder's institutional names and contributor's names may not */ /* be used to endorse or promote products derived from this software without */ /* specific prior written permission of the institution or contributor. */ /******************************************************************************/ #include #include #include #include #include #include "Xrd/XrdScheduler.hh" #include "XrdCms/XrdCmsConfig.hh" #include "XrdCms/XrdCmsManager.hh" #include "XrdCms/XrdCmsManTree.hh" #include "XrdCms/XrdCmsNode.hh" #include "XrdCms/XrdCmsProtocol.hh" #include "XrdCms/XrdCmsRouting.hh" #include "XrdCms/XrdCmsUtils.hh" #include "XrdCms/XrdCmsTrace.hh" #include "XrdNet/XrdNetAddr.hh" #include "XrdOuc/XrdOucTList.hh" #include "XrdOuc/XrdOucTokenizer.hh" #include "XrdSys/XrdSysError.hh" #include "XrdSys/XrdSysTimer.hh" /******************************************************************************/ /* G l o b a l O b j e c t s */ /******************************************************************************/ namespace XrdCms { extern XrdSysError Say; extern XrdOucTrace Trace; } using namespace XrdCms; /******************************************************************************/ /* S t a t i c M e m b e r s */ /******************************************************************************/ XrdSysMutex XrdCmsManager::MTMutex; XrdCmsNode *XrdCmsManager::MastTab[MTMax] = {0}; char XrdCmsManager::MastSID[MTMax] = {0}; int XrdCmsManager::MTHi = -1; /******************************************************************************/ /* L o c a l C l a s s e s */ /******************************************************************************/ class XrdCmsDelNode : XrdJob { public: void DoIt() {nodeP->Delete(XrdCmsManager::MTMutex); delete this; } XrdCmsDelNode(XrdCmsNode *nP) : XrdJob("delete node"), nodeP(nP) {Sched->Schedule((XrdJob *)this);} ~XrdCmsDelNode() {} XrdCmsNode *nodeP; }; /******************************************************************************/ /* C o n s t r u c t o r */ /******************************************************************************/ XrdCmsManager::XrdCmsManager(XrdOucTList *mlP, int snum) { myMans = 0; ManTree = 0; curManCnt = 0; curManList= mlP; newManList= 0; theSite = 0; theHost = 0; theSID = 0; siteID = snum; wasRedir = false; } /******************************************************************************/ /* A d d */ /******************************************************************************/ XrdCmsNode *XrdCmsManager::Add(XrdLink *lp, int Lvl, bool &xit) { EPNAME("Add") XrdCmsNode *nP; int i; // Check if there is a pending reconfiguration. If so, return no node but // tell the caller to finish so we can proceed with the reconfiguration // MTMutex.Lock(); lp->setID("manager",0); if (newManList) {MTMutex.UnLock(); xit = true; return 0;} xit = false; // Find available ID for this node // for (i = 0; i < MTMax; i++) if (!MastTab[i]) break; // Check if we have too many here // if (i >= MTMax) {MTMutex.UnLock(); Say.Emsg("Manager", "Login to", lp->Name(), "failed; too many managers"); return 0; } // Obtain a new a new node object // if (!(nP = new XrdCmsNode(lp, 0, 0, 0, Lvl, i))) {Say.Emsg("Manager", "Unable to obtain node object."); return 0;} // Assign new manager // MastTab[i] = nP; MastSID[i] = siteID; if (i > MTHi) MTHi = i; nP->isOffline = 0; nP->isNoStage = 0; nP->isBad = 0; nP->isBound = 1; nP->isConn = 1; nP->isMan = (Config.asManager() ? 1 : 0); nP->setManager(this); MTMutex.UnLock(); // Document login // DEBUG(nP->Name() <<" to manager config; id=" < 0 || !newManList) {MTMutex.UnLock(); return;} // Remove all vestigial information // for (int i = 0; i <= MTHi; i++) {if (MastSID[i] == siteID) {MastTab[i] = 0; MastSID[i] = 0;}} // Readjust the high water mark // while(MTHi >= 0 && !MastTab[MTHi]) MTHi--; // Delete the current manager list, it is safe to do so // while((mP = curManList)) {curManList = curManList->next; delete mP;} curManList = newManList; newManList = 0; // Run the new manager setup // Say.Say("Config ","Manager subsystem reconfiguration completed; restarting."); Run(curManList); // All done // MTMutex.UnLock(); } /******************************************************************************/ /* I n f o r m */ /******************************************************************************/ void XrdCmsManager::Inform(const char *What, const char *Data, int Dlen) { EPNAME("Inform"); XrdCmsNode *nP; int i; // Obtain a lock on the table // MTMutex.Lock(); // Run through the table looking for managers to send messages to // for (i = 0; i <= MTHi; i++) {if ((nP=MastTab[i]) && !nP->isOffline) {nP->Lock(true); MTMutex.UnLock(); DEBUG(nP->Name() <<" " <Send(Data, Dlen); nP->UnLock(); MTMutex.Lock(); } } MTMutex.UnLock(); } /******************************************************************************/ void XrdCmsManager::Inform(const char *What, struct iovec *vP, int vN, int vT) { EPNAME("Inform"); int i; XrdCmsNode *nP; // Obtain a lock on the table // MTMutex.Lock(); // Run through the table looking for managers to send messages to // for (i = 0; i <= MTHi; i++) {if ((nP=MastTab[i]) && !nP->isOffline) {nP->Lock(true); MTMutex.UnLock(); DEBUG(nP->Name() <<" " <Send(vP, vN, vT); nP->UnLock(); MTMutex.Lock(); } } MTMutex.UnLock(); } /******************************************************************************/ void XrdCmsManager::Inform(XrdCms::CmsReqCode rCode, int rMod, const char *Arg, int Alen) { CmsRRHdr Hdr = {0, (kXR_char)rCode, (kXR_char)rMod, htons(static_cast(Alen))}; struct iovec ioV[2] = {{(char *)&Hdr, sizeof(Hdr)}, {(char *)Arg, (size_t)Alen}}; Inform(Router.getName((int)rCode), ioV, (Arg ? 2 : 1), Alen+sizeof(Hdr)); } /******************************************************************************/ void XrdCmsManager::Inform(CmsRRHdr &Hdr, const char *Arg, int Alen) { struct iovec ioV[2] = {{(char *)&Hdr, sizeof(Hdr)}, {(char *)Arg, (size_t)Alen}}; Hdr.datalen = htons(static_cast(Alen)); Inform(Router.getName(Hdr.rrCode), ioV, (Arg ? 2 : 1), Alen+sizeof(Hdr)); } /******************************************************************************/ /* R e m o v e */ /******************************************************************************/ void XrdCmsManager::Remove(XrdCmsNode *nP, const char *reason) { EPNAME("Remove") int sinst, sent = nP->ID(sinst); // Obtain a lock on the servtab // MTMutex.Lock(); // Make sure this node is the right one // if (!(nP == MastTab[sent])) {MTMutex.UnLock(); DEBUG("manager " <isOffline = 1; DEBUG("completed " <Name() <<" manager " <= 0 && !MastTab[MTHi]) MTHi--; MTMutex.UnLock(); // Document removal // . if (reason) Say.Emsg("Manager", nP->Ident, "removed;", reason); } /******************************************************************************/ /* R e r u n */ /******************************************************************************/ void XrdCmsManager::Rerun(char *newMans) { static CmsDiscRequest discRequest = {{0, kYR_disc, 0, 0}}; XrdOucTList *tP; const char *eText; char *hP; int newManCnt = 0; // Lock ourselves // MTMutex.Lock(); wasRedir = true; // If we already have a pending new sequence, then just return // if (newManList) {MTMutex.UnLock(); return;} // Indicate that we will be re-initialzing // Say.Say("Config ", "Manager subsystem reconfiguring using ", newMans); // Process the new man list // XrdNetAddr manAddr; XrdOucTokenizer mList((char *)newMans); hP = mList.GetLine(); // Add each manager in the list. These have already been expanded and are // gaurenteed to contain a port number as the list is provided by the cmsd. // However, we will check for duplicates and ignore any overage. // while((hP = mList.GetToken())) {if ((eText = manAddr.Set(hP))) {Say.Emsg("Config","Ignoring manager", hP, eText); continue;} tP = newManList; while(tP && strcmp(hP, tP->text)) tP = tP->next; if (tP) {Say.Emsg("Config","Ignoring duplicate manager", hP); continue; } if (newManCnt >=MTMax) {Say.Emsg("Config","Ignoring manager", hP, "and remaining entries; limit exceeded!"); break; } newManList = new XrdOucTList(manAddr.Name(),manAddr.Port(),newManList); newManCnt++; } // If we have managers then tell the cluster builder to abort as we will // be restarting this whole process (we don't want any hung nodes here). // if (newManCnt) ManTree->Abort(); // Now run through the node table and doom all current site connections as we // need to reinitialize the whole manager subsystem. Note that none of these // objects can escape without us removing them from the table. // if (newManCnt) {for (int i = 0; i <= MTHi; i++) if (MastTab[i] && (MastSID[i] == siteID)) {MastTab[i]->isBad |= XrdCmsNode::isBlisted|XrdCmsNode::isDoomed; MastTab[i]->Send((char *)&discRequest, sizeof(discRequest)); } } // We are done // MTMutex.UnLock(); } /******************************************************************************/ /* R e s e t */ /******************************************************************************/ void XrdCmsManager::Reset() { EPNAME("Reset"); static CmsStatusRequest myState = {{0, kYR_status, CmsStatusRequest::kYR_Reset, 0}}; static const int szReqst = sizeof(CmsStatusRequest); XrdCmsNode *nP; int i; // Obtain a lock on the table // MTMutex.Lock(); // Run through the table looking for managers to send a reset request // for (i = 0; i <= MTHi; i++) {if ((nP=MastTab[i]) && !nP->isOffline && nP->isKnown) {nP->Lock(true); nP->isKnown = 0; MTMutex.UnLock(); DEBUG("sent to " <Name()); nP->Send((char *)&myState, szReqst); nP->UnLock(); MTMutex.Lock(); } } MTMutex.UnLock(); } /******************************************************************************/ /* Private: R u n */ /******************************************************************************/ int XrdCmsManager::Run(XrdOucTList *manL) { XrdOucTList *tP = manL; XrdJob *jP, *jFirst = 0, *jLast = 0; // This method is either called during initial start-up or if we were wholly // redirected elsewhere due to a blacklist. In the latter case, the caller // must have obtained all the required locks // curManCnt = 0; if (!manL) return 0; // Prime the manager subsystem. We check here to make sure we will not be // tying to connect to ourselves. This is possible if the manager and meta- // manager were defined to be the same and we are a manager. We would have // liked to screen this out earlier but port discovery prevents it. // while(tP) {if (strcmp(tP->text, Config.myName) || tP->val != Config.PortTCP) {jP = (XrdJob *)XrdCmsProtocol::Alloc(Config.myRole, this, tP->text, tP->val); if (!jFirst) jFirst = jLast = jP; else {jLast->NextJob = jP; jLast = jP;} curManCnt++; } else { char buff[512]; sprintf(buff, "%s:%d", tP->text, tP->val); Say.Emsg("Config", "Circular connection to", buff, "ignored."); } tP = tP->next; } // Make sure we have something to start up // if (!curManCnt) {Say.Emsg("Config","No managers can be started; we are now unreachable!"); return 0; } // We now know there is no pandering going on, so we need to initialize the // the tree management subsystem to get it into a fresh state. // if (myMans) delete myMans; myMans = new XrdCmsManList; if (ManTree) delete ManTree; ManTree = new XrdCmsManTree(curManCnt); if (theSID) {free(theSID); theSID = 0;} if (theSite) {free(theSite); theSite = 0;} // Now start up all of the threads // if (jFirst == jLast) Sched->Schedule(jFirst); else Sched->Schedule(curManCnt, jFirst, jLast); // All done // return curManCnt; } /******************************************************************************/ /* S t a r t */ /******************************************************************************/ bool XrdCmsManager::Start(const XrdOucTList *mL) { XrdOucTList *manVec[MTMax] = {0}; XrdCmsManager *manP; char buff[1024]; int n, sid, snum = 0, mtot = 0, mnum = 0, xnum = 0; // If there is no manager list then we must not be connecting to anyone // if (!mL) return true; // Segregate the manager list by site and run them that way. Unfortunately, // that means we have to copy the TList. This ok as this happens once. // while(mL) {sid = mL->ival[1]; mtot++; if (sid >= MTMax) {sprintf(buff, "%d", sid); Say.Say("Config ", "Invalid site ID ", buff, " for ", mL->text); } else { manVec[sid] = new XrdOucTList(mL->text, mL->val, manVec[sid]); mnum++; } mL = mL->next; } // Count how many sites we have // for (n = 0; n < MTMax; n++) if (manVec[n]) snum++; // Indicate what we are about to do // snprintf(buff, sizeof(buff),"%d manager%s and %d site%s.", mnum, (mnum != 1 ? "s":""), snum, (snum != 1 ? "s":"")); Say.Say("Config Connecting to ", buff); // Now run each one // for (n = 0; n < MTMax; n++) {if (manVec[n]) {manP = new XrdCmsManager(manVec[n], n); xnum += manP->Run(manVec[n]); } } // Check if we should issue a warning // if (xnum < mtot) {snprintf(buff, sizeof(buff), "%d of %d", xnum, mtot); Say.Say("Config Warning! Only ", buff, " manager(s) will be contacted!"); } // All done // return xnum == mtot; } /******************************************************************************/ /* V e r i f y */ /******************************************************************************/ bool XrdCmsManager::Verify(XrdLink *lP, const char *sid, const char *sname) { XrdSysMutexHelper hMutex(MTMutex); const char *sidP; // Trim off the type of service in the sid // if ((sidP = index(sid, ' '))) sidP++; else sidP = sid; // If we have no sid, just record it // if (!theSID) {theSID = strdup(sidP); if (theSite) free(theSite); theHost = strdup(lP->Host()); theSite = (sname ? strdup(sname) : strdup("anonymous")); return true; } // Make sure we are connecting to the same cluster as before // if (!strcmp(theSID, sidP)) return true; // Compute the offending site configuration // char mBuff[1024]; snprintf(mBuff,sizeof(mBuff),"%s for site %s; " "making file location unpredictable!", theHost, (wasRedir ? theSite : XrdCmsUtils::SiteName(siteID))); // There seems to be a configuration error here // Say.Emsg("Manager", lP->Host(), "manager configuration differs from", mBuff); return false; }